Home | History | Annotate | Download | only in intltest
      1 /********************************************************************
      2  * COPYRIGHT:
      3  * Copyright (c) 2002-2010, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  ********************************************************************/
      6 
      7 //
      8 //   regextst.cpp
      9 //
     10 //      ICU Regular Expressions test, part of intltest.
     11 //
     12 
     13 #include "intltest.h"
     14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     15 
     16 #include "unicode/regex.h"
     17 #include "unicode/uchar.h"
     18 #include "unicode/ucnv.h"
     19 #include "unicode/ustring.h"
     20 #include "regextst.h"
     21 #include "uvector.h"
     22 #include "util.h"
     23 #include <stdlib.h>
     24 #include <string.h>
     25 #include <stdio.h>
     26 #include "cstring.h"
     27 #include "uinvchar.h"
     28 
     29 #define SUPPORT_MUTATING_INPUT_STRING   0
     30 
     31 //---------------------------------------------------------------------------
     32 //
     33 //  Test class boilerplate
     34 //
     35 //---------------------------------------------------------------------------
     36 RegexTest::RegexTest()
     37 {
     38 }
     39 
     40 
     41 RegexTest::~RegexTest()
     42 {
     43 }
     44 
     45 
     46 
     47 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
     48 {
     49     if (exec) logln("TestSuite RegexTest: ");
     50     switch (index) {
     51 
     52         case 0: name = "Basic";
     53             if (exec) Basic();
     54             break;
     55         case 1: name = "API_Match";
     56             if (exec) API_Match();
     57             break;
     58         case 2: name = "API_Replace";
     59             if (exec) API_Replace();
     60             break;
     61         case 3: name = "API_Pattern";
     62             if (exec) API_Pattern();
     63             break;
     64         case 4:
     65 #if !UCONFIG_NO_FILE_IO
     66             name = "Extended";
     67             if (exec) Extended();
     68 #else
     69             name = "skip";
     70 #endif
     71             break;
     72         case 5: name = "Errors";
     73             if (exec) Errors();
     74             break;
     75         case 6: name = "PerlTests";
     76             if (exec) PerlTests();
     77             break;
     78         case 7: name = "Callbacks";
     79             if (exec) Callbacks();
     80             break;
     81         case 8: name = "FindProgressCallbacks";
     82             if (exec) FindProgressCallbacks();
     83             break;
     84         case 9: name = "Bug 6149";
     85              if (exec) Bug6149();
     86              break;
     87         case 10: name = "UTextBasic";
     88           if (exec) UTextBasic();
     89           break;
     90         case 11: name = "API_Match_UTF8";
     91           if (exec) API_Match_UTF8();
     92           break;
     93         case 12: name = "API_Replace_UTF8";
     94           if (exec) API_Replace_UTF8();
     95           break;
     96         case 13: name = "API_Pattern_UTF8";
     97           if (exec) API_Pattern_UTF8();
     98           break;
     99         case 14: name = "PerlTestsUTF8";
    100           if (exec) PerlTestsUTF8();
    101           break;
    102         case 15: name = "PreAllocatedUTextCAPI";
    103           if (exec) PreAllocatedUTextCAPI();
    104           break;
    105         case 16: name = "Bug 7651";
    106              if (exec) Bug7651();
    107              break;
    108         case 17: name = "Bug 7740";
    109             if (exec) Bug7740();
    110             break;
    111 
    112         default: name = "";
    113             break; //needed to end loop
    114     }
    115 }
    116 
    117 
    118 /**
    119  * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
    120  * into ASCII.
    121  * @see utext_openUTF8
    122  */
    123 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
    124 
    125 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
    126 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
    127   return utext_openUTF8(ut, inv, length, status);
    128 #else
    129   char buf[1024];
    130 
    131   uprv_aestrncpy((uint8_t*)buf, (const uint8_t*)inv, length);
    132 
    133   return utext_openUTF8(ut, buf, length, status);
    134 #endif
    135 }
    136 
    137 //---------------------------------------------------------------------------
    138 //
    139 //   Error Checking / Reporting macros used in all of the tests.
    140 //
    141 //---------------------------------------------------------------------------
    142 
    143 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
    144   int64_t oldIndex = utext_getNativeIndex(text);
    145   utext_setNativeIndex(text, 0);
    146   char *bufPtr = buf;
    147   UChar32 c = utext_next32From(text, 0);
    148   while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
    149     if (0x000020<=c && c<0x00007e) {
    150       *bufPtr = c;
    151     } else {
    152 #if 0
    153       sprintf(bufPtr,"U+%04X", c);
    154       bufPtr+= strlen(bufPtr)-1;
    155 #else
    156       *bufPtr = '%';
    157 #endif
    158     }
    159     bufPtr++;
    160     c = UTEXT_NEXT32(text);
    161   }
    162   *bufPtr = 0;
    163 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
    164   char *ebuf = (char*)malloc(bufLen);
    165   uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
    166   uprv_strncpy(buf, ebuf, bufLen);
    167   free((void*)ebuf);
    168 #endif
    169   utext_setNativeIndex(text, oldIndex);
    170 }
    171 
    172 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
    173 
    174 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
    175                                                               __FILE__, __LINE__, u_errorName(status)); return;}}
    176 
    177 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
    178 
    179 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
    180 if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
    181     __LINE__, u_errorName(errcode), u_errorName(status));};}
    182 
    183 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
    184     "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
    185 
    186 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
    187     errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
    188 
    189 /**
    190  * @param expected expected text in UTF-8 (not platform) codepage
    191  */
    192 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
    193     UErrorCode status = U_ZERO_ERROR;
    194     UText expectedText = UTEXT_INITIALIZER;
    195     utext_openUTF8(&expectedText, expected, -1, &status);
    196     if(U_FAILURE(status)) {
    197       errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
    198       return;
    199     }
    200     if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
    201       errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
    202       return;
    203     }
    204     utext_setNativeIndex(actual, 0);
    205     if (utext_compare(&expectedText, -1, actual, -1) != 0) {
    206         char buf[201 /*21*/];
    207         char expectedBuf[201];
    208         utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
    209         utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
    210         errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
    211     }
    212     utext_close(&expectedText);
    213 }
    214 /**
    215  * @param expected invariant (platform local text) input
    216  */
    217 
    218 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
    219     UErrorCode status = U_ZERO_ERROR;
    220     UText expectedText = UTEXT_INITIALIZER;
    221     regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
    222     if(U_FAILURE(status)) {
    223       errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
    224       return;
    225     }
    226     utext_setNativeIndex(actual, 0);
    227     if (utext_compare(&expectedText, -1, actual, -1) != 0) {
    228         char buf[201 /*21*/];
    229         char expectedBuf[201];
    230         utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
    231         utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
    232         errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
    233     }
    234     utext_close(&expectedText);
    235 }
    236 
    237 /**
    238  * Assumes utf-8 input
    239  */
    240 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
    241 /**
    242  * Assumes Invariant input
    243  */
    244 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
    245 
    246 
    247 //---------------------------------------------------------------------------
    248 //
    249 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
    250 //                       for the LookingAt() and  Match() functions.
    251 //
    252 //       usage:
    253 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
    254 //
    255 //          The expected results are UBool - TRUE or FALSE.
    256 //          The input text is unescaped.  The pattern is not.
    257 //
    258 //
    259 //---------------------------------------------------------------------------
    260 
    261 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
    262 
    263 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
    264     const UnicodeString pattern(pat, -1, US_INV);
    265     const UnicodeString inputText(text, -1, US_INV);
    266     UErrorCode          status  = U_ZERO_ERROR;
    267     UParseError         pe;
    268     RegexPattern        *REPattern = NULL;
    269     RegexMatcher        *REMatcher = NULL;
    270     UBool               retVal     = TRUE;
    271 
    272     UnicodeString patString(pat, -1, US_INV);
    273     REPattern = RegexPattern::compile(patString, 0, pe, status);
    274     if (U_FAILURE(status)) {
    275         dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
    276             line, u_errorName(status));
    277         return FALSE;
    278     }
    279     if (line==376) { RegexPatternDump(REPattern);}
    280 
    281     UnicodeString inputString(inputText);
    282     UnicodeString unEscapedInput = inputString.unescape();
    283     REMatcher = REPattern->matcher(unEscapedInput, status);
    284     if (U_FAILURE(status)) {
    285         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
    286             line, u_errorName(status));
    287         return FALSE;
    288     }
    289 
    290     UBool actualmatch;
    291     actualmatch = REMatcher->lookingAt(status);
    292     if (U_FAILURE(status)) {
    293         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
    294             line, u_errorName(status));
    295         retVal =  FALSE;
    296     }
    297     if (actualmatch != looking) {
    298         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
    299         retVal = FALSE;
    300     }
    301 
    302     status = U_ZERO_ERROR;
    303     actualmatch = REMatcher->matches(status);
    304     if (U_FAILURE(status)) {
    305         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
    306             line, u_errorName(status));
    307         retVal = FALSE;
    308     }
    309     if (actualmatch != match) {
    310         errln("RegexTest: wrong return from matches() at line %d.\n", line);
    311         retVal = FALSE;
    312     }
    313 
    314     if (retVal == FALSE) {
    315         RegexPatternDump(REPattern);
    316     }
    317 
    318     delete REPattern;
    319     delete REMatcher;
    320     return retVal;
    321 }
    322 
    323 
    324 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
    325     UText               pattern    = UTEXT_INITIALIZER;
    326     int32_t             inputUTF8Length;
    327     char                *textChars = NULL;
    328     UText               inputText  = UTEXT_INITIALIZER;
    329     UErrorCode          status     = U_ZERO_ERROR;
    330     UParseError         pe;
    331     RegexPattern        *REPattern = NULL;
    332     RegexMatcher        *REMatcher = NULL;
    333     UBool               retVal     = TRUE;
    334 
    335     regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
    336     REPattern = RegexPattern::compile(&pattern, 0, pe, status);
    337     if (U_FAILURE(status)) {
    338         dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
    339             line, u_errorName(status));
    340         return FALSE;
    341     }
    342 
    343     UnicodeString inputString(text, -1, US_INV);
    344     UnicodeString unEscapedInput = inputString.unescape();
    345     LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
    346     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
    347 
    348     inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
    349     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
    350         // UTF-8 does not allow unpaired surrogates, so this could actually happen
    351         logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
    352         return TRUE; // not a failure of the Regex engine
    353     }
    354     status = U_ZERO_ERROR; // buffer overflow
    355     textChars = new char[inputUTF8Length+1];
    356     unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
    357     utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
    358 
    359     REMatcher = REPattern->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status);
    360     if (U_FAILURE(status)) {
    361         errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
    362             line, u_errorName(status));
    363         return FALSE;
    364     }
    365 
    366     UBool actualmatch;
    367     actualmatch = REMatcher->lookingAt(status);
    368     if (U_FAILURE(status)) {
    369         errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
    370             line, u_errorName(status));
    371         retVal =  FALSE;
    372     }
    373     if (actualmatch != looking) {
    374         errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
    375         retVal = FALSE;
    376     }
    377 
    378     status = U_ZERO_ERROR;
    379     actualmatch = REMatcher->matches(status);
    380     if (U_FAILURE(status)) {
    381         errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
    382             line, u_errorName(status));
    383         retVal = FALSE;
    384     }
    385     if (actualmatch != match) {
    386         errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
    387         retVal = FALSE;
    388     }
    389 
    390     if (retVal == FALSE) {
    391         RegexPatternDump(REPattern);
    392     }
    393 
    394     delete REPattern;
    395     delete REMatcher;
    396     utext_close(&inputText);
    397     utext_close(&pattern);
    398     delete[] textChars;
    399     return retVal;
    400 }
    401 
    402 
    403 
    404 //---------------------------------------------------------------------------
    405 //
    406 //    REGEX_ERR       Macro + invocation function to simplify writing tests
    407 //                       regex tests for incorrect patterns
    408 //
    409 //       usage:
    410 //          REGEX_ERR("pattern",   expected error line, column, expected status);
    411 //
    412 //---------------------------------------------------------------------------
    413 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
    414 
    415 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
    416                           UErrorCode expectedStatus, int32_t line) {
    417     UnicodeString       pattern(pat);
    418 
    419     UErrorCode          status         = U_ZERO_ERROR;
    420     UParseError         pe;
    421     RegexPattern        *callerPattern = NULL;
    422 
    423     //
    424     //  Compile the caller's pattern
    425     //
    426     UnicodeString patString(pat);
    427     callerPattern = RegexPattern::compile(patString, 0, pe, status);
    428     if (status != expectedStatus) {
    429         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
    430     } else {
    431         if (status != U_ZERO_ERROR) {
    432             if (pe.line != errLine || pe.offset != errCol) {
    433                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
    434                     line, errLine, errCol, pe.line, pe.offset);
    435             }
    436         }
    437     }
    438 
    439     delete callerPattern;
    440 
    441     //
    442     //  Compile again, using a UTF-8-based UText
    443     //
    444     UText patternText = UTEXT_INITIALIZER;
    445     regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
    446     callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
    447     if (status != expectedStatus) {
    448         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
    449     } else {
    450         if (status != U_ZERO_ERROR) {
    451             if (pe.line != errLine || pe.offset != errCol) {
    452                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
    453                     line, errLine, errCol, pe.line, pe.offset);
    454             }
    455         }
    456     }
    457 
    458     delete callerPattern;
    459     utext_close(&patternText);
    460 }
    461 
    462 
    463 
    464 //---------------------------------------------------------------------------
    465 //
    466 //      Basic      Check for basic functionality of regex pattern matching.
    467 //                 Avoid the use of REGEX_FIND test macro, which has
    468 //                 substantial dependencies on basic Regex functionality.
    469 //
    470 //---------------------------------------------------------------------------
    471 void RegexTest::Basic() {
    472 
    473 
    474 //
    475 // Debug - slide failing test cases early
    476 //
    477 #if 0
    478     {
    479         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
    480         UParseError pe;
    481         UErrorCode  status = U_ZERO_ERROR;
    482         RegexPattern::compile("^(?:a?b?)*$", 0, pe, status);
    483         // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
    484         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
    485     }
    486     exit(1);
    487 #endif
    488 
    489 
    490     //
    491     // Pattern with parentheses
    492     //
    493     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
    494     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
    495     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
    496 
    497     //
    498     // Patterns with *
    499     //
    500     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
    501     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
    502     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
    503     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
    504     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
    505 
    506     REGEX_TESTLM("a*", "",  TRUE, TRUE);
    507     REGEX_TESTLM("a*", "b", TRUE, FALSE);
    508 
    509 
    510     //
    511     //  Patterns with "."
    512     //
    513     REGEX_TESTLM(".", "abc", TRUE, FALSE);
    514     REGEX_TESTLM("...", "abc", TRUE, TRUE);
    515     REGEX_TESTLM("....", "abc", FALSE, FALSE);
    516     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
    517     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
    518     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
    519     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
    520     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
    521 
    522     //
    523     //  Patterns with * applied to chars at end of literal string
    524     //
    525     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
    526     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
    527 
    528     //
    529     //  Supplemental chars match as single chars, not a pair of surrogates.
    530     //
    531     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
    532     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
    533     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
    534 
    535 
    536     //
    537     //  UnicodeSets in the pattern
    538     //
    539     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
    540     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
    541     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
    542     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
    543     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
    544     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
    545 
    546     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
    547     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
    548     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
    549     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
    550     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
    551 
    552     //
    553     //   OR operator in patterns
    554     //
    555     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
    556     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
    557     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
    558     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
    559 
    560     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
    561     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
    562     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
    563     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
    564     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
    565     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
    566 
    567     //
    568     //  +
    569     //
    570     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
    571     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
    572     REGEX_TESTLM("b+", "", FALSE, FALSE);
    573     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
    574     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
    575     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
    576 
    577     //
    578     //   ?
    579     //
    580     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
    581     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
    582     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
    583     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
    584     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
    585     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
    586     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
    587     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
    588     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
    589 
    590     //
    591     //  Escape sequences that become single literal chars, handled internally
    592     //   by ICU's Unescape.
    593     //
    594 
    595     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
    596     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
    597     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
    598     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
    599     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
    600     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
    601     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
    602     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
    603     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
    604     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
    605 
    606     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
    607     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
    608 
    609     // Escape of special chars in patterns
    610     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
    611 }
    612 
    613 
    614 //---------------------------------------------------------------------------
    615 //
    616 //    UTextBasic   Check for quirks that are specific to the UText
    617 //                 implementation.
    618 //
    619 //---------------------------------------------------------------------------
    620 void RegexTest::UTextBasic() {
    621     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
    622     UErrorCode status = U_ZERO_ERROR;
    623     UText pattern = UTEXT_INITIALIZER;
    624     utext_openUTF8(&pattern, str_abc, -1, &status);
    625     RegexMatcher matcher(&pattern, 0, status);
    626     REGEX_CHECK_STATUS;
    627 
    628     UText input = UTEXT_INITIALIZER;
    629     utext_openUTF8(&input, str_abc, -1, &status);
    630     REGEX_CHECK_STATUS;
    631     matcher.reset(&input);
    632     REGEX_CHECK_STATUS;
    633     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
    634 
    635     matcher.reset(matcher.inputText());
    636     REGEX_CHECK_STATUS;
    637     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
    638 
    639     utext_close(&pattern);
    640     utext_close(&input);
    641 }
    642 
    643 
    644 //---------------------------------------------------------------------------
    645 //
    646 //      API_Match   Test that the API for class RegexMatcher
    647 //                  is present and nominally working, but excluding functions
    648 //                  implementing replace operations.
    649 //
    650 //---------------------------------------------------------------------------
    651 void RegexTest::API_Match() {
    652     UParseError         pe;
    653     UErrorCode          status=U_ZERO_ERROR;
    654     int32_t             flags = 0;
    655 
    656     //
    657     // Debug - slide failing test cases early
    658     //
    659 #if 0
    660     {
    661     }
    662     return;
    663 #endif
    664 
    665     //
    666     // Simple pattern compilation
    667     //
    668     {
    669         UnicodeString       re("abc");
    670         RegexPattern        *pat2;
    671         pat2 = RegexPattern::compile(re, flags, pe, status);
    672         REGEX_CHECK_STATUS;
    673 
    674         UnicodeString inStr1 = "abcdef this is a test";
    675         UnicodeString instr2 = "not abc";
    676         UnicodeString empty  = "";
    677 
    678 
    679         //
    680         // Matcher creation and reset.
    681         //
    682         RegexMatcher *m1 = pat2->matcher(inStr1, status);
    683         REGEX_CHECK_STATUS;
    684         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    685         REGEX_ASSERT(m1->input() == inStr1);
    686         m1->reset(instr2);
    687         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
    688         REGEX_ASSERT(m1->input() == instr2);
    689         m1->reset(inStr1);
    690         REGEX_ASSERT(m1->input() == inStr1);
    691         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    692         m1->reset(empty);
    693         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
    694         REGEX_ASSERT(m1->input() == empty);
    695         REGEX_ASSERT(&m1->pattern() == pat2);
    696 
    697         //
    698         //  reset(pos, status)
    699         //
    700         m1->reset(inStr1);
    701         m1->reset(4, status);
    702         REGEX_CHECK_STATUS;
    703         REGEX_ASSERT(m1->input() == inStr1);
    704         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    705 
    706         m1->reset(-1, status);
    707         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    708         status = U_ZERO_ERROR;
    709 
    710         m1->reset(0, status);
    711         REGEX_CHECK_STATUS;
    712         status = U_ZERO_ERROR;
    713 
    714         int32_t len = m1->input().length();
    715         m1->reset(len-1, status);
    716         REGEX_CHECK_STATUS;
    717         status = U_ZERO_ERROR;
    718 
    719         m1->reset(len, status);
    720         REGEX_CHECK_STATUS;
    721         status = U_ZERO_ERROR;
    722 
    723         m1->reset(len+1, status);
    724         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    725         status = U_ZERO_ERROR;
    726 
    727         //
    728         // match(pos, status)
    729         //
    730         m1->reset(instr2);
    731         REGEX_ASSERT(m1->matches(4, status) == TRUE);
    732         m1->reset();
    733         REGEX_ASSERT(m1->matches(3, status) == FALSE);
    734         m1->reset();
    735         REGEX_ASSERT(m1->matches(5, status) == FALSE);
    736         REGEX_ASSERT(m1->matches(4, status) == TRUE);
    737         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
    738         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    739 
    740         // Match() at end of string should fail, but should not
    741         //  be an error.
    742         status = U_ZERO_ERROR;
    743         len = m1->input().length();
    744         REGEX_ASSERT(m1->matches(len, status) == FALSE);
    745         REGEX_CHECK_STATUS;
    746 
    747         // Match beyond end of string should fail with an error.
    748         status = U_ZERO_ERROR;
    749         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
    750         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    751 
    752         // Successful match at end of string.
    753         {
    754             status = U_ZERO_ERROR;
    755             RegexMatcher m("A?", 0, status);  // will match zero length string.
    756             REGEX_CHECK_STATUS;
    757             m.reset(inStr1);
    758             len = inStr1.length();
    759             REGEX_ASSERT(m.matches(len, status) == TRUE);
    760             REGEX_CHECK_STATUS;
    761             m.reset(empty);
    762             REGEX_ASSERT(m.matches(0, status) == TRUE);
    763             REGEX_CHECK_STATUS;
    764         }
    765 
    766 
    767         //
    768         // lookingAt(pos, status)
    769         //
    770         status = U_ZERO_ERROR;
    771         m1->reset(instr2);  // "not abc"
    772         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
    773         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
    774         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
    775         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
    776         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
    777         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    778         status = U_ZERO_ERROR;
    779         len = m1->input().length();
    780         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
    781         REGEX_CHECK_STATUS;
    782         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
    783         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    784 
    785         delete m1;
    786         delete pat2;
    787     }
    788 
    789 
    790     //
    791     // Capture Group.
    792     //     RegexMatcher::start();
    793     //     RegexMatcher::end();
    794     //     RegexMatcher::groupCount();
    795     //
    796     {
    797         int32_t             flags=0;
    798         UParseError         pe;
    799         UErrorCode          status=U_ZERO_ERROR;
    800 
    801         UnicodeString       re("01(23(45)67)(.*)");
    802         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
    803         REGEX_CHECK_STATUS;
    804         UnicodeString data = "0123456789";
    805 
    806         RegexMatcher *matcher = pat->matcher(data, status);
    807         REGEX_CHECK_STATUS;
    808         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
    809         static const int32_t matchStarts[] = {0,  2, 4, 8};
    810         static const int32_t matchEnds[]   = {10, 8, 6, 10};
    811         int32_t i;
    812         for (i=0; i<4; i++) {
    813             int32_t actualStart = matcher->start(i, status);
    814             REGEX_CHECK_STATUS;
    815             if (actualStart != matchStarts[i]) {
    816                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
    817                     __LINE__, i, matchStarts[i], actualStart);
    818             }
    819             int32_t actualEnd = matcher->end(i, status);
    820             REGEX_CHECK_STATUS;
    821             if (actualEnd != matchEnds[i]) {
    822                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
    823                     __LINE__, i, matchEnds[i], actualEnd);
    824             }
    825         }
    826 
    827         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
    828         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
    829 
    830         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
    831         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
    832         matcher->reset();
    833         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
    834 
    835         matcher->lookingAt(status);
    836         REGEX_ASSERT(matcher->group(status)    == "0123456789");
    837         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
    838         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
    839         REGEX_ASSERT(matcher->group(2, status) == "45"        );
    840         REGEX_ASSERT(matcher->group(3, status) == "89"        );
    841         REGEX_CHECK_STATUS;
    842         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
    843         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
    844         matcher->reset();
    845         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
    846 
    847         delete matcher;
    848         delete pat;
    849 
    850     }
    851 
    852     //
    853     //  find
    854     //
    855     {
    856         int32_t             flags=0;
    857         UParseError         pe;
    858         UErrorCode          status=U_ZERO_ERROR;
    859 
    860         UnicodeString       re("abc");
    861         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
    862         REGEX_CHECK_STATUS;
    863         UnicodeString data = ".abc..abc...abc..";
    864         //                    012345678901234567
    865 
    866         RegexMatcher *matcher = pat->matcher(data, status);
    867         REGEX_CHECK_STATUS;
    868         REGEX_ASSERT(matcher->find());
    869         REGEX_ASSERT(matcher->start(status) == 1);
    870         REGEX_ASSERT(matcher->find());
    871         REGEX_ASSERT(matcher->start(status) == 6);
    872         REGEX_ASSERT(matcher->find());
    873         REGEX_ASSERT(matcher->start(status) == 12);
    874         REGEX_ASSERT(matcher->find() == FALSE);
    875         REGEX_ASSERT(matcher->find() == FALSE);
    876 
    877         matcher->reset();
    878         REGEX_ASSERT(matcher->find());
    879         REGEX_ASSERT(matcher->start(status) == 1);
    880 
    881         REGEX_ASSERT(matcher->find(0, status));
    882         REGEX_ASSERT(matcher->start(status) == 1);
    883         REGEX_ASSERT(matcher->find(1, status));
    884         REGEX_ASSERT(matcher->start(status) == 1);
    885         REGEX_ASSERT(matcher->find(2, status));
    886         REGEX_ASSERT(matcher->start(status) == 6);
    887         REGEX_ASSERT(matcher->find(12, status));
    888         REGEX_ASSERT(matcher->start(status) == 12);
    889         REGEX_ASSERT(matcher->find(13, status) == FALSE);
    890         REGEX_ASSERT(matcher->find(16, status) == FALSE);
    891         REGEX_ASSERT(matcher->find(17, status) == FALSE);
    892         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
    893 
    894         status = U_ZERO_ERROR;
    895         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
    896         status = U_ZERO_ERROR;
    897         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
    898 
    899         REGEX_ASSERT(matcher->groupCount() == 0);
    900 
    901         delete matcher;
    902         delete pat;
    903     }
    904 
    905 
    906     //
    907     //  find, with \G in pattern (true if at the end of a previous match).
    908     //
    909     {
    910         int32_t             flags=0;
    911         UParseError         pe;
    912         UErrorCode          status=U_ZERO_ERROR;
    913 
    914         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
    915         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
    916         REGEX_CHECK_STATUS;
    917         UnicodeString data = ".abcabc.abc..";
    918         //                    012345678901234567
    919 
    920         RegexMatcher *matcher = pat->matcher(data, status);
    921         REGEX_CHECK_STATUS;
    922         REGEX_ASSERT(matcher->find());
    923         REGEX_ASSERT(matcher->start(status) == 0);
    924         REGEX_ASSERT(matcher->start(1, status) == -1);
    925         REGEX_ASSERT(matcher->start(2, status) == 1);
    926 
    927         REGEX_ASSERT(matcher->find());
    928         REGEX_ASSERT(matcher->start(status) == 4);
    929         REGEX_ASSERT(matcher->start(1, status) == 4);
    930         REGEX_ASSERT(matcher->start(2, status) == -1);
    931         REGEX_CHECK_STATUS;
    932 
    933         delete matcher;
    934         delete pat;
    935     }
    936 
    937     //
    938     //   find with zero length matches, match position should bump ahead
    939     //     to prevent loops.
    940     //
    941     {
    942         int32_t                 i;
    943         UErrorCode          status=U_ZERO_ERROR;
    944         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
    945                                                       //   using an always-true look-ahead.
    946         REGEX_CHECK_STATUS;
    947         UnicodeString s("    ");
    948         m.reset(s);
    949         for (i=0; ; i++) {
    950             if (m.find() == FALSE) {
    951                 break;
    952             }
    953             REGEX_ASSERT(m.start(status) == i);
    954             REGEX_ASSERT(m.end(status) == i);
    955         }
    956         REGEX_ASSERT(i==5);
    957 
    958         // Check that the bump goes over surrogate pairs OK
    959         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
    960         s = s.unescape();
    961         m.reset(s);
    962         for (i=0; ; i+=2) {
    963             if (m.find() == FALSE) {
    964                 break;
    965             }
    966             REGEX_ASSERT(m.start(status) == i);
    967             REGEX_ASSERT(m.end(status) == i);
    968         }
    969         REGEX_ASSERT(i==10);
    970     }
    971     {
    972         // find() loop breaking test.
    973         //        with pattern of /.?/, should see a series of one char matches, then a single
    974         //        match of zero length at the end of the input string.
    975         int32_t                 i;
    976         UErrorCode          status=U_ZERO_ERROR;
    977         RegexMatcher        m(".?", 0, status);
    978         REGEX_CHECK_STATUS;
    979         UnicodeString s("    ");
    980         m.reset(s);
    981         for (i=0; ; i++) {
    982             if (m.find() == FALSE) {
    983                 break;
    984             }
    985             REGEX_ASSERT(m.start(status) == i);
    986             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
    987         }
    988         REGEX_ASSERT(i==5);
    989     }
    990 
    991 
    992     //
    993     // Matchers with no input string behave as if they had an empty input string.
    994     //
    995 
    996     {
    997         UErrorCode status = U_ZERO_ERROR;
    998         RegexMatcher  m(".?", 0, status);
    999         REGEX_CHECK_STATUS;
   1000         REGEX_ASSERT(m.find());
   1001         REGEX_ASSERT(m.start(status) == 0);
   1002         REGEX_ASSERT(m.input() == "");
   1003     }
   1004     {
   1005         UErrorCode status = U_ZERO_ERROR;
   1006         RegexPattern  *p = RegexPattern::compile(".", 0, status);
   1007         RegexMatcher  *m = p->matcher(status);
   1008         REGEX_CHECK_STATUS;
   1009 
   1010         REGEX_ASSERT(m->find() == FALSE);
   1011         REGEX_ASSERT(m->input() == "");
   1012         delete m;
   1013         delete p;
   1014     }
   1015 
   1016     //
   1017     // Regions
   1018     //
   1019     {
   1020         UErrorCode status = U_ZERO_ERROR;
   1021         UnicodeString testString("This is test data");
   1022         RegexMatcher m(".*", testString,  0, status);
   1023         REGEX_CHECK_STATUS;
   1024         REGEX_ASSERT(m.regionStart() == 0);
   1025         REGEX_ASSERT(m.regionEnd() == testString.length());
   1026         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1027         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1028 
   1029         m.region(2,4, status);
   1030         REGEX_CHECK_STATUS;
   1031         REGEX_ASSERT(m.matches(status));
   1032         REGEX_ASSERT(m.start(status)==2);
   1033         REGEX_ASSERT(m.end(status)==4);
   1034         REGEX_CHECK_STATUS;
   1035 
   1036         m.reset();
   1037         REGEX_ASSERT(m.regionStart() == 0);
   1038         REGEX_ASSERT(m.regionEnd() == testString.length());
   1039 
   1040         UnicodeString shorterString("short");
   1041         m.reset(shorterString);
   1042         REGEX_ASSERT(m.regionStart() == 0);
   1043         REGEX_ASSERT(m.regionEnd() == shorterString.length());
   1044 
   1045         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1046         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
   1047         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   1048         REGEX_ASSERT(&m == &m.reset());
   1049         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   1050 
   1051         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
   1052         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1053         REGEX_ASSERT(&m == &m.reset());
   1054         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1055 
   1056         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1057         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
   1058         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   1059         REGEX_ASSERT(&m == &m.reset());
   1060         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   1061 
   1062         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
   1063         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1064         REGEX_ASSERT(&m == &m.reset());
   1065         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1066 
   1067     }
   1068 
   1069     //
   1070     // hitEnd() and requireEnd()
   1071     //
   1072     {
   1073         UErrorCode status = U_ZERO_ERROR;
   1074         UnicodeString testString("aabb");
   1075         RegexMatcher m1(".*", testString,  0, status);
   1076         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
   1077         REGEX_ASSERT(m1.hitEnd() == TRUE);
   1078         REGEX_ASSERT(m1.requireEnd() == FALSE);
   1079         REGEX_CHECK_STATUS;
   1080 
   1081         status = U_ZERO_ERROR;
   1082         RegexMatcher m2("a*", testString, 0, status);
   1083         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
   1084         REGEX_ASSERT(m2.hitEnd() == FALSE);
   1085         REGEX_ASSERT(m2.requireEnd() == FALSE);
   1086         REGEX_CHECK_STATUS;
   1087 
   1088         status = U_ZERO_ERROR;
   1089         RegexMatcher m3(".*$", testString, 0, status);
   1090         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
   1091         REGEX_ASSERT(m3.hitEnd() == TRUE);
   1092         REGEX_ASSERT(m3.requireEnd() == TRUE);
   1093         REGEX_CHECK_STATUS;
   1094     }
   1095 
   1096 
   1097     //
   1098     // Compilation error on reset with UChar *
   1099     //   These were a hazard that people were stumbling over with runtime errors.
   1100     //   Changed them to compiler errors by adding private methods that more closely
   1101     //   matched the incorrect use of the functions.
   1102     //
   1103 #if 0
   1104     {
   1105         UErrorCode status = U_ZERO_ERROR;
   1106         UChar ucharString[20];
   1107         RegexMatcher m(".", 0, status);
   1108         m.reset(ucharString);  // should not compile.
   1109 
   1110         RegexPattern *p = RegexPattern::compile(".", 0, status);
   1111         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
   1112 
   1113         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
   1114     }
   1115 #endif
   1116 
   1117     //
   1118     //  Time Outs.
   1119     //       Note:  These tests will need to be changed when the regexp engine is
   1120     //              able to detect and cut short the exponential time behavior on
   1121     //              this type of match.
   1122     //
   1123     {
   1124         UErrorCode status = U_ZERO_ERROR;
   1125         //    Enough 'a's in the string to cause the match to time out.
   1126         //       (Each on additonal 'a' doubles the time)
   1127         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
   1128         RegexMatcher matcher("(a+)+b", testString, 0, status);
   1129         REGEX_CHECK_STATUS;
   1130         REGEX_ASSERT(matcher.getTimeLimit() == 0);
   1131         matcher.setTimeLimit(100, status);
   1132         REGEX_ASSERT(matcher.getTimeLimit() == 100);
   1133         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1134         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
   1135     }
   1136     {
   1137         UErrorCode status = U_ZERO_ERROR;
   1138         //   Few enough 'a's to slip in under the time limit.
   1139         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
   1140         RegexMatcher matcher("(a+)+b", testString, 0, status);
   1141         REGEX_CHECK_STATUS;
   1142         matcher.setTimeLimit(100, status);
   1143         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1144         REGEX_CHECK_STATUS;
   1145     }
   1146 
   1147     //
   1148     //  Stack Limits
   1149     //
   1150     {
   1151         UErrorCode status = U_ZERO_ERROR;
   1152         UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
   1153 
   1154         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
   1155         //   of the '+', and makes the stack frames larger.
   1156         RegexMatcher matcher("(A)+A$", testString, 0, status);
   1157 
   1158         // With the default stack, this match should fail to run
   1159         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1160         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
   1161 
   1162         // With unlimited stack, it should run
   1163         status = U_ZERO_ERROR;
   1164         matcher.setStackLimit(0, status);
   1165         REGEX_CHECK_STATUS;
   1166         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
   1167         REGEX_CHECK_STATUS;
   1168         REGEX_ASSERT(matcher.getStackLimit() == 0);
   1169 
   1170         // With a limited stack, it the match should fail
   1171         status = U_ZERO_ERROR;
   1172         matcher.setStackLimit(10000, status);
   1173         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1174         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
   1175         REGEX_ASSERT(matcher.getStackLimit() == 10000);
   1176     }
   1177 
   1178         // A pattern that doesn't save state should work with
   1179         //   a minimal sized stack
   1180     {
   1181         UErrorCode status = U_ZERO_ERROR;
   1182         UnicodeString testString = "abc";
   1183         RegexMatcher matcher("abc", testString, 0, status);
   1184         REGEX_CHECK_STATUS;
   1185         matcher.setStackLimit(30, status);
   1186         REGEX_CHECK_STATUS;
   1187         REGEX_ASSERT(matcher.matches(status) == TRUE);
   1188         REGEX_CHECK_STATUS;
   1189         REGEX_ASSERT(matcher.getStackLimit() == 30);
   1190 
   1191         // Negative stack sizes should fail
   1192         status = U_ZERO_ERROR;
   1193         matcher.setStackLimit(1000, status);
   1194         REGEX_CHECK_STATUS;
   1195         matcher.setStackLimit(-1, status);
   1196         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
   1197         REGEX_ASSERT(matcher.getStackLimit() == 1000);
   1198     }
   1199 
   1200 
   1201 }
   1202 
   1203 
   1204 
   1205 
   1206 
   1207 
   1208 //---------------------------------------------------------------------------
   1209 //
   1210 //      API_Replace        API test for class RegexMatcher, testing the
   1211 //                         Replace family of functions.
   1212 //
   1213 //---------------------------------------------------------------------------
   1214 void RegexTest::API_Replace() {
   1215     //
   1216     //  Replace
   1217     //
   1218     int32_t             flags=0;
   1219     UParseError         pe;
   1220     UErrorCode          status=U_ZERO_ERROR;
   1221 
   1222     UnicodeString       re("abc");
   1223     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
   1224     REGEX_CHECK_STATUS;
   1225     UnicodeString data = ".abc..abc...abc..";
   1226     //                    012345678901234567
   1227     RegexMatcher *matcher = pat->matcher(data, status);
   1228 
   1229     //
   1230     //  Plain vanilla matches.
   1231     //
   1232     UnicodeString  dest;
   1233     dest = matcher->replaceFirst("yz", status);
   1234     REGEX_CHECK_STATUS;
   1235     REGEX_ASSERT(dest == ".yz..abc...abc..");
   1236 
   1237     dest = matcher->replaceAll("yz", status);
   1238     REGEX_CHECK_STATUS;
   1239     REGEX_ASSERT(dest == ".yz..yz...yz..");
   1240 
   1241     //
   1242     //  Plain vanilla non-matches.
   1243     //
   1244     UnicodeString d2 = ".abx..abx...abx..";
   1245     matcher->reset(d2);
   1246     dest = matcher->replaceFirst("yz", status);
   1247     REGEX_CHECK_STATUS;
   1248     REGEX_ASSERT(dest == ".abx..abx...abx..");
   1249 
   1250     dest = matcher->replaceAll("yz", status);
   1251     REGEX_CHECK_STATUS;
   1252     REGEX_ASSERT(dest == ".abx..abx...abx..");
   1253 
   1254     //
   1255     // Empty source string
   1256     //
   1257     UnicodeString d3 = "";
   1258     matcher->reset(d3);
   1259     dest = matcher->replaceFirst("yz", status);
   1260     REGEX_CHECK_STATUS;
   1261     REGEX_ASSERT(dest == "");
   1262 
   1263     dest = matcher->replaceAll("yz", status);
   1264     REGEX_CHECK_STATUS;
   1265     REGEX_ASSERT(dest == "");
   1266 
   1267     //
   1268     // Empty substitution string
   1269     //
   1270     matcher->reset(data);              // ".abc..abc...abc.."
   1271     dest = matcher->replaceFirst("", status);
   1272     REGEX_CHECK_STATUS;
   1273     REGEX_ASSERT(dest == "...abc...abc..");
   1274 
   1275     dest = matcher->replaceAll("", status);
   1276     REGEX_CHECK_STATUS;
   1277     REGEX_ASSERT(dest == "........");
   1278 
   1279     //
   1280     // match whole string
   1281     //
   1282     UnicodeString d4 = "abc";
   1283     matcher->reset(d4);
   1284     dest = matcher->replaceFirst("xyz", status);
   1285     REGEX_CHECK_STATUS;
   1286     REGEX_ASSERT(dest == "xyz");
   1287 
   1288     dest = matcher->replaceAll("xyz", status);
   1289     REGEX_CHECK_STATUS;
   1290     REGEX_ASSERT(dest == "xyz");
   1291 
   1292     //
   1293     // Capture Group, simple case
   1294     //
   1295     UnicodeString       re2("a(..)");
   1296     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
   1297     REGEX_CHECK_STATUS;
   1298     UnicodeString d5 = "abcdefg";
   1299     RegexMatcher *matcher2 = pat2->matcher(d5, status);
   1300     REGEX_CHECK_STATUS;
   1301     dest = matcher2->replaceFirst("$1$1", status);
   1302     REGEX_CHECK_STATUS;
   1303     REGEX_ASSERT(dest == "bcbcdefg");
   1304 
   1305     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
   1306     REGEX_CHECK_STATUS;
   1307     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
   1308 
   1309     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
   1310     REGEX_CHECK_STATUS;
   1311     REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
   1312 
   1313     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
   1314     replacement = replacement.unescape();
   1315     dest = matcher2->replaceFirst(replacement, status);
   1316     REGEX_CHECK_STATUS;
   1317     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
   1318 
   1319     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
   1320 
   1321 
   1322     //
   1323     // Replacement String with \u hex escapes
   1324     //
   1325     {
   1326         UnicodeString  src = "abc 1 abc 2 abc 3";
   1327         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
   1328         matcher->reset(src);
   1329         UnicodeString  result = matcher->replaceAll(substitute, status);
   1330         REGEX_CHECK_STATUS;
   1331         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
   1332     }
   1333     {
   1334         UnicodeString  src = "abc !";
   1335         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
   1336         matcher->reset(src);
   1337         UnicodeString  result = matcher->replaceAll(substitute, status);
   1338         REGEX_CHECK_STATUS;
   1339         UnicodeString expected = UnicodeString("--");
   1340         expected.append((UChar32)0x10000);
   1341         expected.append("-- !");
   1342         REGEX_ASSERT(result == expected);
   1343     }
   1344     // TODO:  need more through testing of capture substitutions.
   1345 
   1346     // Bug 4057
   1347     //
   1348     {
   1349         status = U_ZERO_ERROR;
   1350         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
   1351         RegexMatcher m("ss(.*?)ee", 0, status);
   1352         REGEX_CHECK_STATUS;
   1353         UnicodeString result;
   1354 
   1355         // Multiple finds do NOT bump up the previous appendReplacement postion.
   1356         m.reset(s);
   1357         m.find();
   1358         m.find();
   1359         m.appendReplacement(result, "ooh", status);
   1360         REGEX_CHECK_STATUS;
   1361         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1362 
   1363         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
   1364         status = U_ZERO_ERROR;
   1365         result.truncate(0);
   1366         m.reset(10, status);
   1367         m.find();
   1368         m.find();
   1369         m.appendReplacement(result, "ooh", status);
   1370         REGEX_CHECK_STATUS;
   1371         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1372 
   1373         // find() at interior of string, appendReplacemnt still starts at beginning.
   1374         status = U_ZERO_ERROR;
   1375         result.truncate(0);
   1376         m.reset();
   1377         m.find(10, status);
   1378         m.find();
   1379         m.appendReplacement(result, "ooh", status);
   1380         REGEX_CHECK_STATUS;
   1381         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1382 
   1383         m.appendTail(result);
   1384         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
   1385 
   1386     }
   1387 
   1388     delete matcher2;
   1389     delete pat2;
   1390     delete matcher;
   1391     delete pat;
   1392 }
   1393 
   1394 
   1395 //---------------------------------------------------------------------------
   1396 //
   1397 //      API_Pattern       Test that the API for class RegexPattern is
   1398 //                        present and nominally working.
   1399 //
   1400 //---------------------------------------------------------------------------
   1401 void RegexTest::API_Pattern() {
   1402     RegexPattern        pata;    // Test default constructor to not crash.
   1403     RegexPattern        patb;
   1404 
   1405     REGEX_ASSERT(pata == patb);
   1406     REGEX_ASSERT(pata == pata);
   1407 
   1408     UnicodeString re1("abc[a-l][m-z]");
   1409     UnicodeString re2("def");
   1410     UErrorCode    status = U_ZERO_ERROR;
   1411     UParseError   pe;
   1412 
   1413     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
   1414     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
   1415     REGEX_CHECK_STATUS;
   1416     REGEX_ASSERT(*pat1 == *pat1);
   1417     REGEX_ASSERT(*pat1 != pata);
   1418 
   1419     // Assign
   1420     patb = *pat1;
   1421     REGEX_ASSERT(patb == *pat1);
   1422 
   1423     // Copy Construct
   1424     RegexPattern patc(*pat1);
   1425     REGEX_ASSERT(patc == *pat1);
   1426     REGEX_ASSERT(patb == patc);
   1427     REGEX_ASSERT(pat1 != pat2);
   1428     patb = *pat2;
   1429     REGEX_ASSERT(patb != patc);
   1430     REGEX_ASSERT(patb == *pat2);
   1431 
   1432     // Compile with no flags.
   1433     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
   1434     REGEX_ASSERT(*pat1a == *pat1);
   1435 
   1436     REGEX_ASSERT(pat1a->flags() == 0);
   1437 
   1438     // Compile with different flags should be not equal
   1439     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
   1440     REGEX_CHECK_STATUS;
   1441 
   1442     REGEX_ASSERT(*pat1b != *pat1a);
   1443     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
   1444     REGEX_ASSERT(pat1a->flags() == 0);
   1445     delete pat1b;
   1446 
   1447     // clone
   1448     RegexPattern *pat1c = pat1->clone();
   1449     REGEX_ASSERT(*pat1c == *pat1);
   1450     REGEX_ASSERT(*pat1c != *pat2);
   1451 
   1452     delete pat1c;
   1453     delete pat1a;
   1454     delete pat1;
   1455     delete pat2;
   1456 
   1457 
   1458     //
   1459     //   Verify that a matcher created from a cloned pattern works.
   1460     //     (Jitterbug 3423)
   1461     //
   1462     {
   1463         UErrorCode     status     = U_ZERO_ERROR;
   1464         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
   1465         RegexPattern  *pClone     = pSource->clone();
   1466         delete         pSource;
   1467         RegexMatcher  *mFromClone = pClone->matcher(status);
   1468         REGEX_CHECK_STATUS;
   1469         UnicodeString s = "Hello World";
   1470         mFromClone->reset(s);
   1471         REGEX_ASSERT(mFromClone->find() == TRUE);
   1472         REGEX_ASSERT(mFromClone->group(status) == "Hello");
   1473         REGEX_ASSERT(mFromClone->find() == TRUE);
   1474         REGEX_ASSERT(mFromClone->group(status) == "World");
   1475         REGEX_ASSERT(mFromClone->find() == FALSE);
   1476         delete mFromClone;
   1477         delete pClone;
   1478     }
   1479 
   1480     //
   1481     //   matches convenience API
   1482     //
   1483     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
   1484     REGEX_CHECK_STATUS;
   1485     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
   1486     REGEX_CHECK_STATUS;
   1487     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
   1488     REGEX_CHECK_STATUS;
   1489     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
   1490     REGEX_CHECK_STATUS;
   1491     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
   1492     REGEX_CHECK_STATUS;
   1493     status = U_INDEX_OUTOFBOUNDS_ERROR;
   1494     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
   1495     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1496 
   1497 
   1498     //
   1499     // Split()
   1500     //
   1501     status = U_ZERO_ERROR;
   1502     pat1 = RegexPattern::compile(" +",  pe, status);
   1503     REGEX_CHECK_STATUS;
   1504     UnicodeString  fields[10];
   1505 
   1506     int32_t n;
   1507     n = pat1->split("Now is the time", fields, 10, status);
   1508     REGEX_CHECK_STATUS;
   1509     REGEX_ASSERT(n==4);
   1510     REGEX_ASSERT(fields[0]=="Now");
   1511     REGEX_ASSERT(fields[1]=="is");
   1512     REGEX_ASSERT(fields[2]=="the");
   1513     REGEX_ASSERT(fields[3]=="time");
   1514     REGEX_ASSERT(fields[4]=="");
   1515 
   1516     n = pat1->split("Now is the time", fields, 2, status);
   1517     REGEX_CHECK_STATUS;
   1518     REGEX_ASSERT(n==2);
   1519     REGEX_ASSERT(fields[0]=="Now");
   1520     REGEX_ASSERT(fields[1]=="is the time");
   1521     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
   1522 
   1523     fields[1] = "*";
   1524     status = U_ZERO_ERROR;
   1525     n = pat1->split("Now is the time", fields, 1, status);
   1526     REGEX_CHECK_STATUS;
   1527     REGEX_ASSERT(n==1);
   1528     REGEX_ASSERT(fields[0]=="Now is the time");
   1529     REGEX_ASSERT(fields[1]=="*");
   1530     status = U_ZERO_ERROR;
   1531 
   1532     n = pat1->split("    Now       is the time   ", fields, 10, status);
   1533     REGEX_CHECK_STATUS;
   1534     REGEX_ASSERT(n==5);
   1535     REGEX_ASSERT(fields[0]=="");
   1536     REGEX_ASSERT(fields[1]=="Now");
   1537     REGEX_ASSERT(fields[2]=="is");
   1538     REGEX_ASSERT(fields[3]=="the");
   1539     REGEX_ASSERT(fields[4]=="time");
   1540     REGEX_ASSERT(fields[5]=="");
   1541 
   1542     n = pat1->split("     ", fields, 10, status);
   1543     REGEX_CHECK_STATUS;
   1544     REGEX_ASSERT(n==1);
   1545     REGEX_ASSERT(fields[0]=="");
   1546 
   1547     fields[0] = "foo";
   1548     n = pat1->split("", fields, 10, status);
   1549     REGEX_CHECK_STATUS;
   1550     REGEX_ASSERT(n==0);
   1551     REGEX_ASSERT(fields[0]=="foo");
   1552 
   1553     delete pat1;
   1554 
   1555     //  split, with a pattern with (capture)
   1556     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
   1557     REGEX_CHECK_STATUS;
   1558 
   1559     status = U_ZERO_ERROR;
   1560     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
   1561     REGEX_CHECK_STATUS;
   1562     REGEX_ASSERT(n==6);
   1563     REGEX_ASSERT(fields[0]=="");
   1564     REGEX_ASSERT(fields[1]=="a");
   1565     REGEX_ASSERT(fields[2]=="Now is ");
   1566     REGEX_ASSERT(fields[3]=="b");
   1567     REGEX_ASSERT(fields[4]=="the time");
   1568     REGEX_ASSERT(fields[5]=="c");
   1569     REGEX_ASSERT(fields[6]=="");
   1570     REGEX_ASSERT(status==U_ZERO_ERROR);
   1571 
   1572     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
   1573     REGEX_CHECK_STATUS;
   1574     REGEX_ASSERT(n==6);
   1575     REGEX_ASSERT(fields[0]=="  ");
   1576     REGEX_ASSERT(fields[1]=="a");
   1577     REGEX_ASSERT(fields[2]=="Now is ");
   1578     REGEX_ASSERT(fields[3]=="b");
   1579     REGEX_ASSERT(fields[4]=="the time");
   1580     REGEX_ASSERT(fields[5]=="c");
   1581     REGEX_ASSERT(fields[6]=="");
   1582 
   1583     status = U_ZERO_ERROR;
   1584     fields[6] = "foo";
   1585     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
   1586     REGEX_CHECK_STATUS;
   1587     REGEX_ASSERT(n==6);
   1588     REGEX_ASSERT(fields[0]=="  ");
   1589     REGEX_ASSERT(fields[1]=="a");
   1590     REGEX_ASSERT(fields[2]=="Now is ");
   1591     REGEX_ASSERT(fields[3]=="b");
   1592     REGEX_ASSERT(fields[4]=="the time");
   1593     REGEX_ASSERT(fields[5]=="c");
   1594     REGEX_ASSERT(fields[6]=="foo");
   1595 
   1596     status = U_ZERO_ERROR;
   1597     fields[5] = "foo";
   1598     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
   1599     REGEX_CHECK_STATUS;
   1600     REGEX_ASSERT(n==5);
   1601     REGEX_ASSERT(fields[0]=="  ");
   1602     REGEX_ASSERT(fields[1]=="a");
   1603     REGEX_ASSERT(fields[2]=="Now is ");
   1604     REGEX_ASSERT(fields[3]=="b");
   1605     REGEX_ASSERT(fields[4]=="the time<c>");
   1606     REGEX_ASSERT(fields[5]=="foo");
   1607 
   1608     status = U_ZERO_ERROR;
   1609     fields[5] = "foo";
   1610     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
   1611     REGEX_CHECK_STATUS;
   1612     REGEX_ASSERT(n==5);
   1613     REGEX_ASSERT(fields[0]=="  ");
   1614     REGEX_ASSERT(fields[1]=="a");
   1615     REGEX_ASSERT(fields[2]=="Now is ");
   1616     REGEX_ASSERT(fields[3]=="b");
   1617     REGEX_ASSERT(fields[4]=="the time");
   1618     REGEX_ASSERT(fields[5]=="foo");
   1619 
   1620     status = U_ZERO_ERROR;
   1621     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
   1622     REGEX_CHECK_STATUS;
   1623     REGEX_ASSERT(n==4);
   1624     REGEX_ASSERT(fields[0]=="  ");
   1625     REGEX_ASSERT(fields[1]=="a");
   1626     REGEX_ASSERT(fields[2]=="Now is ");
   1627     REGEX_ASSERT(fields[3]=="the time<c>");
   1628     status = U_ZERO_ERROR;
   1629     delete pat1;
   1630 
   1631     pat1 = RegexPattern::compile("([-,])",  pe, status);
   1632     REGEX_CHECK_STATUS;
   1633     n = pat1->split("1-10,20", fields, 10, status);
   1634     REGEX_CHECK_STATUS;
   1635     REGEX_ASSERT(n==5);
   1636     REGEX_ASSERT(fields[0]=="1");
   1637     REGEX_ASSERT(fields[1]=="-");
   1638     REGEX_ASSERT(fields[2]=="10");
   1639     REGEX_ASSERT(fields[3]==",");
   1640     REGEX_ASSERT(fields[4]=="20");
   1641     delete pat1;
   1642 
   1643 
   1644     //
   1645     // RegexPattern::pattern()
   1646     //
   1647     pat1 = new RegexPattern();
   1648     REGEX_ASSERT(pat1->pattern() == "");
   1649     delete pat1;
   1650 
   1651     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
   1652     REGEX_CHECK_STATUS;
   1653     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
   1654     delete pat1;
   1655 
   1656 
   1657     //
   1658     // classID functions
   1659     //
   1660     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
   1661     REGEX_CHECK_STATUS;
   1662     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
   1663     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
   1664     UnicodeString Hello("Hello, world.");
   1665     RegexMatcher *m = pat1->matcher(Hello, status);
   1666     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
   1667     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
   1668     REGEX_ASSERT(m->getDynamicClassID() != NULL);
   1669     delete m;
   1670     delete pat1;
   1671 
   1672 }
   1673 
   1674 //---------------------------------------------------------------------------
   1675 //
   1676 //      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
   1677 //                       is present and working, but excluding functions
   1678 //                       implementing replace operations.
   1679 //
   1680 //---------------------------------------------------------------------------
   1681 void RegexTest::API_Match_UTF8() {
   1682     UParseError         pe;
   1683     UErrorCode          status=U_ZERO_ERROR;
   1684     int32_t             flags = 0;
   1685 
   1686     //
   1687     // Debug - slide failing test cases early
   1688     //
   1689 #if 0
   1690     {
   1691     }
   1692     return;
   1693 #endif
   1694 
   1695     //
   1696     // Simple pattern compilation
   1697     //
   1698     {
   1699         UText               re = UTEXT_INITIALIZER;
   1700         regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
   1701         RegexPattern        *pat2;
   1702         pat2 = RegexPattern::compile(&re, flags, pe, status);
   1703         REGEX_CHECK_STATUS;
   1704 
   1705         UText input1 = UTEXT_INITIALIZER;
   1706         UText input2 = UTEXT_INITIALIZER;
   1707         UText empty  = UTEXT_INITIALIZER;
   1708         regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
   1709         REGEX_VERBOSE_TEXT(&input1);
   1710         regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
   1711         REGEX_VERBOSE_TEXT(&input2);
   1712         utext_openUChars(&empty, NULL, 0, &status);
   1713 
   1714         int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
   1715         int32_t input2Len = strlen("not abc");
   1716 
   1717 
   1718         //
   1719         // Matcher creation and reset.
   1720         //
   1721         RegexMatcher *m1 = pat2->matcher(&input1, RegexPattern::PATTERN_IS_UTEXT, status);
   1722         REGEX_CHECK_STATUS;
   1723         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1724         const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
   1725         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
   1726         m1->reset(&input2);
   1727         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
   1728         const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
   1729         REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
   1730         m1->reset(&input1);
   1731         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
   1732         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1733         m1->reset(&empty);
   1734         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
   1735         REGEX_ASSERT(utext_nativeLength(&empty) == 0);
   1736 
   1737         //
   1738         //  reset(pos, status)
   1739         //
   1740         m1->reset(&input1);
   1741         m1->reset(4, status);
   1742         REGEX_CHECK_STATUS;
   1743         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
   1744         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1745 
   1746         m1->reset(-1, status);
   1747         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1748         status = U_ZERO_ERROR;
   1749 
   1750         m1->reset(0, status);
   1751         REGEX_CHECK_STATUS;
   1752         status = U_ZERO_ERROR;
   1753 
   1754         m1->reset(input1Len-1, status);
   1755         REGEX_CHECK_STATUS;
   1756         status = U_ZERO_ERROR;
   1757 
   1758         m1->reset(input1Len, status);
   1759         REGEX_CHECK_STATUS;
   1760         status = U_ZERO_ERROR;
   1761 
   1762         m1->reset(input1Len+1, status);
   1763         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1764         status = U_ZERO_ERROR;
   1765 
   1766         //
   1767         // match(pos, status)
   1768         //
   1769         m1->reset(&input2);
   1770         REGEX_ASSERT(m1->matches(4, status) == TRUE);
   1771         m1->reset();
   1772         REGEX_ASSERT(m1->matches(3, status) == FALSE);
   1773         m1->reset();
   1774         REGEX_ASSERT(m1->matches(5, status) == FALSE);
   1775         REGEX_ASSERT(m1->matches(4, status) == TRUE);
   1776         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
   1777         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1778 
   1779         // Match() at end of string should fail, but should not
   1780         //  be an error.
   1781         status = U_ZERO_ERROR;
   1782         REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
   1783         REGEX_CHECK_STATUS;
   1784 
   1785         // Match beyond end of string should fail with an error.
   1786         status = U_ZERO_ERROR;
   1787         REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
   1788         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1789 
   1790         // Successful match at end of string.
   1791         {
   1792             status = U_ZERO_ERROR;
   1793             RegexMatcher m("A?", 0, status);  // will match zero length string.
   1794             REGEX_CHECK_STATUS;
   1795             m.reset(&input1);
   1796             REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
   1797             REGEX_CHECK_STATUS;
   1798             m.reset(&empty);
   1799             REGEX_ASSERT(m.matches(0, status) == TRUE);
   1800             REGEX_CHECK_STATUS;
   1801         }
   1802 
   1803 
   1804         //
   1805         // lookingAt(pos, status)
   1806         //
   1807         status = U_ZERO_ERROR;
   1808         m1->reset(&input2);  // "not abc"
   1809         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
   1810         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
   1811         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
   1812         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
   1813         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
   1814         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1815         status = U_ZERO_ERROR;
   1816         REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
   1817         REGEX_CHECK_STATUS;
   1818         REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
   1819         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1820 
   1821         delete m1;
   1822         delete pat2;
   1823 
   1824         utext_close(&re);
   1825         utext_close(&input1);
   1826         utext_close(&input2);
   1827         utext_close(&empty);
   1828     }
   1829 
   1830 
   1831     //
   1832     // Capture Group.
   1833     //     RegexMatcher::start();
   1834     //     RegexMatcher::end();
   1835     //     RegexMatcher::groupCount();
   1836     //
   1837     {
   1838         int32_t             flags=0;
   1839         UParseError         pe;
   1840         UErrorCode          status=U_ZERO_ERROR;
   1841         UText               re=UTEXT_INITIALIZER;
   1842         const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
   1843         utext_openUTF8(&re, str_01234567_pat, -1, &status);
   1844 
   1845         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   1846         REGEX_CHECK_STATUS;
   1847 
   1848         UText input = UTEXT_INITIALIZER;
   1849         const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
   1850         utext_openUTF8(&input, str_0123456789, -1, &status);
   1851 
   1852         RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status);
   1853         REGEX_CHECK_STATUS;
   1854         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
   1855         static const int32_t matchStarts[] = {0,  2, 4, 8};
   1856         static const int32_t matchEnds[]   = {10, 8, 6, 10};
   1857         int32_t i;
   1858         for (i=0; i<4; i++) {
   1859             int32_t actualStart = matcher->start(i, status);
   1860             REGEX_CHECK_STATUS;
   1861             if (actualStart != matchStarts[i]) {
   1862                 errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
   1863                       __FILE__, __LINE__, i, matchStarts[i], actualStart);
   1864             }
   1865             int32_t actualEnd = matcher->end(i, status);
   1866             REGEX_CHECK_STATUS;
   1867             if (actualEnd != matchEnds[i]) {
   1868                 errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
   1869                       __FILE__, __LINE__, i, matchEnds[i], actualEnd);
   1870             }
   1871         }
   1872 
   1873         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
   1874         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
   1875 
   1876         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   1877         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
   1878         matcher->reset();
   1879         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
   1880 
   1881         matcher->lookingAt(status);
   1882 
   1883         UnicodeString dest;
   1884         UText destText = UTEXT_INITIALIZER;
   1885         utext_openUnicodeString(&destText, &dest, &status);
   1886         UText *result;
   1887         //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
   1888         //	Test shallow-clone API
   1889         int64_t   group_len;
   1890         result = matcher->group((UText *)NULL, group_len, status);
   1891         REGEX_CHECK_STATUS;
   1892         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   1893         utext_close(result);
   1894         result = matcher->group(0, &destText, group_len, status);
   1895         REGEX_CHECK_STATUS;
   1896         REGEX_ASSERT(result == &destText);
   1897         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   1898         //  destText is now immutable, reopen it
   1899         utext_close(&destText);
   1900         utext_openUnicodeString(&destText, &dest, &status);
   1901 
   1902         result = matcher->group(0, NULL, status);
   1903         REGEX_CHECK_STATUS;
   1904         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   1905         utext_close(result);
   1906         result = matcher->group(0, &destText, status);
   1907         REGEX_CHECK_STATUS;
   1908         REGEX_ASSERT(result == &destText);
   1909         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   1910 
   1911         result = matcher->group(1, NULL, status);
   1912         REGEX_CHECK_STATUS;
   1913         const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */
   1914         REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
   1915         utext_close(result);
   1916         result = matcher->group(1, &destText, status);
   1917         REGEX_CHECK_STATUS;
   1918         REGEX_ASSERT(result == &destText);
   1919         REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
   1920 
   1921         result = matcher->group(2, NULL, status);
   1922         REGEX_CHECK_STATUS;
   1923         const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */
   1924         REGEX_ASSERT_UTEXT_UTF8(str_45, result);
   1925         utext_close(result);
   1926         result = matcher->group(2, &destText, status);
   1927         REGEX_CHECK_STATUS;
   1928         REGEX_ASSERT(result == &destText);
   1929         REGEX_ASSERT_UTEXT_UTF8(str_45, result);
   1930 
   1931         result = matcher->group(3, NULL, status);
   1932         REGEX_CHECK_STATUS;
   1933         const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */
   1934         REGEX_ASSERT_UTEXT_UTF8(str_89, result);
   1935         utext_close(result);
   1936         result = matcher->group(3, &destText, status);
   1937         REGEX_CHECK_STATUS;
   1938         REGEX_ASSERT(result == &destText);
   1939         REGEX_ASSERT_UTEXT_UTF8(str_89, result);
   1940 
   1941         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   1942         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
   1943         matcher->reset();
   1944         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
   1945 
   1946         delete matcher;
   1947         delete pat;
   1948 
   1949         utext_close(&destText);
   1950         utext_close(&input);
   1951         utext_close(&re);
   1952     }
   1953 
   1954     //
   1955     //  find
   1956     //
   1957     {
   1958         int32_t             flags=0;
   1959         UParseError         pe;
   1960         UErrorCode          status=U_ZERO_ERROR;
   1961         UText               re=UTEXT_INITIALIZER;
   1962         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
   1963         utext_openUTF8(&re, str_abc, -1, &status);
   1964 
   1965         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   1966         REGEX_CHECK_STATUS;
   1967         UText input = UTEXT_INITIALIZER;
   1968         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
   1969         utext_openUTF8(&input, str_abcabcabc, -1, &status);
   1970         //                      012345678901234567
   1971 
   1972         RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status);
   1973         REGEX_CHECK_STATUS;
   1974         REGEX_ASSERT(matcher->find());
   1975         REGEX_ASSERT(matcher->start(status) == 1);
   1976         REGEX_ASSERT(matcher->find());
   1977         REGEX_ASSERT(matcher->start(status) == 6);
   1978         REGEX_ASSERT(matcher->find());
   1979         REGEX_ASSERT(matcher->start(status) == 12);
   1980         REGEX_ASSERT(matcher->find() == FALSE);
   1981         REGEX_ASSERT(matcher->find() == FALSE);
   1982 
   1983         matcher->reset();
   1984         REGEX_ASSERT(matcher->find());
   1985         REGEX_ASSERT(matcher->start(status) == 1);
   1986 
   1987         REGEX_ASSERT(matcher->find(0, status));
   1988         REGEX_ASSERT(matcher->start(status) == 1);
   1989         REGEX_ASSERT(matcher->find(1, status));
   1990         REGEX_ASSERT(matcher->start(status) == 1);
   1991         REGEX_ASSERT(matcher->find(2, status));
   1992         REGEX_ASSERT(matcher->start(status) == 6);
   1993         REGEX_ASSERT(matcher->find(12, status));
   1994         REGEX_ASSERT(matcher->start(status) == 12);
   1995         REGEX_ASSERT(matcher->find(13, status) == FALSE);
   1996         REGEX_ASSERT(matcher->find(16, status) == FALSE);
   1997         REGEX_ASSERT(matcher->find(17, status) == FALSE);
   1998         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
   1999 
   2000         status = U_ZERO_ERROR;
   2001         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2002         status = U_ZERO_ERROR;
   2003         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2004 
   2005         REGEX_ASSERT(matcher->groupCount() == 0);
   2006 
   2007         delete matcher;
   2008         delete pat;
   2009 
   2010         utext_close(&input);
   2011         utext_close(&re);
   2012     }
   2013 
   2014 
   2015     //
   2016     //  find, with \G in pattern (true if at the end of a previous match).
   2017     //
   2018     {
   2019         int32_t             flags=0;
   2020         UParseError         pe;
   2021         UErrorCode          status=U_ZERO_ERROR;
   2022         UText               re=UTEXT_INITIALIZER;
   2023         const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
   2024         utext_openUTF8(&re, str_Gabcabc, -1, &status);
   2025 
   2026         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2027 
   2028         REGEX_CHECK_STATUS;
   2029         UText input = UTEXT_INITIALIZER;
   2030         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
   2031         utext_openUTF8(&input, str_abcabcabc, -1, &status);
   2032         //                      012345678901234567
   2033 
   2034         RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status);
   2035         REGEX_CHECK_STATUS;
   2036         REGEX_ASSERT(matcher->find());
   2037         REGEX_ASSERT(matcher->start(status) == 0);
   2038         REGEX_ASSERT(matcher->start(1, status) == -1);
   2039         REGEX_ASSERT(matcher->start(2, status) == 1);
   2040 
   2041         REGEX_ASSERT(matcher->find());
   2042         REGEX_ASSERT(matcher->start(status) == 4);
   2043         REGEX_ASSERT(matcher->start(1, status) == 4);
   2044         REGEX_ASSERT(matcher->start(2, status) == -1);
   2045         REGEX_CHECK_STATUS;
   2046 
   2047         delete matcher;
   2048         delete pat;
   2049 
   2050         utext_close(&input);
   2051         utext_close(&re);
   2052     }
   2053 
   2054     //
   2055     //   find with zero length matches, match position should bump ahead
   2056     //     to prevent loops.
   2057     //
   2058     {
   2059         int32_t                 i;
   2060         UErrorCode          status=U_ZERO_ERROR;
   2061         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
   2062                                                       //   using an always-true look-ahead.
   2063         REGEX_CHECK_STATUS;
   2064         UText s = UTEXT_INITIALIZER;
   2065         utext_openUTF8(&s, "    ", -1, &status);
   2066         m.reset(&s);
   2067         for (i=0; ; i++) {
   2068             if (m.find() == FALSE) {
   2069                 break;
   2070             }
   2071             REGEX_ASSERT(m.start(status) == i);
   2072             REGEX_ASSERT(m.end(status) == i);
   2073         }
   2074         REGEX_ASSERT(i==5);
   2075 
   2076         // Check that the bump goes over characters outside the BMP OK
   2077         // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
   2078         unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
   2079         utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
   2080         m.reset(&s);
   2081         for (i=0; ; i+=4) {
   2082             if (m.find() == FALSE) {
   2083                 break;
   2084             }
   2085             REGEX_ASSERT(m.start(status) == i);
   2086             REGEX_ASSERT(m.end(status) == i);
   2087         }
   2088         REGEX_ASSERT(i==20);
   2089 
   2090         utext_close(&s);
   2091     }
   2092     {
   2093         // find() loop breaking test.
   2094         //        with pattern of /.?/, should see a series of one char matches, then a single
   2095         //        match of zero length at the end of the input string.
   2096         int32_t                 i;
   2097         UErrorCode          status=U_ZERO_ERROR;
   2098         RegexMatcher        m(".?", 0, status);
   2099         REGEX_CHECK_STATUS;
   2100         UText s = UTEXT_INITIALIZER;
   2101         utext_openUTF8(&s, "    ", -1, &status);
   2102         m.reset(&s);
   2103         for (i=0; ; i++) {
   2104             if (m.find() == FALSE) {
   2105                 break;
   2106             }
   2107             REGEX_ASSERT(m.start(status) == i);
   2108             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
   2109         }
   2110         REGEX_ASSERT(i==5);
   2111 
   2112         utext_close(&s);
   2113     }
   2114 
   2115 
   2116     //
   2117     // Matchers with no input string behave as if they had an empty input string.
   2118     //
   2119 
   2120     {
   2121         UErrorCode status = U_ZERO_ERROR;
   2122         RegexMatcher  m(".?", 0, status);
   2123         REGEX_CHECK_STATUS;
   2124         REGEX_ASSERT(m.find());
   2125         REGEX_ASSERT(m.start(status) == 0);
   2126         REGEX_ASSERT(m.input() == "");
   2127     }
   2128     {
   2129         UErrorCode status = U_ZERO_ERROR;
   2130         RegexPattern  *p = RegexPattern::compile(".", 0, status);
   2131         RegexMatcher  *m = p->matcher(status);
   2132         REGEX_CHECK_STATUS;
   2133 
   2134         REGEX_ASSERT(m->find() == FALSE);
   2135         REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
   2136         delete m;
   2137         delete p;
   2138     }
   2139 
   2140     //
   2141     // Regions
   2142     //
   2143     {
   2144         UErrorCode status = U_ZERO_ERROR;
   2145         UText testPattern = UTEXT_INITIALIZER;
   2146         UText testText    = UTEXT_INITIALIZER;
   2147         regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
   2148         REGEX_VERBOSE_TEXT(&testPattern);
   2149         regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
   2150         REGEX_VERBOSE_TEXT(&testText);
   2151 
   2152         RegexMatcher m(&testPattern, &testText, 0, status);
   2153         REGEX_CHECK_STATUS;
   2154         REGEX_ASSERT(m.regionStart() == 0);
   2155         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
   2156         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2157         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2158 
   2159         m.region(2,4, status);
   2160         REGEX_CHECK_STATUS;
   2161         REGEX_ASSERT(m.matches(status));
   2162         REGEX_ASSERT(m.start(status)==2);
   2163         REGEX_ASSERT(m.end(status)==4);
   2164         REGEX_CHECK_STATUS;
   2165 
   2166         m.reset();
   2167         REGEX_ASSERT(m.regionStart() == 0);
   2168         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
   2169 
   2170         regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
   2171         REGEX_VERBOSE_TEXT(&testText);
   2172         m.reset(&testText);
   2173         REGEX_ASSERT(m.regionStart() == 0);
   2174         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
   2175 
   2176         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2177         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
   2178         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   2179         REGEX_ASSERT(&m == &m.reset());
   2180         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   2181 
   2182         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
   2183         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2184         REGEX_ASSERT(&m == &m.reset());
   2185         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2186 
   2187         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2188         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
   2189         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   2190         REGEX_ASSERT(&m == &m.reset());
   2191         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   2192 
   2193         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
   2194         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2195         REGEX_ASSERT(&m == &m.reset());
   2196         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2197 
   2198         utext_close(&testText);
   2199         utext_close(&testPattern);
   2200     }
   2201 
   2202     //
   2203     // hitEnd() and requireEnd()
   2204     //
   2205     {
   2206         UErrorCode status = U_ZERO_ERROR;
   2207         UText testPattern = UTEXT_INITIALIZER;
   2208         UText testText    = UTEXT_INITIALIZER;
   2209         const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
   2210         const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
   2211         utext_openUTF8(&testPattern, str_, -1, &status);
   2212         utext_openUTF8(&testText, str_aabb, -1, &status);
   2213 
   2214         RegexMatcher m1(&testPattern, &testText,  0, status);
   2215         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
   2216         REGEX_ASSERT(m1.hitEnd() == TRUE);
   2217         REGEX_ASSERT(m1.requireEnd() == FALSE);
   2218         REGEX_CHECK_STATUS;
   2219 
   2220         status = U_ZERO_ERROR;
   2221         const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
   2222         utext_openUTF8(&testPattern, str_a, -1, &status);
   2223         RegexMatcher m2(&testPattern, &testText, 0, status);
   2224         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
   2225         REGEX_ASSERT(m2.hitEnd() == FALSE);
   2226         REGEX_ASSERT(m2.requireEnd() == FALSE);
   2227         REGEX_CHECK_STATUS;
   2228 
   2229         status = U_ZERO_ERROR;
   2230         const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
   2231         utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
   2232         RegexMatcher m3(&testPattern, &testText, 0, status);
   2233         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
   2234         REGEX_ASSERT(m3.hitEnd() == TRUE);
   2235         REGEX_ASSERT(m3.requireEnd() == TRUE);
   2236         REGEX_CHECK_STATUS;
   2237 
   2238         utext_close(&testText);
   2239         utext_close(&testPattern);
   2240     }
   2241 }
   2242 
   2243 
   2244 //---------------------------------------------------------------------------
   2245 //
   2246 //      API_Replace_UTF8   API test for class RegexMatcher, testing the
   2247 //                         Replace family of functions.
   2248 //
   2249 //---------------------------------------------------------------------------
   2250 void RegexTest::API_Replace_UTF8() {
   2251     //
   2252     //  Replace
   2253     //
   2254     int32_t             flags=0;
   2255     UParseError         pe;
   2256     UErrorCode          status=U_ZERO_ERROR;
   2257 
   2258     UText               re=UTEXT_INITIALIZER;
   2259     regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
   2260     REGEX_VERBOSE_TEXT(&re);
   2261     RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2262     REGEX_CHECK_STATUS;
   2263 
   2264     char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
   2265     //             012345678901234567
   2266     UText dataText = UTEXT_INITIALIZER;
   2267     utext_openUTF8(&dataText, data, -1, &status);
   2268     REGEX_CHECK_STATUS;
   2269     REGEX_VERBOSE_TEXT(&dataText);
   2270     RegexMatcher *matcher = pat->matcher(&dataText, RegexPattern::PATTERN_IS_UTEXT, status);
   2271 
   2272     //
   2273     //  Plain vanilla matches.
   2274     //
   2275     UnicodeString  dest;
   2276     UText destText = UTEXT_INITIALIZER;
   2277     utext_openUnicodeString(&destText, &dest, &status);
   2278     UText *result;
   2279 
   2280     UText replText = UTEXT_INITIALIZER;
   2281 
   2282     const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
   2283     utext_openUTF8(&replText, str_yz, -1, &status);
   2284     REGEX_VERBOSE_TEXT(&replText);
   2285     result = matcher->replaceFirst(&replText, NULL, status);
   2286     REGEX_CHECK_STATUS;
   2287     const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
   2288     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
   2289     utext_close(result);
   2290     result = matcher->replaceFirst(&replText, &destText, status);
   2291     REGEX_CHECK_STATUS;
   2292     REGEX_ASSERT(result == &destText);
   2293     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
   2294 
   2295     result = matcher->replaceAll(&replText, NULL, status);
   2296     REGEX_CHECK_STATUS;
   2297     const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
   2298     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
   2299     utext_close(result);
   2300 
   2301     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2302     result = matcher->replaceAll(&replText, &destText, status);
   2303     REGEX_CHECK_STATUS;
   2304     REGEX_ASSERT(result == &destText);
   2305     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
   2306 
   2307     //
   2308     //  Plain vanilla non-matches.
   2309     //
   2310     const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
   2311     utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
   2312     matcher->reset(&dataText);
   2313 
   2314     result = matcher->replaceFirst(&replText, NULL, status);
   2315     REGEX_CHECK_STATUS;
   2316     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2317     utext_close(result);
   2318     result = matcher->replaceFirst(&replText, &destText, status);
   2319     REGEX_CHECK_STATUS;
   2320     REGEX_ASSERT(result == &destText);
   2321     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2322 
   2323     result = matcher->replaceAll(&replText, NULL, status);
   2324     REGEX_CHECK_STATUS;
   2325     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2326     utext_close(result);
   2327     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2328     result = matcher->replaceAll(&replText, &destText, status);
   2329     REGEX_CHECK_STATUS;
   2330     REGEX_ASSERT(result == &destText);
   2331     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2332 
   2333     //
   2334     // Empty source string
   2335     //
   2336     utext_openUTF8(&dataText, NULL, 0, &status);
   2337     matcher->reset(&dataText);
   2338 
   2339     result = matcher->replaceFirst(&replText, NULL, status);
   2340     REGEX_CHECK_STATUS;
   2341     REGEX_ASSERT_UTEXT_UTF8("", result);
   2342     utext_close(result);
   2343     result = matcher->replaceFirst(&replText, &destText, status);
   2344     REGEX_CHECK_STATUS;
   2345     REGEX_ASSERT(result == &destText);
   2346     REGEX_ASSERT_UTEXT_UTF8("", result);
   2347 
   2348     result = matcher->replaceAll(&replText, NULL, status);
   2349     REGEX_CHECK_STATUS;
   2350     REGEX_ASSERT_UTEXT_UTF8("", result);
   2351     utext_close(result);
   2352     result = matcher->replaceAll(&replText, &destText, status);
   2353     REGEX_CHECK_STATUS;
   2354     REGEX_ASSERT(result == &destText);
   2355     REGEX_ASSERT_UTEXT_UTF8("", result);
   2356 
   2357     //
   2358     // Empty substitution string
   2359     //
   2360     utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
   2361     matcher->reset(&dataText);
   2362 
   2363     utext_openUTF8(&replText, NULL, 0, &status);
   2364     result = matcher->replaceFirst(&replText, NULL, status);
   2365     REGEX_CHECK_STATUS;
   2366     const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
   2367     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
   2368     utext_close(result);
   2369     result = matcher->replaceFirst(&replText, &destText, status);
   2370     REGEX_CHECK_STATUS;
   2371     REGEX_ASSERT(result == &destText);
   2372     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
   2373 
   2374     result = matcher->replaceAll(&replText, NULL, status);
   2375     REGEX_CHECK_STATUS;
   2376     const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
   2377     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
   2378     utext_close(result);
   2379     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2380     result = matcher->replaceAll(&replText, &destText, status);
   2381     REGEX_CHECK_STATUS;
   2382     REGEX_ASSERT(result == &destText);
   2383     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
   2384 
   2385     //
   2386     // match whole string
   2387     //
   2388     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
   2389     utext_openUTF8(&dataText, str_abc, -1, &status);
   2390     matcher->reset(&dataText);
   2391 
   2392     const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
   2393     utext_openUTF8(&replText, str_xyz, -1, &status);
   2394     result = matcher->replaceFirst(&replText, NULL, status);
   2395     REGEX_CHECK_STATUS;
   2396     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2397     utext_close(result);
   2398     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2399     result = matcher->replaceFirst(&replText, &destText, status);
   2400     REGEX_CHECK_STATUS;
   2401     REGEX_ASSERT(result == &destText);
   2402     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2403 
   2404     result = matcher->replaceAll(&replText, NULL, status);
   2405     REGEX_CHECK_STATUS;
   2406     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2407     utext_close(result);
   2408     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2409     result = matcher->replaceAll(&replText, &destText, status);
   2410     REGEX_CHECK_STATUS;
   2411     REGEX_ASSERT(result == &destText);
   2412     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2413 
   2414     //
   2415     // Capture Group, simple case
   2416     //
   2417     const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
   2418     utext_openUTF8(&re, str_add, -1, &status);
   2419     RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
   2420     REGEX_CHECK_STATUS;
   2421 
   2422     const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
   2423     utext_openUTF8(&dataText, str_abcdefg, -1, &status);
   2424     RegexMatcher *matcher2 = pat2->matcher(&dataText, RegexPattern::PATTERN_IS_UTEXT, status);
   2425     REGEX_CHECK_STATUS;
   2426 
   2427     const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
   2428     utext_openUTF8(&replText, str_11, -1, &status);
   2429     result = matcher2->replaceFirst(&replText, NULL, status);
   2430     REGEX_CHECK_STATUS;
   2431     const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
   2432     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
   2433     utext_close(result);
   2434     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2435     result = matcher2->replaceFirst(&replText, &destText, status);
   2436     REGEX_CHECK_STATUS;
   2437     REGEX_ASSERT(result == &destText);
   2438     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
   2439 
   2440     regextst_openUTF8FromInvariant(&replText, "The value of \\$1 is $1.", -1, &status);
   2441     result = matcher2->replaceFirst(&replText, NULL, status);
   2442     REGEX_CHECK_STATUS;
   2443     const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
   2444     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
   2445     utext_close(result);
   2446     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2447     result = matcher2->replaceFirst(&replText, &destText, status);
   2448     REGEX_CHECK_STATUS;
   2449     REGEX_ASSERT(result == &destText);
   2450     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
   2451 
   2452     const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */
   2453     utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
   2454     result = matcher2->replaceFirst(&replText, NULL, status);
   2455     REGEX_CHECK_STATUS;
   2456     const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
   2457     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
   2458     utext_close(result);
   2459     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2460     result = matcher2->replaceFirst(&replText, &destText, status);
   2461     REGEX_CHECK_STATUS;
   2462     REGEX_ASSERT(result == &destText);
   2463     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
   2464 
   2465     unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
   2466     //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
   2467     //                                 012345678901234567890123456
   2468     supplDigitChars[22] = 0xF0;
   2469     supplDigitChars[23] = 0x9D;
   2470     supplDigitChars[24] = 0x9F;
   2471     supplDigitChars[25] = 0x8F;
   2472     utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
   2473 
   2474     result = matcher2->replaceFirst(&replText, NULL, status);
   2475     REGEX_CHECK_STATUS;
   2476     const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
   2477     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
   2478     utext_close(result);
   2479     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2480     result = matcher2->replaceFirst(&replText, &destText, status);
   2481     REGEX_CHECK_STATUS;
   2482     REGEX_ASSERT(result == &destText);
   2483     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
   2484     const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
   2485     utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
   2486     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
   2487 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
   2488     utext_close(result);
   2489     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2490     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
   2491     REGEX_ASSERT(result == &destText);
   2492 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
   2493 
   2494     //
   2495     // Replacement String with \u hex escapes
   2496     //
   2497     {
   2498       const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
   2499       const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
   2500         utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
   2501         utext_openUTF8(&replText, str_u0043, -1, &status);
   2502         matcher->reset(&dataText);
   2503 
   2504         result = matcher->replaceAll(&replText, NULL, status);
   2505         REGEX_CHECK_STATUS;
   2506         const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
   2507         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
   2508         utext_close(result);
   2509         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2510         result = matcher->replaceAll(&replText, &destText, status);
   2511         REGEX_CHECK_STATUS;
   2512         REGEX_ASSERT(result == &destText);
   2513         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
   2514     }
   2515     {
   2516       const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
   2517         utext_openUTF8(&dataText, str_abc, -1, &status);
   2518         const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
   2519         utext_openUTF8(&replText, str_U00010000, -1, &status);
   2520         matcher->reset(&dataText);
   2521 
   2522         unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
   2523         //                          0123456789
   2524         expected[2] = 0xF0;
   2525         expected[3] = 0x90;
   2526         expected[4] = 0x80;
   2527         expected[5] = 0x80;
   2528 
   2529         result = matcher->replaceAll(&replText, NULL, status);
   2530         REGEX_CHECK_STATUS;
   2531         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
   2532         utext_close(result);
   2533         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2534         result = matcher->replaceAll(&replText, &destText, status);
   2535         REGEX_CHECK_STATUS;
   2536         REGEX_ASSERT(result == &destText);
   2537         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
   2538     }
   2539     // TODO:  need more through testing of capture substitutions.
   2540 
   2541     // Bug 4057
   2542     //
   2543     {
   2544         status = U_ZERO_ERROR;
   2545 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
   2546 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
   2547 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
   2548         utext_openUTF8(&re, str_ssee, -1, &status);
   2549         utext_openUTF8(&dataText, str_blah, -1, &status);
   2550         utext_openUTF8(&replText, str_ooh, -1, &status);
   2551 
   2552         RegexMatcher m(&re, 0, status);
   2553         REGEX_CHECK_STATUS;
   2554 
   2555         UnicodeString result;
   2556         UText resultText = UTEXT_INITIALIZER;
   2557         utext_openUnicodeString(&resultText, &result, &status);
   2558 
   2559         // Multiple finds do NOT bump up the previous appendReplacement postion.
   2560         m.reset(&dataText);
   2561         m.find();
   2562         m.find();
   2563         m.appendReplacement(&resultText, &replText, status);
   2564         REGEX_CHECK_STATUS;
   2565         const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
   2566         REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
   2567 
   2568         // After a reset into the interior of a string, appendReplacement still starts at beginning.
   2569         status = U_ZERO_ERROR;
   2570         result.truncate(0);
   2571         utext_openUnicodeString(&resultText, &result, &status);
   2572         m.reset(10, status);
   2573         m.find();
   2574         m.find();
   2575         m.appendReplacement(&resultText, &replText, status);
   2576         REGEX_CHECK_STATUS;
   2577         const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
   2578         REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
   2579 
   2580         // find() at interior of string, appendReplacement still starts at beginning.
   2581         status = U_ZERO_ERROR;
   2582         result.truncate(0);
   2583         utext_openUnicodeString(&resultText, &result, &status);
   2584         m.reset();
   2585         m.find(10, status);
   2586         m.find();
   2587         m.appendReplacement(&resultText, &replText, status);
   2588         REGEX_CHECK_STATUS;
   2589         const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
   2590         REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
   2591 
   2592         m.appendTail(&resultText, status);
   2593         const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
   2594         REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
   2595 
   2596         utext_close(&resultText);
   2597     }
   2598 
   2599     delete matcher2;
   2600     delete pat2;
   2601     delete matcher;
   2602     delete pat;
   2603 
   2604     utext_close(&dataText);
   2605     utext_close(&replText);
   2606     utext_close(&destText);
   2607     utext_close(&re);
   2608 }
   2609 
   2610 
   2611 //---------------------------------------------------------------------------
   2612 //
   2613 //      API_Pattern_UTF8  Test that the API for class RegexPattern is
   2614 //                        present and nominally working.
   2615 //
   2616 //---------------------------------------------------------------------------
   2617 void RegexTest::API_Pattern_UTF8() {
   2618     RegexPattern        pata;    // Test default constructor to not crash.
   2619     RegexPattern        patb;
   2620 
   2621     REGEX_ASSERT(pata == patb);
   2622     REGEX_ASSERT(pata == pata);
   2623 
   2624     UText         re1 = UTEXT_INITIALIZER;
   2625     UText         re2 = UTEXT_INITIALIZER;
   2626     UErrorCode    status = U_ZERO_ERROR;
   2627     UParseError   pe;
   2628 
   2629     const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
   2630     const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
   2631     utext_openUTF8(&re1, str_abcalmz, -1, &status);
   2632     utext_openUTF8(&re2, str_def, -1, &status);
   2633 
   2634     RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
   2635     RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
   2636     REGEX_CHECK_STATUS;
   2637     REGEX_ASSERT(*pat1 == *pat1);
   2638     REGEX_ASSERT(*pat1 != pata);
   2639 
   2640     // Assign
   2641     patb = *pat1;
   2642     REGEX_ASSERT(patb == *pat1);
   2643 
   2644     // Copy Construct
   2645     RegexPattern patc(*pat1);
   2646     REGEX_ASSERT(patc == *pat1);
   2647     REGEX_ASSERT(patb == patc);
   2648     REGEX_ASSERT(pat1 != pat2);
   2649     patb = *pat2;
   2650     REGEX_ASSERT(patb != patc);
   2651     REGEX_ASSERT(patb == *pat2);
   2652 
   2653     // Compile with no flags.
   2654     RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
   2655     REGEX_ASSERT(*pat1a == *pat1);
   2656 
   2657     REGEX_ASSERT(pat1a->flags() == 0);
   2658 
   2659     // Compile with different flags should be not equal
   2660     RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
   2661     REGEX_CHECK_STATUS;
   2662 
   2663     REGEX_ASSERT(*pat1b != *pat1a);
   2664     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
   2665     REGEX_ASSERT(pat1a->flags() == 0);
   2666     delete pat1b;
   2667 
   2668     // clone
   2669     RegexPattern *pat1c = pat1->clone();
   2670     REGEX_ASSERT(*pat1c == *pat1);
   2671     REGEX_ASSERT(*pat1c != *pat2);
   2672 
   2673     delete pat1c;
   2674     delete pat1a;
   2675     delete pat1;
   2676     delete pat2;
   2677 
   2678     utext_close(&re1);
   2679     utext_close(&re2);
   2680 
   2681 
   2682     //
   2683     //   Verify that a matcher created from a cloned pattern works.
   2684     //     (Jitterbug 3423)
   2685     //
   2686     {
   2687         UErrorCode     status     = U_ZERO_ERROR;
   2688         UText          pattern    = UTEXT_INITIALIZER;
   2689         const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
   2690         utext_openUTF8(&pattern, str_pL, -1, &status);
   2691 
   2692         RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
   2693         RegexPattern  *pClone     = pSource->clone();
   2694         delete         pSource;
   2695         RegexMatcher  *mFromClone = pClone->matcher(status);
   2696         REGEX_CHECK_STATUS;
   2697 
   2698         UText          input      = UTEXT_INITIALIZER;
   2699         const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
   2700         utext_openUTF8(&input, str_HelloWorld, -1, &status);
   2701         mFromClone->reset(&input);
   2702         REGEX_ASSERT(mFromClone->find() == TRUE);
   2703         REGEX_ASSERT(mFromClone->group(status) == "Hello");
   2704         REGEX_ASSERT(mFromClone->find() == TRUE);
   2705         REGEX_ASSERT(mFromClone->group(status) == "World");
   2706         REGEX_ASSERT(mFromClone->find() == FALSE);
   2707         delete mFromClone;
   2708         delete pClone;
   2709 
   2710         utext_close(&input);
   2711         utext_close(&pattern);
   2712     }
   2713 
   2714     //
   2715     //   matches convenience API
   2716     //
   2717     {
   2718         UErrorCode status  = U_ZERO_ERROR;
   2719         UText      pattern = UTEXT_INITIALIZER;
   2720         UText      input   = UTEXT_INITIALIZER;
   2721 
   2722         const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
   2723         utext_openUTF8(&input, str_randominput, -1, &status);
   2724 
   2725         const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
   2726         utext_openUTF8(&pattern, str_dotstar, -1, &status);
   2727         REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
   2728         REGEX_CHECK_STATUS;
   2729 
   2730         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
   2731         utext_openUTF8(&pattern, str_abc, -1, &status);
   2732         REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
   2733         REGEX_CHECK_STATUS;
   2734 
   2735         const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
   2736         utext_openUTF8(&pattern, str_nput, -1, &status);
   2737         REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
   2738         REGEX_CHECK_STATUS;
   2739 
   2740         utext_openUTF8(&pattern, str_randominput, -1, &status);
   2741         REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
   2742         REGEX_CHECK_STATUS;
   2743 
   2744         const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
   2745         utext_openUTF8(&pattern, str_u, -1, &status);
   2746         REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
   2747         REGEX_CHECK_STATUS;
   2748 
   2749         utext_openUTF8(&input, str_abc, -1, &status);
   2750         utext_openUTF8(&pattern, str_abc, -1, &status);
   2751         status = U_INDEX_OUTOFBOUNDS_ERROR;
   2752         REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
   2753         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   2754 
   2755         utext_close(&input);
   2756         utext_close(&pattern);
   2757     }
   2758 
   2759 
   2760     //
   2761     // Split()
   2762     //
   2763     status = U_ZERO_ERROR;
   2764     const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
   2765     utext_openUTF8(&re1, str_spaceplus, -1, &status);
   2766     pat1 = RegexPattern::compile(&re1, pe, status);
   2767     REGEX_CHECK_STATUS;
   2768     UnicodeString  fields[10];
   2769 
   2770     int32_t n;
   2771     n = pat1->split("Now is the time", fields, 10, status);
   2772     REGEX_CHECK_STATUS;
   2773     REGEX_ASSERT(n==4);
   2774     REGEX_ASSERT(fields[0]=="Now");
   2775     REGEX_ASSERT(fields[1]=="is");
   2776     REGEX_ASSERT(fields[2]=="the");
   2777     REGEX_ASSERT(fields[3]=="time");
   2778     REGEX_ASSERT(fields[4]=="");
   2779 
   2780     n = pat1->split("Now is the time", fields, 2, status);
   2781     REGEX_CHECK_STATUS;
   2782     REGEX_ASSERT(n==2);
   2783     REGEX_ASSERT(fields[0]=="Now");
   2784     REGEX_ASSERT(fields[1]=="is the time");
   2785     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
   2786 
   2787     fields[1] = "*";
   2788     status = U_ZERO_ERROR;
   2789     n = pat1->split("Now is the time", fields, 1, status);
   2790     REGEX_CHECK_STATUS;
   2791     REGEX_ASSERT(n==1);
   2792     REGEX_ASSERT(fields[0]=="Now is the time");
   2793     REGEX_ASSERT(fields[1]=="*");
   2794     status = U_ZERO_ERROR;
   2795 
   2796     n = pat1->split("    Now       is the time   ", fields, 10, status);
   2797     REGEX_CHECK_STATUS;
   2798     REGEX_ASSERT(n==5);
   2799     REGEX_ASSERT(fields[0]=="");
   2800     REGEX_ASSERT(fields[1]=="Now");
   2801     REGEX_ASSERT(fields[2]=="is");
   2802     REGEX_ASSERT(fields[3]=="the");
   2803     REGEX_ASSERT(fields[4]=="time");
   2804     REGEX_ASSERT(fields[5]=="");
   2805 
   2806     n = pat1->split("     ", fields, 10, status);
   2807     REGEX_CHECK_STATUS;
   2808     REGEX_ASSERT(n==1);
   2809     REGEX_ASSERT(fields[0]=="");
   2810 
   2811     fields[0] = "foo";
   2812     n = pat1->split("", fields, 10, status);
   2813     REGEX_CHECK_STATUS;
   2814     REGEX_ASSERT(n==0);
   2815     REGEX_ASSERT(fields[0]=="foo");
   2816 
   2817     delete pat1;
   2818 
   2819     //  split, with a pattern with (capture)
   2820     regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
   2821     pat1 = RegexPattern::compile(&re1,  pe, status);
   2822     REGEX_CHECK_STATUS;
   2823 
   2824     status = U_ZERO_ERROR;
   2825     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
   2826     REGEX_CHECK_STATUS;
   2827     REGEX_ASSERT(n==6);
   2828     REGEX_ASSERT(fields[0]=="");
   2829     REGEX_ASSERT(fields[1]=="a");
   2830     REGEX_ASSERT(fields[2]=="Now is ");
   2831     REGEX_ASSERT(fields[3]=="b");
   2832     REGEX_ASSERT(fields[4]=="the time");
   2833     REGEX_ASSERT(fields[5]=="c");
   2834     REGEX_ASSERT(fields[6]=="");
   2835     REGEX_ASSERT(status==U_ZERO_ERROR);
   2836 
   2837     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
   2838     REGEX_CHECK_STATUS;
   2839     REGEX_ASSERT(n==6);
   2840     REGEX_ASSERT(fields[0]=="  ");
   2841     REGEX_ASSERT(fields[1]=="a");
   2842     REGEX_ASSERT(fields[2]=="Now is ");
   2843     REGEX_ASSERT(fields[3]=="b");
   2844     REGEX_ASSERT(fields[4]=="the time");
   2845     REGEX_ASSERT(fields[5]=="c");
   2846     REGEX_ASSERT(fields[6]=="");
   2847 
   2848     status = U_ZERO_ERROR;
   2849     fields[6] = "foo";
   2850     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
   2851     REGEX_CHECK_STATUS;
   2852     REGEX_ASSERT(n==6);
   2853     REGEX_ASSERT(fields[0]=="  ");
   2854     REGEX_ASSERT(fields[1]=="a");
   2855     REGEX_ASSERT(fields[2]=="Now is ");
   2856     REGEX_ASSERT(fields[3]=="b");
   2857     REGEX_ASSERT(fields[4]=="the time");
   2858     REGEX_ASSERT(fields[5]=="c");
   2859     REGEX_ASSERT(fields[6]=="foo");
   2860 
   2861     status = U_ZERO_ERROR;
   2862     fields[5] = "foo";
   2863     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
   2864     REGEX_CHECK_STATUS;
   2865     REGEX_ASSERT(n==5);
   2866     REGEX_ASSERT(fields[0]=="  ");
   2867     REGEX_ASSERT(fields[1]=="a");
   2868     REGEX_ASSERT(fields[2]=="Now is ");
   2869     REGEX_ASSERT(fields[3]=="b");
   2870     REGEX_ASSERT(fields[4]=="the time<c>");
   2871     REGEX_ASSERT(fields[5]=="foo");
   2872 
   2873     status = U_ZERO_ERROR;
   2874     fields[5] = "foo";
   2875     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
   2876     REGEX_CHECK_STATUS;
   2877     REGEX_ASSERT(n==5);
   2878     REGEX_ASSERT(fields[0]=="  ");
   2879     REGEX_ASSERT(fields[1]=="a");
   2880     REGEX_ASSERT(fields[2]=="Now is ");
   2881     REGEX_ASSERT(fields[3]=="b");
   2882     REGEX_ASSERT(fields[4]=="the time");
   2883     REGEX_ASSERT(fields[5]=="foo");
   2884 
   2885     status = U_ZERO_ERROR;
   2886     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
   2887     REGEX_CHECK_STATUS;
   2888     REGEX_ASSERT(n==4);
   2889     REGEX_ASSERT(fields[0]=="  ");
   2890     REGEX_ASSERT(fields[1]=="a");
   2891     REGEX_ASSERT(fields[2]=="Now is ");
   2892     REGEX_ASSERT(fields[3]=="the time<c>");
   2893     status = U_ZERO_ERROR;
   2894     delete pat1;
   2895 
   2896     regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
   2897     pat1 = RegexPattern::compile(&re1, pe, status);
   2898     REGEX_CHECK_STATUS;
   2899     n = pat1->split("1-10,20", fields, 10, status);
   2900     REGEX_CHECK_STATUS;
   2901     REGEX_ASSERT(n==5);
   2902     REGEX_ASSERT(fields[0]=="1");
   2903     REGEX_ASSERT(fields[1]=="-");
   2904     REGEX_ASSERT(fields[2]=="10");
   2905     REGEX_ASSERT(fields[3]==",");
   2906     REGEX_ASSERT(fields[4]=="20");
   2907     delete pat1;
   2908 
   2909 
   2910     //
   2911     // RegexPattern::pattern() and patternText()
   2912     //
   2913     pat1 = new RegexPattern();
   2914     REGEX_ASSERT(pat1->pattern() == "");
   2915     REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
   2916     delete pat1;
   2917 
   2918     regextst_openUTF8FromInvariant(&re1, "(Hello, world)*", -1, &status);
   2919     pat1 = RegexPattern::compile(&re1, pe, status);
   2920     REGEX_CHECK_STATUS;
   2921     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
   2922     REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
   2923     delete pat1;
   2924 
   2925     utext_close(&re1);
   2926 }
   2927 
   2928 
   2929 //---------------------------------------------------------------------------
   2930 //
   2931 //      Extended       A more thorough check for features of regex patterns
   2932 //                     The test cases are in a separate data file,
   2933 //                       source/tests/testdata/regextst.txt
   2934 //                     A description of the test data format is included in that file.
   2935 //
   2936 //---------------------------------------------------------------------------
   2937 
   2938 const char *
   2939 RegexTest::getPath(char buffer[2048], const char *filename) {
   2940     UErrorCode status=U_ZERO_ERROR;
   2941     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   2942     if (U_FAILURE(status)) {
   2943         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
   2944         return NULL;
   2945     }
   2946 
   2947     strcpy(buffer, testDataDirectory);
   2948     strcat(buffer, filename);
   2949     return buffer;
   2950 }
   2951 
   2952 void RegexTest::Extended() {
   2953     char tdd[2048];
   2954     const char *srcPath;
   2955     UErrorCode  status  = U_ZERO_ERROR;
   2956     int32_t     lineNum = 0;
   2957 
   2958     //
   2959     //  Open and read the test data file.
   2960     //
   2961     srcPath=getPath(tdd, "regextst.txt");
   2962     if(srcPath==NULL) {
   2963         return; /* something went wrong, error already output */
   2964     }
   2965 
   2966     int32_t    len;
   2967     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
   2968     if (U_FAILURE(status)) {
   2969         return; /* something went wrong, error already output */
   2970     }
   2971 
   2972     //
   2973     //  Put the test data into a UnicodeString
   2974     //
   2975     UnicodeString testString(FALSE, testData, len);
   2976 
   2977     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
   2978     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
   2979     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
   2980 
   2981     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
   2982     UnicodeString   testPattern;   // The pattern for test from the test file.
   2983     UnicodeString   testFlags;     // the flags   for a test.
   2984     UnicodeString   matchString;   // The marked up string to be used as input
   2985 
   2986     if (U_FAILURE(status)){
   2987         dataerrln("Construct RegexMatcher() error.");
   2988         delete [] testData;
   2989         return;
   2990     }
   2991 
   2992     //
   2993     //  Loop over the test data file, once per line.
   2994     //
   2995     while (lineMat.find()) {
   2996         lineNum++;
   2997         if (U_FAILURE(status)) {
   2998           errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
   2999         }
   3000 
   3001         status = U_ZERO_ERROR;
   3002         UnicodeString testLine = lineMat.group(1, status);
   3003         if (testLine.length() == 0) {
   3004             continue;
   3005         }
   3006 
   3007         //
   3008         // Parse the test line.  Skip blank and comment only lines.
   3009         // Separate out the three main fields - pattern, flags, target.
   3010         //
   3011 
   3012         commentMat.reset(testLine);
   3013         if (commentMat.lookingAt(status)) {
   3014             // This line is a comment, or blank.
   3015             continue;
   3016         }
   3017 
   3018         //
   3019         //  Pull out the pattern field, remove it from the test file line.
   3020         //
   3021         quotedStuffMat.reset(testLine);
   3022         if (quotedStuffMat.lookingAt(status)) {
   3023             testPattern = quotedStuffMat.group(2, status);
   3024             testLine.remove(0, quotedStuffMat.end(0, status));
   3025         } else {
   3026             errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
   3027             continue;
   3028         }
   3029 
   3030 
   3031         //
   3032         //  Pull out the flags from the test file line.
   3033         //
   3034         flagsMat.reset(testLine);
   3035         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
   3036         testFlags = flagsMat.group(1, status);
   3037         if (flagsMat.group(2, status).length() > 0) {
   3038             errln("Bad Match flag at line %d. Scanning %c\n",
   3039                 lineNum, flagsMat.group(2, status).charAt(0));
   3040             continue;
   3041         }
   3042         testLine.remove(0, flagsMat.end(0, status));
   3043 
   3044         //
   3045         //  Pull out the match string, as a whole.
   3046         //    We'll process the <tags> later.
   3047         //
   3048         quotedStuffMat.reset(testLine);
   3049         if (quotedStuffMat.lookingAt(status)) {
   3050             matchString = quotedStuffMat.group(2, status);
   3051             testLine.remove(0, quotedStuffMat.end(0, status));
   3052         } else {
   3053             errln("Bad match string at test file line %d", lineNum);
   3054             continue;
   3055         }
   3056 
   3057         //
   3058         //  The only thing left from the input line should be an optional trailing comment.
   3059         //
   3060         commentMat.reset(testLine);
   3061         if (commentMat.lookingAt(status) == FALSE) {
   3062             errln("Line %d: unexpected characters at end of test line.", lineNum);
   3063             continue;
   3064         }
   3065 
   3066         //
   3067         //  Run the test
   3068         //
   3069         regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
   3070     }
   3071 
   3072     delete [] testData;
   3073 
   3074 }
   3075 
   3076 
   3077 
   3078 //---------------------------------------------------------------------------
   3079 //
   3080 //    regex_find(pattern, flags, inputString, lineNumber)
   3081 //
   3082 //         Function to run a single test from the Extended (data driven) tests.
   3083 //         See file test/testdata/regextst.txt for a description of the
   3084 //         pattern and inputString fields, and the allowed flags.
   3085 //         lineNumber is the source line in regextst.txt of the test.
   3086 //
   3087 //---------------------------------------------------------------------------
   3088 
   3089 
   3090 //  Set a value into a UVector at position specified by a decimal number in
   3091 //   a UnicodeString.   This is a utility function needed by the actual test function,
   3092 //   which follows.
   3093 static void set(UVector &vec, int32_t val, UnicodeString index) {
   3094     UErrorCode  status=U_ZERO_ERROR;
   3095     int32_t  idx = 0;
   3096     for (int32_t i=0; i<index.length(); i++) {
   3097         int32_t d=u_charDigitValue(index.charAt(i));
   3098         if (d<0) {return;}
   3099         idx = idx*10 + d;
   3100     }
   3101     while (vec.size()<idx+1) {vec.addElement(-1, status);}
   3102     vec.setElementAt(val, idx);
   3103 }
   3104 
   3105 static void setInt(UVector &vec, int32_t val, int32_t idx) {
   3106     UErrorCode  status=U_ZERO_ERROR;
   3107     while (vec.size()<idx+1) {vec.addElement(-1, status);}
   3108     vec.setElementAt(val, idx);
   3109 }
   3110 
   3111 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
   3112 {
   3113     UBool couldFind = TRUE;
   3114     UTEXT_SETNATIVEINDEX(utext, 0);
   3115     int32_t i = 0;
   3116     while (i < unistrOffset) {
   3117         UChar32 c = UTEXT_NEXT32(utext);
   3118         if (c != U_SENTINEL) {
   3119             i += U16_LENGTH(c);
   3120         } else {
   3121             couldFind = FALSE;
   3122             break;
   3123         }
   3124     }
   3125     nativeIndex = UTEXT_GETNATIVEINDEX(utext);
   3126     return couldFind;
   3127 }
   3128 
   3129 
   3130 void RegexTest::regex_find(const UnicodeString &pattern,
   3131                            const UnicodeString &flags,
   3132                            const UnicodeString &inputString,
   3133                            const char *srcPath,
   3134                            int32_t line) {
   3135     UnicodeString       unEscapedInput;
   3136     UnicodeString       deTaggedInput;
   3137 
   3138     int32_t             patternUTF8Length,      inputUTF8Length;
   3139     char                *patternChars  = NULL, *inputChars = NULL;
   3140     UText               patternText    = UTEXT_INITIALIZER;
   3141     UText               inputText      = UTEXT_INITIALIZER;
   3142     UConverter          *UTF8Converter = NULL;
   3143 
   3144     UErrorCode          status         = U_ZERO_ERROR;
   3145     UParseError         pe;
   3146     RegexPattern        *parsePat      = NULL;
   3147     RegexMatcher        *parseMatcher  = NULL;
   3148     RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
   3149     RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
   3150     UVector             groupStarts(status);
   3151     UVector             groupEnds(status);
   3152     UVector             groupStartsUTF8(status);
   3153     UVector             groupEndsUTF8(status);
   3154     UBool               isMatch        = FALSE, isUTF8Match = FALSE;
   3155     UBool               failed         = FALSE;
   3156     int32_t             numFinds;
   3157     int32_t             i;
   3158     UBool               useMatchesFunc   = FALSE;
   3159     UBool               useLookingAtFunc = FALSE;
   3160     int32_t             regionStart      = -1;
   3161     int32_t             regionEnd        = -1;
   3162     int32_t             regionStartUTF8  = -1;
   3163     int32_t             regionEndUTF8    = -1;
   3164 
   3165 
   3166     //
   3167     //  Compile the caller's pattern
   3168     //
   3169     uint32_t bflags = 0;
   3170     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
   3171         bflags |= UREGEX_CASE_INSENSITIVE;
   3172     }
   3173     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
   3174         bflags |= UREGEX_COMMENTS;
   3175     }
   3176     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
   3177         bflags |= UREGEX_DOTALL;
   3178     }
   3179     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
   3180         bflags |= UREGEX_MULTILINE;
   3181     }
   3182 
   3183     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
   3184         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
   3185     }
   3186     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
   3187         bflags |= UREGEX_UNIX_LINES;
   3188     }
   3189 
   3190 
   3191     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
   3192     if (status != U_ZERO_ERROR) {
   3193         #if UCONFIG_NO_BREAK_ITERATION==1
   3194         // 'v' test flag means that the test pattern should not compile if ICU was configured
   3195         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
   3196         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
   3197             goto cleanupAndReturn;
   3198         }
   3199         #endif
   3200         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
   3201             // Expected pattern compilation error.
   3202             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
   3203                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
   3204             }
   3205             goto cleanupAndReturn;
   3206         } else {
   3207             // Unexpected pattern compilation error.
   3208             errln("Line %d: error %s compiling pattern.", line, u_errorName(status));
   3209             goto cleanupAndReturn;
   3210         }
   3211     }
   3212 
   3213     UTF8Converter = ucnv_open("UTF8", &status);
   3214     ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
   3215 
   3216     patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
   3217     status = U_ZERO_ERROR; // buffer overflow
   3218     patternChars = new char[patternUTF8Length+1];
   3219     pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
   3220     utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
   3221 
   3222     if (status == U_ZERO_ERROR) {
   3223         UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
   3224 
   3225         if (status != U_ZERO_ERROR) {
   3226 #if UCONFIG_NO_BREAK_ITERATION==1
   3227             // 'v' test flag means that the test pattern should not compile if ICU was configured
   3228             //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
   3229             if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
   3230                 goto cleanupAndReturn;
   3231             }
   3232 #endif
   3233             if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
   3234                 // Expected pattern compilation error.
   3235                 if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
   3236                     logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
   3237                 }
   3238                 goto cleanupAndReturn;
   3239             } else {
   3240                 // Unexpected pattern compilation error.
   3241                 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
   3242                 goto cleanupAndReturn;
   3243             }
   3244         }
   3245     }
   3246 
   3247     if (UTF8Pattern == NULL) {
   3248         // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
   3249         logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
   3250         status = U_ZERO_ERROR;
   3251     }
   3252 
   3253     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
   3254         RegexPatternDump(callerPattern);
   3255     }
   3256 
   3257     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
   3258         errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
   3259         goto cleanupAndReturn;
   3260     }
   3261 
   3262 
   3263     //
   3264     // Number of times find() should be called on the test string, default to 1
   3265     //
   3266     numFinds = 1;
   3267     for (i=2; i<=9; i++) {
   3268         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
   3269             if (numFinds != 1) {
   3270                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
   3271                 goto cleanupAndReturn;
   3272             }
   3273             numFinds = i;
   3274         }
   3275     }
   3276 
   3277     // 'M' flag.  Use matches() instead of find()
   3278     if (flags.indexOf((UChar)0x4d) >= 0) {
   3279         useMatchesFunc = TRUE;
   3280     }
   3281     if (flags.indexOf((UChar)0x4c) >= 0) {
   3282         useLookingAtFunc = TRUE;
   3283     }
   3284 
   3285     //
   3286     //  Find the tags in the input data, remove them, and record the group boundary
   3287     //    positions.
   3288     //
   3289     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
   3290     REGEX_CHECK_STATUS_L(line);
   3291 
   3292     unEscapedInput = inputString.unescape();
   3293     parseMatcher = parsePat->matcher(unEscapedInput, status);
   3294     REGEX_CHECK_STATUS_L(line);
   3295     while(parseMatcher->find()) {
   3296         parseMatcher->appendReplacement(deTaggedInput, "", status);
   3297         REGEX_CHECK_STATUS;
   3298         UnicodeString groupNum = parseMatcher->group(2, status);
   3299         if (groupNum == "r") {
   3300             // <r> or </r>, a region specification within the string
   3301             if (parseMatcher->group(1, status) == "/") {
   3302                 regionEnd = deTaggedInput.length();
   3303             } else {
   3304                 regionStart = deTaggedInput.length();
   3305             }
   3306         } else {
   3307             // <digits> or </digits>, a group match boundary tag.
   3308             if (parseMatcher->group(1, status) == "/") {
   3309                 set(groupEnds, deTaggedInput.length(), groupNum);
   3310             } else {
   3311                 set(groupStarts, deTaggedInput.length(), groupNum);
   3312             }
   3313         }
   3314     }
   3315     parseMatcher->appendTail(deTaggedInput);
   3316     REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
   3317     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
   3318       errln("mismatched <r> tags");
   3319       failed = TRUE;
   3320       goto cleanupAndReturn;
   3321     }
   3322 
   3323     //
   3324     //  Configure the matcher according to the flags specified with this test.
   3325     //
   3326     matcher = callerPattern->matcher(deTaggedInput, status);
   3327     REGEX_CHECK_STATUS_L(line);
   3328     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
   3329         matcher->setTrace(TRUE);
   3330     }
   3331 
   3332     if (UTF8Pattern != NULL) {
   3333         inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
   3334         status = U_ZERO_ERROR; // buffer overflow
   3335         inputChars = new char[inputUTF8Length+1];
   3336         deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
   3337         utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
   3338 
   3339         if (status == U_ZERO_ERROR) {
   3340             UTF8Matcher = UTF8Pattern->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status);
   3341             REGEX_CHECK_STATUS_L(line);
   3342         }
   3343 
   3344         if (UTF8Matcher == NULL) {
   3345             // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
   3346           logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
   3347             status = U_ZERO_ERROR;
   3348         }
   3349     }
   3350 
   3351     //
   3352     //  Generate native indices for UTF8 versions of region and capture group info
   3353     //
   3354     if (UTF8Matcher != NULL) {
   3355         if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
   3356         if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
   3357 
   3358         //  Fill out the native index UVector info.
   3359         //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
   3360         for (i=0; i<groupStarts.size(); i++) {
   3361             int32_t  start = groupStarts.elementAti(i);
   3362             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
   3363             if (start >= 0) {
   3364                 int32_t  startUTF8;
   3365                 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
   3366                     errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
   3367                     failed = TRUE;
   3368                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3369                 }
   3370                 setInt(groupStartsUTF8, startUTF8, i);
   3371             }
   3372 
   3373             int32_t  end = groupEnds.elementAti(i);
   3374             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
   3375             if (end >= 0) {
   3376                 int32_t  endUTF8;
   3377                 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
   3378                     errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
   3379                     failed = TRUE;
   3380                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3381                 }
   3382                 setInt(groupEndsUTF8, endUTF8, i);
   3383             }
   3384         }
   3385     }
   3386 
   3387     if (regionStart>=0) {
   3388        matcher->region(regionStart, regionEnd, status);
   3389        REGEX_CHECK_STATUS_L(line);
   3390        if (UTF8Matcher != NULL) {
   3391            UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
   3392            REGEX_CHECK_STATUS_L(line);
   3393        }
   3394     }
   3395     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
   3396         matcher->useAnchoringBounds(FALSE);
   3397         if (UTF8Matcher != NULL) {
   3398             UTF8Matcher->useAnchoringBounds(FALSE);
   3399         }
   3400     }
   3401     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
   3402         matcher->useTransparentBounds(TRUE);
   3403         if (UTF8Matcher != NULL) {
   3404             UTF8Matcher->useTransparentBounds(TRUE);
   3405         }
   3406     }
   3407 
   3408 
   3409 
   3410     //
   3411     // Do a find on the de-tagged input using the caller's pattern
   3412     //     TODO: error on count>1 and not find().
   3413     //           error on both matches() and lookingAt().
   3414     //
   3415     for (i=0; i<numFinds; i++) {
   3416         if (useMatchesFunc) {
   3417             isMatch = matcher->matches(status);
   3418             if (UTF8Matcher != NULL) {
   3419                isUTF8Match = UTF8Matcher->matches(status);
   3420             }
   3421         } else  if (useLookingAtFunc) {
   3422             isMatch = matcher->lookingAt(status);
   3423             if (UTF8Matcher != NULL) {
   3424                 isUTF8Match = UTF8Matcher->lookingAt(status);
   3425             }
   3426         } else {
   3427             isMatch = matcher->find();
   3428             if (UTF8Matcher != NULL) {
   3429                 isUTF8Match = UTF8Matcher->find();
   3430             }
   3431         }
   3432     }
   3433     matcher->setTrace(FALSE);
   3434 
   3435     //
   3436     // Match up the groups from the find() with the groups from the tags
   3437     //
   3438 
   3439     // number of tags should match number of groups from find operation.
   3440     // matcher->groupCount does not include group 0, the entire match, hence the +1.
   3441     //   G option in test means that capture group data is not available in the
   3442     //     expected results, so the check needs to be suppressed.
   3443     if (isMatch == FALSE && groupStarts.size() != 0) {
   3444         errln("Error at line %d:  Match expected, but none found.", line);
   3445         failed = TRUE;
   3446         goto cleanupAndReturn;
   3447     } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
   3448         errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
   3449         failed = TRUE;
   3450         goto cleanupAndReturn;
   3451     }
   3452 
   3453     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
   3454         // Only check for match / no match.  Don't check capture groups.
   3455         if (isMatch && groupStarts.size() == 0) {
   3456             errln("Error at line %d:  No match expected, but one found.", line);
   3457             failed = TRUE;
   3458         } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
   3459             errln("Error at line %d:  No match expected, but one found. (UTF8)", line);
   3460             failed = TRUE;
   3461         }
   3462         goto cleanupAndReturn;
   3463     }
   3464 
   3465     REGEX_CHECK_STATUS_L(line);
   3466     for (i=0; i<=matcher->groupCount(); i++) {
   3467         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
   3468         int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
   3469         if (matcher->start(i, status) != expectedStart) {
   3470             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
   3471                 line, i, expectedStart, matcher->start(i, status));
   3472             failed = TRUE;
   3473             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3474         } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
   3475             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
   3476                   line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
   3477             failed = TRUE;
   3478             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3479         }
   3480 
   3481         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
   3482         int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
   3483         if (matcher->end(i, status) != expectedEnd) {
   3484             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
   3485                 line, i, expectedEnd, matcher->end(i, status));
   3486             failed = TRUE;
   3487             // Error on end position;  keep going; real error is probably yet to come as group
   3488             //   end positions work from end of the input data towards the front.
   3489         } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
   3490             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
   3491                   line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
   3492             failed = TRUE;
   3493             // Error on end position;  keep going; real error is probably yet to come as group
   3494             //   end positions work from end of the input data towards the front.
   3495         }
   3496     }
   3497     if ( matcher->groupCount()+1 < groupStarts.size()) {
   3498         errln("Error at line %d: Expected %d capture groups, found %d.",
   3499             line, groupStarts.size()-1, matcher->groupCount());
   3500         failed = TRUE;
   3501         }
   3502     else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
   3503         errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
   3504               line, groupStarts.size()-1, UTF8Matcher->groupCount());
   3505         failed = TRUE;
   3506     }
   3507 
   3508     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3509         matcher->requireEnd() == TRUE) {
   3510         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
   3511         failed = TRUE;
   3512     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3513         UTF8Matcher->requireEnd() == TRUE) {
   3514         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
   3515         failed = TRUE;
   3516     }
   3517 
   3518     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
   3519         matcher->requireEnd() == FALSE) {
   3520         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
   3521         failed = TRUE;
   3522     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3523         UTF8Matcher->requireEnd() == FALSE) {
   3524         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
   3525         failed = TRUE;
   3526     }
   3527 
   3528     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
   3529         matcher->hitEnd() == TRUE) {
   3530         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
   3531         failed = TRUE;
   3532     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
   3533                UTF8Matcher->hitEnd() == TRUE) {
   3534         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
   3535         failed = TRUE;
   3536     }
   3537 
   3538     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
   3539         matcher->hitEnd() == FALSE) {
   3540         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
   3541         failed = TRUE;
   3542     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
   3543                UTF8Matcher->hitEnd() == FALSE) {
   3544         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
   3545         failed = TRUE;
   3546     }
   3547 
   3548 
   3549 cleanupAndReturn:
   3550     if (failed) {
   3551         infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
   3552             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
   3553         // callerPattern->dump();
   3554     }
   3555     delete parseMatcher;
   3556     delete parsePat;
   3557     delete UTF8Matcher;
   3558     delete UTF8Pattern;
   3559     delete matcher;
   3560     delete callerPattern;
   3561 
   3562     utext_close(&inputText);
   3563     delete[] inputChars;
   3564     utext_close(&patternText);
   3565     delete[] patternChars;
   3566     ucnv_close(UTF8Converter);
   3567 }
   3568 
   3569 
   3570 
   3571 
   3572 //---------------------------------------------------------------------------
   3573 //
   3574 //      Errors     Check for error handling in patterns.
   3575 //
   3576 //---------------------------------------------------------------------------
   3577 void RegexTest::Errors() {
   3578     // \escape sequences that aren't implemented yet.
   3579     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
   3580 
   3581     // Missing close parentheses
   3582     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
   3583     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
   3584     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
   3585 
   3586     // Extra close paren
   3587     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
   3588     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
   3589     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
   3590 
   3591     // Look-ahead, Look-behind
   3592     //  TODO:  add tests for unbounded length look-behinds.
   3593     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
   3594 
   3595     // Attempt to use non-default flags
   3596     {
   3597         UParseError   pe;
   3598         UErrorCode    status = U_ZERO_ERROR;
   3599         int32_t       flags  = UREGEX_CANON_EQ |
   3600                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
   3601                                UREGEX_MULTILINE;
   3602         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
   3603         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
   3604         delete pat1;
   3605     }
   3606 
   3607 
   3608     // Quantifiers are allowed only after something that can be quantified.
   3609     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
   3610     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
   3611     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
   3612 
   3613     // Mal-formed {min,max} quantifiers
   3614     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
   3615     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
   3616     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
   3617     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
   3618     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
   3619     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
   3620     REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
   3621     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
   3622     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
   3623 
   3624     // Ticket 5389
   3625     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
   3626 
   3627     // Invalid Back Reference \0
   3628     //    For ICU 3.8 and earlier
   3629     //    For ICU versions newer than 3.8, \0 introduces an octal escape.
   3630     //
   3631     REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
   3632 
   3633 }
   3634 
   3635 
   3636 //-------------------------------------------------------------------------------
   3637 //
   3638 //  Read a text data file, convert it to UChars, and return the data
   3639 //    in one big UChar * buffer, which the caller must delete.
   3640 //
   3641 //--------------------------------------------------------------------------------
   3642 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
   3643                                      const char *defEncoding, UErrorCode &status) {
   3644     UChar       *retPtr  = NULL;
   3645     char        *fileBuf = NULL;
   3646     UConverter* conv     = NULL;
   3647     FILE        *f       = NULL;
   3648 
   3649     ulen = 0;
   3650     if (U_FAILURE(status)) {
   3651         return retPtr;
   3652     }
   3653 
   3654     //
   3655     //  Open the file.
   3656     //
   3657     f = fopen(fileName, "rb");
   3658     if (f == 0) {
   3659         dataerrln("Error opening test data file %s\n", fileName);
   3660         status = U_FILE_ACCESS_ERROR;
   3661         return NULL;
   3662     }
   3663     //
   3664     //  Read it in
   3665     //
   3666     int32_t            fileSize;
   3667     int32_t            amt_read;
   3668 
   3669     fseek( f, 0, SEEK_END);
   3670     fileSize = ftell(f);
   3671     fileBuf = new char[fileSize];
   3672     fseek(f, 0, SEEK_SET);
   3673     amt_read = fread(fileBuf, 1, fileSize, f);
   3674     if (amt_read != fileSize || fileSize <= 0) {
   3675         errln("Error reading test data file.");
   3676         goto cleanUpAndReturn;
   3677     }
   3678 
   3679     //
   3680     // Look for a Unicode Signature (BOM) on the data just read
   3681     //
   3682     int32_t        signatureLength;
   3683     const char *   fileBufC;
   3684     const char*    encoding;
   3685 
   3686     fileBufC = fileBuf;
   3687     encoding = ucnv_detectUnicodeSignature(
   3688         fileBuf, fileSize, &signatureLength, &status);
   3689     if(encoding!=NULL ){
   3690         fileBufC  += signatureLength;
   3691         fileSize  -= signatureLength;
   3692     } else {
   3693         encoding = defEncoding;
   3694         if (strcmp(encoding, "utf-8") == 0) {
   3695             errln("file %s is missing its BOM", fileName);
   3696         }
   3697     }
   3698 
   3699     //
   3700     // Open a converter to take the rule file to UTF-16
   3701     //
   3702     conv = ucnv_open(encoding, &status);
   3703     if (U_FAILURE(status)) {
   3704         goto cleanUpAndReturn;
   3705     }
   3706 
   3707     //
   3708     // Convert the rules to UChar.
   3709     //  Preflight first to determine required buffer size.
   3710     //
   3711     ulen = ucnv_toUChars(conv,
   3712         NULL,           //  dest,
   3713         0,              //  destCapacity,
   3714         fileBufC,
   3715         fileSize,
   3716         &status);
   3717     if (status == U_BUFFER_OVERFLOW_ERROR) {
   3718         // Buffer Overflow is expected from the preflight operation.
   3719         status = U_ZERO_ERROR;
   3720 
   3721         retPtr = new UChar[ulen+1];
   3722         ucnv_toUChars(conv,
   3723             retPtr,       //  dest,
   3724             ulen+1,
   3725             fileBufC,
   3726             fileSize,
   3727             &status);
   3728     }
   3729 
   3730 cleanUpAndReturn:
   3731     fclose(f);
   3732     delete[] fileBuf;
   3733     ucnv_close(conv);
   3734     if (U_FAILURE(status)) {
   3735         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   3736         delete retPtr;
   3737         retPtr = 0;
   3738         ulen   = 0;
   3739     };
   3740     return retPtr;
   3741 }
   3742 
   3743 
   3744 //-------------------------------------------------------------------------------
   3745 //
   3746 //   PerlTests  - Run Perl's regular expression tests
   3747 //                The input file for this test is re_tests, the standard regular
   3748 //                expression test data distributed with the Perl source code.
   3749 //
   3750 //                Here is Perl's description of the test data file:
   3751 //
   3752 //        # The tests are in a separate file 't/op/re_tests'.
   3753 //        # Each line in that file is a separate test.
   3754 //        # There are five columns, separated by tabs.
   3755 //        #
   3756 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
   3757 //        # Modifiers can be put after the closing C<'>.
   3758 //        #
   3759 //        # Column 2 contains the string to be matched.
   3760 //        #
   3761 //        # Column 3 contains the expected result:
   3762 //        #     y   expect a match
   3763 //        #     n   expect no match
   3764 //        #     c   expect an error
   3765 //        # B   test exposes a known bug in Perl, should be skipped
   3766 //        # b   test exposes a known bug in Perl, should be skipped if noamp
   3767 //        #
   3768 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
   3769 //        #
   3770 //        # Column 4 contains a string, usually C<$&>.
   3771 //        #
   3772 //        # Column 5 contains the expected result of double-quote
   3773 //        # interpolating that string after the match, or start of error message.
   3774 //        #
   3775 //        # Column 6, if present, contains a reason why the test is skipped.
   3776 //        # This is printed with "skipped", for harness to pick up.
   3777 //        #
   3778 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
   3779 //        #
   3780 //        # If you want to add a regular expression test that can't be expressed
   3781 //        # in this format, don't add it here: put it in op/pat.t instead.
   3782 //
   3783 //        For ICU, if field 3 contains an 'i', the test will be skipped.
   3784 //        The test exposes is some known incompatibility between ICU and Perl regexps.
   3785 //        (The i is in addition to whatever was there before.)
   3786 //
   3787 //-------------------------------------------------------------------------------
   3788 void RegexTest::PerlTests() {
   3789     char tdd[2048];
   3790     const char *srcPath;
   3791     UErrorCode  status = U_ZERO_ERROR;
   3792     UParseError pe;
   3793 
   3794     //
   3795     //  Open and read the test data file.
   3796     //
   3797     srcPath=getPath(tdd, "re_tests.txt");
   3798     if(srcPath==NULL) {
   3799         return; /* something went wrong, error already output */
   3800     }
   3801 
   3802     int32_t    len;
   3803     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
   3804     if (U_FAILURE(status)) {
   3805         return; /* something went wrong, error already output */
   3806     }
   3807 
   3808     //
   3809     //  Put the test data into a UnicodeString
   3810     //
   3811     UnicodeString testDataString(FALSE, testData, len);
   3812 
   3813     //
   3814     //  Regex to break the input file into lines, and strip the new lines.
   3815     //     One line per match, capture group one is the desired data.
   3816     //
   3817     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
   3818     if (U_FAILURE(status)) {
   3819         dataerrln("RegexPattern::compile() error");
   3820         return;
   3821     }
   3822     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
   3823 
   3824     //
   3825     //  Regex to split a test file line into fields.
   3826     //    There are six fields, separated by tabs.
   3827     //
   3828     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
   3829 
   3830     //
   3831     //  Regex to identify test patterns with flag settings, and to separate them.
   3832     //    Test patterns with flags look like 'pattern'i
   3833     //    Test patterns without flags are not quoted:   pattern
   3834     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
   3835     //
   3836     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
   3837     RegexMatcher* flagMat = flagPat->matcher(status);
   3838 
   3839     //
   3840     // The Perl tests reference several perl-isms, which are evaluated/substituted
   3841     //   in the test data.  Not being perl, this must be done explicitly.  Here
   3842     //   are string constants and REs for these constructs.
   3843     //
   3844     UnicodeString nulnulSrc("${nulnul}");
   3845     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
   3846     nulnul = nulnul.unescape();
   3847 
   3848     UnicodeString ffffSrc("${ffff}");
   3849     UnicodeString ffff("\\uffff", -1, US_INV);
   3850     ffff = ffff.unescape();
   3851 
   3852     //  regexp for $-[0], $+[2], etc.
   3853     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
   3854     RegexMatcher *groupsMat = groupsPat->matcher(status);
   3855 
   3856     //  regexp for $0, $1, $2, etc.
   3857     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
   3858     RegexMatcher *cgMat = cgPat->matcher(status);
   3859 
   3860 
   3861     //
   3862     // Main Loop for the Perl Tests, runs once per line from the
   3863     //   test data file.
   3864     //
   3865     int32_t  lineNum = 0;
   3866     int32_t  skippedUnimplementedCount = 0;
   3867     while (lineMat->find()) {
   3868         lineNum++;
   3869 
   3870         //
   3871         //  Get a line, break it into its fields, do the Perl
   3872         //    variable substitutions.
   3873         //
   3874         UnicodeString line = lineMat->group(1, status);
   3875         UnicodeString fields[7];
   3876         fieldPat->split(line, fields, 7, status);
   3877 
   3878         flagMat->reset(fields[0]);
   3879         flagMat->matches(status);
   3880         UnicodeString pattern  = flagMat->group(2, status);
   3881         pattern.findAndReplace("${bang}", "!");
   3882         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
   3883         pattern.findAndReplace(ffffSrc, ffff);
   3884 
   3885         //
   3886         //  Identify patterns that include match flag settings,
   3887         //    split off the flags, remove the extra quotes.
   3888         //
   3889         UnicodeString flagStr = flagMat->group(3, status);
   3890         if (U_FAILURE(status)) {
   3891             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   3892             return;
   3893         }
   3894         int32_t flags = 0;
   3895         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
   3896         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
   3897         const UChar UChar_m = 0x6d;
   3898         const UChar UChar_x = 0x78;
   3899         const UChar UChar_y = 0x79;
   3900         if (flagStr.indexOf(UChar_i) != -1) {
   3901             flags |= UREGEX_CASE_INSENSITIVE;
   3902         }
   3903         if (flagStr.indexOf(UChar_m) != -1) {
   3904             flags |= UREGEX_MULTILINE;
   3905         }
   3906         if (flagStr.indexOf(UChar_x) != -1) {
   3907             flags |= UREGEX_COMMENTS;
   3908         }
   3909 
   3910         //
   3911         // Compile the test pattern.
   3912         //
   3913         status = U_ZERO_ERROR;
   3914         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
   3915         if (status == U_REGEX_UNIMPLEMENTED) {
   3916             //
   3917             // Test of a feature that is planned for ICU, but not yet implemented.
   3918             //   skip the test.
   3919             skippedUnimplementedCount++;
   3920             delete testPat;
   3921             status = U_ZERO_ERROR;
   3922             continue;
   3923         }
   3924 
   3925         if (U_FAILURE(status)) {
   3926             // Some tests are supposed to generate errors.
   3927             //   Only report an error for tests that are supposed to succeed.
   3928             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
   3929                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
   3930             {
   3931                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
   3932             }
   3933             status = U_ZERO_ERROR;
   3934             delete testPat;
   3935             continue;
   3936         }
   3937 
   3938         if (fields[2].indexOf(UChar_i) >= 0) {
   3939             // ICU should skip this test.
   3940             delete testPat;
   3941             continue;
   3942         }
   3943 
   3944         if (fields[2].indexOf(UChar_c) >= 0) {
   3945             // This pattern should have caused a compilation error, but didn't/
   3946             errln("line %d: Expected a pattern compile error, got success.", lineNum);
   3947             delete testPat;
   3948             continue;
   3949         }
   3950 
   3951         //
   3952         // replace the Perl variables that appear in some of the
   3953         //   match data strings.
   3954         //
   3955         UnicodeString matchString = fields[1];
   3956         matchString.findAndReplace(nulnulSrc, nulnul);
   3957         matchString.findAndReplace(ffffSrc,   ffff);
   3958 
   3959         // Replace any \n in the match string with an actual new-line char.
   3960         //  Don't do full unescape, as this unescapes more than Perl does, which
   3961         //  causes other spurious failures in the tests.
   3962         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   3963 
   3964 
   3965 
   3966         //
   3967         // Run the test, check for expected match/don't match result.
   3968         //
   3969         RegexMatcher *testMat = testPat->matcher(matchString, status);
   3970         UBool found = testMat->find();
   3971         UBool expected = FALSE;
   3972         if (fields[2].indexOf(UChar_y) >=0) {
   3973             expected = TRUE;
   3974         }
   3975         if (expected != found) {
   3976             errln("line %d: Expected %smatch, got %smatch",
   3977                 lineNum, expected?"":"no ", found?"":"no " );
   3978             continue;
   3979         }
   3980 
   3981         // Don't try to check expected results if there is no match.
   3982         //   (Some have stuff in the expected fields)
   3983         if (!found) {
   3984             delete testMat;
   3985             delete testPat;
   3986             continue;
   3987         }
   3988 
   3989         //
   3990         // Interpret the Perl expression from the fourth field of the data file,
   3991         // building up an ICU string from the results of the ICU match.
   3992         //   The Perl expression will contain references to the results of
   3993         //     a regex match, including the matched string, capture group strings,
   3994         //     group starting and ending indicies, etc.
   3995         //
   3996         UnicodeString resultString;
   3997         UnicodeString perlExpr = fields[3];
   3998 #if SUPPORT_MUTATING_INPUT_STRING
   3999         groupsMat->reset(perlExpr);
   4000         cgMat->reset(perlExpr);
   4001 #endif
   4002 
   4003         while (perlExpr.length() > 0) {
   4004 #if !SUPPORT_MUTATING_INPUT_STRING
   4005             //  Perferred usage.  Reset after any modification to input string.
   4006             groupsMat->reset(perlExpr);
   4007             cgMat->reset(perlExpr);
   4008 #endif
   4009 
   4010             if (perlExpr.startsWith("$&")) {
   4011                 resultString.append(testMat->group(status));
   4012                 perlExpr.remove(0, 2);
   4013             }
   4014 
   4015             else if (groupsMat->lookingAt(status)) {
   4016                 // $-[0]   $+[2]  etc.
   4017                 UnicodeString digitString = groupsMat->group(2, status);
   4018                 int32_t t = 0;
   4019                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4020                 UnicodeString plusOrMinus = groupsMat->group(1, status);
   4021                 int32_t matchPosition;
   4022                 if (plusOrMinus.compare("+") == 0) {
   4023                     matchPosition = testMat->end(groupNum, status);
   4024                 } else {
   4025                     matchPosition = testMat->start(groupNum, status);
   4026                 }
   4027                 if (matchPosition != -1) {
   4028                     ICU_Utility::appendNumber(resultString, matchPosition);
   4029                 }
   4030                 perlExpr.remove(0, groupsMat->end(status));
   4031             }
   4032 
   4033             else if (cgMat->lookingAt(status)) {
   4034                 // $1, $2, $3, etc.
   4035                 UnicodeString digitString = cgMat->group(1, status);
   4036                 int32_t t = 0;
   4037                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4038                 if (U_SUCCESS(status)) {
   4039                     resultString.append(testMat->group(groupNum, status));
   4040                     status = U_ZERO_ERROR;
   4041                 }
   4042                 perlExpr.remove(0, cgMat->end(status));
   4043             }
   4044 
   4045             else if (perlExpr.startsWith("@-")) {
   4046                 int32_t i;
   4047                 for (i=0; i<=testMat->groupCount(); i++) {
   4048                     if (i>0) {
   4049                         resultString.append(" ");
   4050                     }
   4051                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
   4052                 }
   4053                 perlExpr.remove(0, 2);
   4054             }
   4055 
   4056             else if (perlExpr.startsWith("@+")) {
   4057                 int32_t i;
   4058                 for (i=0; i<=testMat->groupCount(); i++) {
   4059                     if (i>0) {
   4060                         resultString.append(" ");
   4061                     }
   4062                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
   4063                 }
   4064                 perlExpr.remove(0, 2);
   4065             }
   4066 
   4067             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
   4068                                                      //           or as an escaped sequence (e.g. \n)
   4069                 if (perlExpr.length() > 1) {
   4070                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
   4071                 }
   4072                 UChar c = perlExpr.charAt(0);
   4073                 switch (c) {
   4074                 case 'n':   c = '\n'; break;
   4075                 // add any other escape sequences that show up in the test expected results.
   4076                 }
   4077                 resultString.append(c);
   4078                 perlExpr.remove(0, 1);
   4079             }
   4080 
   4081             else  {
   4082                 // Any characters from the perl expression that we don't explicitly
   4083                 //  recognize before here are assumed to be literals and copied
   4084                 //  as-is to the expected results.
   4085                 resultString.append(perlExpr.charAt(0));
   4086                 perlExpr.remove(0, 1);
   4087             }
   4088 
   4089             if (U_FAILURE(status)) {
   4090                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
   4091                 break;
   4092             }
   4093         }
   4094 
   4095         //
   4096         // Expected Results Compare
   4097         //
   4098         UnicodeString expectedS(fields[4]);
   4099         expectedS.findAndReplace(nulnulSrc, nulnul);
   4100         expectedS.findAndReplace(ffffSrc,   ffff);
   4101         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4102 
   4103 
   4104         if (expectedS.compare(resultString) != 0) {
   4105             err("Line %d: Incorrect perl expression results.", lineNum);
   4106             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
   4107         }
   4108 
   4109         delete testMat;
   4110         delete testPat;
   4111     }
   4112 
   4113     //
   4114     // All done.  Clean up allocated stuff.
   4115     //
   4116     delete cgMat;
   4117     delete cgPat;
   4118 
   4119     delete groupsMat;
   4120     delete groupsPat;
   4121 
   4122     delete flagMat;
   4123     delete flagPat;
   4124 
   4125     delete lineMat;
   4126     delete linePat;
   4127 
   4128     delete fieldPat;
   4129     delete [] testData;
   4130 
   4131 
   4132     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
   4133 
   4134 }
   4135 
   4136 
   4137 //-------------------------------------------------------------------------------
   4138 //
   4139 //   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
   4140 //                  (instead of using UnicodeStrings) to test the alternate engine.
   4141 //                  The input file for this test is re_tests, the standard regular
   4142 //                  expression test data distributed with the Perl source code.
   4143 //                  See PerlTests() for more information.
   4144 //
   4145 //-------------------------------------------------------------------------------
   4146 void RegexTest::PerlTestsUTF8() {
   4147     char tdd[2048];
   4148     const char *srcPath;
   4149     UErrorCode  status = U_ZERO_ERROR;
   4150     UParseError pe;
   4151     LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
   4152     UText       patternText = UTEXT_INITIALIZER;
   4153     char       *patternChars = NULL;
   4154     int32_t     patternLength;
   4155     int32_t     patternCapacity = 0;
   4156     UText       inputText = UTEXT_INITIALIZER;
   4157     char       *inputChars = NULL;
   4158     int32_t     inputLength;
   4159     int32_t     inputCapacity = 0;
   4160 
   4161     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
   4162 
   4163     //
   4164     //  Open and read the test data file.
   4165     //
   4166     srcPath=getPath(tdd, "re_tests.txt");
   4167     if(srcPath==NULL) {
   4168         return; /* something went wrong, error already output */
   4169     }
   4170 
   4171     int32_t    len;
   4172     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
   4173     if (U_FAILURE(status)) {
   4174         return; /* something went wrong, error already output */
   4175     }
   4176 
   4177     //
   4178     //  Put the test data into a UnicodeString
   4179     //
   4180     UnicodeString testDataString(FALSE, testData, len);
   4181 
   4182     //
   4183     //  Regex to break the input file into lines, and strip the new lines.
   4184     //     One line per match, capture group one is the desired data.
   4185     //
   4186     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
   4187     if (U_FAILURE(status)) {
   4188         dataerrln("RegexPattern::compile() error");
   4189         return;
   4190     }
   4191     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
   4192 
   4193     //
   4194     //  Regex to split a test file line into fields.
   4195     //    There are six fields, separated by tabs.
   4196     //
   4197     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
   4198 
   4199     //
   4200     //  Regex to identify test patterns with flag settings, and to separate them.
   4201     //    Test patterns with flags look like 'pattern'i
   4202     //    Test patterns without flags are not quoted:   pattern
   4203     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
   4204     //
   4205     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
   4206     RegexMatcher* flagMat = flagPat->matcher(status);
   4207 
   4208     //
   4209     // The Perl tests reference several perl-isms, which are evaluated/substituted
   4210     //   in the test data.  Not being perl, this must be done explicitly.  Here
   4211     //   are string constants and REs for these constructs.
   4212     //
   4213     UnicodeString nulnulSrc("${nulnul}");
   4214     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
   4215     nulnul = nulnul.unescape();
   4216 
   4217     UnicodeString ffffSrc("${ffff}");
   4218     UnicodeString ffff("\\uffff", -1, US_INV);
   4219     ffff = ffff.unescape();
   4220 
   4221     //  regexp for $-[0], $+[2], etc.
   4222     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
   4223     RegexMatcher *groupsMat = groupsPat->matcher(status);
   4224 
   4225     //  regexp for $0, $1, $2, etc.
   4226     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
   4227     RegexMatcher *cgMat = cgPat->matcher(status);
   4228 
   4229 
   4230     //
   4231     // Main Loop for the Perl Tests, runs once per line from the
   4232     //   test data file.
   4233     //
   4234     int32_t  lineNum = 0;
   4235     int32_t  skippedUnimplementedCount = 0;
   4236     while (lineMat->find()) {
   4237         lineNum++;
   4238 
   4239         //
   4240         //  Get a line, break it into its fields, do the Perl
   4241         //    variable substitutions.
   4242         //
   4243         UnicodeString line = lineMat->group(1, status);
   4244         UnicodeString fields[7];
   4245         fieldPat->split(line, fields, 7, status);
   4246 
   4247         flagMat->reset(fields[0]);
   4248         flagMat->matches(status);
   4249         UnicodeString pattern  = flagMat->group(2, status);
   4250         pattern.findAndReplace("${bang}", "!");
   4251         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
   4252         pattern.findAndReplace(ffffSrc, ffff);
   4253 
   4254         //
   4255         //  Identify patterns that include match flag settings,
   4256         //    split off the flags, remove the extra quotes.
   4257         //
   4258         UnicodeString flagStr = flagMat->group(3, status);
   4259         if (U_FAILURE(status)) {
   4260             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   4261             return;
   4262         }
   4263         int32_t flags = 0;
   4264         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
   4265         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
   4266         const UChar UChar_m = 0x6d;
   4267         const UChar UChar_x = 0x78;
   4268         const UChar UChar_y = 0x79;
   4269         if (flagStr.indexOf(UChar_i) != -1) {
   4270             flags |= UREGEX_CASE_INSENSITIVE;
   4271         }
   4272         if (flagStr.indexOf(UChar_m) != -1) {
   4273             flags |= UREGEX_MULTILINE;
   4274         }
   4275         if (flagStr.indexOf(UChar_x) != -1) {
   4276             flags |= UREGEX_COMMENTS;
   4277         }
   4278 
   4279         //
   4280         // Put the pattern in a UTF-8 UText
   4281         //
   4282         status = U_ZERO_ERROR;
   4283         patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
   4284         if (status == U_BUFFER_OVERFLOW_ERROR) {
   4285             status = U_ZERO_ERROR;
   4286             delete[] patternChars;
   4287             patternCapacity = patternLength + 1;
   4288             patternChars = new char[patternCapacity];
   4289             pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
   4290         }
   4291         utext_openUTF8(&patternText, patternChars, patternLength, &status);
   4292 
   4293         //
   4294         // Compile the test pattern.
   4295         //
   4296         RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
   4297         if (status == U_REGEX_UNIMPLEMENTED) {
   4298             //
   4299             // Test of a feature that is planned for ICU, but not yet implemented.
   4300             //   skip the test.
   4301             skippedUnimplementedCount++;
   4302             delete testPat;
   4303             status = U_ZERO_ERROR;
   4304             continue;
   4305         }
   4306 
   4307         if (U_FAILURE(status)) {
   4308             // Some tests are supposed to generate errors.
   4309             //   Only report an error for tests that are supposed to succeed.
   4310             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
   4311                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
   4312             {
   4313                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
   4314             }
   4315             status = U_ZERO_ERROR;
   4316             delete testPat;
   4317             continue;
   4318         }
   4319 
   4320         if (fields[2].indexOf(UChar_i) >= 0) {
   4321             // ICU should skip this test.
   4322             delete testPat;
   4323             continue;
   4324         }
   4325 
   4326         if (fields[2].indexOf(UChar_c) >= 0) {
   4327             // This pattern should have caused a compilation error, but didn't/
   4328             errln("line %d: Expected a pattern compile error, got success.", lineNum);
   4329             delete testPat;
   4330             continue;
   4331         }
   4332 
   4333 
   4334         //
   4335         // replace the Perl variables that appear in some of the
   4336         //   match data strings.
   4337         //
   4338         UnicodeString matchString = fields[1];
   4339         matchString.findAndReplace(nulnulSrc, nulnul);
   4340         matchString.findAndReplace(ffffSrc,   ffff);
   4341 
   4342         // Replace any \n in the match string with an actual new-line char.
   4343         //  Don't do full unescape, as this unescapes more than Perl does, which
   4344         //  causes other spurious failures in the tests.
   4345         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4346 
   4347         //
   4348         // Put the input in a UTF-8 UText
   4349         //
   4350         status = U_ZERO_ERROR;
   4351         inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
   4352         if (status == U_BUFFER_OVERFLOW_ERROR) {
   4353             status = U_ZERO_ERROR;
   4354             delete[] inputChars;
   4355             inputCapacity = inputLength + 1;
   4356             inputChars = new char[inputCapacity];
   4357             matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
   4358         }
   4359         utext_openUTF8(&inputText, inputChars, inputLength, &status);
   4360 
   4361         //
   4362         // Run the test, check for expected match/don't match result.
   4363         //
   4364         RegexMatcher *testMat = testPat->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status);
   4365         UBool found = testMat->find();
   4366         UBool expected = FALSE;
   4367         if (fields[2].indexOf(UChar_y) >=0) {
   4368             expected = TRUE;
   4369         }
   4370         if (expected != found) {
   4371             errln("line %d: Expected %smatch, got %smatch",
   4372                 lineNum, expected?"":"no ", found?"":"no " );
   4373             continue;
   4374         }
   4375 
   4376         // Don't try to check expected results if there is no match.
   4377         //   (Some have stuff in the expected fields)
   4378         if (!found) {
   4379             delete testMat;
   4380             delete testPat;
   4381             continue;
   4382         }
   4383 
   4384         //
   4385         // Interpret the Perl expression from the fourth field of the data file,
   4386         // building up an ICU string from the results of the ICU match.
   4387         //   The Perl expression will contain references to the results of
   4388         //     a regex match, including the matched string, capture group strings,
   4389         //     group starting and ending indicies, etc.
   4390         //
   4391         UnicodeString resultString;
   4392         UnicodeString perlExpr = fields[3];
   4393 
   4394         while (perlExpr.length() > 0) {
   4395             groupsMat->reset(perlExpr);
   4396             cgMat->reset(perlExpr);
   4397 
   4398             if (perlExpr.startsWith("$&")) {
   4399                 resultString.append(testMat->group(status));
   4400                 perlExpr.remove(0, 2);
   4401             }
   4402 
   4403             else if (groupsMat->lookingAt(status)) {
   4404                 // $-[0]   $+[2]  etc.
   4405                 UnicodeString digitString = groupsMat->group(2, status);
   4406                 int32_t t = 0;
   4407                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4408                 UnicodeString plusOrMinus = groupsMat->group(1, status);
   4409                 int32_t matchPosition;
   4410                 if (plusOrMinus.compare("+") == 0) {
   4411                     matchPosition = testMat->end(groupNum, status);
   4412                 } else {
   4413                     matchPosition = testMat->start(groupNum, status);
   4414                 }
   4415                 if (matchPosition != -1) {
   4416                     ICU_Utility::appendNumber(resultString, matchPosition);
   4417                 }
   4418                 perlExpr.remove(0, groupsMat->end(status));
   4419             }
   4420 
   4421             else if (cgMat->lookingAt(status)) {
   4422                 // $1, $2, $3, etc.
   4423                 UnicodeString digitString = cgMat->group(1, status);
   4424                 int32_t t = 0;
   4425                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4426                 if (U_SUCCESS(status)) {
   4427                     resultString.append(testMat->group(groupNum, status));
   4428                     status = U_ZERO_ERROR;
   4429                 }
   4430                 perlExpr.remove(0, cgMat->end(status));
   4431             }
   4432 
   4433             else if (perlExpr.startsWith("@-")) {
   4434                 int32_t i;
   4435                 for (i=0; i<=testMat->groupCount(); i++) {
   4436                     if (i>0) {
   4437                         resultString.append(" ");
   4438                     }
   4439                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
   4440                 }
   4441                 perlExpr.remove(0, 2);
   4442             }
   4443 
   4444             else if (perlExpr.startsWith("@+")) {
   4445                 int32_t i;
   4446                 for (i=0; i<=testMat->groupCount(); i++) {
   4447                     if (i>0) {
   4448                         resultString.append(" ");
   4449                     }
   4450                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
   4451                 }
   4452                 perlExpr.remove(0, 2);
   4453             }
   4454 
   4455             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
   4456                                                      //           or as an escaped sequence (e.g. \n)
   4457                 if (perlExpr.length() > 1) {
   4458                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
   4459                 }
   4460                 UChar c = perlExpr.charAt(0);
   4461                 switch (c) {
   4462                 case 'n':   c = '\n'; break;
   4463                 // add any other escape sequences that show up in the test expected results.
   4464                 }
   4465                 resultString.append(c);
   4466                 perlExpr.remove(0, 1);
   4467             }
   4468 
   4469             else  {
   4470                 // Any characters from the perl expression that we don't explicitly
   4471                 //  recognize before here are assumed to be literals and copied
   4472                 //  as-is to the expected results.
   4473                 resultString.append(perlExpr.charAt(0));
   4474                 perlExpr.remove(0, 1);
   4475             }
   4476 
   4477             if (U_FAILURE(status)) {
   4478                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
   4479                 break;
   4480             }
   4481         }
   4482 
   4483         //
   4484         // Expected Results Compare
   4485         //
   4486         UnicodeString expectedS(fields[4]);
   4487         expectedS.findAndReplace(nulnulSrc, nulnul);
   4488         expectedS.findAndReplace(ffffSrc,   ffff);
   4489         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4490 
   4491 
   4492         if (expectedS.compare(resultString) != 0) {
   4493             err("Line %d: Incorrect perl expression results.", lineNum);
   4494             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
   4495         }
   4496 
   4497         delete testMat;
   4498         delete testPat;
   4499     }
   4500 
   4501     //
   4502     // All done.  Clean up allocated stuff.
   4503     //
   4504     delete cgMat;
   4505     delete cgPat;
   4506 
   4507     delete groupsMat;
   4508     delete groupsPat;
   4509 
   4510     delete flagMat;
   4511     delete flagPat;
   4512 
   4513     delete lineMat;
   4514     delete linePat;
   4515 
   4516     delete fieldPat;
   4517     delete [] testData;
   4518 
   4519     utext_close(&patternText);
   4520     utext_close(&inputText);
   4521 
   4522     delete [] patternChars;
   4523     delete [] inputChars;
   4524 
   4525 
   4526     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
   4527 
   4528 }
   4529 
   4530 
   4531 //--------------------------------------------------------------
   4532 //
   4533 //  Bug6149   Verify limits to heap expansion for backtrack stack.
   4534 //             Use this pattern,
   4535 //                 "(a?){1,}"
   4536 //             The zero-length match will repeat forever.
   4537 //                (That this goes into a loop is another bug)
   4538 //
   4539 //---------------------------------------------------------------
   4540 void RegexTest::Bug6149() {
   4541     UnicodeString pattern("(a?){1,}");
   4542     UnicodeString s("xyz");
   4543     uint32_t flags = 0;
   4544     UErrorCode status = U_ZERO_ERROR;
   4545 
   4546     RegexMatcher  matcher(pattern, s, flags, status);
   4547     UBool result = false;
   4548     REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
   4549     REGEX_ASSERT(result == FALSE);
   4550  }
   4551 
   4552 
   4553 //
   4554 //   Callbacks()    Test the callback function.
   4555 //                  When set, callbacks occur periodically during matching operations,
   4556 //                  giving the application code the ability to abort the operation
   4557 //                  before it's normal completion.
   4558 //
   4559 
   4560 struct callBackContext {
   4561     RegexTest        *test;
   4562     int32_t          maxCalls;
   4563     int32_t          numCalls;
   4564     int32_t          lastSteps;
   4565     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
   4566 };
   4567 
   4568 U_CDECL_BEGIN
   4569 static UBool U_CALLCONV
   4570 testCallBackFn(const void *context, int32_t steps) {
   4571     callBackContext  *info = (callBackContext *)context;
   4572     if (info->lastSteps+1 != steps) {
   4573         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
   4574     }
   4575     info->lastSteps = steps;
   4576     info->numCalls++;
   4577     return (info->numCalls < info->maxCalls);
   4578 }
   4579 U_CDECL_END
   4580 
   4581 void RegexTest::Callbacks() {
   4582    {
   4583         // Getter returns NULLs if no callback has been set
   4584 
   4585         //   The variables that the getter will fill in.
   4586         //   Init to non-null values so that the action of the getter can be seen.
   4587         const void          *returnedContext = &returnedContext;
   4588         URegexMatchCallback *returnedFn = &testCallBackFn;
   4589 
   4590         UErrorCode status = U_ZERO_ERROR;
   4591         RegexMatcher matcher("x", 0, status);
   4592         REGEX_CHECK_STATUS;
   4593         matcher.getMatchCallback(returnedFn, returnedContext, status);
   4594         REGEX_CHECK_STATUS;
   4595         REGEX_ASSERT(returnedFn == NULL);
   4596         REGEX_ASSERT(returnedContext == NULL);
   4597     }
   4598 
   4599    {
   4600         // Set and Get work
   4601         callBackContext cbInfo = {this, 0, 0, 0};
   4602         const void          *returnedContext;
   4603         URegexMatchCallback *returnedFn;
   4604         UErrorCode status = U_ZERO_ERROR;
   4605         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
   4606         REGEX_CHECK_STATUS;
   4607         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
   4608         REGEX_CHECK_STATUS;
   4609         matcher.getMatchCallback(returnedFn, returnedContext, status);
   4610         REGEX_CHECK_STATUS;
   4611         REGEX_ASSERT(returnedFn == testCallBackFn);
   4612         REGEX_ASSERT(returnedContext == &cbInfo);
   4613 
   4614         // A short-running match shouldn't invoke the callback
   4615         status = U_ZERO_ERROR;
   4616         cbInfo.reset(1);
   4617         UnicodeString s = "xxx";
   4618         matcher.reset(s);
   4619         REGEX_ASSERT(matcher.matches(status));
   4620         REGEX_CHECK_STATUS;
   4621         REGEX_ASSERT(cbInfo.numCalls == 0);
   4622 
   4623         // A medium-length match that runs long enough to invoke the
   4624         //   callback, but not so long that the callback aborts it.
   4625         status = U_ZERO_ERROR;
   4626         cbInfo.reset(4);
   4627         s = "aaaaaaaaaaaaaaaaaaab";
   4628         matcher.reset(s);
   4629         REGEX_ASSERT(matcher.matches(status)==FALSE);
   4630         REGEX_CHECK_STATUS;
   4631         REGEX_ASSERT(cbInfo.numCalls > 0);
   4632 
   4633         // A longer running match that the callback function will abort.
   4634         status = U_ZERO_ERROR;
   4635         cbInfo.reset(4);
   4636         s = "aaaaaaaaaaaaaaaaaaaaaaab";
   4637         matcher.reset(s);
   4638         REGEX_ASSERT(matcher.matches(status)==FALSE);
   4639         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
   4640         REGEX_ASSERT(cbInfo.numCalls == 4);
   4641     }
   4642 
   4643 
   4644 }
   4645 
   4646 
   4647 //
   4648 //   FindProgressCallbacks()    Test the find "progress" callback function.
   4649 //                  When set, the find progress callback will be invoked during a find operations
   4650 //                  after each return from a match attempt, giving the application the opportunity
   4651 //                  to terminate a long-running find operation before it's normal completion.
   4652 //
   4653 
   4654 struct progressCallBackContext {
   4655     RegexTest        *test;
   4656     int64_t          lastIndex;
   4657     int32_t          maxCalls;
   4658     int32_t          numCalls;
   4659     void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
   4660 };
   4661 
   4662 U_CDECL_BEGIN
   4663 static UBool U_CALLCONV
   4664 testProgressCallBackFn(const void *context, int64_t matchIndex) {
   4665     progressCallBackContext  *info = (progressCallBackContext *)context;
   4666     info->numCalls++;
   4667     info->lastIndex = matchIndex;
   4668 //    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
   4669     return (info->numCalls < info->maxCalls);
   4670 }
   4671 U_CDECL_END
   4672 
   4673 void RegexTest::FindProgressCallbacks() {
   4674    {
   4675         // Getter returns NULLs if no callback has been set
   4676 
   4677         //   The variables that the getter will fill in.
   4678         //   Init to non-null values so that the action of the getter can be seen.
   4679         const void                  *returnedContext = &returnedContext;
   4680         URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
   4681 
   4682         UErrorCode status = U_ZERO_ERROR;
   4683         RegexMatcher matcher("x", 0, status);
   4684         REGEX_CHECK_STATUS;
   4685         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
   4686         REGEX_CHECK_STATUS;
   4687         REGEX_ASSERT(returnedFn == NULL);
   4688         REGEX_ASSERT(returnedContext == NULL);
   4689     }
   4690 
   4691    {
   4692         // Set and Get work
   4693         progressCallBackContext cbInfo = {this, 0, 0, 0};
   4694         const void                  *returnedContext;
   4695         URegexFindProgressCallback  *returnedFn;
   4696         UErrorCode status = U_ZERO_ERROR;
   4697         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
   4698         REGEX_CHECK_STATUS;
   4699         matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
   4700         REGEX_CHECK_STATUS;
   4701         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
   4702         REGEX_CHECK_STATUS;
   4703         REGEX_ASSERT(returnedFn == testProgressCallBackFn);
   4704         REGEX_ASSERT(returnedContext == &cbInfo);
   4705 
   4706         // A short-running match should NOT invoke the callback.
   4707         status = U_ZERO_ERROR;
   4708         cbInfo.reset(100);
   4709         UnicodeString s = "abxxx";
   4710         matcher.reset(s);
   4711 #if 0
   4712         matcher.setTrace(TRUE);
   4713 #endif
   4714         REGEX_ASSERT(matcher.find(0, status));
   4715         REGEX_CHECK_STATUS;
   4716         REGEX_ASSERT(cbInfo.numCalls == 0);
   4717 
   4718         // A medium running match that causes matcher.find() to invoke our callback for each index.
   4719         status = U_ZERO_ERROR;
   4720         s = "aaaaaaaaaaaaaaaaaaab";
   4721         cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
   4722         matcher.reset(s);
   4723         REGEX_ASSERT(matcher.find(0, status)==FALSE);
   4724         REGEX_CHECK_STATUS;
   4725         REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
   4726 
   4727         // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
   4728         status = U_ZERO_ERROR;
   4729         UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
   4730         cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
   4731         matcher.reset(s1);
   4732         REGEX_ASSERT(matcher.find(0, status)==FALSE);
   4733         REGEX_CHECK_STATUS;
   4734         REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
   4735 
   4736 #if 0
   4737         // Now a match that will succeed, but after an interruption
   4738         status = U_ZERO_ERROR;
   4739         UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
   4740         cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
   4741         matcher.reset(s2);
   4742         REGEX_ASSERT(matcher.find(0, status)==FALSE);
   4743         REGEX_CHECK_STATUS;
   4744         // Now retry the match from where left off
   4745         cbInfo.maxCalls = 100; //  No callback limit
   4746         REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
   4747         REGEX_CHECK_STATUS;
   4748 #endif
   4749     }
   4750 
   4751 
   4752 }
   4753 
   4754 
   4755 //---------------------------------------------------------------------------
   4756 //
   4757 //    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
   4758 //                             UTexts. The pure-C implementation of UText
   4759 //                             has no mutable backing stores, but we can
   4760 //                             use UnicodeString here to test the functionality.
   4761 //
   4762 //---------------------------------------------------------------------------
   4763 void RegexTest::PreAllocatedUTextCAPI () {
   4764     UErrorCode           status = U_ZERO_ERROR;
   4765     URegularExpression  *re;
   4766     UText                patternText = UTEXT_INITIALIZER;
   4767     UnicodeString        buffer;
   4768     UText                bufferText = UTEXT_INITIALIZER;
   4769 
   4770     utext_openUnicodeString(&bufferText, &buffer, &status);
   4771 
   4772     /*
   4773      *  getText() and getUText()
   4774      */
   4775     {
   4776         UText  text1 = UTEXT_INITIALIZER;
   4777         UText  text2 = UTEXT_INITIALIZER;
   4778         UChar  text2Chars[20];
   4779         UText  *resultText;
   4780 
   4781         status = U_ZERO_ERROR;
   4782         regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
   4783         regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
   4784         u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
   4785         utext_openUChars(&text2, text2Chars, -1, &status);
   4786 
   4787         regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
   4788         re = uregex_openUText(&patternText, 0, NULL, &status);
   4789 
   4790         /* First set a UText */
   4791         uregex_setUText(re, &text1, &status);
   4792         resultText = uregex_getUText(re, &bufferText, &status);
   4793         REGEX_CHECK_STATUS;
   4794         REGEX_ASSERT(resultText == &bufferText);
   4795         utext_setNativeIndex(resultText, 0);
   4796         utext_setNativeIndex(&text1, 0);
   4797         REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
   4798 
   4799         resultText = uregex_getUText(re, &bufferText, &status);
   4800         REGEX_CHECK_STATUS;
   4801         REGEX_ASSERT(resultText == &bufferText);
   4802         utext_setNativeIndex(resultText, 0);
   4803         utext_setNativeIndex(&text1, 0);
   4804         REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
   4805 
   4806         /* Then set a UChar * */
   4807         uregex_setText(re, text2Chars, 7, &status);
   4808         resultText = uregex_getUText(re, &bufferText, &status);
   4809         REGEX_CHECK_STATUS;
   4810         REGEX_ASSERT(resultText == &bufferText);
   4811         utext_setNativeIndex(resultText, 0);
   4812         utext_setNativeIndex(&text2, 0);
   4813         REGEX_ASSERT(utext_compare(resultText, -1, &text2, -1) == 0);
   4814 
   4815         uregex_close(re);
   4816         utext_close(&text1);
   4817         utext_close(&text2);
   4818     }
   4819 
   4820     /*
   4821      *  group()
   4822      */
   4823     {
   4824         UChar    text1[80];
   4825         UText   *actual;
   4826         UBool    result;
   4827         u_uastrncpy(text1, "noise abc interior def, and this is off the end",  sizeof(text1)/2);
   4828 
   4829         status = U_ZERO_ERROR;
   4830         re = uregex_openC("abc(.*?)def", 0, NULL, &status);
   4831         REGEX_CHECK_STATUS;
   4832 
   4833         uregex_setText(re, text1, -1, &status);
   4834         result = uregex_find(re, 0, &status);
   4835         REGEX_ASSERT(result==TRUE);
   4836 
   4837         /*  Capture Group 0, the full match.  Should succeed.  */
   4838         status = U_ZERO_ERROR;
   4839         actual = uregex_groupUTextDeep(re, 0, &bufferText, &status);
   4840         REGEX_CHECK_STATUS;
   4841         REGEX_ASSERT(actual == &bufferText);
   4842         REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual);
   4843 
   4844         /*  Capture group #1.  Should succeed. */
   4845         status = U_ZERO_ERROR;
   4846         actual = uregex_groupUTextDeep(re, 1, &bufferText, &status);
   4847         REGEX_CHECK_STATUS;
   4848         REGEX_ASSERT(actual == &bufferText);
   4849         REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual);
   4850 
   4851         /*  Capture group out of range.  Error. */
   4852         status = U_ZERO_ERROR;
   4853         actual = uregex_groupUTextDeep(re, 2, &bufferText, &status);
   4854         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   4855         REGEX_ASSERT(actual == &bufferText);
   4856 
   4857         uregex_close(re);
   4858 
   4859     }
   4860 
   4861     /*
   4862      *  replaceFirst()
   4863      */
   4864     {
   4865         UChar    text1[80];
   4866         UChar    text2[80];
   4867         UText    replText = UTEXT_INITIALIZER;
   4868         UText   *result;
   4869 
   4870         status = U_ZERO_ERROR;
   4871         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
   4872         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
   4873         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
   4874 
   4875         re = uregex_openC("x(.*?)x", 0, NULL, &status);
   4876         REGEX_CHECK_STATUS;
   4877 
   4878         /*  Normal case, with match */
   4879         uregex_setText(re, text1, -1, &status);
   4880         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   4881         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   4882         REGEX_CHECK_STATUS;
   4883         REGEX_ASSERT(result == &bufferText);
   4884         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
   4885 
   4886         /* No match.  Text should copy to output with no changes.  */
   4887         uregex_setText(re, text2, -1, &status);
   4888         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   4889         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   4890         REGEX_CHECK_STATUS;
   4891         REGEX_ASSERT(result == &bufferText);
   4892         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
   4893 
   4894         /* Unicode escapes */
   4895         uregex_setText(re, text1, -1, &status);
   4896         regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
   4897         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   4898         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   4899         REGEX_CHECK_STATUS;
   4900         REGEX_ASSERT(result == &bufferText);
   4901         REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
   4902 
   4903         uregex_close(re);
   4904         utext_close(&replText);
   4905     }
   4906 
   4907 
   4908     /*
   4909      *  replaceAll()
   4910      */
   4911     {
   4912         UChar    text1[80];
   4913         UChar    text2[80];
   4914         UText    replText = UTEXT_INITIALIZER;
   4915         UText   *result;
   4916 
   4917         status = U_ZERO_ERROR;
   4918         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
   4919         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
   4920         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
   4921 
   4922         re = uregex_openC("x(.*?)x", 0, NULL, &status);
   4923         REGEX_CHECK_STATUS;
   4924 
   4925         /*  Normal case, with match */
   4926         uregex_setText(re, text1, -1, &status);
   4927         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   4928         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
   4929         REGEX_CHECK_STATUS;
   4930         REGEX_ASSERT(result == &bufferText);
   4931         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
   4932 
   4933         /* No match.  Text should copy to output with no changes.  */
   4934         uregex_setText(re, text2, -1, &status);
   4935         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   4936         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
   4937         REGEX_CHECK_STATUS;
   4938         REGEX_ASSERT(result == &bufferText);
   4939         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
   4940 
   4941         uregex_close(re);
   4942         utext_close(&replText);
   4943     }
   4944 
   4945 
   4946     /*
   4947      *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
   4948      *   so we don't need to test it here.
   4949      */
   4950 
   4951     utext_close(&bufferText);
   4952     utext_close(&patternText);
   4953 }
   4954 
   4955 //--------------------------------------------------------------
   4956 //
   4957 //  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
   4958 //
   4959 //---------------------------------------------------------------
   4960 void RegexTest::Bug7651() {
   4961     UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
   4962     //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
   4963     //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
   4964     UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
   4965     UnicodeString s("#ff @abcd This is test");
   4966     RegexPattern  *REPattern = NULL;
   4967     RegexMatcher  *REMatcher = NULL;
   4968     UErrorCode status = U_ZERO_ERROR;
   4969     UParseError pe;
   4970 
   4971     REPattern = RegexPattern::compile(pattern1, 0, pe, status);
   4972     REGEX_CHECK_STATUS;
   4973     REMatcher = REPattern->matcher(s, status);
   4974     REGEX_CHECK_STATUS;
   4975     REGEX_ASSERT(REMatcher->find());
   4976     REGEX_ASSERT(REMatcher->start(status) == 0);
   4977     delete REPattern;
   4978     delete REMatcher;
   4979     status = U_ZERO_ERROR;
   4980 
   4981     REPattern = RegexPattern::compile(pattern2, 0, pe, status);
   4982     REGEX_CHECK_STATUS;
   4983     REMatcher = REPattern->matcher(s, status);
   4984     REGEX_CHECK_STATUS;
   4985     REGEX_ASSERT(REMatcher->find());
   4986     REGEX_ASSERT(REMatcher->start(status) == 0);
   4987     delete REPattern;
   4988     delete REMatcher;
   4989     status = U_ZERO_ERROR;
   4990  }
   4991 
   4992 void RegexTest::Bug7740() {
   4993     UErrorCode status = U_ZERO_ERROR;
   4994     UnicodeString pattern = "(a)";
   4995     UnicodeString text = "abcdef";
   4996     RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
   4997     REGEX_CHECK_STATUS;
   4998     REGEX_ASSERT(m->lookingAt(status));
   4999     REGEX_CHECK_STATUS;
   5000     status = U_ILLEGAL_ARGUMENT_ERROR;
   5001     UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
   5002     REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
   5003     REGEX_ASSERT(s == "");
   5004     delete m;
   5005 }
   5006 
   5007 
   5008 
   5009 
   5010 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
   5011 
   5012