Home | History | Annotate | Download | only in intltest
      1 /********************************************************************
      2  * COPYRIGHT:
      3  * Copyright (c) 2002-2012, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  ********************************************************************/
      6 
      7 //
      8 //   regextst.cpp
      9 //
     10 //      ICU Regular Expressions test, part of intltest.
     11 //
     12 
     13 /*
     14      NOTE!!
     15 
     16      PLEASE be careful about ASCII assumptions in this test.
     17      This test is one of the worst repeat offenders.
     18      If you have questions, contact someone on the ICU PMC
     19      who has access to an EBCDIC system.
     20 
     21  */
     22 
     23 #include "intltest.h"
     24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     25 
     26 #include "unicode/regex.h"
     27 #include "unicode/uchar.h"
     28 #include "unicode/ucnv.h"
     29 #include "unicode/uniset.h"
     30 #include "unicode/ustring.h"
     31 #include "regextst.h"
     32 #include "uvector.h"
     33 #include "util.h"
     34 #include <stdlib.h>
     35 #include <string.h>
     36 #include <stdio.h>
     37 #include "cstring.h"
     38 #include "uinvchar.h"
     39 
     40 #define SUPPORT_MUTATING_INPUT_STRING   0
     41 
     42 //---------------------------------------------------------------------------
     43 //
     44 //  Test class boilerplate
     45 //
     46 //---------------------------------------------------------------------------
     47 RegexTest::RegexTest()
     48 {
     49 }
     50 
     51 
     52 RegexTest::~RegexTest()
     53 {
     54 }
     55 
     56 
     57 
     58 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
     59 {
     60     if (exec) logln("TestSuite RegexTest: ");
     61     switch (index) {
     62 
     63         case 0: name = "Basic";
     64             if (exec) Basic();
     65             break;
     66         case 1: name = "API_Match";
     67             if (exec) API_Match();
     68             break;
     69         case 2: name = "API_Replace";
     70             if (exec) API_Replace();
     71             break;
     72         case 3: name = "API_Pattern";
     73             if (exec) API_Pattern();
     74             break;
     75         case 4:
     76 #if !UCONFIG_NO_FILE_IO
     77             name = "Extended";
     78             if (exec) Extended();
     79 #else
     80             name = "skip";
     81 #endif
     82             break;
     83         case 5: name = "Errors";
     84             if (exec) Errors();
     85             break;
     86         case 6: name = "PerlTests";
     87             if (exec) PerlTests();
     88             break;
     89         case 7: name = "Callbacks";
     90             if (exec) Callbacks();
     91             break;
     92         case 8: name = "FindProgressCallbacks";
     93             if (exec) FindProgressCallbacks();
     94             break;
     95         case 9: name = "Bug 6149";
     96              if (exec) Bug6149();
     97              break;
     98         case 10: name = "UTextBasic";
     99           if (exec) UTextBasic();
    100           break;
    101         case 11: name = "API_Match_UTF8";
    102           if (exec) API_Match_UTF8();
    103           break;
    104         case 12: name = "API_Replace_UTF8";
    105           if (exec) API_Replace_UTF8();
    106           break;
    107         case 13: name = "API_Pattern_UTF8";
    108           if (exec) API_Pattern_UTF8();
    109           break;
    110         case 14: name = "PerlTestsUTF8";
    111           if (exec) PerlTestsUTF8();
    112           break;
    113         case 15: name = "PreAllocatedUTextCAPI";
    114           if (exec) PreAllocatedUTextCAPI();
    115           break;
    116         case 16: name = "Bug 7651";
    117              if (exec) Bug7651();
    118              break;
    119         case 17: name = "Bug 7740";
    120             if (exec) Bug7740();
    121             break;
    122         case 18: name = "Bug 8479";
    123             if (exec) Bug8479();
    124             break;
    125         case 19: name = "Bug 7029";
    126             if (exec) Bug7029();
    127             break;
    128         case 20: name = "CheckInvBufSize";
    129             if (exec) CheckInvBufSize();
    130             break;
    131         case 21: name = "Bug 9283";
    132             if (exec) Bug9283();
    133             break;
    134 
    135         default: name = "";
    136             break; //needed to end loop
    137     }
    138 }
    139 
    140 
    141 
    142 /**
    143  * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
    144  * into ASCII.
    145  * @see utext_openUTF8
    146  */
    147 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
    148 
    149 //---------------------------------------------------------------------------
    150 //
    151 //   Error Checking / Reporting macros used in all of the tests.
    152 //
    153 //---------------------------------------------------------------------------
    154 
    155 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
    156   int64_t oldIndex = utext_getNativeIndex(text);
    157   utext_setNativeIndex(text, 0);
    158   char *bufPtr = buf;
    159   UChar32 c = utext_next32From(text, 0);
    160   while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
    161     if (0x000020<=c && c<0x00007e) {
    162       *bufPtr = c;
    163     } else {
    164 #if 0
    165       sprintf(bufPtr,"U+%04X", c);
    166       bufPtr+= strlen(bufPtr)-1;
    167 #else
    168       *bufPtr = '%';
    169 #endif
    170     }
    171     bufPtr++;
    172     c = UTEXT_NEXT32(text);
    173   }
    174   *bufPtr = 0;
    175 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
    176   char *ebuf = (char*)malloc(bufLen);
    177   uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
    178   uprv_strncpy(buf, ebuf, bufLen);
    179   free((void*)ebuf);
    180 #endif
    181   utext_setNativeIndex(text, oldIndex);
    182 }
    183 
    184 
    185 static char ASSERT_BUF[1024];
    186 
    187 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
    188   if(message.length()==0) {
    189     strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
    190   } else {
    191     UnicodeString buf;
    192     IntlTest::prettify(message,buf);
    193     if(buf.length()==0) {
    194       strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
    195     } else {
    196       buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
    197       if(ASSERT_BUF[0]==0) {
    198         ASSERT_BUF[0]=0;
    199         for(int32_t i=0;i<buf.length();i++) {
    200           UChar ch = buf[i];
    201           sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
    202         }
    203       }
    204     }
    205   }
    206   ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
    207   return ASSERT_BUF;
    208 }
    209 
    210 
    211 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
    212 
    213 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
    214                                                               __FILE__, __LINE__, u_errorName(status)); return;}}
    215 
    216 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
    217 
    218 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
    219 if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
    220     __LINE__, u_errorName(errcode), u_errorName(status));};}
    221 
    222 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
    223     "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
    224 
    225 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
    226     errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
    227 
    228 #define REGEX_ASSERT_UNISTR(ustr,inv) {if (!(ustr==inv)) {errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s,%s) failed \n", __FILE__, __LINE__, extractToAssertBuf(ustr),inv);};}
    229 
    230 
    231 static UBool testUTextEqual(UText *uta, UText *utb) {
    232     UChar32 ca = 0;
    233     UChar32 cb = 0;
    234     utext_setNativeIndex(uta, 0);
    235     utext_setNativeIndex(utb, 0);
    236     do {
    237         ca = utext_next32(uta);
    238         cb = utext_next32(utb);
    239         if (ca != cb) {
    240             break;
    241         }
    242     } while (ca != U_SENTINEL);
    243     return ca == cb;
    244 }
    245 
    246 
    247 /**
    248  * @param expected expected text in UTF-8 (not platform) codepage
    249  */
    250 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
    251     UErrorCode status = U_ZERO_ERROR;
    252     UText expectedText = UTEXT_INITIALIZER;
    253     utext_openUTF8(&expectedText, expected, -1, &status);
    254     if(U_FAILURE(status)) {
    255       errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
    256       return;
    257     }
    258     if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
    259       errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
    260       return;
    261     }
    262     utext_setNativeIndex(actual, 0);
    263     if (!testUTextEqual(&expectedText, actual)) {
    264         char buf[201 /*21*/];
    265         char expectedBuf[201];
    266         utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
    267         utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
    268         errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
    269     }
    270     utext_close(&expectedText);
    271 }
    272 /**
    273  * @param expected invariant (platform local text) input
    274  */
    275 
    276 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
    277     UErrorCode status = U_ZERO_ERROR;
    278     UText expectedText = UTEXT_INITIALIZER;
    279     regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
    280     if(U_FAILURE(status)) {
    281       errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
    282       return;
    283     }
    284     utext_setNativeIndex(actual, 0);
    285     if (!testUTextEqual(&expectedText, actual)) {
    286         char buf[201 /*21*/];
    287         char expectedBuf[201];
    288         utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
    289         utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
    290         errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
    291     }
    292     utext_close(&expectedText);
    293 }
    294 
    295 /**
    296  * Assumes utf-8 input
    297  */
    298 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
    299 /**
    300  * Assumes Invariant input
    301  */
    302 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
    303 
    304 /**
    305  * This buffer ( inv_buf ) is used to hold the UTF-8 strings
    306  * passed into utext_openUTF8. An error will be given if
    307  * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
    308  */
    309 
    310 #define INV_BUFSIZ 2048 /* increase this if too small */
    311 
    312 static int64_t inv_next=0;
    313 
    314 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
    315 static char inv_buf[INV_BUFSIZ];
    316 #endif
    317 
    318 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
    319   if(length==-1) length=strlen(inv);
    320 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
    321   inv_next+=length;
    322   return utext_openUTF8(ut, inv, length, status);
    323 #else
    324   if(inv_next+length+1>INV_BUFSIZ) {
    325     fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
    326             __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
    327     *status = U_MEMORY_ALLOCATION_ERROR;
    328     return NULL;
    329   }
    330 
    331   unsigned char *buf = (unsigned char*)inv_buf+inv_next;
    332   uprv_aestrncpy(buf, (const uint8_t*)inv, length);
    333   inv_next+=length;
    334 
    335 #if 0
    336   fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
    337 #endif
    338 
    339   return utext_openUTF8(ut, (const char*)buf, length, status);
    340 #endif
    341 }
    342 
    343 
    344 //---------------------------------------------------------------------------
    345 //
    346 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
    347 //                       for the LookingAt() and  Match() functions.
    348 //
    349 //       usage:
    350 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
    351 //
    352 //          The expected results are UBool - TRUE or FALSE.
    353 //          The input text is unescaped.  The pattern is not.
    354 //
    355 //
    356 //---------------------------------------------------------------------------
    357 
    358 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
    359 
    360 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
    361     const UnicodeString pattern(pat, -1, US_INV);
    362     const UnicodeString inputText(text, -1, US_INV);
    363     UErrorCode          status  = U_ZERO_ERROR;
    364     UParseError         pe;
    365     RegexPattern        *REPattern = NULL;
    366     RegexMatcher        *REMatcher = NULL;
    367     UBool               retVal     = TRUE;
    368 
    369     UnicodeString patString(pat, -1, US_INV);
    370     REPattern = RegexPattern::compile(patString, 0, pe, status);
    371     if (U_FAILURE(status)) {
    372         dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
    373             line, u_errorName(status));
    374         return FALSE;
    375     }
    376     if (line==376) { RegexPatternDump(REPattern);}
    377 
    378     UnicodeString inputString(inputText);
    379     UnicodeString unEscapedInput = inputString.unescape();
    380     REMatcher = REPattern->matcher(unEscapedInput, status);
    381     if (U_FAILURE(status)) {
    382         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
    383             line, u_errorName(status));
    384         return FALSE;
    385     }
    386 
    387     UBool actualmatch;
    388     actualmatch = REMatcher->lookingAt(status);
    389     if (U_FAILURE(status)) {
    390         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
    391             line, u_errorName(status));
    392         retVal =  FALSE;
    393     }
    394     if (actualmatch != looking) {
    395         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
    396         retVal = FALSE;
    397     }
    398 
    399     status = U_ZERO_ERROR;
    400     actualmatch = REMatcher->matches(status);
    401     if (U_FAILURE(status)) {
    402         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
    403             line, u_errorName(status));
    404         retVal = FALSE;
    405     }
    406     if (actualmatch != match) {
    407         errln("RegexTest: wrong return from matches() at line %d.\n", line);
    408         retVal = FALSE;
    409     }
    410 
    411     if (retVal == FALSE) {
    412         RegexPatternDump(REPattern);
    413     }
    414 
    415     delete REPattern;
    416     delete REMatcher;
    417     return retVal;
    418 }
    419 
    420 
    421 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
    422     UText               pattern    = UTEXT_INITIALIZER;
    423     int32_t             inputUTF8Length;
    424     char                *textChars = NULL;
    425     UText               inputText  = UTEXT_INITIALIZER;
    426     UErrorCode          status     = U_ZERO_ERROR;
    427     UParseError         pe;
    428     RegexPattern        *REPattern = NULL;
    429     RegexMatcher        *REMatcher = NULL;
    430     UBool               retVal     = TRUE;
    431 
    432     regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
    433     REPattern = RegexPattern::compile(&pattern, 0, pe, status);
    434     if (U_FAILURE(status)) {
    435         dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
    436             line, u_errorName(status));
    437         return FALSE;
    438     }
    439 
    440     UnicodeString inputString(text, -1, US_INV);
    441     UnicodeString unEscapedInput = inputString.unescape();
    442     LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
    443     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
    444 
    445     inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
    446     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
    447         // UTF-8 does not allow unpaired surrogates, so this could actually happen
    448         logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
    449         return TRUE; // not a failure of the Regex engine
    450     }
    451     status = U_ZERO_ERROR; // buffer overflow
    452     textChars = new char[inputUTF8Length+1];
    453     unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
    454     utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
    455 
    456     REMatcher = &REPattern->matcher(status)->reset(&inputText);
    457     if (U_FAILURE(status)) {
    458         errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
    459             line, u_errorName(status));
    460         return FALSE;
    461     }
    462 
    463     UBool actualmatch;
    464     actualmatch = REMatcher->lookingAt(status);
    465     if (U_FAILURE(status)) {
    466         errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
    467             line, u_errorName(status));
    468         retVal =  FALSE;
    469     }
    470     if (actualmatch != looking) {
    471         errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
    472         retVal = FALSE;
    473     }
    474 
    475     status = U_ZERO_ERROR;
    476     actualmatch = REMatcher->matches(status);
    477     if (U_FAILURE(status)) {
    478         errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
    479             line, u_errorName(status));
    480         retVal = FALSE;
    481     }
    482     if (actualmatch != match) {
    483         errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
    484         retVal = FALSE;
    485     }
    486 
    487     if (retVal == FALSE) {
    488         RegexPatternDump(REPattern);
    489     }
    490 
    491     delete REPattern;
    492     delete REMatcher;
    493     utext_close(&inputText);
    494     utext_close(&pattern);
    495     delete[] textChars;
    496     return retVal;
    497 }
    498 
    499 
    500 
    501 //---------------------------------------------------------------------------
    502 //
    503 //    REGEX_ERR       Macro + invocation function to simplify writing tests
    504 //                       regex tests for incorrect patterns
    505 //
    506 //       usage:
    507 //          REGEX_ERR("pattern",   expected error line, column, expected status);
    508 //
    509 //---------------------------------------------------------------------------
    510 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
    511 
    512 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
    513                           UErrorCode expectedStatus, int32_t line) {
    514     UnicodeString       pattern(pat);
    515 
    516     UErrorCode          status         = U_ZERO_ERROR;
    517     UParseError         pe;
    518     RegexPattern        *callerPattern = NULL;
    519 
    520     //
    521     //  Compile the caller's pattern
    522     //
    523     UnicodeString patString(pat);
    524     callerPattern = RegexPattern::compile(patString, 0, pe, status);
    525     if (status != expectedStatus) {
    526         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
    527     } else {
    528         if (status != U_ZERO_ERROR) {
    529             if (pe.line != errLine || pe.offset != errCol) {
    530                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
    531                     line, errLine, errCol, pe.line, pe.offset);
    532             }
    533         }
    534     }
    535 
    536     delete callerPattern;
    537 
    538     //
    539     //  Compile again, using a UTF-8-based UText
    540     //
    541     UText patternText = UTEXT_INITIALIZER;
    542     regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
    543     callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
    544     if (status != expectedStatus) {
    545         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
    546     } else {
    547         if (status != U_ZERO_ERROR) {
    548             if (pe.line != errLine || pe.offset != errCol) {
    549                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
    550                     line, errLine, errCol, pe.line, pe.offset);
    551             }
    552         }
    553     }
    554 
    555     delete callerPattern;
    556     utext_close(&patternText);
    557 }
    558 
    559 
    560 
    561 //---------------------------------------------------------------------------
    562 //
    563 //      Basic      Check for basic functionality of regex pattern matching.
    564 //                 Avoid the use of REGEX_FIND test macro, which has
    565 //                 substantial dependencies on basic Regex functionality.
    566 //
    567 //---------------------------------------------------------------------------
    568 void RegexTest::Basic() {
    569 
    570 
    571 //
    572 // Debug - slide failing test cases early
    573 //
    574 #if 0
    575     {
    576         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
    577         UParseError pe;
    578         UErrorCode  status = U_ZERO_ERROR;
    579         RegexPattern *pattern;
    580         pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
    581         RegexPatternDump(pattern);
    582         RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
    583         UBool result = m->find();
    584         printf("result = %d\n", result);
    585         // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
    586         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
    587     }
    588     exit(1);
    589 #endif
    590 
    591 
    592     //
    593     // Pattern with parentheses
    594     //
    595     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
    596     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
    597     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
    598 
    599     //
    600     // Patterns with *
    601     //
    602     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
    603     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
    604     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
    605     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
    606     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
    607 
    608     REGEX_TESTLM("a*", "",  TRUE, TRUE);
    609     REGEX_TESTLM("a*", "b", TRUE, FALSE);
    610 
    611 
    612     //
    613     //  Patterns with "."
    614     //
    615     REGEX_TESTLM(".", "abc", TRUE, FALSE);
    616     REGEX_TESTLM("...", "abc", TRUE, TRUE);
    617     REGEX_TESTLM("....", "abc", FALSE, FALSE);
    618     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
    619     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
    620     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
    621     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
    622     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
    623 
    624     //
    625     //  Patterns with * applied to chars at end of literal string
    626     //
    627     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
    628     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
    629 
    630     //
    631     //  Supplemental chars match as single chars, not a pair of surrogates.
    632     //
    633     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
    634     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
    635     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
    636 
    637 
    638     //
    639     //  UnicodeSets in the pattern
    640     //
    641     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
    642     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
    643     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
    644     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
    645     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
    646     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
    647 
    648     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
    649     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
    650     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
    651     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
    652     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
    653 
    654     //
    655     //   OR operator in patterns
    656     //
    657     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
    658     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
    659     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
    660     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
    661 
    662     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
    663     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
    664     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
    665     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
    666     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
    667     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
    668 
    669     //
    670     //  +
    671     //
    672     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
    673     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
    674     REGEX_TESTLM("b+", "", FALSE, FALSE);
    675     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
    676     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
    677     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
    678 
    679     //
    680     //   ?
    681     //
    682     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
    683     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
    684     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
    685     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
    686     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
    687     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
    688     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
    689     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
    690     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
    691 
    692     //
    693     //  Escape sequences that become single literal chars, handled internally
    694     //   by ICU's Unescape.
    695     //
    696 
    697     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
    698     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
    699     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
    700     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
    701     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
    702     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
    703     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
    704     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
    705     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
    706     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
    707 
    708     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
    709     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
    710 
    711     // Escape of special chars in patterns
    712     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
    713 }
    714 
    715 
    716 //---------------------------------------------------------------------------
    717 //
    718 //    UTextBasic   Check for quirks that are specific to the UText
    719 //                 implementation.
    720 //
    721 //---------------------------------------------------------------------------
    722 void RegexTest::UTextBasic() {
    723     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
    724     UErrorCode status = U_ZERO_ERROR;
    725     UText pattern = UTEXT_INITIALIZER;
    726     utext_openUTF8(&pattern, str_abc, -1, &status);
    727     RegexMatcher matcher(&pattern, 0, status);
    728     REGEX_CHECK_STATUS;
    729 
    730     UText input = UTEXT_INITIALIZER;
    731     utext_openUTF8(&input, str_abc, -1, &status);
    732     REGEX_CHECK_STATUS;
    733     matcher.reset(&input);
    734     REGEX_CHECK_STATUS;
    735     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
    736 
    737     matcher.reset(matcher.inputText());
    738     REGEX_CHECK_STATUS;
    739     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
    740 
    741     utext_close(&pattern);
    742     utext_close(&input);
    743 }
    744 
    745 
    746 //---------------------------------------------------------------------------
    747 //
    748 //      API_Match   Test that the API for class RegexMatcher
    749 //                  is present and nominally working, but excluding functions
    750 //                  implementing replace operations.
    751 //
    752 //---------------------------------------------------------------------------
    753 void RegexTest::API_Match() {
    754     UParseError         pe;
    755     UErrorCode          status=U_ZERO_ERROR;
    756     int32_t             flags = 0;
    757 
    758     //
    759     // Debug - slide failing test cases early
    760     //
    761 #if 0
    762     {
    763     }
    764     return;
    765 #endif
    766 
    767     //
    768     // Simple pattern compilation
    769     //
    770     {
    771         UnicodeString       re("abc");
    772         RegexPattern        *pat2;
    773         pat2 = RegexPattern::compile(re, flags, pe, status);
    774         REGEX_CHECK_STATUS;
    775 
    776         UnicodeString inStr1 = "abcdef this is a test";
    777         UnicodeString instr2 = "not abc";
    778         UnicodeString empty  = "";
    779 
    780 
    781         //
    782         // Matcher creation and reset.
    783         //
    784         RegexMatcher *m1 = pat2->matcher(inStr1, status);
    785         REGEX_CHECK_STATUS;
    786         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    787         REGEX_ASSERT(m1->input() == inStr1);
    788         m1->reset(instr2);
    789         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
    790         REGEX_ASSERT(m1->input() == instr2);
    791         m1->reset(inStr1);
    792         REGEX_ASSERT(m1->input() == inStr1);
    793         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    794         m1->reset(empty);
    795         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
    796         REGEX_ASSERT(m1->input() == empty);
    797         REGEX_ASSERT(&m1->pattern() == pat2);
    798 
    799         //
    800         //  reset(pos, status)
    801         //
    802         m1->reset(inStr1);
    803         m1->reset(4, status);
    804         REGEX_CHECK_STATUS;
    805         REGEX_ASSERT(m1->input() == inStr1);
    806         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    807 
    808         m1->reset(-1, status);
    809         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    810         status = U_ZERO_ERROR;
    811 
    812         m1->reset(0, status);
    813         REGEX_CHECK_STATUS;
    814         status = U_ZERO_ERROR;
    815 
    816         int32_t len = m1->input().length();
    817         m1->reset(len-1, status);
    818         REGEX_CHECK_STATUS;
    819         status = U_ZERO_ERROR;
    820 
    821         m1->reset(len, status);
    822         REGEX_CHECK_STATUS;
    823         status = U_ZERO_ERROR;
    824 
    825         m1->reset(len+1, status);
    826         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    827         status = U_ZERO_ERROR;
    828 
    829         //
    830         // match(pos, status)
    831         //
    832         m1->reset(instr2);
    833         REGEX_ASSERT(m1->matches(4, status) == TRUE);
    834         m1->reset();
    835         REGEX_ASSERT(m1->matches(3, status) == FALSE);
    836         m1->reset();
    837         REGEX_ASSERT(m1->matches(5, status) == FALSE);
    838         REGEX_ASSERT(m1->matches(4, status) == TRUE);
    839         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
    840         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    841 
    842         // Match() at end of string should fail, but should not
    843         //  be an error.
    844         status = U_ZERO_ERROR;
    845         len = m1->input().length();
    846         REGEX_ASSERT(m1->matches(len, status) == FALSE);
    847         REGEX_CHECK_STATUS;
    848 
    849         // Match beyond end of string should fail with an error.
    850         status = U_ZERO_ERROR;
    851         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
    852         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    853 
    854         // Successful match at end of string.
    855         {
    856             status = U_ZERO_ERROR;
    857             RegexMatcher m("A?", 0, status);  // will match zero length string.
    858             REGEX_CHECK_STATUS;
    859             m.reset(inStr1);
    860             len = inStr1.length();
    861             REGEX_ASSERT(m.matches(len, status) == TRUE);
    862             REGEX_CHECK_STATUS;
    863             m.reset(empty);
    864             REGEX_ASSERT(m.matches(0, status) == TRUE);
    865             REGEX_CHECK_STATUS;
    866         }
    867 
    868 
    869         //
    870         // lookingAt(pos, status)
    871         //
    872         status = U_ZERO_ERROR;
    873         m1->reset(instr2);  // "not abc"
    874         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
    875         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
    876         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
    877         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
    878         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
    879         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    880         status = U_ZERO_ERROR;
    881         len = m1->input().length();
    882         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
    883         REGEX_CHECK_STATUS;
    884         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
    885         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    886 
    887         delete m1;
    888         delete pat2;
    889     }
    890 
    891 
    892     //
    893     // Capture Group.
    894     //     RegexMatcher::start();
    895     //     RegexMatcher::end();
    896     //     RegexMatcher::groupCount();
    897     //
    898     {
    899         int32_t             flags=0;
    900         UParseError         pe;
    901         UErrorCode          status=U_ZERO_ERROR;
    902 
    903         UnicodeString       re("01(23(45)67)(.*)");
    904         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
    905         REGEX_CHECK_STATUS;
    906         UnicodeString data = "0123456789";
    907 
    908         RegexMatcher *matcher = pat->matcher(data, status);
    909         REGEX_CHECK_STATUS;
    910         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
    911         static const int32_t matchStarts[] = {0,  2, 4, 8};
    912         static const int32_t matchEnds[]   = {10, 8, 6, 10};
    913         int32_t i;
    914         for (i=0; i<4; i++) {
    915             int32_t actualStart = matcher->start(i, status);
    916             REGEX_CHECK_STATUS;
    917             if (actualStart != matchStarts[i]) {
    918                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
    919                     __LINE__, i, matchStarts[i], actualStart);
    920             }
    921             int32_t actualEnd = matcher->end(i, status);
    922             REGEX_CHECK_STATUS;
    923             if (actualEnd != matchEnds[i]) {
    924                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
    925                     __LINE__, i, matchEnds[i], actualEnd);
    926             }
    927         }
    928 
    929         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
    930         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
    931 
    932         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
    933         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
    934         matcher->reset();
    935         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
    936 
    937         matcher->lookingAt(status);
    938         REGEX_ASSERT(matcher->group(status)    == "0123456789");
    939         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
    940         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
    941         REGEX_ASSERT(matcher->group(2, status) == "45"        );
    942         REGEX_ASSERT(matcher->group(3, status) == "89"        );
    943         REGEX_CHECK_STATUS;
    944         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
    945         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
    946         matcher->reset();
    947         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
    948 
    949         delete matcher;
    950         delete pat;
    951 
    952     }
    953 
    954     //
    955     //  find
    956     //
    957     {
    958         int32_t             flags=0;
    959         UParseError         pe;
    960         UErrorCode          status=U_ZERO_ERROR;
    961 
    962         UnicodeString       re("abc");
    963         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
    964         REGEX_CHECK_STATUS;
    965         UnicodeString data = ".abc..abc...abc..";
    966         //                    012345678901234567
    967 
    968         RegexMatcher *matcher = pat->matcher(data, status);
    969         REGEX_CHECK_STATUS;
    970         REGEX_ASSERT(matcher->find());
    971         REGEX_ASSERT(matcher->start(status) == 1);
    972         REGEX_ASSERT(matcher->find());
    973         REGEX_ASSERT(matcher->start(status) == 6);
    974         REGEX_ASSERT(matcher->find());
    975         REGEX_ASSERT(matcher->start(status) == 12);
    976         REGEX_ASSERT(matcher->find() == FALSE);
    977         REGEX_ASSERT(matcher->find() == FALSE);
    978 
    979         matcher->reset();
    980         REGEX_ASSERT(matcher->find());
    981         REGEX_ASSERT(matcher->start(status) == 1);
    982 
    983         REGEX_ASSERT(matcher->find(0, status));
    984         REGEX_ASSERT(matcher->start(status) == 1);
    985         REGEX_ASSERT(matcher->find(1, status));
    986         REGEX_ASSERT(matcher->start(status) == 1);
    987         REGEX_ASSERT(matcher->find(2, status));
    988         REGEX_ASSERT(matcher->start(status) == 6);
    989         REGEX_ASSERT(matcher->find(12, status));
    990         REGEX_ASSERT(matcher->start(status) == 12);
    991         REGEX_ASSERT(matcher->find(13, status) == FALSE);
    992         REGEX_ASSERT(matcher->find(16, status) == FALSE);
    993         REGEX_ASSERT(matcher->find(17, status) == FALSE);
    994         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
    995 
    996         status = U_ZERO_ERROR;
    997         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
    998         status = U_ZERO_ERROR;
    999         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
   1000 
   1001         REGEX_ASSERT(matcher->groupCount() == 0);
   1002 
   1003         delete matcher;
   1004         delete pat;
   1005     }
   1006 
   1007 
   1008     //
   1009     //  find, with \G in pattern (true if at the end of a previous match).
   1010     //
   1011     {
   1012         int32_t             flags=0;
   1013         UParseError         pe;
   1014         UErrorCode          status=U_ZERO_ERROR;
   1015 
   1016         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
   1017         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
   1018         REGEX_CHECK_STATUS;
   1019         UnicodeString data = ".abcabc.abc..";
   1020         //                    012345678901234567
   1021 
   1022         RegexMatcher *matcher = pat->matcher(data, status);
   1023         REGEX_CHECK_STATUS;
   1024         REGEX_ASSERT(matcher->find());
   1025         REGEX_ASSERT(matcher->start(status) == 0);
   1026         REGEX_ASSERT(matcher->start(1, status) == -1);
   1027         REGEX_ASSERT(matcher->start(2, status) == 1);
   1028 
   1029         REGEX_ASSERT(matcher->find());
   1030         REGEX_ASSERT(matcher->start(status) == 4);
   1031         REGEX_ASSERT(matcher->start(1, status) == 4);
   1032         REGEX_ASSERT(matcher->start(2, status) == -1);
   1033         REGEX_CHECK_STATUS;
   1034 
   1035         delete matcher;
   1036         delete pat;
   1037     }
   1038 
   1039     //
   1040     //   find with zero length matches, match position should bump ahead
   1041     //     to prevent loops.
   1042     //
   1043     {
   1044         int32_t                 i;
   1045         UErrorCode          status=U_ZERO_ERROR;
   1046         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
   1047                                                       //   using an always-true look-ahead.
   1048         REGEX_CHECK_STATUS;
   1049         UnicodeString s("    ");
   1050         m.reset(s);
   1051         for (i=0; ; i++) {
   1052             if (m.find() == FALSE) {
   1053                 break;
   1054             }
   1055             REGEX_ASSERT(m.start(status) == i);
   1056             REGEX_ASSERT(m.end(status) == i);
   1057         }
   1058         REGEX_ASSERT(i==5);
   1059 
   1060         // Check that the bump goes over surrogate pairs OK
   1061         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
   1062         s = s.unescape();
   1063         m.reset(s);
   1064         for (i=0; ; i+=2) {
   1065             if (m.find() == FALSE) {
   1066                 break;
   1067             }
   1068             REGEX_ASSERT(m.start(status) == i);
   1069             REGEX_ASSERT(m.end(status) == i);
   1070         }
   1071         REGEX_ASSERT(i==10);
   1072     }
   1073     {
   1074         // find() loop breaking test.
   1075         //        with pattern of /.?/, should see a series of one char matches, then a single
   1076         //        match of zero length at the end of the input string.
   1077         int32_t                 i;
   1078         UErrorCode          status=U_ZERO_ERROR;
   1079         RegexMatcher        m(".?", 0, status);
   1080         REGEX_CHECK_STATUS;
   1081         UnicodeString s("    ");
   1082         m.reset(s);
   1083         for (i=0; ; i++) {
   1084             if (m.find() == FALSE) {
   1085                 break;
   1086             }
   1087             REGEX_ASSERT(m.start(status) == i);
   1088             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
   1089         }
   1090         REGEX_ASSERT(i==5);
   1091     }
   1092 
   1093 
   1094     //
   1095     // Matchers with no input string behave as if they had an empty input string.
   1096     //
   1097 
   1098     {
   1099         UErrorCode status = U_ZERO_ERROR;
   1100         RegexMatcher  m(".?", 0, status);
   1101         REGEX_CHECK_STATUS;
   1102         REGEX_ASSERT(m.find());
   1103         REGEX_ASSERT(m.start(status) == 0);
   1104         REGEX_ASSERT(m.input() == "");
   1105     }
   1106     {
   1107         UErrorCode status = U_ZERO_ERROR;
   1108         RegexPattern  *p = RegexPattern::compile(".", 0, status);
   1109         RegexMatcher  *m = p->matcher(status);
   1110         REGEX_CHECK_STATUS;
   1111 
   1112         REGEX_ASSERT(m->find() == FALSE);
   1113         REGEX_ASSERT(m->input() == "");
   1114         delete m;
   1115         delete p;
   1116     }
   1117 
   1118     //
   1119     // Regions
   1120     //
   1121     {
   1122         UErrorCode status = U_ZERO_ERROR;
   1123         UnicodeString testString("This is test data");
   1124         RegexMatcher m(".*", testString,  0, status);
   1125         REGEX_CHECK_STATUS;
   1126         REGEX_ASSERT(m.regionStart() == 0);
   1127         REGEX_ASSERT(m.regionEnd() == testString.length());
   1128         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1129         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1130 
   1131         m.region(2,4, status);
   1132         REGEX_CHECK_STATUS;
   1133         REGEX_ASSERT(m.matches(status));
   1134         REGEX_ASSERT(m.start(status)==2);
   1135         REGEX_ASSERT(m.end(status)==4);
   1136         REGEX_CHECK_STATUS;
   1137 
   1138         m.reset();
   1139         REGEX_ASSERT(m.regionStart() == 0);
   1140         REGEX_ASSERT(m.regionEnd() == testString.length());
   1141 
   1142         UnicodeString shorterString("short");
   1143         m.reset(shorterString);
   1144         REGEX_ASSERT(m.regionStart() == 0);
   1145         REGEX_ASSERT(m.regionEnd() == shorterString.length());
   1146 
   1147         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1148         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
   1149         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   1150         REGEX_ASSERT(&m == &m.reset());
   1151         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   1152 
   1153         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
   1154         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1155         REGEX_ASSERT(&m == &m.reset());
   1156         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1157 
   1158         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1159         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
   1160         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   1161         REGEX_ASSERT(&m == &m.reset());
   1162         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   1163 
   1164         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
   1165         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1166         REGEX_ASSERT(&m == &m.reset());
   1167         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1168 
   1169     }
   1170 
   1171     //
   1172     // hitEnd() and requireEnd()
   1173     //
   1174     {
   1175         UErrorCode status = U_ZERO_ERROR;
   1176         UnicodeString testString("aabb");
   1177         RegexMatcher m1(".*", testString,  0, status);
   1178         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
   1179         REGEX_ASSERT(m1.hitEnd() == TRUE);
   1180         REGEX_ASSERT(m1.requireEnd() == FALSE);
   1181         REGEX_CHECK_STATUS;
   1182 
   1183         status = U_ZERO_ERROR;
   1184         RegexMatcher m2("a*", testString, 0, status);
   1185         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
   1186         REGEX_ASSERT(m2.hitEnd() == FALSE);
   1187         REGEX_ASSERT(m2.requireEnd() == FALSE);
   1188         REGEX_CHECK_STATUS;
   1189 
   1190         status = U_ZERO_ERROR;
   1191         RegexMatcher m3(".*$", testString, 0, status);
   1192         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
   1193         REGEX_ASSERT(m3.hitEnd() == TRUE);
   1194         REGEX_ASSERT(m3.requireEnd() == TRUE);
   1195         REGEX_CHECK_STATUS;
   1196     }
   1197 
   1198 
   1199     //
   1200     // Compilation error on reset with UChar *
   1201     //   These were a hazard that people were stumbling over with runtime errors.
   1202     //   Changed them to compiler errors by adding private methods that more closely
   1203     //   matched the incorrect use of the functions.
   1204     //
   1205 #if 0
   1206     {
   1207         UErrorCode status = U_ZERO_ERROR;
   1208         UChar ucharString[20];
   1209         RegexMatcher m(".", 0, status);
   1210         m.reset(ucharString);  // should not compile.
   1211 
   1212         RegexPattern *p = RegexPattern::compile(".", 0, status);
   1213         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
   1214 
   1215         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
   1216     }
   1217 #endif
   1218 
   1219     //
   1220     //  Time Outs.
   1221     //       Note:  These tests will need to be changed when the regexp engine is
   1222     //              able to detect and cut short the exponential time behavior on
   1223     //              this type of match.
   1224     //
   1225     {
   1226         UErrorCode status = U_ZERO_ERROR;
   1227         //    Enough 'a's in the string to cause the match to time out.
   1228         //       (Each on additonal 'a' doubles the time)
   1229         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
   1230         RegexMatcher matcher("(a+)+b", testString, 0, status);
   1231         REGEX_CHECK_STATUS;
   1232         REGEX_ASSERT(matcher.getTimeLimit() == 0);
   1233         matcher.setTimeLimit(100, status);
   1234         REGEX_ASSERT(matcher.getTimeLimit() == 100);
   1235         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1236         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
   1237     }
   1238     {
   1239         UErrorCode status = U_ZERO_ERROR;
   1240         //   Few enough 'a's to slip in under the time limit.
   1241         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
   1242         RegexMatcher matcher("(a+)+b", testString, 0, status);
   1243         REGEX_CHECK_STATUS;
   1244         matcher.setTimeLimit(100, status);
   1245         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1246         REGEX_CHECK_STATUS;
   1247     }
   1248 
   1249     //
   1250     //  Stack Limits
   1251     //
   1252     {
   1253         UErrorCode status = U_ZERO_ERROR;
   1254         UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
   1255 
   1256         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
   1257         //   of the '+', and makes the stack frames larger.
   1258         RegexMatcher matcher("(A)+A$", testString, 0, status);
   1259 
   1260         // With the default stack, this match should fail to run
   1261         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1262         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
   1263 
   1264         // With unlimited stack, it should run
   1265         status = U_ZERO_ERROR;
   1266         matcher.setStackLimit(0, status);
   1267         REGEX_CHECK_STATUS;
   1268         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
   1269         REGEX_CHECK_STATUS;
   1270         REGEX_ASSERT(matcher.getStackLimit() == 0);
   1271 
   1272         // With a limited stack, it the match should fail
   1273         status = U_ZERO_ERROR;
   1274         matcher.setStackLimit(10000, status);
   1275         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1276         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
   1277         REGEX_ASSERT(matcher.getStackLimit() == 10000);
   1278     }
   1279 
   1280         // A pattern that doesn't save state should work with
   1281         //   a minimal sized stack
   1282     {
   1283         UErrorCode status = U_ZERO_ERROR;
   1284         UnicodeString testString = "abc";
   1285         RegexMatcher matcher("abc", testString, 0, status);
   1286         REGEX_CHECK_STATUS;
   1287         matcher.setStackLimit(30, status);
   1288         REGEX_CHECK_STATUS;
   1289         REGEX_ASSERT(matcher.matches(status) == TRUE);
   1290         REGEX_CHECK_STATUS;
   1291         REGEX_ASSERT(matcher.getStackLimit() == 30);
   1292 
   1293         // Negative stack sizes should fail
   1294         status = U_ZERO_ERROR;
   1295         matcher.setStackLimit(1000, status);
   1296         REGEX_CHECK_STATUS;
   1297         matcher.setStackLimit(-1, status);
   1298         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
   1299         REGEX_ASSERT(matcher.getStackLimit() == 1000);
   1300     }
   1301 
   1302 
   1303 }
   1304 
   1305 
   1306 
   1307 
   1308 
   1309 
   1310 //---------------------------------------------------------------------------
   1311 //
   1312 //      API_Replace        API test for class RegexMatcher, testing the
   1313 //                         Replace family of functions.
   1314 //
   1315 //---------------------------------------------------------------------------
   1316 void RegexTest::API_Replace() {
   1317     //
   1318     //  Replace
   1319     //
   1320     int32_t             flags=0;
   1321     UParseError         pe;
   1322     UErrorCode          status=U_ZERO_ERROR;
   1323 
   1324     UnicodeString       re("abc");
   1325     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
   1326     REGEX_CHECK_STATUS;
   1327     UnicodeString data = ".abc..abc...abc..";
   1328     //                    012345678901234567
   1329     RegexMatcher *matcher = pat->matcher(data, status);
   1330 
   1331     //
   1332     //  Plain vanilla matches.
   1333     //
   1334     UnicodeString  dest;
   1335     dest = matcher->replaceFirst("yz", status);
   1336     REGEX_CHECK_STATUS;
   1337     REGEX_ASSERT(dest == ".yz..abc...abc..");
   1338 
   1339     dest = matcher->replaceAll("yz", status);
   1340     REGEX_CHECK_STATUS;
   1341     REGEX_ASSERT(dest == ".yz..yz...yz..");
   1342 
   1343     //
   1344     //  Plain vanilla non-matches.
   1345     //
   1346     UnicodeString d2 = ".abx..abx...abx..";
   1347     matcher->reset(d2);
   1348     dest = matcher->replaceFirst("yz", status);
   1349     REGEX_CHECK_STATUS;
   1350     REGEX_ASSERT(dest == ".abx..abx...abx..");
   1351 
   1352     dest = matcher->replaceAll("yz", status);
   1353     REGEX_CHECK_STATUS;
   1354     REGEX_ASSERT(dest == ".abx..abx...abx..");
   1355 
   1356     //
   1357     // Empty source string
   1358     //
   1359     UnicodeString d3 = "";
   1360     matcher->reset(d3);
   1361     dest = matcher->replaceFirst("yz", status);
   1362     REGEX_CHECK_STATUS;
   1363     REGEX_ASSERT(dest == "");
   1364 
   1365     dest = matcher->replaceAll("yz", status);
   1366     REGEX_CHECK_STATUS;
   1367     REGEX_ASSERT(dest == "");
   1368 
   1369     //
   1370     // Empty substitution string
   1371     //
   1372     matcher->reset(data);              // ".abc..abc...abc.."
   1373     dest = matcher->replaceFirst("", status);
   1374     REGEX_CHECK_STATUS;
   1375     REGEX_ASSERT(dest == "...abc...abc..");
   1376 
   1377     dest = matcher->replaceAll("", status);
   1378     REGEX_CHECK_STATUS;
   1379     REGEX_ASSERT(dest == "........");
   1380 
   1381     //
   1382     // match whole string
   1383     //
   1384     UnicodeString d4 = "abc";
   1385     matcher->reset(d4);
   1386     dest = matcher->replaceFirst("xyz", status);
   1387     REGEX_CHECK_STATUS;
   1388     REGEX_ASSERT(dest == "xyz");
   1389 
   1390     dest = matcher->replaceAll("xyz", status);
   1391     REGEX_CHECK_STATUS;
   1392     REGEX_ASSERT(dest == "xyz");
   1393 
   1394     //
   1395     // Capture Group, simple case
   1396     //
   1397     UnicodeString       re2("a(..)");
   1398     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
   1399     REGEX_CHECK_STATUS;
   1400     UnicodeString d5 = "abcdefg";
   1401     RegexMatcher *matcher2 = pat2->matcher(d5, status);
   1402     REGEX_CHECK_STATUS;
   1403     dest = matcher2->replaceFirst("$1$1", status);
   1404     REGEX_CHECK_STATUS;
   1405     REGEX_ASSERT(dest == "bcbcdefg");
   1406 
   1407     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
   1408     REGEX_CHECK_STATUS;
   1409     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
   1410 
   1411     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
   1412     REGEX_CHECK_STATUS;
   1413     REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
   1414 
   1415     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
   1416     replacement = replacement.unescape();
   1417     dest = matcher2->replaceFirst(replacement, status);
   1418     REGEX_CHECK_STATUS;
   1419     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
   1420 
   1421     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
   1422 
   1423 
   1424     //
   1425     // Replacement String with \u hex escapes
   1426     //
   1427     {
   1428         UnicodeString  src = "abc 1 abc 2 abc 3";
   1429         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
   1430         matcher->reset(src);
   1431         UnicodeString  result = matcher->replaceAll(substitute, status);
   1432         REGEX_CHECK_STATUS;
   1433         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
   1434     }
   1435     {
   1436         UnicodeString  src = "abc !";
   1437         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
   1438         matcher->reset(src);
   1439         UnicodeString  result = matcher->replaceAll(substitute, status);
   1440         REGEX_CHECK_STATUS;
   1441         UnicodeString expected = UnicodeString("--");
   1442         expected.append((UChar32)0x10000);
   1443         expected.append("-- !");
   1444         REGEX_ASSERT(result == expected);
   1445     }
   1446     // TODO:  need more through testing of capture substitutions.
   1447 
   1448     // Bug 4057
   1449     //
   1450     {
   1451         status = U_ZERO_ERROR;
   1452         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
   1453         RegexMatcher m("ss(.*?)ee", 0, status);
   1454         REGEX_CHECK_STATUS;
   1455         UnicodeString result;
   1456 
   1457         // Multiple finds do NOT bump up the previous appendReplacement postion.
   1458         m.reset(s);
   1459         m.find();
   1460         m.find();
   1461         m.appendReplacement(result, "ooh", status);
   1462         REGEX_CHECK_STATUS;
   1463         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1464 
   1465         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
   1466         status = U_ZERO_ERROR;
   1467         result.truncate(0);
   1468         m.reset(10, status);
   1469         m.find();
   1470         m.find();
   1471         m.appendReplacement(result, "ooh", status);
   1472         REGEX_CHECK_STATUS;
   1473         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1474 
   1475         // find() at interior of string, appendReplacemnt still starts at beginning.
   1476         status = U_ZERO_ERROR;
   1477         result.truncate(0);
   1478         m.reset();
   1479         m.find(10, status);
   1480         m.find();
   1481         m.appendReplacement(result, "ooh", status);
   1482         REGEX_CHECK_STATUS;
   1483         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1484 
   1485         m.appendTail(result);
   1486         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
   1487 
   1488     }
   1489 
   1490     delete matcher2;
   1491     delete pat2;
   1492     delete matcher;
   1493     delete pat;
   1494 }
   1495 
   1496 
   1497 //---------------------------------------------------------------------------
   1498 //
   1499 //      API_Pattern       Test that the API for class RegexPattern is
   1500 //                        present and nominally working.
   1501 //
   1502 //---------------------------------------------------------------------------
   1503 void RegexTest::API_Pattern() {
   1504     RegexPattern        pata;    // Test default constructor to not crash.
   1505     RegexPattern        patb;
   1506 
   1507     REGEX_ASSERT(pata == patb);
   1508     REGEX_ASSERT(pata == pata);
   1509 
   1510     UnicodeString re1("abc[a-l][m-z]");
   1511     UnicodeString re2("def");
   1512     UErrorCode    status = U_ZERO_ERROR;
   1513     UParseError   pe;
   1514 
   1515     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
   1516     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
   1517     REGEX_CHECK_STATUS;
   1518     REGEX_ASSERT(*pat1 == *pat1);
   1519     REGEX_ASSERT(*pat1 != pata);
   1520 
   1521     // Assign
   1522     patb = *pat1;
   1523     REGEX_ASSERT(patb == *pat1);
   1524 
   1525     // Copy Construct
   1526     RegexPattern patc(*pat1);
   1527     REGEX_ASSERT(patc == *pat1);
   1528     REGEX_ASSERT(patb == patc);
   1529     REGEX_ASSERT(pat1 != pat2);
   1530     patb = *pat2;
   1531     REGEX_ASSERT(patb != patc);
   1532     REGEX_ASSERT(patb == *pat2);
   1533 
   1534     // Compile with no flags.
   1535     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
   1536     REGEX_ASSERT(*pat1a == *pat1);
   1537 
   1538     REGEX_ASSERT(pat1a->flags() == 0);
   1539 
   1540     // Compile with different flags should be not equal
   1541     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
   1542     REGEX_CHECK_STATUS;
   1543 
   1544     REGEX_ASSERT(*pat1b != *pat1a);
   1545     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
   1546     REGEX_ASSERT(pat1a->flags() == 0);
   1547     delete pat1b;
   1548 
   1549     // clone
   1550     RegexPattern *pat1c = pat1->clone();
   1551     REGEX_ASSERT(*pat1c == *pat1);
   1552     REGEX_ASSERT(*pat1c != *pat2);
   1553 
   1554     delete pat1c;
   1555     delete pat1a;
   1556     delete pat1;
   1557     delete pat2;
   1558 
   1559 
   1560     //
   1561     //   Verify that a matcher created from a cloned pattern works.
   1562     //     (Jitterbug 3423)
   1563     //
   1564     {
   1565         UErrorCode     status     = U_ZERO_ERROR;
   1566         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
   1567         RegexPattern  *pClone     = pSource->clone();
   1568         delete         pSource;
   1569         RegexMatcher  *mFromClone = pClone->matcher(status);
   1570         REGEX_CHECK_STATUS;
   1571         UnicodeString s = "Hello World";
   1572         mFromClone->reset(s);
   1573         REGEX_ASSERT(mFromClone->find() == TRUE);
   1574         REGEX_ASSERT(mFromClone->group(status) == "Hello");
   1575         REGEX_ASSERT(mFromClone->find() == TRUE);
   1576         REGEX_ASSERT(mFromClone->group(status) == "World");
   1577         REGEX_ASSERT(mFromClone->find() == FALSE);
   1578         delete mFromClone;
   1579         delete pClone;
   1580     }
   1581 
   1582     //
   1583     //   matches convenience API
   1584     //
   1585     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
   1586     REGEX_CHECK_STATUS;
   1587     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
   1588     REGEX_CHECK_STATUS;
   1589     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
   1590     REGEX_CHECK_STATUS;
   1591     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
   1592     REGEX_CHECK_STATUS;
   1593     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
   1594     REGEX_CHECK_STATUS;
   1595     status = U_INDEX_OUTOFBOUNDS_ERROR;
   1596     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
   1597     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1598 
   1599 
   1600     //
   1601     // Split()
   1602     //
   1603     status = U_ZERO_ERROR;
   1604     pat1 = RegexPattern::compile(" +",  pe, status);
   1605     REGEX_CHECK_STATUS;
   1606     UnicodeString  fields[10];
   1607 
   1608     int32_t n;
   1609     n = pat1->split("Now is the time", fields, 10, status);
   1610     REGEX_CHECK_STATUS;
   1611     REGEX_ASSERT(n==4);
   1612     REGEX_ASSERT(fields[0]=="Now");
   1613     REGEX_ASSERT(fields[1]=="is");
   1614     REGEX_ASSERT(fields[2]=="the");
   1615     REGEX_ASSERT(fields[3]=="time");
   1616     REGEX_ASSERT(fields[4]=="");
   1617 
   1618     n = pat1->split("Now is the time", fields, 2, status);
   1619     REGEX_CHECK_STATUS;
   1620     REGEX_ASSERT(n==2);
   1621     REGEX_ASSERT(fields[0]=="Now");
   1622     REGEX_ASSERT(fields[1]=="is the time");
   1623     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
   1624 
   1625     fields[1] = "*";
   1626     status = U_ZERO_ERROR;
   1627     n = pat1->split("Now is the time", fields, 1, status);
   1628     REGEX_CHECK_STATUS;
   1629     REGEX_ASSERT(n==1);
   1630     REGEX_ASSERT(fields[0]=="Now is the time");
   1631     REGEX_ASSERT(fields[1]=="*");
   1632     status = U_ZERO_ERROR;
   1633 
   1634     n = pat1->split("    Now       is the time   ", fields, 10, status);
   1635     REGEX_CHECK_STATUS;
   1636     REGEX_ASSERT(n==6);
   1637     REGEX_ASSERT(fields[0]=="");
   1638     REGEX_ASSERT(fields[1]=="Now");
   1639     REGEX_ASSERT(fields[2]=="is");
   1640     REGEX_ASSERT(fields[3]=="the");
   1641     REGEX_ASSERT(fields[4]=="time");
   1642     REGEX_ASSERT(fields[5]=="");
   1643 
   1644     n = pat1->split("     ", fields, 10, status);
   1645     REGEX_CHECK_STATUS;
   1646     REGEX_ASSERT(n==2);
   1647     REGEX_ASSERT(fields[0]=="");
   1648     REGEX_ASSERT(fields[1]=="");
   1649 
   1650     fields[0] = "foo";
   1651     n = pat1->split("", fields, 10, status);
   1652     REGEX_CHECK_STATUS;
   1653     REGEX_ASSERT(n==0);
   1654     REGEX_ASSERT(fields[0]=="foo");
   1655 
   1656     delete pat1;
   1657 
   1658     //  split, with a pattern with (capture)
   1659     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
   1660     REGEX_CHECK_STATUS;
   1661 
   1662     status = U_ZERO_ERROR;
   1663     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
   1664     REGEX_CHECK_STATUS;
   1665     REGEX_ASSERT(n==7);
   1666     REGEX_ASSERT(fields[0]=="");
   1667     REGEX_ASSERT(fields[1]=="a");
   1668     REGEX_ASSERT(fields[2]=="Now is ");
   1669     REGEX_ASSERT(fields[3]=="b");
   1670     REGEX_ASSERT(fields[4]=="the time");
   1671     REGEX_ASSERT(fields[5]=="c");
   1672     REGEX_ASSERT(fields[6]=="");
   1673     REGEX_ASSERT(status==U_ZERO_ERROR);
   1674 
   1675     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
   1676     REGEX_CHECK_STATUS;
   1677     REGEX_ASSERT(n==7);
   1678     REGEX_ASSERT(fields[0]=="  ");
   1679     REGEX_ASSERT(fields[1]=="a");
   1680     REGEX_ASSERT(fields[2]=="Now is ");
   1681     REGEX_ASSERT(fields[3]=="b");
   1682     REGEX_ASSERT(fields[4]=="the time");
   1683     REGEX_ASSERT(fields[5]=="c");
   1684     REGEX_ASSERT(fields[6]=="");
   1685 
   1686     status = U_ZERO_ERROR;
   1687     fields[6] = "foo";
   1688     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
   1689     REGEX_CHECK_STATUS;
   1690     REGEX_ASSERT(n==6);
   1691     REGEX_ASSERT(fields[0]=="  ");
   1692     REGEX_ASSERT(fields[1]=="a");
   1693     REGEX_ASSERT(fields[2]=="Now is ");
   1694     REGEX_ASSERT(fields[3]=="b");
   1695     REGEX_ASSERT(fields[4]=="the time");
   1696     REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
   1697     REGEX_ASSERT(fields[6]=="foo");
   1698 
   1699     status = U_ZERO_ERROR;
   1700     fields[5] = "foo";
   1701     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
   1702     REGEX_CHECK_STATUS;
   1703     REGEX_ASSERT(n==5);
   1704     REGEX_ASSERT(fields[0]=="  ");
   1705     REGEX_ASSERT(fields[1]=="a");
   1706     REGEX_ASSERT(fields[2]=="Now is ");
   1707     REGEX_ASSERT(fields[3]=="b");
   1708     REGEX_ASSERT(fields[4]=="the time<c>");
   1709     REGEX_ASSERT(fields[5]=="foo");
   1710 
   1711     status = U_ZERO_ERROR;
   1712     fields[5] = "foo";
   1713     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
   1714     REGEX_CHECK_STATUS;
   1715     REGEX_ASSERT(n==5);
   1716     REGEX_ASSERT(fields[0]=="  ");
   1717     REGEX_ASSERT(fields[1]=="a");
   1718     REGEX_ASSERT(fields[2]=="Now is ");
   1719     REGEX_ASSERT(fields[3]=="b");
   1720     REGEX_ASSERT(fields[4]=="the time");
   1721     REGEX_ASSERT(fields[5]=="foo");
   1722 
   1723     status = U_ZERO_ERROR;
   1724     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
   1725     REGEX_CHECK_STATUS;
   1726     REGEX_ASSERT(n==4);
   1727     REGEX_ASSERT(fields[0]=="  ");
   1728     REGEX_ASSERT(fields[1]=="a");
   1729     REGEX_ASSERT(fields[2]=="Now is ");
   1730     REGEX_ASSERT(fields[3]=="the time<c>");
   1731     status = U_ZERO_ERROR;
   1732     delete pat1;
   1733 
   1734     pat1 = RegexPattern::compile("([-,])",  pe, status);
   1735     REGEX_CHECK_STATUS;
   1736     n = pat1->split("1-10,20", fields, 10, status);
   1737     REGEX_CHECK_STATUS;
   1738     REGEX_ASSERT(n==5);
   1739     REGEX_ASSERT(fields[0]=="1");
   1740     REGEX_ASSERT(fields[1]=="-");
   1741     REGEX_ASSERT(fields[2]=="10");
   1742     REGEX_ASSERT(fields[3]==",");
   1743     REGEX_ASSERT(fields[4]=="20");
   1744     delete pat1;
   1745 
   1746     // Test split of string with empty trailing fields
   1747     pat1 = RegexPattern::compile(",", pe, status);
   1748     REGEX_CHECK_STATUS;
   1749     n = pat1->split("a,b,c,", fields, 10, status);
   1750     REGEX_CHECK_STATUS;
   1751     REGEX_ASSERT(n==4);
   1752     REGEX_ASSERT(fields[0]=="a");
   1753     REGEX_ASSERT(fields[1]=="b");
   1754     REGEX_ASSERT(fields[2]=="c");
   1755     REGEX_ASSERT(fields[3]=="");
   1756 
   1757     n = pat1->split("a,,,", fields, 10, status);
   1758     REGEX_CHECK_STATUS;
   1759     REGEX_ASSERT(n==4);
   1760     REGEX_ASSERT(fields[0]=="a");
   1761     REGEX_ASSERT(fields[1]=="");
   1762     REGEX_ASSERT(fields[2]=="");
   1763     REGEX_ASSERT(fields[3]=="");
   1764     delete pat1;
   1765 
   1766     // Split Separator with zero length match.
   1767     pat1 = RegexPattern::compile(":?", pe, status);
   1768     REGEX_CHECK_STATUS;
   1769     n = pat1->split("abc", fields, 10, status);
   1770     REGEX_CHECK_STATUS;
   1771     REGEX_ASSERT(n==5);
   1772     REGEX_ASSERT(fields[0]=="");
   1773     REGEX_ASSERT(fields[1]=="a");
   1774     REGEX_ASSERT(fields[2]=="b");
   1775     REGEX_ASSERT(fields[3]=="c");
   1776     REGEX_ASSERT(fields[4]=="");
   1777 
   1778     delete pat1;
   1779 
   1780     //
   1781     // RegexPattern::pattern()
   1782     //
   1783     pat1 = new RegexPattern();
   1784     REGEX_ASSERT(pat1->pattern() == "");
   1785     delete pat1;
   1786 
   1787     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
   1788     REGEX_CHECK_STATUS;
   1789     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
   1790     delete pat1;
   1791 
   1792 
   1793     //
   1794     // classID functions
   1795     //
   1796     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
   1797     REGEX_CHECK_STATUS;
   1798     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
   1799     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
   1800     UnicodeString Hello("Hello, world.");
   1801     RegexMatcher *m = pat1->matcher(Hello, status);
   1802     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
   1803     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
   1804     REGEX_ASSERT(m->getDynamicClassID() != NULL);
   1805     delete m;
   1806     delete pat1;
   1807 
   1808 }
   1809 
   1810 //---------------------------------------------------------------------------
   1811 //
   1812 //      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
   1813 //                       is present and working, but excluding functions
   1814 //                       implementing replace operations.
   1815 //
   1816 //---------------------------------------------------------------------------
   1817 void RegexTest::API_Match_UTF8() {
   1818     UParseError         pe;
   1819     UErrorCode          status=U_ZERO_ERROR;
   1820     int32_t             flags = 0;
   1821 
   1822     //
   1823     // Debug - slide failing test cases early
   1824     //
   1825 #if 0
   1826     {
   1827     }
   1828     return;
   1829 #endif
   1830 
   1831     //
   1832     // Simple pattern compilation
   1833     //
   1834     {
   1835         UText               re = UTEXT_INITIALIZER;
   1836         regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
   1837         REGEX_VERBOSE_TEXT(&re);
   1838         RegexPattern        *pat2;
   1839         pat2 = RegexPattern::compile(&re, flags, pe, status);
   1840         REGEX_CHECK_STATUS;
   1841 
   1842         UText input1 = UTEXT_INITIALIZER;
   1843         UText input2 = UTEXT_INITIALIZER;
   1844         UText empty  = UTEXT_INITIALIZER;
   1845         regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
   1846         REGEX_VERBOSE_TEXT(&input1);
   1847         regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
   1848         REGEX_VERBOSE_TEXT(&input2);
   1849         utext_openUChars(&empty, NULL, 0, &status);
   1850 
   1851         int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
   1852         int32_t input2Len = strlen("not abc");
   1853 
   1854 
   1855         //
   1856         // Matcher creation and reset.
   1857         //
   1858         RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
   1859         REGEX_CHECK_STATUS;
   1860         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1861         const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
   1862         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
   1863         m1->reset(&input2);
   1864         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
   1865         const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
   1866         REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
   1867         m1->reset(&input1);
   1868         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
   1869         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1870         m1->reset(&empty);
   1871         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
   1872         REGEX_ASSERT(utext_nativeLength(&empty) == 0);
   1873 
   1874         //
   1875         //  reset(pos, status)
   1876         //
   1877         m1->reset(&input1);
   1878         m1->reset(4, status);
   1879         REGEX_CHECK_STATUS;
   1880         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
   1881         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1882 
   1883         m1->reset(-1, status);
   1884         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1885         status = U_ZERO_ERROR;
   1886 
   1887         m1->reset(0, status);
   1888         REGEX_CHECK_STATUS;
   1889         status = U_ZERO_ERROR;
   1890 
   1891         m1->reset(input1Len-1, status);
   1892         REGEX_CHECK_STATUS;
   1893         status = U_ZERO_ERROR;
   1894 
   1895         m1->reset(input1Len, status);
   1896         REGEX_CHECK_STATUS;
   1897         status = U_ZERO_ERROR;
   1898 
   1899         m1->reset(input1Len+1, status);
   1900         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1901         status = U_ZERO_ERROR;
   1902 
   1903         //
   1904         // match(pos, status)
   1905         //
   1906         m1->reset(&input2);
   1907         REGEX_ASSERT(m1->matches(4, status) == TRUE);
   1908         m1->reset();
   1909         REGEX_ASSERT(m1->matches(3, status) == FALSE);
   1910         m1->reset();
   1911         REGEX_ASSERT(m1->matches(5, status) == FALSE);
   1912         REGEX_ASSERT(m1->matches(4, status) == TRUE);
   1913         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
   1914         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1915 
   1916         // Match() at end of string should fail, but should not
   1917         //  be an error.
   1918         status = U_ZERO_ERROR;
   1919         REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
   1920         REGEX_CHECK_STATUS;
   1921 
   1922         // Match beyond end of string should fail with an error.
   1923         status = U_ZERO_ERROR;
   1924         REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
   1925         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1926 
   1927         // Successful match at end of string.
   1928         {
   1929             status = U_ZERO_ERROR;
   1930             RegexMatcher m("A?", 0, status);  // will match zero length string.
   1931             REGEX_CHECK_STATUS;
   1932             m.reset(&input1);
   1933             REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
   1934             REGEX_CHECK_STATUS;
   1935             m.reset(&empty);
   1936             REGEX_ASSERT(m.matches(0, status) == TRUE);
   1937             REGEX_CHECK_STATUS;
   1938         }
   1939 
   1940 
   1941         //
   1942         // lookingAt(pos, status)
   1943         //
   1944         status = U_ZERO_ERROR;
   1945         m1->reset(&input2);  // "not abc"
   1946         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
   1947         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
   1948         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
   1949         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
   1950         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
   1951         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1952         status = U_ZERO_ERROR;
   1953         REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
   1954         REGEX_CHECK_STATUS;
   1955         REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
   1956         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1957 
   1958         delete m1;
   1959         delete pat2;
   1960 
   1961         utext_close(&re);
   1962         utext_close(&input1);
   1963         utext_close(&input2);
   1964         utext_close(&empty);
   1965     }
   1966 
   1967 
   1968     //
   1969     // Capture Group.
   1970     //     RegexMatcher::start();
   1971     //     RegexMatcher::end();
   1972     //     RegexMatcher::groupCount();
   1973     //
   1974     {
   1975         int32_t             flags=0;
   1976         UParseError         pe;
   1977         UErrorCode          status=U_ZERO_ERROR;
   1978         UText               re=UTEXT_INITIALIZER;
   1979         const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
   1980         utext_openUTF8(&re, str_01234567_pat, -1, &status);
   1981 
   1982         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   1983         REGEX_CHECK_STATUS;
   1984 
   1985         UText input = UTEXT_INITIALIZER;
   1986         const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
   1987         utext_openUTF8(&input, str_0123456789, -1, &status);
   1988 
   1989         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
   1990         REGEX_CHECK_STATUS;
   1991         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
   1992         static const int32_t matchStarts[] = {0,  2, 4, 8};
   1993         static const int32_t matchEnds[]   = {10, 8, 6, 10};
   1994         int32_t i;
   1995         for (i=0; i<4; i++) {
   1996             int32_t actualStart = matcher->start(i, status);
   1997             REGEX_CHECK_STATUS;
   1998             if (actualStart != matchStarts[i]) {
   1999                 errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
   2000                       __FILE__, __LINE__, i, matchStarts[i], actualStart);
   2001             }
   2002             int32_t actualEnd = matcher->end(i, status);
   2003             REGEX_CHECK_STATUS;
   2004             if (actualEnd != matchEnds[i]) {
   2005                 errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
   2006                       __FILE__, __LINE__, i, matchEnds[i], actualEnd);
   2007             }
   2008         }
   2009 
   2010         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
   2011         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
   2012 
   2013         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2014         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2015         matcher->reset();
   2016         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
   2017 
   2018         matcher->lookingAt(status);
   2019 
   2020         UnicodeString dest;
   2021         UText destText = UTEXT_INITIALIZER;
   2022         utext_openUnicodeString(&destText, &dest, &status);
   2023         UText *result;
   2024         //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
   2025         //	Test shallow-clone API
   2026         int64_t   group_len;
   2027         result = matcher->group((UText *)NULL, group_len, status);
   2028         REGEX_CHECK_STATUS;
   2029         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   2030         utext_close(result);
   2031         result = matcher->group(0, &destText, group_len, status);
   2032         REGEX_CHECK_STATUS;
   2033         REGEX_ASSERT(result == &destText);
   2034         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   2035         //  destText is now immutable, reopen it
   2036         utext_close(&destText);
   2037         utext_openUnicodeString(&destText, &dest, &status);
   2038 
   2039         result = matcher->group(0, NULL, status);
   2040         REGEX_CHECK_STATUS;
   2041         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   2042         utext_close(result);
   2043         result = matcher->group(0, &destText, status);
   2044         REGEX_CHECK_STATUS;
   2045         REGEX_ASSERT(result == &destText);
   2046         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   2047 
   2048         result = matcher->group(1, NULL, status);
   2049         REGEX_CHECK_STATUS;
   2050         const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */
   2051         REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
   2052         utext_close(result);
   2053         result = matcher->group(1, &destText, status);
   2054         REGEX_CHECK_STATUS;
   2055         REGEX_ASSERT(result == &destText);
   2056         REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
   2057 
   2058         result = matcher->group(2, NULL, status);
   2059         REGEX_CHECK_STATUS;
   2060         const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */
   2061         REGEX_ASSERT_UTEXT_UTF8(str_45, result);
   2062         utext_close(result);
   2063         result = matcher->group(2, &destText, status);
   2064         REGEX_CHECK_STATUS;
   2065         REGEX_ASSERT(result == &destText);
   2066         REGEX_ASSERT_UTEXT_UTF8(str_45, result);
   2067 
   2068         result = matcher->group(3, NULL, status);
   2069         REGEX_CHECK_STATUS;
   2070         const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */
   2071         REGEX_ASSERT_UTEXT_UTF8(str_89, result);
   2072         utext_close(result);
   2073         result = matcher->group(3, &destText, status);
   2074         REGEX_CHECK_STATUS;
   2075         REGEX_ASSERT(result == &destText);
   2076         REGEX_ASSERT_UTEXT_UTF8(str_89, result);
   2077 
   2078         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2079         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2080         matcher->reset();
   2081         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
   2082 
   2083         delete matcher;
   2084         delete pat;
   2085 
   2086         utext_close(&destText);
   2087         utext_close(&input);
   2088         utext_close(&re);
   2089     }
   2090 
   2091     //
   2092     //  find
   2093     //
   2094     {
   2095         int32_t             flags=0;
   2096         UParseError         pe;
   2097         UErrorCode          status=U_ZERO_ERROR;
   2098         UText               re=UTEXT_INITIALIZER;
   2099         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
   2100         utext_openUTF8(&re, str_abc, -1, &status);
   2101 
   2102         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2103         REGEX_CHECK_STATUS;
   2104         UText input = UTEXT_INITIALIZER;
   2105         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
   2106         utext_openUTF8(&input, str_abcabcabc, -1, &status);
   2107         //                      012345678901234567
   2108 
   2109         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
   2110         REGEX_CHECK_STATUS;
   2111         REGEX_ASSERT(matcher->find());
   2112         REGEX_ASSERT(matcher->start(status) == 1);
   2113         REGEX_ASSERT(matcher->find());
   2114         REGEX_ASSERT(matcher->start(status) == 6);
   2115         REGEX_ASSERT(matcher->find());
   2116         REGEX_ASSERT(matcher->start(status) == 12);
   2117         REGEX_ASSERT(matcher->find() == FALSE);
   2118         REGEX_ASSERT(matcher->find() == FALSE);
   2119 
   2120         matcher->reset();
   2121         REGEX_ASSERT(matcher->find());
   2122         REGEX_ASSERT(matcher->start(status) == 1);
   2123 
   2124         REGEX_ASSERT(matcher->find(0, status));
   2125         REGEX_ASSERT(matcher->start(status) == 1);
   2126         REGEX_ASSERT(matcher->find(1, status));
   2127         REGEX_ASSERT(matcher->start(status) == 1);
   2128         REGEX_ASSERT(matcher->find(2, status));
   2129         REGEX_ASSERT(matcher->start(status) == 6);
   2130         REGEX_ASSERT(matcher->find(12, status));
   2131         REGEX_ASSERT(matcher->start(status) == 12);
   2132         REGEX_ASSERT(matcher->find(13, status) == FALSE);
   2133         REGEX_ASSERT(matcher->find(16, status) == FALSE);
   2134         REGEX_ASSERT(matcher->find(17, status) == FALSE);
   2135         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
   2136 
   2137         status = U_ZERO_ERROR;
   2138         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2139         status = U_ZERO_ERROR;
   2140         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2141 
   2142         REGEX_ASSERT(matcher->groupCount() == 0);
   2143 
   2144         delete matcher;
   2145         delete pat;
   2146 
   2147         utext_close(&input);
   2148         utext_close(&re);
   2149     }
   2150 
   2151 
   2152     //
   2153     //  find, with \G in pattern (true if at the end of a previous match).
   2154     //
   2155     {
   2156         int32_t             flags=0;
   2157         UParseError         pe;
   2158         UErrorCode          status=U_ZERO_ERROR;
   2159         UText               re=UTEXT_INITIALIZER;
   2160         const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
   2161         utext_openUTF8(&re, str_Gabcabc, -1, &status);
   2162 
   2163         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2164 
   2165         REGEX_CHECK_STATUS;
   2166         UText input = UTEXT_INITIALIZER;
   2167         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
   2168         utext_openUTF8(&input, str_abcabcabc, -1, &status);
   2169         //                      012345678901234567
   2170 
   2171         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
   2172         REGEX_CHECK_STATUS;
   2173         REGEX_ASSERT(matcher->find());
   2174         REGEX_ASSERT(matcher->start(status) == 0);
   2175         REGEX_ASSERT(matcher->start(1, status) == -1);
   2176         REGEX_ASSERT(matcher->start(2, status) == 1);
   2177 
   2178         REGEX_ASSERT(matcher->find());
   2179         REGEX_ASSERT(matcher->start(status) == 4);
   2180         REGEX_ASSERT(matcher->start(1, status) == 4);
   2181         REGEX_ASSERT(matcher->start(2, status) == -1);
   2182         REGEX_CHECK_STATUS;
   2183 
   2184         delete matcher;
   2185         delete pat;
   2186 
   2187         utext_close(&input);
   2188         utext_close(&re);
   2189     }
   2190 
   2191     //
   2192     //   find with zero length matches, match position should bump ahead
   2193     //     to prevent loops.
   2194     //
   2195     {
   2196         int32_t                 i;
   2197         UErrorCode          status=U_ZERO_ERROR;
   2198         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
   2199                                                       //   using an always-true look-ahead.
   2200         REGEX_CHECK_STATUS;
   2201         UText s = UTEXT_INITIALIZER;
   2202         utext_openUTF8(&s, "    ", -1, &status);
   2203         m.reset(&s);
   2204         for (i=0; ; i++) {
   2205             if (m.find() == FALSE) {
   2206                 break;
   2207             }
   2208             REGEX_ASSERT(m.start(status) == i);
   2209             REGEX_ASSERT(m.end(status) == i);
   2210         }
   2211         REGEX_ASSERT(i==5);
   2212 
   2213         // Check that the bump goes over characters outside the BMP OK
   2214         // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
   2215         unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
   2216         utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
   2217         m.reset(&s);
   2218         for (i=0; ; i+=4) {
   2219             if (m.find() == FALSE) {
   2220                 break;
   2221             }
   2222             REGEX_ASSERT(m.start(status) == i);
   2223             REGEX_ASSERT(m.end(status) == i);
   2224         }
   2225         REGEX_ASSERT(i==20);
   2226 
   2227         utext_close(&s);
   2228     }
   2229     {
   2230         // find() loop breaking test.
   2231         //        with pattern of /.?/, should see a series of one char matches, then a single
   2232         //        match of zero length at the end of the input string.
   2233         int32_t                 i;
   2234         UErrorCode          status=U_ZERO_ERROR;
   2235         RegexMatcher        m(".?", 0, status);
   2236         REGEX_CHECK_STATUS;
   2237         UText s = UTEXT_INITIALIZER;
   2238         utext_openUTF8(&s, "    ", -1, &status);
   2239         m.reset(&s);
   2240         for (i=0; ; i++) {
   2241             if (m.find() == FALSE) {
   2242                 break;
   2243             }
   2244             REGEX_ASSERT(m.start(status) == i);
   2245             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
   2246         }
   2247         REGEX_ASSERT(i==5);
   2248 
   2249         utext_close(&s);
   2250     }
   2251 
   2252 
   2253     //
   2254     // Matchers with no input string behave as if they had an empty input string.
   2255     //
   2256 
   2257     {
   2258         UErrorCode status = U_ZERO_ERROR;
   2259         RegexMatcher  m(".?", 0, status);
   2260         REGEX_CHECK_STATUS;
   2261         REGEX_ASSERT(m.find());
   2262         REGEX_ASSERT(m.start(status) == 0);
   2263         REGEX_ASSERT(m.input() == "");
   2264     }
   2265     {
   2266         UErrorCode status = U_ZERO_ERROR;
   2267         RegexPattern  *p = RegexPattern::compile(".", 0, status);
   2268         RegexMatcher  *m = p->matcher(status);
   2269         REGEX_CHECK_STATUS;
   2270 
   2271         REGEX_ASSERT(m->find() == FALSE);
   2272         REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
   2273         delete m;
   2274         delete p;
   2275     }
   2276 
   2277     //
   2278     // Regions
   2279     //
   2280     {
   2281         UErrorCode status = U_ZERO_ERROR;
   2282         UText testPattern = UTEXT_INITIALIZER;
   2283         UText testText    = UTEXT_INITIALIZER;
   2284         regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
   2285         REGEX_VERBOSE_TEXT(&testPattern);
   2286         regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
   2287         REGEX_VERBOSE_TEXT(&testText);
   2288 
   2289         RegexMatcher m(&testPattern, &testText, 0, status);
   2290         REGEX_CHECK_STATUS;
   2291         REGEX_ASSERT(m.regionStart() == 0);
   2292         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
   2293         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2294         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2295 
   2296         m.region(2,4, status);
   2297         REGEX_CHECK_STATUS;
   2298         REGEX_ASSERT(m.matches(status));
   2299         REGEX_ASSERT(m.start(status)==2);
   2300         REGEX_ASSERT(m.end(status)==4);
   2301         REGEX_CHECK_STATUS;
   2302 
   2303         m.reset();
   2304         REGEX_ASSERT(m.regionStart() == 0);
   2305         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
   2306 
   2307         regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
   2308         REGEX_VERBOSE_TEXT(&testText);
   2309         m.reset(&testText);
   2310         REGEX_ASSERT(m.regionStart() == 0);
   2311         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
   2312 
   2313         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2314         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
   2315         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   2316         REGEX_ASSERT(&m == &m.reset());
   2317         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   2318 
   2319         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
   2320         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2321         REGEX_ASSERT(&m == &m.reset());
   2322         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2323 
   2324         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2325         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
   2326         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   2327         REGEX_ASSERT(&m == &m.reset());
   2328         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   2329 
   2330         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
   2331         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2332         REGEX_ASSERT(&m == &m.reset());
   2333         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2334 
   2335         utext_close(&testText);
   2336         utext_close(&testPattern);
   2337     }
   2338 
   2339     //
   2340     // hitEnd() and requireEnd()
   2341     //
   2342     {
   2343         UErrorCode status = U_ZERO_ERROR;
   2344         UText testPattern = UTEXT_INITIALIZER;
   2345         UText testText    = UTEXT_INITIALIZER;
   2346         const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
   2347         const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
   2348         utext_openUTF8(&testPattern, str_, -1, &status);
   2349         utext_openUTF8(&testText, str_aabb, -1, &status);
   2350 
   2351         RegexMatcher m1(&testPattern, &testText,  0, status);
   2352         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
   2353         REGEX_ASSERT(m1.hitEnd() == TRUE);
   2354         REGEX_ASSERT(m1.requireEnd() == FALSE);
   2355         REGEX_CHECK_STATUS;
   2356 
   2357         status = U_ZERO_ERROR;
   2358         const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
   2359         utext_openUTF8(&testPattern, str_a, -1, &status);
   2360         RegexMatcher m2(&testPattern, &testText, 0, status);
   2361         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
   2362         REGEX_ASSERT(m2.hitEnd() == FALSE);
   2363         REGEX_ASSERT(m2.requireEnd() == FALSE);
   2364         REGEX_CHECK_STATUS;
   2365 
   2366         status = U_ZERO_ERROR;
   2367         const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
   2368         utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
   2369         RegexMatcher m3(&testPattern, &testText, 0, status);
   2370         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
   2371         REGEX_ASSERT(m3.hitEnd() == TRUE);
   2372         REGEX_ASSERT(m3.requireEnd() == TRUE);
   2373         REGEX_CHECK_STATUS;
   2374 
   2375         utext_close(&testText);
   2376         utext_close(&testPattern);
   2377     }
   2378 }
   2379 
   2380 
   2381 //---------------------------------------------------------------------------
   2382 //
   2383 //      API_Replace_UTF8   API test for class RegexMatcher, testing the
   2384 //                         Replace family of functions.
   2385 //
   2386 //---------------------------------------------------------------------------
   2387 void RegexTest::API_Replace_UTF8() {
   2388     //
   2389     //  Replace
   2390     //
   2391     int32_t             flags=0;
   2392     UParseError         pe;
   2393     UErrorCode          status=U_ZERO_ERROR;
   2394 
   2395     UText               re=UTEXT_INITIALIZER;
   2396     regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
   2397     REGEX_VERBOSE_TEXT(&re);
   2398     RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2399     REGEX_CHECK_STATUS;
   2400 
   2401     char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
   2402     //             012345678901234567
   2403     UText dataText = UTEXT_INITIALIZER;
   2404     utext_openUTF8(&dataText, data, -1, &status);
   2405     REGEX_CHECK_STATUS;
   2406     REGEX_VERBOSE_TEXT(&dataText);
   2407     RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
   2408 
   2409     //
   2410     //  Plain vanilla matches.
   2411     //
   2412     UnicodeString  dest;
   2413     UText destText = UTEXT_INITIALIZER;
   2414     utext_openUnicodeString(&destText, &dest, &status);
   2415     UText *result;
   2416 
   2417     UText replText = UTEXT_INITIALIZER;
   2418 
   2419     const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
   2420     utext_openUTF8(&replText, str_yz, -1, &status);
   2421     REGEX_VERBOSE_TEXT(&replText);
   2422     result = matcher->replaceFirst(&replText, NULL, status);
   2423     REGEX_CHECK_STATUS;
   2424     const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
   2425     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
   2426     utext_close(result);
   2427     result = matcher->replaceFirst(&replText, &destText, status);
   2428     REGEX_CHECK_STATUS;
   2429     REGEX_ASSERT(result == &destText);
   2430     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
   2431 
   2432     result = matcher->replaceAll(&replText, NULL, status);
   2433     REGEX_CHECK_STATUS;
   2434     const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
   2435     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
   2436     utext_close(result);
   2437 
   2438     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2439     result = matcher->replaceAll(&replText, &destText, status);
   2440     REGEX_CHECK_STATUS;
   2441     REGEX_ASSERT(result == &destText);
   2442     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
   2443 
   2444     //
   2445     //  Plain vanilla non-matches.
   2446     //
   2447     const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
   2448     utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
   2449     matcher->reset(&dataText);
   2450 
   2451     result = matcher->replaceFirst(&replText, NULL, status);
   2452     REGEX_CHECK_STATUS;
   2453     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2454     utext_close(result);
   2455     result = matcher->replaceFirst(&replText, &destText, status);
   2456     REGEX_CHECK_STATUS;
   2457     REGEX_ASSERT(result == &destText);
   2458     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2459 
   2460     result = matcher->replaceAll(&replText, NULL, status);
   2461     REGEX_CHECK_STATUS;
   2462     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2463     utext_close(result);
   2464     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2465     result = matcher->replaceAll(&replText, &destText, status);
   2466     REGEX_CHECK_STATUS;
   2467     REGEX_ASSERT(result == &destText);
   2468     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2469 
   2470     //
   2471     // Empty source string
   2472     //
   2473     utext_openUTF8(&dataText, NULL, 0, &status);
   2474     matcher->reset(&dataText);
   2475 
   2476     result = matcher->replaceFirst(&replText, NULL, status);
   2477     REGEX_CHECK_STATUS;
   2478     REGEX_ASSERT_UTEXT_UTF8("", result);
   2479     utext_close(result);
   2480     result = matcher->replaceFirst(&replText, &destText, status);
   2481     REGEX_CHECK_STATUS;
   2482     REGEX_ASSERT(result == &destText);
   2483     REGEX_ASSERT_UTEXT_UTF8("", result);
   2484 
   2485     result = matcher->replaceAll(&replText, NULL, status);
   2486     REGEX_CHECK_STATUS;
   2487     REGEX_ASSERT_UTEXT_UTF8("", result);
   2488     utext_close(result);
   2489     result = matcher->replaceAll(&replText, &destText, status);
   2490     REGEX_CHECK_STATUS;
   2491     REGEX_ASSERT(result == &destText);
   2492     REGEX_ASSERT_UTEXT_UTF8("", result);
   2493 
   2494     //
   2495     // Empty substitution string
   2496     //
   2497     utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
   2498     matcher->reset(&dataText);
   2499 
   2500     utext_openUTF8(&replText, NULL, 0, &status);
   2501     result = matcher->replaceFirst(&replText, NULL, status);
   2502     REGEX_CHECK_STATUS;
   2503     const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
   2504     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
   2505     utext_close(result);
   2506     result = matcher->replaceFirst(&replText, &destText, status);
   2507     REGEX_CHECK_STATUS;
   2508     REGEX_ASSERT(result == &destText);
   2509     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
   2510 
   2511     result = matcher->replaceAll(&replText, NULL, status);
   2512     REGEX_CHECK_STATUS;
   2513     const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
   2514     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
   2515     utext_close(result);
   2516     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2517     result = matcher->replaceAll(&replText, &destText, status);
   2518     REGEX_CHECK_STATUS;
   2519     REGEX_ASSERT(result == &destText);
   2520     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
   2521 
   2522     //
   2523     // match whole string
   2524     //
   2525     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
   2526     utext_openUTF8(&dataText, str_abc, -1, &status);
   2527     matcher->reset(&dataText);
   2528 
   2529     const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
   2530     utext_openUTF8(&replText, str_xyz, -1, &status);
   2531     result = matcher->replaceFirst(&replText, NULL, status);
   2532     REGEX_CHECK_STATUS;
   2533     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2534     utext_close(result);
   2535     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2536     result = matcher->replaceFirst(&replText, &destText, status);
   2537     REGEX_CHECK_STATUS;
   2538     REGEX_ASSERT(result == &destText);
   2539     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2540 
   2541     result = matcher->replaceAll(&replText, NULL, status);
   2542     REGEX_CHECK_STATUS;
   2543     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2544     utext_close(result);
   2545     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2546     result = matcher->replaceAll(&replText, &destText, status);
   2547     REGEX_CHECK_STATUS;
   2548     REGEX_ASSERT(result == &destText);
   2549     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2550 
   2551     //
   2552     // Capture Group, simple case
   2553     //
   2554     const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
   2555     utext_openUTF8(&re, str_add, -1, &status);
   2556     RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
   2557     REGEX_CHECK_STATUS;
   2558 
   2559     const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
   2560     utext_openUTF8(&dataText, str_abcdefg, -1, &status);
   2561     RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
   2562     REGEX_CHECK_STATUS;
   2563 
   2564     const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
   2565     utext_openUTF8(&replText, str_11, -1, &status);
   2566     result = matcher2->replaceFirst(&replText, NULL, status);
   2567     REGEX_CHECK_STATUS;
   2568     const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
   2569     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
   2570     utext_close(result);
   2571     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2572     result = matcher2->replaceFirst(&replText, &destText, status);
   2573     REGEX_CHECK_STATUS;
   2574     REGEX_ASSERT(result == &destText);
   2575     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
   2576 
   2577     const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
   2578     utext_openUTF8(&replText, str_v, -1, &status);
   2579     REGEX_VERBOSE_TEXT(&replText);
   2580     result = matcher2->replaceFirst(&replText, NULL, status);
   2581     REGEX_CHECK_STATUS;
   2582     const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
   2583     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
   2584     utext_close(result);
   2585     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2586     result = matcher2->replaceFirst(&replText, &destText, status);
   2587     REGEX_CHECK_STATUS;
   2588     REGEX_ASSERT(result == &destText);
   2589     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
   2590 
   2591     const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */
   2592     utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
   2593     result = matcher2->replaceFirst(&replText, NULL, status);
   2594     REGEX_CHECK_STATUS;
   2595     const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
   2596     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
   2597     utext_close(result);
   2598     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2599     result = matcher2->replaceFirst(&replText, &destText, status);
   2600     REGEX_CHECK_STATUS;
   2601     REGEX_ASSERT(result == &destText);
   2602     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
   2603 
   2604     unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
   2605     //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
   2606     //                                 012345678901234567890123456
   2607     supplDigitChars[22] = 0xF0;
   2608     supplDigitChars[23] = 0x9D;
   2609     supplDigitChars[24] = 0x9F;
   2610     supplDigitChars[25] = 0x8F;
   2611     utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
   2612 
   2613     result = matcher2->replaceFirst(&replText, NULL, status);
   2614     REGEX_CHECK_STATUS;
   2615     const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
   2616     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
   2617     utext_close(result);
   2618     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2619     result = matcher2->replaceFirst(&replText, &destText, status);
   2620     REGEX_CHECK_STATUS;
   2621     REGEX_ASSERT(result == &destText);
   2622     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
   2623     const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
   2624     utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
   2625     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
   2626 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
   2627     utext_close(result);
   2628     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2629     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
   2630     REGEX_ASSERT(result == &destText);
   2631 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
   2632 
   2633     //
   2634     // Replacement String with \u hex escapes
   2635     //
   2636     {
   2637       const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
   2638       const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
   2639         utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
   2640         utext_openUTF8(&replText, str_u0043, -1, &status);
   2641         matcher->reset(&dataText);
   2642 
   2643         result = matcher->replaceAll(&replText, NULL, status);
   2644         REGEX_CHECK_STATUS;
   2645         const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
   2646         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
   2647         utext_close(result);
   2648         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2649         result = matcher->replaceAll(&replText, &destText, status);
   2650         REGEX_CHECK_STATUS;
   2651         REGEX_ASSERT(result == &destText);
   2652         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
   2653     }
   2654     {
   2655       const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
   2656         utext_openUTF8(&dataText, str_abc, -1, &status);
   2657         const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
   2658         utext_openUTF8(&replText, str_U00010000, -1, &status);
   2659         matcher->reset(&dataText);
   2660 
   2661         unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
   2662         //                          0123456789
   2663         expected[2] = 0xF0;
   2664         expected[3] = 0x90;
   2665         expected[4] = 0x80;
   2666         expected[5] = 0x80;
   2667 
   2668         result = matcher->replaceAll(&replText, NULL, status);
   2669         REGEX_CHECK_STATUS;
   2670         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
   2671         utext_close(result);
   2672         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2673         result = matcher->replaceAll(&replText, &destText, status);
   2674         REGEX_CHECK_STATUS;
   2675         REGEX_ASSERT(result == &destText);
   2676         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
   2677     }
   2678     // TODO:  need more through testing of capture substitutions.
   2679 
   2680     // Bug 4057
   2681     //
   2682     {
   2683         status = U_ZERO_ERROR;
   2684 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
   2685 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
   2686 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
   2687         utext_openUTF8(&re, str_ssee, -1, &status);
   2688         utext_openUTF8(&dataText, str_blah, -1, &status);
   2689         utext_openUTF8(&replText, str_ooh, -1, &status);
   2690 
   2691         RegexMatcher m(&re, 0, status);
   2692         REGEX_CHECK_STATUS;
   2693 
   2694         UnicodeString result;
   2695         UText resultText = UTEXT_INITIALIZER;
   2696         utext_openUnicodeString(&resultText, &result, &status);
   2697 
   2698         // Multiple finds do NOT bump up the previous appendReplacement postion.
   2699         m.reset(&dataText);
   2700         m.find();
   2701         m.find();
   2702         m.appendReplacement(&resultText, &replText, status);
   2703         REGEX_CHECK_STATUS;
   2704         const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
   2705         REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
   2706 
   2707         // After a reset into the interior of a string, appendReplacement still starts at beginning.
   2708         status = U_ZERO_ERROR;
   2709         result.truncate(0);
   2710         utext_openUnicodeString(&resultText, &result, &status);
   2711         m.reset(10, status);
   2712         m.find();
   2713         m.find();
   2714         m.appendReplacement(&resultText, &replText, status);
   2715         REGEX_CHECK_STATUS;
   2716         const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
   2717         REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
   2718 
   2719         // find() at interior of string, appendReplacement still starts at beginning.
   2720         status = U_ZERO_ERROR;
   2721         result.truncate(0);
   2722         utext_openUnicodeString(&resultText, &result, &status);
   2723         m.reset();
   2724         m.find(10, status);
   2725         m.find();
   2726         m.appendReplacement(&resultText, &replText, status);
   2727         REGEX_CHECK_STATUS;
   2728         const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
   2729         REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
   2730 
   2731         m.appendTail(&resultText, status);
   2732         const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
   2733         REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
   2734 
   2735         utext_close(&resultText);
   2736     }
   2737 
   2738     delete matcher2;
   2739     delete pat2;
   2740     delete matcher;
   2741     delete pat;
   2742 
   2743     utext_close(&dataText);
   2744     utext_close(&replText);
   2745     utext_close(&destText);
   2746     utext_close(&re);
   2747 }
   2748 
   2749 
   2750 //---------------------------------------------------------------------------
   2751 //
   2752 //      API_Pattern_UTF8  Test that the API for class RegexPattern is
   2753 //                        present and nominally working.
   2754 //
   2755 //---------------------------------------------------------------------------
   2756 void RegexTest::API_Pattern_UTF8() {
   2757     RegexPattern        pata;    // Test default constructor to not crash.
   2758     RegexPattern        patb;
   2759 
   2760     REGEX_ASSERT(pata == patb);
   2761     REGEX_ASSERT(pata == pata);
   2762 
   2763     UText         re1 = UTEXT_INITIALIZER;
   2764     UText         re2 = UTEXT_INITIALIZER;
   2765     UErrorCode    status = U_ZERO_ERROR;
   2766     UParseError   pe;
   2767 
   2768     const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
   2769     const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
   2770     utext_openUTF8(&re1, str_abcalmz, -1, &status);
   2771     utext_openUTF8(&re2, str_def, -1, &status);
   2772 
   2773     RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
   2774     RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
   2775     REGEX_CHECK_STATUS;
   2776     REGEX_ASSERT(*pat1 == *pat1);
   2777     REGEX_ASSERT(*pat1 != pata);
   2778 
   2779     // Assign
   2780     patb = *pat1;
   2781     REGEX_ASSERT(patb == *pat1);
   2782 
   2783     // Copy Construct
   2784     RegexPattern patc(*pat1);
   2785     REGEX_ASSERT(patc == *pat1);
   2786     REGEX_ASSERT(patb == patc);
   2787     REGEX_ASSERT(pat1 != pat2);
   2788     patb = *pat2;
   2789     REGEX_ASSERT(patb != patc);
   2790     REGEX_ASSERT(patb == *pat2);
   2791 
   2792     // Compile with no flags.
   2793     RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
   2794     REGEX_ASSERT(*pat1a == *pat1);
   2795 
   2796     REGEX_ASSERT(pat1a->flags() == 0);
   2797 
   2798     // Compile with different flags should be not equal
   2799     RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
   2800     REGEX_CHECK_STATUS;
   2801 
   2802     REGEX_ASSERT(*pat1b != *pat1a);
   2803     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
   2804     REGEX_ASSERT(pat1a->flags() == 0);
   2805     delete pat1b;
   2806 
   2807     // clone
   2808     RegexPattern *pat1c = pat1->clone();
   2809     REGEX_ASSERT(*pat1c == *pat1);
   2810     REGEX_ASSERT(*pat1c != *pat2);
   2811 
   2812     delete pat1c;
   2813     delete pat1a;
   2814     delete pat1;
   2815     delete pat2;
   2816 
   2817     utext_close(&re1);
   2818     utext_close(&re2);
   2819 
   2820 
   2821     //
   2822     //   Verify that a matcher created from a cloned pattern works.
   2823     //     (Jitterbug 3423)
   2824     //
   2825     {
   2826         UErrorCode     status     = U_ZERO_ERROR;
   2827         UText          pattern    = UTEXT_INITIALIZER;
   2828         const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
   2829         utext_openUTF8(&pattern, str_pL, -1, &status);
   2830 
   2831         RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
   2832         RegexPattern  *pClone     = pSource->clone();
   2833         delete         pSource;
   2834         RegexMatcher  *mFromClone = pClone->matcher(status);
   2835         REGEX_CHECK_STATUS;
   2836 
   2837         UText          input      = UTEXT_INITIALIZER;
   2838         const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
   2839         utext_openUTF8(&input, str_HelloWorld, -1, &status);
   2840         mFromClone->reset(&input);
   2841         REGEX_ASSERT(mFromClone->find() == TRUE);
   2842         REGEX_ASSERT(mFromClone->group(status) == "Hello");
   2843         REGEX_ASSERT(mFromClone->find() == TRUE);
   2844         REGEX_ASSERT(mFromClone->group(status) == "World");
   2845         REGEX_ASSERT(mFromClone->find() == FALSE);
   2846         delete mFromClone;
   2847         delete pClone;
   2848 
   2849         utext_close(&input);
   2850         utext_close(&pattern);
   2851     }
   2852 
   2853     //
   2854     //   matches convenience API
   2855     //
   2856     {
   2857         UErrorCode status  = U_ZERO_ERROR;
   2858         UText      pattern = UTEXT_INITIALIZER;
   2859         UText      input   = UTEXT_INITIALIZER;
   2860 
   2861         const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
   2862         utext_openUTF8(&input, str_randominput, -1, &status);
   2863 
   2864         const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
   2865         utext_openUTF8(&pattern, str_dotstar, -1, &status);
   2866         REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
   2867         REGEX_CHECK_STATUS;
   2868 
   2869         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
   2870         utext_openUTF8(&pattern, str_abc, -1, &status);
   2871         REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
   2872         REGEX_CHECK_STATUS;
   2873 
   2874         const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
   2875         utext_openUTF8(&pattern, str_nput, -1, &status);
   2876         REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
   2877         REGEX_CHECK_STATUS;
   2878 
   2879         utext_openUTF8(&pattern, str_randominput, -1, &status);
   2880         REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
   2881         REGEX_CHECK_STATUS;
   2882 
   2883         const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
   2884         utext_openUTF8(&pattern, str_u, -1, &status);
   2885         REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
   2886         REGEX_CHECK_STATUS;
   2887 
   2888         utext_openUTF8(&input, str_abc, -1, &status);
   2889         utext_openUTF8(&pattern, str_abc, -1, &status);
   2890         status = U_INDEX_OUTOFBOUNDS_ERROR;
   2891         REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
   2892         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   2893 
   2894         utext_close(&input);
   2895         utext_close(&pattern);
   2896     }
   2897 
   2898 
   2899     //
   2900     // Split()
   2901     //
   2902     status = U_ZERO_ERROR;
   2903     const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
   2904     utext_openUTF8(&re1, str_spaceplus, -1, &status);
   2905     pat1 = RegexPattern::compile(&re1, pe, status);
   2906     REGEX_CHECK_STATUS;
   2907     UnicodeString  fields[10];
   2908 
   2909     int32_t n;
   2910     n = pat1->split("Now is the time", fields, 10, status);
   2911     REGEX_CHECK_STATUS;
   2912     REGEX_ASSERT(n==4);
   2913     REGEX_ASSERT(fields[0]=="Now");
   2914     REGEX_ASSERT(fields[1]=="is");
   2915     REGEX_ASSERT(fields[2]=="the");
   2916     REGEX_ASSERT(fields[3]=="time");
   2917     REGEX_ASSERT(fields[4]=="");
   2918 
   2919     n = pat1->split("Now is the time", fields, 2, status);
   2920     REGEX_CHECK_STATUS;
   2921     REGEX_ASSERT(n==2);
   2922     REGEX_ASSERT(fields[0]=="Now");
   2923     REGEX_ASSERT(fields[1]=="is the time");
   2924     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
   2925 
   2926     fields[1] = "*";
   2927     status = U_ZERO_ERROR;
   2928     n = pat1->split("Now is the time", fields, 1, status);
   2929     REGEX_CHECK_STATUS;
   2930     REGEX_ASSERT(n==1);
   2931     REGEX_ASSERT(fields[0]=="Now is the time");
   2932     REGEX_ASSERT(fields[1]=="*");
   2933     status = U_ZERO_ERROR;
   2934 
   2935     n = pat1->split("    Now       is the time   ", fields, 10, status);
   2936     REGEX_CHECK_STATUS;
   2937     REGEX_ASSERT(n==6);
   2938     REGEX_ASSERT(fields[0]=="");
   2939     REGEX_ASSERT(fields[1]=="Now");
   2940     REGEX_ASSERT(fields[2]=="is");
   2941     REGEX_ASSERT(fields[3]=="the");
   2942     REGEX_ASSERT(fields[4]=="time");
   2943     REGEX_ASSERT(fields[5]=="");
   2944     REGEX_ASSERT(fields[6]=="");
   2945 
   2946     fields[2] = "*";
   2947     n = pat1->split("     ", fields, 10, status);
   2948     REGEX_CHECK_STATUS;
   2949     REGEX_ASSERT(n==2);
   2950     REGEX_ASSERT(fields[0]=="");
   2951     REGEX_ASSERT(fields[1]=="");
   2952     REGEX_ASSERT(fields[2]=="*");
   2953 
   2954     fields[0] = "foo";
   2955     n = pat1->split("", fields, 10, status);
   2956     REGEX_CHECK_STATUS;
   2957     REGEX_ASSERT(n==0);
   2958     REGEX_ASSERT(fields[0]=="foo");
   2959 
   2960     delete pat1;
   2961 
   2962     //  split, with a pattern with (capture)
   2963     regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
   2964     pat1 = RegexPattern::compile(&re1,  pe, status);
   2965     REGEX_CHECK_STATUS;
   2966 
   2967     status = U_ZERO_ERROR;
   2968     fields[6] = fields[7] = "*";
   2969     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
   2970     REGEX_CHECK_STATUS;
   2971     REGEX_ASSERT(n==7);
   2972     REGEX_ASSERT(fields[0]=="");
   2973     REGEX_ASSERT(fields[1]=="a");
   2974     REGEX_ASSERT(fields[2]=="Now is ");
   2975     REGEX_ASSERT(fields[3]=="b");
   2976     REGEX_ASSERT(fields[4]=="the time");
   2977     REGEX_ASSERT(fields[5]=="c");
   2978     REGEX_ASSERT(fields[6]=="");
   2979     REGEX_ASSERT(fields[7]=="*");
   2980     REGEX_ASSERT(status==U_ZERO_ERROR);
   2981 
   2982     fields[6] = fields[7] = "*";
   2983     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
   2984     REGEX_CHECK_STATUS;
   2985     REGEX_ASSERT(n==7);
   2986     REGEX_ASSERT(fields[0]=="  ");
   2987     REGEX_ASSERT(fields[1]=="a");
   2988     REGEX_ASSERT(fields[2]=="Now is ");
   2989     REGEX_ASSERT(fields[3]=="b");
   2990     REGEX_ASSERT(fields[4]=="the time");
   2991     REGEX_ASSERT(fields[5]=="c");
   2992     REGEX_ASSERT(fields[6]=="");
   2993     REGEX_ASSERT(fields[7]=="*");
   2994 
   2995     status = U_ZERO_ERROR;
   2996     fields[6] = "foo";
   2997     n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
   2998     REGEX_CHECK_STATUS;
   2999     REGEX_ASSERT(n==6);
   3000     REGEX_ASSERT(fields[0]=="  ");
   3001     REGEX_ASSERT(fields[1]=="a");
   3002     REGEX_ASSERT(fields[2]=="Now is ");
   3003     REGEX_ASSERT(fields[3]=="b");
   3004     REGEX_ASSERT(fields[4]=="the time");
   3005     REGEX_ASSERT(fields[5]==" ");
   3006     REGEX_ASSERT(fields[6]=="foo");
   3007 
   3008     status = U_ZERO_ERROR;
   3009     fields[5] = "foo";
   3010     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
   3011     REGEX_CHECK_STATUS;
   3012     REGEX_ASSERT(n==5);
   3013     REGEX_ASSERT(fields[0]=="  ");
   3014     REGEX_ASSERT(fields[1]=="a");
   3015     REGEX_ASSERT(fields[2]=="Now is ");
   3016     REGEX_ASSERT(fields[3]=="b");
   3017     REGEX_ASSERT(fields[4]=="the time<c>");
   3018     REGEX_ASSERT(fields[5]=="foo");
   3019 
   3020     status = U_ZERO_ERROR;
   3021     fields[5] = "foo";
   3022     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
   3023     REGEX_CHECK_STATUS;
   3024     REGEX_ASSERT(n==5);
   3025     REGEX_ASSERT(fields[0]=="  ");
   3026     REGEX_ASSERT(fields[1]=="a");
   3027     REGEX_ASSERT(fields[2]=="Now is ");
   3028     REGEX_ASSERT(fields[3]=="b");
   3029     REGEX_ASSERT(fields[4]=="the time");
   3030     REGEX_ASSERT(fields[5]=="foo");
   3031 
   3032     status = U_ZERO_ERROR;
   3033     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
   3034     REGEX_CHECK_STATUS;
   3035     REGEX_ASSERT(n==4);
   3036     REGEX_ASSERT(fields[0]=="  ");
   3037     REGEX_ASSERT(fields[1]=="a");
   3038     REGEX_ASSERT(fields[2]=="Now is ");
   3039     REGEX_ASSERT(fields[3]=="the time<c>");
   3040     status = U_ZERO_ERROR;
   3041     delete pat1;
   3042 
   3043     regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
   3044     pat1 = RegexPattern::compile(&re1, pe, status);
   3045     REGEX_CHECK_STATUS;
   3046     n = pat1->split("1-10,20", fields, 10, status);
   3047     REGEX_CHECK_STATUS;
   3048     REGEX_ASSERT(n==5);
   3049     REGEX_ASSERT(fields[0]=="1");
   3050     REGEX_ASSERT(fields[1]=="-");
   3051     REGEX_ASSERT(fields[2]=="10");
   3052     REGEX_ASSERT(fields[3]==",");
   3053     REGEX_ASSERT(fields[4]=="20");
   3054     delete pat1;
   3055 
   3056 
   3057     //
   3058     // RegexPattern::pattern() and patternText()
   3059     //
   3060     pat1 = new RegexPattern();
   3061     REGEX_ASSERT(pat1->pattern() == "");
   3062     REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
   3063     delete pat1;
   3064     const char *helloWorldInvariant = "(Hello, world)*";
   3065     regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
   3066     pat1 = RegexPattern::compile(&re1, pe, status);
   3067     REGEX_CHECK_STATUS;
   3068     REGEX_ASSERT_UNISTR(pat1->pattern(),"(Hello, world)*");
   3069     REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
   3070     delete pat1;
   3071 
   3072     utext_close(&re1);
   3073 }
   3074 
   3075 
   3076 //---------------------------------------------------------------------------
   3077 //
   3078 //      Extended       A more thorough check for features of regex patterns
   3079 //                     The test cases are in a separate data file,
   3080 //                       source/tests/testdata/regextst.txt
   3081 //                     A description of the test data format is included in that file.
   3082 //
   3083 //---------------------------------------------------------------------------
   3084 
   3085 const char *
   3086 RegexTest::getPath(char buffer[2048], const char *filename) {
   3087     UErrorCode status=U_ZERO_ERROR;
   3088     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   3089     if (U_FAILURE(status)) {
   3090         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
   3091         return NULL;
   3092     }
   3093 
   3094     strcpy(buffer, testDataDirectory);
   3095     strcat(buffer, filename);
   3096     return buffer;
   3097 }
   3098 
   3099 void RegexTest::Extended() {
   3100     char tdd[2048];
   3101     const char *srcPath;
   3102     UErrorCode  status  = U_ZERO_ERROR;
   3103     int32_t     lineNum = 0;
   3104 
   3105     //
   3106     //  Open and read the test data file.
   3107     //
   3108     srcPath=getPath(tdd, "regextst.txt");
   3109     if(srcPath==NULL) {
   3110         return; /* something went wrong, error already output */
   3111     }
   3112 
   3113     int32_t    len;
   3114     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
   3115     if (U_FAILURE(status)) {
   3116         return; /* something went wrong, error already output */
   3117     }
   3118 
   3119     //
   3120     //  Put the test data into a UnicodeString
   3121     //
   3122     UnicodeString testString(FALSE, testData, len);
   3123 
   3124     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
   3125     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
   3126     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
   3127 
   3128     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
   3129     UnicodeString   testPattern;   // The pattern for test from the test file.
   3130     UnicodeString   testFlags;     // the flags   for a test.
   3131     UnicodeString   matchString;   // The marked up string to be used as input
   3132 
   3133     if (U_FAILURE(status)){
   3134         dataerrln("Construct RegexMatcher() error.");
   3135         delete [] testData;
   3136         return;
   3137     }
   3138 
   3139     //
   3140     //  Loop over the test data file, once per line.
   3141     //
   3142     while (lineMat.find()) {
   3143         lineNum++;
   3144         if (U_FAILURE(status)) {
   3145           errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
   3146         }
   3147 
   3148         status = U_ZERO_ERROR;
   3149         UnicodeString testLine = lineMat.group(1, status);
   3150         if (testLine.length() == 0) {
   3151             continue;
   3152         }
   3153 
   3154         //
   3155         // Parse the test line.  Skip blank and comment only lines.
   3156         // Separate out the three main fields - pattern, flags, target.
   3157         //
   3158 
   3159         commentMat.reset(testLine);
   3160         if (commentMat.lookingAt(status)) {
   3161             // This line is a comment, or blank.
   3162             continue;
   3163         }
   3164 
   3165         //
   3166         //  Pull out the pattern field, remove it from the test file line.
   3167         //
   3168         quotedStuffMat.reset(testLine);
   3169         if (quotedStuffMat.lookingAt(status)) {
   3170             testPattern = quotedStuffMat.group(2, status);
   3171             testLine.remove(0, quotedStuffMat.end(0, status));
   3172         } else {
   3173             errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
   3174             continue;
   3175         }
   3176 
   3177 
   3178         //
   3179         //  Pull out the flags from the test file line.
   3180         //
   3181         flagsMat.reset(testLine);
   3182         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
   3183         testFlags = flagsMat.group(1, status);
   3184         if (flagsMat.group(2, status).length() > 0) {
   3185             errln("Bad Match flag at line %d. Scanning %c\n",
   3186                 lineNum, flagsMat.group(2, status).charAt(0));
   3187             continue;
   3188         }
   3189         testLine.remove(0, flagsMat.end(0, status));
   3190 
   3191         //
   3192         //  Pull out the match string, as a whole.
   3193         //    We'll process the <tags> later.
   3194         //
   3195         quotedStuffMat.reset(testLine);
   3196         if (quotedStuffMat.lookingAt(status)) {
   3197             matchString = quotedStuffMat.group(2, status);
   3198             testLine.remove(0, quotedStuffMat.end(0, status));
   3199         } else {
   3200             errln("Bad match string at test file line %d", lineNum);
   3201             continue;
   3202         }
   3203 
   3204         //
   3205         //  The only thing left from the input line should be an optional trailing comment.
   3206         //
   3207         commentMat.reset(testLine);
   3208         if (commentMat.lookingAt(status) == FALSE) {
   3209             errln("Line %d: unexpected characters at end of test line.", lineNum);
   3210             continue;
   3211         }
   3212 
   3213         //
   3214         //  Run the test
   3215         //
   3216         regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
   3217     }
   3218 
   3219     delete [] testData;
   3220 
   3221 }
   3222 
   3223 
   3224 
   3225 //---------------------------------------------------------------------------
   3226 //
   3227 //    regex_find(pattern, flags, inputString, lineNumber)
   3228 //
   3229 //         Function to run a single test from the Extended (data driven) tests.
   3230 //         See file test/testdata/regextst.txt for a description of the
   3231 //         pattern and inputString fields, and the allowed flags.
   3232 //         lineNumber is the source line in regextst.txt of the test.
   3233 //
   3234 //---------------------------------------------------------------------------
   3235 
   3236 
   3237 //  Set a value into a UVector at position specified by a decimal number in
   3238 //   a UnicodeString.   This is a utility function needed by the actual test function,
   3239 //   which follows.
   3240 static void set(UVector &vec, int32_t val, UnicodeString index) {
   3241     UErrorCode  status=U_ZERO_ERROR;
   3242     int32_t  idx = 0;
   3243     for (int32_t i=0; i<index.length(); i++) {
   3244         int32_t d=u_charDigitValue(index.charAt(i));
   3245         if (d<0) {return;}
   3246         idx = idx*10 + d;
   3247     }
   3248     while (vec.size()<idx+1) {vec.addElement(-1, status);}
   3249     vec.setElementAt(val, idx);
   3250 }
   3251 
   3252 static void setInt(UVector &vec, int32_t val, int32_t idx) {
   3253     UErrorCode  status=U_ZERO_ERROR;
   3254     while (vec.size()<idx+1) {vec.addElement(-1, status);}
   3255     vec.setElementAt(val, idx);
   3256 }
   3257 
   3258 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
   3259 {
   3260     UBool couldFind = TRUE;
   3261     UTEXT_SETNATIVEINDEX(utext, 0);
   3262     int32_t i = 0;
   3263     while (i < unistrOffset) {
   3264         UChar32 c = UTEXT_NEXT32(utext);
   3265         if (c != U_SENTINEL) {
   3266             i += U16_LENGTH(c);
   3267         } else {
   3268             couldFind = FALSE;
   3269             break;
   3270         }
   3271     }
   3272     nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
   3273     return couldFind;
   3274 }
   3275 
   3276 
   3277 void RegexTest::regex_find(const UnicodeString &pattern,
   3278                            const UnicodeString &flags,
   3279                            const UnicodeString &inputString,
   3280                            const char *srcPath,
   3281                            int32_t line) {
   3282     UnicodeString       unEscapedInput;
   3283     UnicodeString       deTaggedInput;
   3284 
   3285     int32_t             patternUTF8Length,      inputUTF8Length;
   3286     char                *patternChars  = NULL, *inputChars = NULL;
   3287     UText               patternText    = UTEXT_INITIALIZER;
   3288     UText               inputText      = UTEXT_INITIALIZER;
   3289     UConverter          *UTF8Converter = NULL;
   3290 
   3291     UErrorCode          status         = U_ZERO_ERROR;
   3292     UParseError         pe;
   3293     RegexPattern        *parsePat      = NULL;
   3294     RegexMatcher        *parseMatcher  = NULL;
   3295     RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
   3296     RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
   3297     UVector             groupStarts(status);
   3298     UVector             groupEnds(status);
   3299     UVector             groupStartsUTF8(status);
   3300     UVector             groupEndsUTF8(status);
   3301     UBool               isMatch        = FALSE, isUTF8Match = FALSE;
   3302     UBool               failed         = FALSE;
   3303     int32_t             numFinds;
   3304     int32_t             i;
   3305     UBool               useMatchesFunc   = FALSE;
   3306     UBool               useLookingAtFunc = FALSE;
   3307     int32_t             regionStart      = -1;
   3308     int32_t             regionEnd        = -1;
   3309     int32_t             regionStartUTF8  = -1;
   3310     int32_t             regionEndUTF8    = -1;
   3311 
   3312 
   3313     //
   3314     //  Compile the caller's pattern
   3315     //
   3316     uint32_t bflags = 0;
   3317     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
   3318         bflags |= UREGEX_CASE_INSENSITIVE;
   3319     }
   3320     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
   3321         bflags |= UREGEX_COMMENTS;
   3322     }
   3323     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
   3324         bflags |= UREGEX_DOTALL;
   3325     }
   3326     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
   3327         bflags |= UREGEX_MULTILINE;
   3328     }
   3329 
   3330     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
   3331         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
   3332     }
   3333     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
   3334         bflags |= UREGEX_UNIX_LINES;
   3335     }
   3336     if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
   3337         bflags |= UREGEX_LITERAL;
   3338     }
   3339 
   3340 
   3341     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
   3342     if (status != U_ZERO_ERROR) {
   3343         #if UCONFIG_NO_BREAK_ITERATION==1
   3344         // 'v' test flag means that the test pattern should not compile if ICU was configured
   3345         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
   3346         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
   3347             goto cleanupAndReturn;
   3348         }
   3349         #endif
   3350         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
   3351             // Expected pattern compilation error.
   3352             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
   3353                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
   3354             }
   3355             goto cleanupAndReturn;
   3356         } else {
   3357             // Unexpected pattern compilation error.
   3358             dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
   3359             goto cleanupAndReturn;
   3360         }
   3361     }
   3362 
   3363     UTF8Converter = ucnv_open("UTF8", &status);
   3364     ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
   3365 
   3366     patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
   3367     status = U_ZERO_ERROR; // buffer overflow
   3368     patternChars = new char[patternUTF8Length+1];
   3369     pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
   3370     utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
   3371 
   3372     if (status == U_ZERO_ERROR) {
   3373         UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
   3374 
   3375         if (status != U_ZERO_ERROR) {
   3376 #if UCONFIG_NO_BREAK_ITERATION==1
   3377             // 'v' test flag means that the test pattern should not compile if ICU was configured
   3378             //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
   3379             if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
   3380                 goto cleanupAndReturn;
   3381             }
   3382 #endif
   3383             if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
   3384                 // Expected pattern compilation error.
   3385                 if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
   3386                     logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
   3387                 }
   3388                 goto cleanupAndReturn;
   3389             } else {
   3390                 // Unexpected pattern compilation error.
   3391                 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
   3392                 goto cleanupAndReturn;
   3393             }
   3394         }
   3395     }
   3396 
   3397     if (UTF8Pattern == NULL) {
   3398         // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
   3399         logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
   3400         status = U_ZERO_ERROR;
   3401     }
   3402 
   3403     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
   3404         RegexPatternDump(callerPattern);
   3405     }
   3406 
   3407     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
   3408         errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
   3409         goto cleanupAndReturn;
   3410     }
   3411 
   3412 
   3413     //
   3414     // Number of times find() should be called on the test string, default to 1
   3415     //
   3416     numFinds = 1;
   3417     for (i=2; i<=9; i++) {
   3418         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
   3419             if (numFinds != 1) {
   3420                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
   3421                 goto cleanupAndReturn;
   3422             }
   3423             numFinds = i;
   3424         }
   3425     }
   3426 
   3427     // 'M' flag.  Use matches() instead of find()
   3428     if (flags.indexOf((UChar)0x4d) >= 0) {
   3429         useMatchesFunc = TRUE;
   3430     }
   3431     if (flags.indexOf((UChar)0x4c) >= 0) {
   3432         useLookingAtFunc = TRUE;
   3433     }
   3434 
   3435     //
   3436     //  Find the tags in the input data, remove them, and record the group boundary
   3437     //    positions.
   3438     //
   3439     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
   3440     REGEX_CHECK_STATUS_L(line);
   3441 
   3442     unEscapedInput = inputString.unescape();
   3443     parseMatcher = parsePat->matcher(unEscapedInput, status);
   3444     REGEX_CHECK_STATUS_L(line);
   3445     while(parseMatcher->find()) {
   3446         parseMatcher->appendReplacement(deTaggedInput, "", status);
   3447         REGEX_CHECK_STATUS;
   3448         UnicodeString groupNum = parseMatcher->group(2, status);
   3449         if (groupNum == "r") {
   3450             // <r> or </r>, a region specification within the string
   3451             if (parseMatcher->group(1, status) == "/") {
   3452                 regionEnd = deTaggedInput.length();
   3453             } else {
   3454                 regionStart = deTaggedInput.length();
   3455             }
   3456         } else {
   3457             // <digits> or </digits>, a group match boundary tag.
   3458             if (parseMatcher->group(1, status) == "/") {
   3459                 set(groupEnds, deTaggedInput.length(), groupNum);
   3460             } else {
   3461                 set(groupStarts, deTaggedInput.length(), groupNum);
   3462             }
   3463         }
   3464     }
   3465     parseMatcher->appendTail(deTaggedInput);
   3466     REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
   3467     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
   3468       errln("mismatched <r> tags");
   3469       failed = TRUE;
   3470       goto cleanupAndReturn;
   3471     }
   3472 
   3473     //
   3474     //  Configure the matcher according to the flags specified with this test.
   3475     //
   3476     matcher = callerPattern->matcher(deTaggedInput, status);
   3477     REGEX_CHECK_STATUS_L(line);
   3478     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
   3479         matcher->setTrace(TRUE);
   3480     }
   3481 
   3482     if (UTF8Pattern != NULL) {
   3483         inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
   3484         status = U_ZERO_ERROR; // buffer overflow
   3485         inputChars = new char[inputUTF8Length+1];
   3486         deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
   3487         utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
   3488 
   3489         if (status == U_ZERO_ERROR) {
   3490             UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
   3491             REGEX_CHECK_STATUS_L(line);
   3492         }
   3493 
   3494         if (UTF8Matcher == NULL) {
   3495             // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
   3496           logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
   3497             status = U_ZERO_ERROR;
   3498         }
   3499     }
   3500 
   3501     //
   3502     //  Generate native indices for UTF8 versions of region and capture group info
   3503     //
   3504     if (UTF8Matcher != NULL) {
   3505         if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
   3506         if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
   3507 
   3508         //  Fill out the native index UVector info.
   3509         //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
   3510         for (i=0; i<groupStarts.size(); i++) {
   3511             int32_t  start = groupStarts.elementAti(i);
   3512             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
   3513             if (start >= 0) {
   3514                 int32_t  startUTF8;
   3515                 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
   3516                     errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
   3517                     failed = TRUE;
   3518                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3519                 }
   3520                 setInt(groupStartsUTF8, startUTF8, i);
   3521             }
   3522 
   3523             int32_t  end = groupEnds.elementAti(i);
   3524             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
   3525             if (end >= 0) {
   3526                 int32_t  endUTF8;
   3527                 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
   3528                     errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
   3529                     failed = TRUE;
   3530                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3531                 }
   3532                 setInt(groupEndsUTF8, endUTF8, i);
   3533             }
   3534         }
   3535     }
   3536 
   3537     if (regionStart>=0) {
   3538        matcher->region(regionStart, regionEnd, status);
   3539        REGEX_CHECK_STATUS_L(line);
   3540        if (UTF8Matcher != NULL) {
   3541            UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
   3542            REGEX_CHECK_STATUS_L(line);
   3543        }
   3544     }
   3545     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
   3546         matcher->useAnchoringBounds(FALSE);
   3547         if (UTF8Matcher != NULL) {
   3548             UTF8Matcher->useAnchoringBounds(FALSE);
   3549         }
   3550     }
   3551     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
   3552         matcher->useTransparentBounds(TRUE);
   3553         if (UTF8Matcher != NULL) {
   3554             UTF8Matcher->useTransparentBounds(TRUE);
   3555         }
   3556     }
   3557 
   3558 
   3559 
   3560     //
   3561     // Do a find on the de-tagged input using the caller's pattern
   3562     //     TODO: error on count>1 and not find().
   3563     //           error on both matches() and lookingAt().
   3564     //
   3565     for (i=0; i<numFinds; i++) {
   3566         if (useMatchesFunc) {
   3567             isMatch = matcher->matches(status);
   3568             if (UTF8Matcher != NULL) {
   3569                isUTF8Match = UTF8Matcher->matches(status);
   3570             }
   3571         } else  if (useLookingAtFunc) {
   3572             isMatch = matcher->lookingAt(status);
   3573             if (UTF8Matcher != NULL) {
   3574                 isUTF8Match = UTF8Matcher->lookingAt(status);
   3575             }
   3576         } else {
   3577             isMatch = matcher->find();
   3578             if (UTF8Matcher != NULL) {
   3579                 isUTF8Match = UTF8Matcher->find();
   3580             }
   3581         }
   3582     }
   3583     matcher->setTrace(FALSE);
   3584 
   3585     //
   3586     // Match up the groups from the find() with the groups from the tags
   3587     //
   3588 
   3589     // number of tags should match number of groups from find operation.
   3590     // matcher->groupCount does not include group 0, the entire match, hence the +1.
   3591     //   G option in test means that capture group data is not available in the
   3592     //     expected results, so the check needs to be suppressed.
   3593     if (isMatch == FALSE && groupStarts.size() != 0) {
   3594         dataerrln("Error at line %d:  Match expected, but none found.", line);
   3595         failed = TRUE;
   3596         goto cleanupAndReturn;
   3597     } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
   3598         errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
   3599         failed = TRUE;
   3600         goto cleanupAndReturn;
   3601     }
   3602 
   3603     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
   3604         // Only check for match / no match.  Don't check capture groups.
   3605         if (isMatch && groupStarts.size() == 0) {
   3606             errln("Error at line %d:  No match expected, but one found.", line);
   3607             failed = TRUE;
   3608         } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
   3609             errln("Error at line %d:  No match expected, but one found. (UTF8)", line);
   3610             failed = TRUE;
   3611         }
   3612         goto cleanupAndReturn;
   3613     }
   3614 
   3615     REGEX_CHECK_STATUS_L(line);
   3616     for (i=0; i<=matcher->groupCount(); i++) {
   3617         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
   3618         int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
   3619         if (matcher->start(i, status) != expectedStart) {
   3620             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
   3621                 line, i, expectedStart, matcher->start(i, status));
   3622             failed = TRUE;
   3623             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3624         } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
   3625             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
   3626                   line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
   3627             failed = TRUE;
   3628             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3629         }
   3630 
   3631         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
   3632         int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
   3633         if (matcher->end(i, status) != expectedEnd) {
   3634             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
   3635                 line, i, expectedEnd, matcher->end(i, status));
   3636             failed = TRUE;
   3637             // Error on end position;  keep going; real error is probably yet to come as group
   3638             //   end positions work from end of the input data towards the front.
   3639         } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
   3640             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
   3641                   line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
   3642             failed = TRUE;
   3643             // Error on end position;  keep going; real error is probably yet to come as group
   3644             //   end positions work from end of the input data towards the front.
   3645         }
   3646     }
   3647     if ( matcher->groupCount()+1 < groupStarts.size()) {
   3648         errln("Error at line %d: Expected %d capture groups, found %d.",
   3649             line, groupStarts.size()-1, matcher->groupCount());
   3650         failed = TRUE;
   3651         }
   3652     else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
   3653         errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
   3654               line, groupStarts.size()-1, UTF8Matcher->groupCount());
   3655         failed = TRUE;
   3656     }
   3657 
   3658     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3659         matcher->requireEnd() == TRUE) {
   3660         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
   3661         failed = TRUE;
   3662     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3663         UTF8Matcher->requireEnd() == TRUE) {
   3664         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
   3665         failed = TRUE;
   3666     }
   3667 
   3668     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
   3669         matcher->requireEnd() == FALSE) {
   3670         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
   3671         failed = TRUE;
   3672     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3673         UTF8Matcher->requireEnd() == FALSE) {
   3674         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
   3675         failed = TRUE;
   3676     }
   3677 
   3678     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
   3679         matcher->hitEnd() == TRUE) {
   3680         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
   3681         failed = TRUE;
   3682     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
   3683                UTF8Matcher->hitEnd() == TRUE) {
   3684         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
   3685         failed = TRUE;
   3686     }
   3687 
   3688     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
   3689         matcher->hitEnd() == FALSE) {
   3690         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
   3691         failed = TRUE;
   3692     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
   3693                UTF8Matcher->hitEnd() == FALSE) {
   3694         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
   3695         failed = TRUE;
   3696     }
   3697 
   3698 
   3699 cleanupAndReturn:
   3700     if (failed) {
   3701         infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
   3702             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
   3703         // callerPattern->dump();
   3704     }
   3705     delete parseMatcher;
   3706     delete parsePat;
   3707     delete UTF8Matcher;
   3708     delete UTF8Pattern;
   3709     delete matcher;
   3710     delete callerPattern;
   3711 
   3712     utext_close(&inputText);
   3713     delete[] inputChars;
   3714     utext_close(&patternText);
   3715     delete[] patternChars;
   3716     ucnv_close(UTF8Converter);
   3717 }
   3718 
   3719 
   3720 
   3721 
   3722 //---------------------------------------------------------------------------
   3723 //
   3724 //      Errors     Check for error handling in patterns.
   3725 //
   3726 //---------------------------------------------------------------------------
   3727 void RegexTest::Errors() {
   3728     // \escape sequences that aren't implemented yet.
   3729     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
   3730 
   3731     // Missing close parentheses
   3732     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
   3733     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
   3734     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
   3735 
   3736     // Extra close paren
   3737     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
   3738     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
   3739     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
   3740 
   3741     // Look-ahead, Look-behind
   3742     //  TODO:  add tests for unbounded length look-behinds.
   3743     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
   3744 
   3745     // Attempt to use non-default flags
   3746     {
   3747         UParseError   pe;
   3748         UErrorCode    status = U_ZERO_ERROR;
   3749         int32_t       flags  = UREGEX_CANON_EQ |
   3750                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
   3751                                UREGEX_MULTILINE;
   3752         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
   3753         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
   3754         delete pat1;
   3755     }
   3756 
   3757 
   3758     // Quantifiers are allowed only after something that can be quantified.
   3759     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
   3760     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
   3761     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
   3762 
   3763     // Mal-formed {min,max} quantifiers
   3764     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
   3765     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
   3766     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
   3767     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
   3768     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
   3769     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
   3770     REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
   3771     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
   3772     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
   3773 
   3774     // Ticket 5389
   3775     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
   3776 
   3777     // Invalid Back Reference \0
   3778     //    For ICU 3.8 and earlier
   3779     //    For ICU versions newer than 3.8, \0 introduces an octal escape.
   3780     //
   3781     REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
   3782 
   3783 }
   3784 
   3785 
   3786 //-------------------------------------------------------------------------------
   3787 //
   3788 //  Read a text data file, convert it to UChars, and return the data
   3789 //    in one big UChar * buffer, which the caller must delete.
   3790 //
   3791 //--------------------------------------------------------------------------------
   3792 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
   3793                                      const char *defEncoding, UErrorCode &status) {
   3794     UChar       *retPtr  = NULL;
   3795     char        *fileBuf = NULL;
   3796     UConverter* conv     = NULL;
   3797     FILE        *f       = NULL;
   3798 
   3799     ulen = 0;
   3800     if (U_FAILURE(status)) {
   3801         return retPtr;
   3802     }
   3803 
   3804     //
   3805     //  Open the file.
   3806     //
   3807     f = fopen(fileName, "rb");
   3808     if (f == 0) {
   3809         dataerrln("Error opening test data file %s\n", fileName);
   3810         status = U_FILE_ACCESS_ERROR;
   3811         return NULL;
   3812     }
   3813     //
   3814     //  Read it in
   3815     //
   3816     int32_t            fileSize;
   3817     int32_t            amt_read;
   3818 
   3819     fseek( f, 0, SEEK_END);
   3820     fileSize = ftell(f);
   3821     fileBuf = new char[fileSize];
   3822     fseek(f, 0, SEEK_SET);
   3823     amt_read = fread(fileBuf, 1, fileSize, f);
   3824     if (amt_read != fileSize || fileSize <= 0) {
   3825         errln("Error reading test data file.");
   3826         goto cleanUpAndReturn;
   3827     }
   3828 
   3829     //
   3830     // Look for a Unicode Signature (BOM) on the data just read
   3831     //
   3832     int32_t        signatureLength;
   3833     const char *   fileBufC;
   3834     const char*    encoding;
   3835 
   3836     fileBufC = fileBuf;
   3837     encoding = ucnv_detectUnicodeSignature(
   3838         fileBuf, fileSize, &signatureLength, &status);
   3839     if(encoding!=NULL ){
   3840         fileBufC  += signatureLength;
   3841         fileSize  -= signatureLength;
   3842     } else {
   3843         encoding = defEncoding;
   3844         if (strcmp(encoding, "utf-8") == 0) {
   3845             errln("file %s is missing its BOM", fileName);
   3846         }
   3847     }
   3848 
   3849     //
   3850     // Open a converter to take the rule file to UTF-16
   3851     //
   3852     conv = ucnv_open(encoding, &status);
   3853     if (U_FAILURE(status)) {
   3854         goto cleanUpAndReturn;
   3855     }
   3856 
   3857     //
   3858     // Convert the rules to UChar.
   3859     //  Preflight first to determine required buffer size.
   3860     //
   3861     ulen = ucnv_toUChars(conv,
   3862         NULL,           //  dest,
   3863         0,              //  destCapacity,
   3864         fileBufC,
   3865         fileSize,
   3866         &status);
   3867     if (status == U_BUFFER_OVERFLOW_ERROR) {
   3868         // Buffer Overflow is expected from the preflight operation.
   3869         status = U_ZERO_ERROR;
   3870 
   3871         retPtr = new UChar[ulen+1];
   3872         ucnv_toUChars(conv,
   3873             retPtr,       //  dest,
   3874             ulen+1,
   3875             fileBufC,
   3876             fileSize,
   3877             &status);
   3878     }
   3879 
   3880 cleanUpAndReturn:
   3881     fclose(f);
   3882     delete[] fileBuf;
   3883     ucnv_close(conv);
   3884     if (U_FAILURE(status)) {
   3885         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   3886         delete []retPtr;
   3887         retPtr = 0;
   3888         ulen   = 0;
   3889     };
   3890     return retPtr;
   3891 }
   3892 
   3893 
   3894 //-------------------------------------------------------------------------------
   3895 //
   3896 //   PerlTests  - Run Perl's regular expression tests
   3897 //                The input file for this test is re_tests, the standard regular
   3898 //                expression test data distributed with the Perl source code.
   3899 //
   3900 //                Here is Perl's description of the test data file:
   3901 //
   3902 //        # The tests are in a separate file 't/op/re_tests'.
   3903 //        # Each line in that file is a separate test.
   3904 //        # There are five columns, separated by tabs.
   3905 //        #
   3906 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
   3907 //        # Modifiers can be put after the closing C<'>.
   3908 //        #
   3909 //        # Column 2 contains the string to be matched.
   3910 //        #
   3911 //        # Column 3 contains the expected result:
   3912 //        #     y   expect a match
   3913 //        #     n   expect no match
   3914 //        #     c   expect an error
   3915 //        # B   test exposes a known bug in Perl, should be skipped
   3916 //        # b   test exposes a known bug in Perl, should be skipped if noamp
   3917 //        #
   3918 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
   3919 //        #
   3920 //        # Column 4 contains a string, usually C<$&>.
   3921 //        #
   3922 //        # Column 5 contains the expected result of double-quote
   3923 //        # interpolating that string after the match, or start of error message.
   3924 //        #
   3925 //        # Column 6, if present, contains a reason why the test is skipped.
   3926 //        # This is printed with "skipped", for harness to pick up.
   3927 //        #
   3928 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
   3929 //        #
   3930 //        # If you want to add a regular expression test that can't be expressed
   3931 //        # in this format, don't add it here: put it in op/pat.t instead.
   3932 //
   3933 //        For ICU, if field 3 contains an 'i', the test will be skipped.
   3934 //        The test exposes is some known incompatibility between ICU and Perl regexps.
   3935 //        (The i is in addition to whatever was there before.)
   3936 //
   3937 //-------------------------------------------------------------------------------
   3938 void RegexTest::PerlTests() {
   3939     char tdd[2048];
   3940     const char *srcPath;
   3941     UErrorCode  status = U_ZERO_ERROR;
   3942     UParseError pe;
   3943 
   3944     //
   3945     //  Open and read the test data file.
   3946     //
   3947     srcPath=getPath(tdd, "re_tests.txt");
   3948     if(srcPath==NULL) {
   3949         return; /* something went wrong, error already output */
   3950     }
   3951 
   3952     int32_t    len;
   3953     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
   3954     if (U_FAILURE(status)) {
   3955         return; /* something went wrong, error already output */
   3956     }
   3957 
   3958     //
   3959     //  Put the test data into a UnicodeString
   3960     //
   3961     UnicodeString testDataString(FALSE, testData, len);
   3962 
   3963     //
   3964     //  Regex to break the input file into lines, and strip the new lines.
   3965     //     One line per match, capture group one is the desired data.
   3966     //
   3967     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
   3968     if (U_FAILURE(status)) {
   3969         dataerrln("RegexPattern::compile() error");
   3970         return;
   3971     }
   3972     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
   3973 
   3974     //
   3975     //  Regex to split a test file line into fields.
   3976     //    There are six fields, separated by tabs.
   3977     //
   3978     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
   3979 
   3980     //
   3981     //  Regex to identify test patterns with flag settings, and to separate them.
   3982     //    Test patterns with flags look like 'pattern'i
   3983     //    Test patterns without flags are not quoted:   pattern
   3984     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
   3985     //
   3986     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
   3987     RegexMatcher* flagMat = flagPat->matcher(status);
   3988 
   3989     //
   3990     // The Perl tests reference several perl-isms, which are evaluated/substituted
   3991     //   in the test data.  Not being perl, this must be done explicitly.  Here
   3992     //   are string constants and REs for these constructs.
   3993     //
   3994     UnicodeString nulnulSrc("${nulnul}");
   3995     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
   3996     nulnul = nulnul.unescape();
   3997 
   3998     UnicodeString ffffSrc("${ffff}");
   3999     UnicodeString ffff("\\uffff", -1, US_INV);
   4000     ffff = ffff.unescape();
   4001 
   4002     //  regexp for $-[0], $+[2], etc.
   4003     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
   4004     RegexMatcher *groupsMat = groupsPat->matcher(status);
   4005 
   4006     //  regexp for $0, $1, $2, etc.
   4007     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
   4008     RegexMatcher *cgMat = cgPat->matcher(status);
   4009 
   4010 
   4011     //
   4012     // Main Loop for the Perl Tests, runs once per line from the
   4013     //   test data file.
   4014     //
   4015     int32_t  lineNum = 0;
   4016     int32_t  skippedUnimplementedCount = 0;
   4017     while (lineMat->find()) {
   4018         lineNum++;
   4019 
   4020         //
   4021         //  Get a line, break it into its fields, do the Perl
   4022         //    variable substitutions.
   4023         //
   4024         UnicodeString line = lineMat->group(1, status);
   4025         UnicodeString fields[7];
   4026         fieldPat->split(line, fields, 7, status);
   4027 
   4028         flagMat->reset(fields[0]);
   4029         flagMat->matches(status);
   4030         UnicodeString pattern  = flagMat->group(2, status);
   4031         pattern.findAndReplace("${bang}", "!");
   4032         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
   4033         pattern.findAndReplace(ffffSrc, ffff);
   4034 
   4035         //
   4036         //  Identify patterns that include match flag settings,
   4037         //    split off the flags, remove the extra quotes.
   4038         //
   4039         UnicodeString flagStr = flagMat->group(3, status);
   4040         if (U_FAILURE(status)) {
   4041             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   4042             return;
   4043         }
   4044         int32_t flags = 0;
   4045         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
   4046         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
   4047         const UChar UChar_m = 0x6d;
   4048         const UChar UChar_x = 0x78;
   4049         const UChar UChar_y = 0x79;
   4050         if (flagStr.indexOf(UChar_i) != -1) {
   4051             flags |= UREGEX_CASE_INSENSITIVE;
   4052         }
   4053         if (flagStr.indexOf(UChar_m) != -1) {
   4054             flags |= UREGEX_MULTILINE;
   4055         }
   4056         if (flagStr.indexOf(UChar_x) != -1) {
   4057             flags |= UREGEX_COMMENTS;
   4058         }
   4059 
   4060         //
   4061         // Compile the test pattern.
   4062         //
   4063         status = U_ZERO_ERROR;
   4064         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
   4065         if (status == U_REGEX_UNIMPLEMENTED) {
   4066             //
   4067             // Test of a feature that is planned for ICU, but not yet implemented.
   4068             //   skip the test.
   4069             skippedUnimplementedCount++;
   4070             delete testPat;
   4071             status = U_ZERO_ERROR;
   4072             continue;
   4073         }
   4074 
   4075         if (U_FAILURE(status)) {
   4076             // Some tests are supposed to generate errors.
   4077             //   Only report an error for tests that are supposed to succeed.
   4078             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
   4079                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
   4080             {
   4081                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
   4082             }
   4083             status = U_ZERO_ERROR;
   4084             delete testPat;
   4085             continue;
   4086         }
   4087 
   4088         if (fields[2].indexOf(UChar_i) >= 0) {
   4089             // ICU should skip this test.
   4090             delete testPat;
   4091             continue;
   4092         }
   4093 
   4094         if (fields[2].indexOf(UChar_c) >= 0) {
   4095             // This pattern should have caused a compilation error, but didn't/
   4096             errln("line %d: Expected a pattern compile error, got success.", lineNum);
   4097             delete testPat;
   4098             continue;
   4099         }
   4100 
   4101         //
   4102         // replace the Perl variables that appear in some of the
   4103         //   match data strings.
   4104         //
   4105         UnicodeString matchString = fields[1];
   4106         matchString.findAndReplace(nulnulSrc, nulnul);
   4107         matchString.findAndReplace(ffffSrc,   ffff);
   4108 
   4109         // Replace any \n in the match string with an actual new-line char.
   4110         //  Don't do full unescape, as this unescapes more than Perl does, which
   4111         //  causes other spurious failures in the tests.
   4112         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4113 
   4114 
   4115 
   4116         //
   4117         // Run the test, check for expected match/don't match result.
   4118         //
   4119         RegexMatcher *testMat = testPat->matcher(matchString, status);
   4120         UBool found = testMat->find();
   4121         UBool expected = FALSE;
   4122         if (fields[2].indexOf(UChar_y) >=0) {
   4123             expected = TRUE;
   4124         }
   4125         if (expected != found) {
   4126             errln("line %d: Expected %smatch, got %smatch",
   4127                 lineNum, expected?"":"no ", found?"":"no " );
   4128             continue;
   4129         }
   4130 
   4131         // Don't try to check expected results if there is no match.
   4132         //   (Some have stuff in the expected fields)
   4133         if (!found) {
   4134             delete testMat;
   4135             delete testPat;
   4136             continue;
   4137         }
   4138 
   4139         //
   4140         // Interpret the Perl expression from the fourth field of the data file,
   4141         // building up an ICU string from the results of the ICU match.
   4142         //   The Perl expression will contain references to the results of
   4143         //     a regex match, including the matched string, capture group strings,
   4144         //     group starting and ending indicies, etc.
   4145         //
   4146         UnicodeString resultString;
   4147         UnicodeString perlExpr = fields[3];
   4148 #if SUPPORT_MUTATING_INPUT_STRING
   4149         groupsMat->reset(perlExpr);
   4150         cgMat->reset(perlExpr);
   4151 #endif
   4152 
   4153         while (perlExpr.length() > 0) {
   4154 #if !SUPPORT_MUTATING_INPUT_STRING
   4155             //  Perferred usage.  Reset after any modification to input string.
   4156             groupsMat->reset(perlExpr);
   4157             cgMat->reset(perlExpr);
   4158 #endif
   4159 
   4160             if (perlExpr.startsWith("$&")) {
   4161                 resultString.append(testMat->group(status));
   4162                 perlExpr.remove(0, 2);
   4163             }
   4164 
   4165             else if (groupsMat->lookingAt(status)) {
   4166                 // $-[0]   $+[2]  etc.
   4167                 UnicodeString digitString = groupsMat->group(2, status);
   4168                 int32_t t = 0;
   4169                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4170                 UnicodeString plusOrMinus = groupsMat->group(1, status);
   4171                 int32_t matchPosition;
   4172                 if (plusOrMinus.compare("+") == 0) {
   4173                     matchPosition = testMat->end(groupNum, status);
   4174                 } else {
   4175                     matchPosition = testMat->start(groupNum, status);
   4176                 }
   4177                 if (matchPosition != -1) {
   4178                     ICU_Utility::appendNumber(resultString, matchPosition);
   4179                 }
   4180                 perlExpr.remove(0, groupsMat->end(status));
   4181             }
   4182 
   4183             else if (cgMat->lookingAt(status)) {
   4184                 // $1, $2, $3, etc.
   4185                 UnicodeString digitString = cgMat->group(1, status);
   4186                 int32_t t = 0;
   4187                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4188                 if (U_SUCCESS(status)) {
   4189                     resultString.append(testMat->group(groupNum, status));
   4190                     status = U_ZERO_ERROR;
   4191                 }
   4192                 perlExpr.remove(0, cgMat->end(status));
   4193             }
   4194 
   4195             else if (perlExpr.startsWith("@-")) {
   4196                 int32_t i;
   4197                 for (i=0; i<=testMat->groupCount(); i++) {
   4198                     if (i>0) {
   4199                         resultString.append(" ");
   4200                     }
   4201                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
   4202                 }
   4203                 perlExpr.remove(0, 2);
   4204             }
   4205 
   4206             else if (perlExpr.startsWith("@+")) {
   4207                 int32_t i;
   4208                 for (i=0; i<=testMat->groupCount(); i++) {
   4209                     if (i>0) {
   4210                         resultString.append(" ");
   4211                     }
   4212                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
   4213                 }
   4214                 perlExpr.remove(0, 2);
   4215             }
   4216 
   4217             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
   4218                                                      //           or as an escaped sequence (e.g. \n)
   4219                 if (perlExpr.length() > 1) {
   4220                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
   4221                 }
   4222                 UChar c = perlExpr.charAt(0);
   4223                 switch (c) {
   4224                 case 'n':   c = '\n'; break;
   4225                 // add any other escape sequences that show up in the test expected results.
   4226                 }
   4227                 resultString.append(c);
   4228                 perlExpr.remove(0, 1);
   4229             }
   4230 
   4231             else  {
   4232                 // Any characters from the perl expression that we don't explicitly
   4233                 //  recognize before here are assumed to be literals and copied
   4234                 //  as-is to the expected results.
   4235                 resultString.append(perlExpr.charAt(0));
   4236                 perlExpr.remove(0, 1);
   4237             }
   4238 
   4239             if (U_FAILURE(status)) {
   4240                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
   4241                 break;
   4242             }
   4243         }
   4244 
   4245         //
   4246         // Expected Results Compare
   4247         //
   4248         UnicodeString expectedS(fields[4]);
   4249         expectedS.findAndReplace(nulnulSrc, nulnul);
   4250         expectedS.findAndReplace(ffffSrc,   ffff);
   4251         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4252 
   4253 
   4254         if (expectedS.compare(resultString) != 0) {
   4255             err("Line %d: Incorrect perl expression results.", lineNum);
   4256             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
   4257         }
   4258 
   4259         delete testMat;
   4260         delete testPat;
   4261     }
   4262 
   4263     //
   4264     // All done.  Clean up allocated stuff.
   4265     //
   4266     delete cgMat;
   4267     delete cgPat;
   4268 
   4269     delete groupsMat;
   4270     delete groupsPat;
   4271 
   4272     delete flagMat;
   4273     delete flagPat;
   4274 
   4275     delete lineMat;
   4276     delete linePat;
   4277 
   4278     delete fieldPat;
   4279     delete [] testData;
   4280 
   4281 
   4282     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
   4283 
   4284 }
   4285 
   4286 
   4287 //-------------------------------------------------------------------------------
   4288 //
   4289 //   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
   4290 //                  (instead of using UnicodeStrings) to test the alternate engine.
   4291 //                  The input file for this test is re_tests, the standard regular
   4292 //                  expression test data distributed with the Perl source code.
   4293 //                  See PerlTests() for more information.
   4294 //
   4295 //-------------------------------------------------------------------------------
   4296 void RegexTest::PerlTestsUTF8() {
   4297     char tdd[2048];
   4298     const char *srcPath;
   4299     UErrorCode  status = U_ZERO_ERROR;
   4300     UParseError pe;
   4301     LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
   4302     UText       patternText = UTEXT_INITIALIZER;
   4303     char       *patternChars = NULL;
   4304     int32_t     patternLength;
   4305     int32_t     patternCapacity = 0;
   4306     UText       inputText = UTEXT_INITIALIZER;
   4307     char       *inputChars = NULL;
   4308     int32_t     inputLength;
   4309     int32_t     inputCapacity = 0;
   4310 
   4311     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
   4312 
   4313     //
   4314     //  Open and read the test data file.
   4315     //
   4316     srcPath=getPath(tdd, "re_tests.txt");
   4317     if(srcPath==NULL) {
   4318         return; /* something went wrong, error already output */
   4319     }
   4320 
   4321     int32_t    len;
   4322     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
   4323     if (U_FAILURE(status)) {
   4324         return; /* something went wrong, error already output */
   4325     }
   4326 
   4327     //
   4328     //  Put the test data into a UnicodeString
   4329     //
   4330     UnicodeString testDataString(FALSE, testData, len);
   4331 
   4332     //
   4333     //  Regex to break the input file into lines, and strip the new lines.
   4334     //     One line per match, capture group one is the desired data.
   4335     //
   4336     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
   4337     if (U_FAILURE(status)) {
   4338         dataerrln("RegexPattern::compile() error");
   4339         return;
   4340     }
   4341     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
   4342 
   4343     //
   4344     //  Regex to split a test file line into fields.
   4345     //    There are six fields, separated by tabs.
   4346     //
   4347     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
   4348 
   4349     //
   4350     //  Regex to identify test patterns with flag settings, and to separate them.
   4351     //    Test patterns with flags look like 'pattern'i
   4352     //    Test patterns without flags are not quoted:   pattern
   4353     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
   4354     //
   4355     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
   4356     RegexMatcher* flagMat = flagPat->matcher(status);
   4357 
   4358     //
   4359     // The Perl tests reference several perl-isms, which are evaluated/substituted
   4360     //   in the test data.  Not being perl, this must be done explicitly.  Here
   4361     //   are string constants and REs for these constructs.
   4362     //
   4363     UnicodeString nulnulSrc("${nulnul}");
   4364     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
   4365     nulnul = nulnul.unescape();
   4366 
   4367     UnicodeString ffffSrc("${ffff}");
   4368     UnicodeString ffff("\\uffff", -1, US_INV);
   4369     ffff = ffff.unescape();
   4370 
   4371     //  regexp for $-[0], $+[2], etc.
   4372     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
   4373     RegexMatcher *groupsMat = groupsPat->matcher(status);
   4374 
   4375     //  regexp for $0, $1, $2, etc.
   4376     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
   4377     RegexMatcher *cgMat = cgPat->matcher(status);
   4378 
   4379 
   4380     //
   4381     // Main Loop for the Perl Tests, runs once per line from the
   4382     //   test data file.
   4383     //
   4384     int32_t  lineNum = 0;
   4385     int32_t  skippedUnimplementedCount = 0;
   4386     while (lineMat->find()) {
   4387         lineNum++;
   4388 
   4389         //
   4390         //  Get a line, break it into its fields, do the Perl
   4391         //    variable substitutions.
   4392         //
   4393         UnicodeString line = lineMat->group(1, status);
   4394         UnicodeString fields[7];
   4395         fieldPat->split(line, fields, 7, status);
   4396 
   4397         flagMat->reset(fields[0]);
   4398         flagMat->matches(status);
   4399         UnicodeString pattern  = flagMat->group(2, status);
   4400         pattern.findAndReplace("${bang}", "!");
   4401         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
   4402         pattern.findAndReplace(ffffSrc, ffff);
   4403 
   4404         //
   4405         //  Identify patterns that include match flag settings,
   4406         //    split off the flags, remove the extra quotes.
   4407         //
   4408         UnicodeString flagStr = flagMat->group(3, status);
   4409         if (U_FAILURE(status)) {
   4410             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   4411             return;
   4412         }
   4413         int32_t flags = 0;
   4414         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
   4415         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
   4416         const UChar UChar_m = 0x6d;
   4417         const UChar UChar_x = 0x78;
   4418         const UChar UChar_y = 0x79;
   4419         if (flagStr.indexOf(UChar_i) != -1) {
   4420             flags |= UREGEX_CASE_INSENSITIVE;
   4421         }
   4422         if (flagStr.indexOf(UChar_m) != -1) {
   4423             flags |= UREGEX_MULTILINE;
   4424         }
   4425         if (flagStr.indexOf(UChar_x) != -1) {
   4426             flags |= UREGEX_COMMENTS;
   4427         }
   4428 
   4429         //
   4430         // Put the pattern in a UTF-8 UText
   4431         //
   4432         status = U_ZERO_ERROR;
   4433         patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
   4434         if (status == U_BUFFER_OVERFLOW_ERROR) {
   4435             status = U_ZERO_ERROR;
   4436             delete[] patternChars;
   4437             patternCapacity = patternLength + 1;
   4438             patternChars = new char[patternCapacity];
   4439             pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
   4440         }
   4441         utext_openUTF8(&patternText, patternChars, patternLength, &status);
   4442 
   4443         //
   4444         // Compile the test pattern.
   4445         //
   4446         RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
   4447         if (status == U_REGEX_UNIMPLEMENTED) {
   4448             //
   4449             // Test of a feature that is planned for ICU, but not yet implemented.
   4450             //   skip the test.
   4451             skippedUnimplementedCount++;
   4452             delete testPat;
   4453             status = U_ZERO_ERROR;
   4454             continue;
   4455         }
   4456 
   4457         if (U_FAILURE(status)) {
   4458             // Some tests are supposed to generate errors.
   4459             //   Only report an error for tests that are supposed to succeed.
   4460             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
   4461                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
   4462             {
   4463                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
   4464             }
   4465             status = U_ZERO_ERROR;
   4466             delete testPat;
   4467             continue;
   4468         }
   4469 
   4470         if (fields[2].indexOf(UChar_i) >= 0) {
   4471             // ICU should skip this test.
   4472             delete testPat;
   4473             continue;
   4474         }
   4475 
   4476         if (fields[2].indexOf(UChar_c) >= 0) {
   4477             // This pattern should have caused a compilation error, but didn't/
   4478             errln("line %d: Expected a pattern compile error, got success.", lineNum);
   4479             delete testPat;
   4480             continue;
   4481         }
   4482 
   4483 
   4484         //
   4485         // replace the Perl variables that appear in some of the
   4486         //   match data strings.
   4487         //
   4488         UnicodeString matchString = fields[1];
   4489         matchString.findAndReplace(nulnulSrc, nulnul);
   4490         matchString.findAndReplace(ffffSrc,   ffff);
   4491 
   4492         // Replace any \n in the match string with an actual new-line char.
   4493         //  Don't do full unescape, as this unescapes more than Perl does, which
   4494         //  causes other spurious failures in the tests.
   4495         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4496 
   4497         //
   4498         // Put the input in a UTF-8 UText
   4499         //
   4500         status = U_ZERO_ERROR;
   4501         inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
   4502         if (status == U_BUFFER_OVERFLOW_ERROR) {
   4503             status = U_ZERO_ERROR;
   4504             delete[] inputChars;
   4505             inputCapacity = inputLength + 1;
   4506             inputChars = new char[inputCapacity];
   4507             matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
   4508         }
   4509         utext_openUTF8(&inputText, inputChars, inputLength, &status);
   4510 
   4511         //
   4512         // Run the test, check for expected match/don't match result.
   4513         //
   4514         RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
   4515         UBool found = testMat->find();
   4516         UBool expected = FALSE;
   4517         if (fields[2].indexOf(UChar_y) >=0) {
   4518             expected = TRUE;
   4519         }
   4520         if (expected != found) {
   4521             errln("line %d: Expected %smatch, got %smatch",
   4522                 lineNum, expected?"":"no ", found?"":"no " );
   4523             continue;
   4524         }
   4525 
   4526         // Don't try to check expected results if there is no match.
   4527         //   (Some have stuff in the expected fields)
   4528         if (!found) {
   4529             delete testMat;
   4530             delete testPat;
   4531             continue;
   4532         }
   4533 
   4534         //
   4535         // Interpret the Perl expression from the fourth field of the data file,
   4536         // building up an ICU string from the results of the ICU match.
   4537         //   The Perl expression will contain references to the results of
   4538         //     a regex match, including the matched string, capture group strings,
   4539         //     group starting and ending indicies, etc.
   4540         //
   4541         UnicodeString resultString;
   4542         UnicodeString perlExpr = fields[3];
   4543 
   4544         while (perlExpr.length() > 0) {
   4545             groupsMat->reset(perlExpr);
   4546             cgMat->reset(perlExpr);
   4547 
   4548             if (perlExpr.startsWith("$&")) {
   4549                 resultString.append(testMat->group(status));
   4550                 perlExpr.remove(0, 2);
   4551             }
   4552 
   4553             else if (groupsMat->lookingAt(status)) {
   4554                 // $-[0]   $+[2]  etc.
   4555                 UnicodeString digitString = groupsMat->group(2, status);
   4556                 int32_t t = 0;
   4557                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4558                 UnicodeString plusOrMinus = groupsMat->group(1, status);
   4559                 int32_t matchPosition;
   4560                 if (plusOrMinus.compare("+") == 0) {
   4561                     matchPosition = testMat->end(groupNum, status);
   4562                 } else {
   4563                     matchPosition = testMat->start(groupNum, status);
   4564                 }
   4565                 if (matchPosition != -1) {
   4566                     ICU_Utility::appendNumber(resultString, matchPosition);
   4567                 }
   4568                 perlExpr.remove(0, groupsMat->end(status));
   4569             }
   4570 
   4571             else if (cgMat->lookingAt(status)) {
   4572                 // $1, $2, $3, etc.
   4573                 UnicodeString digitString = cgMat->group(1, status);
   4574                 int32_t t = 0;
   4575                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4576                 if (U_SUCCESS(status)) {
   4577                     resultString.append(testMat->group(groupNum, status));
   4578                     status = U_ZERO_ERROR;
   4579                 }
   4580                 perlExpr.remove(0, cgMat->end(status));
   4581             }
   4582 
   4583             else if (perlExpr.startsWith("@-")) {
   4584                 int32_t i;
   4585                 for (i=0; i<=testMat->groupCount(); i++) {
   4586                     if (i>0) {
   4587                         resultString.append(" ");
   4588                     }
   4589                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
   4590                 }
   4591                 perlExpr.remove(0, 2);
   4592             }
   4593 
   4594             else if (perlExpr.startsWith("@+")) {
   4595                 int32_t i;
   4596                 for (i=0; i<=testMat->groupCount(); i++) {
   4597                     if (i>0) {
   4598                         resultString.append(" ");
   4599                     }
   4600                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
   4601                 }
   4602                 perlExpr.remove(0, 2);
   4603             }
   4604 
   4605             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
   4606                                                      //           or as an escaped sequence (e.g. \n)
   4607                 if (perlExpr.length() > 1) {
   4608                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
   4609                 }
   4610                 UChar c = perlExpr.charAt(0);
   4611                 switch (c) {
   4612                 case 'n':   c = '\n'; break;
   4613                 // add any other escape sequences that show up in the test expected results.
   4614                 }
   4615                 resultString.append(c);
   4616                 perlExpr.remove(0, 1);
   4617             }
   4618 
   4619             else  {
   4620                 // Any characters from the perl expression that we don't explicitly
   4621                 //  recognize before here are assumed to be literals and copied
   4622                 //  as-is to the expected results.
   4623                 resultString.append(perlExpr.charAt(0));
   4624                 perlExpr.remove(0, 1);
   4625             }
   4626 
   4627             if (U_FAILURE(status)) {
   4628                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
   4629                 break;
   4630             }
   4631         }
   4632 
   4633         //
   4634         // Expected Results Compare
   4635         //
   4636         UnicodeString expectedS(fields[4]);
   4637         expectedS.findAndReplace(nulnulSrc, nulnul);
   4638         expectedS.findAndReplace(ffffSrc,   ffff);
   4639         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4640 
   4641 
   4642         if (expectedS.compare(resultString) != 0) {
   4643             err("Line %d: Incorrect perl expression results.", lineNum);
   4644             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
   4645         }
   4646 
   4647         delete testMat;
   4648         delete testPat;
   4649     }
   4650 
   4651     //
   4652     // All done.  Clean up allocated stuff.
   4653     //
   4654     delete cgMat;
   4655     delete cgPat;
   4656 
   4657     delete groupsMat;
   4658     delete groupsPat;
   4659 
   4660     delete flagMat;
   4661     delete flagPat;
   4662 
   4663     delete lineMat;
   4664     delete linePat;
   4665 
   4666     delete fieldPat;
   4667     delete [] testData;
   4668 
   4669     utext_close(&patternText);
   4670     utext_close(&inputText);
   4671 
   4672     delete [] patternChars;
   4673     delete [] inputChars;
   4674 
   4675 
   4676     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
   4677 
   4678 }
   4679 
   4680 
   4681 //--------------------------------------------------------------
   4682 //
   4683 //  Bug6149   Verify limits to heap expansion for backtrack stack.
   4684 //             Use this pattern,
   4685 //                 "(a?){1,}"
   4686 //             The zero-length match will repeat forever.
   4687 //                (That this goes into a loop is another bug)
   4688 //
   4689 //---------------------------------------------------------------
   4690 void RegexTest::Bug6149() {
   4691     UnicodeString pattern("(a?){1,}");
   4692     UnicodeString s("xyz");
   4693     uint32_t flags = 0;
   4694     UErrorCode status = U_ZERO_ERROR;
   4695 
   4696     RegexMatcher  matcher(pattern, s, flags, status);
   4697     UBool result = false;
   4698     REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
   4699     REGEX_ASSERT(result == FALSE);
   4700  }
   4701 
   4702 
   4703 //
   4704 //   Callbacks()    Test the callback function.
   4705 //                  When set, callbacks occur periodically during matching operations,
   4706 //                  giving the application code the ability to abort the operation
   4707 //                  before it's normal completion.
   4708 //
   4709 
   4710 struct callBackContext {
   4711     RegexTest        *test;
   4712     int32_t          maxCalls;
   4713     int32_t          numCalls;
   4714     int32_t          lastSteps;
   4715     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
   4716 };
   4717 
   4718 U_CDECL_BEGIN
   4719 static UBool U_CALLCONV
   4720 testCallBackFn(const void *context, int32_t steps) {
   4721     callBackContext  *info = (callBackContext *)context;
   4722     if (info->lastSteps+1 != steps) {
   4723         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
   4724     }
   4725     info->lastSteps = steps;
   4726     info->numCalls++;
   4727     return (info->numCalls < info->maxCalls);
   4728 }
   4729 U_CDECL_END
   4730 
   4731 void RegexTest::Callbacks() {
   4732    {
   4733         // Getter returns NULLs if no callback has been set
   4734 
   4735         //   The variables that the getter will fill in.
   4736         //   Init to non-null values so that the action of the getter can be seen.
   4737         const void          *returnedContext = &returnedContext;
   4738         URegexMatchCallback *returnedFn = &testCallBackFn;
   4739 
   4740         UErrorCode status = U_ZERO_ERROR;
   4741         RegexMatcher matcher("x", 0, status);
   4742         REGEX_CHECK_STATUS;
   4743         matcher.getMatchCallback(returnedFn, returnedContext, status);
   4744         REGEX_CHECK_STATUS;
   4745         REGEX_ASSERT(returnedFn == NULL);
   4746         REGEX_ASSERT(returnedContext == NULL);
   4747     }
   4748 
   4749    {
   4750         // Set and Get work
   4751         callBackContext cbInfo = {this, 0, 0, 0};
   4752         const void          *returnedContext;
   4753         URegexMatchCallback *returnedFn;
   4754         UErrorCode status = U_ZERO_ERROR;
   4755         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
   4756         REGEX_CHECK_STATUS;
   4757         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
   4758         REGEX_CHECK_STATUS;
   4759         matcher.getMatchCallback(returnedFn, returnedContext, status);
   4760         REGEX_CHECK_STATUS;
   4761         REGEX_ASSERT(returnedFn == testCallBackFn);
   4762         REGEX_ASSERT(returnedContext == &cbInfo);
   4763 
   4764         // A short-running match shouldn't invoke the callback
   4765         status = U_ZERO_ERROR;
   4766         cbInfo.reset(1);
   4767         UnicodeString s = "xxx";
   4768         matcher.reset(s);
   4769         REGEX_ASSERT(matcher.matches(status));
   4770         REGEX_CHECK_STATUS;
   4771         REGEX_ASSERT(cbInfo.numCalls == 0);
   4772 
   4773         // A medium-length match that runs long enough to invoke the
   4774         //   callback, but not so long that the callback aborts it.
   4775         status = U_ZERO_ERROR;
   4776         cbInfo.reset(4);
   4777         s = "aaaaaaaaaaaaaaaaaaab";
   4778         matcher.reset(s);
   4779         REGEX_ASSERT(matcher.matches(status)==FALSE);
   4780         REGEX_CHECK_STATUS;
   4781         REGEX_ASSERT(cbInfo.numCalls > 0);
   4782 
   4783         // A longer running match that the callback function will abort.
   4784         status = U_ZERO_ERROR;
   4785         cbInfo.reset(4);
   4786         s = "aaaaaaaaaaaaaaaaaaaaaaab";
   4787         matcher.reset(s);
   4788         REGEX_ASSERT(matcher.matches(status)==FALSE);
   4789         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
   4790         REGEX_ASSERT(cbInfo.numCalls == 4);
   4791     }
   4792 
   4793 
   4794 }
   4795 
   4796 
   4797 //
   4798 //   FindProgressCallbacks()    Test the find "progress" callback function.
   4799 //                  When set, the find progress callback will be invoked during a find operations
   4800 //                  after each return from a match attempt, giving the application the opportunity
   4801 //                  to terminate a long-running find operation before it's normal completion.
   4802 //
   4803 
   4804 struct progressCallBackContext {
   4805     RegexTest        *test;
   4806     int64_t          lastIndex;
   4807     int32_t          maxCalls;
   4808     int32_t          numCalls;
   4809     void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
   4810 };
   4811 
   4812 U_CDECL_BEGIN
   4813 static UBool U_CALLCONV
   4814 testProgressCallBackFn(const void *context, int64_t matchIndex) {
   4815     progressCallBackContext  *info = (progressCallBackContext *)context;
   4816     info->numCalls++;
   4817     info->lastIndex = matchIndex;
   4818 //    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
   4819     return (info->numCalls < info->maxCalls);
   4820 }
   4821 U_CDECL_END
   4822 
   4823 void RegexTest::FindProgressCallbacks() {
   4824    {
   4825         // Getter returns NULLs if no callback has been set
   4826 
   4827         //   The variables that the getter will fill in.
   4828         //   Init to non-null values so that the action of the getter can be seen.
   4829         const void                  *returnedContext = &returnedContext;
   4830         URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
   4831 
   4832         UErrorCode status = U_ZERO_ERROR;
   4833         RegexMatcher matcher("x", 0, status);
   4834         REGEX_CHECK_STATUS;
   4835         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
   4836         REGEX_CHECK_STATUS;
   4837         REGEX_ASSERT(returnedFn == NULL);
   4838         REGEX_ASSERT(returnedContext == NULL);
   4839     }
   4840 
   4841    {
   4842         // Set and Get work
   4843         progressCallBackContext cbInfo = {this, 0, 0, 0};
   4844         const void                  *returnedContext;
   4845         URegexFindProgressCallback  *returnedFn;
   4846         UErrorCode status = U_ZERO_ERROR;
   4847         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
   4848         REGEX_CHECK_STATUS;
   4849         matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
   4850         REGEX_CHECK_STATUS;
   4851         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
   4852         REGEX_CHECK_STATUS;
   4853         REGEX_ASSERT(returnedFn == testProgressCallBackFn);
   4854         REGEX_ASSERT(returnedContext == &cbInfo);
   4855 
   4856         // A short-running match should NOT invoke the callback.
   4857         status = U_ZERO_ERROR;
   4858         cbInfo.reset(100);
   4859         UnicodeString s = "abxxx";
   4860         matcher.reset(s);
   4861 #if 0
   4862         matcher.setTrace(TRUE);
   4863 #endif
   4864         REGEX_ASSERT(matcher.find(0, status));
   4865         REGEX_CHECK_STATUS;
   4866         REGEX_ASSERT(cbInfo.numCalls == 0);
   4867 
   4868         // A medium running match that causes matcher.find() to invoke our callback for each index.
   4869         status = U_ZERO_ERROR;
   4870         s = "aaaaaaaaaaaaaaaaaaab";
   4871         cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
   4872         matcher.reset(s);
   4873         REGEX_ASSERT(matcher.find(0, status)==FALSE);
   4874         REGEX_CHECK_STATUS;
   4875         REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
   4876 
   4877         // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
   4878         status = U_ZERO_ERROR;
   4879         UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
   4880         cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
   4881         matcher.reset(s1);
   4882         REGEX_ASSERT(matcher.find(0, status)==FALSE);
   4883         REGEX_CHECK_STATUS;
   4884         REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
   4885 
   4886 #if 0
   4887         // Now a match that will succeed, but after an interruption
   4888         status = U_ZERO_ERROR;
   4889         UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
   4890         cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
   4891         matcher.reset(s2);
   4892         REGEX_ASSERT(matcher.find(0, status)==FALSE);
   4893         REGEX_CHECK_STATUS;
   4894         // Now retry the match from where left off
   4895         cbInfo.maxCalls = 100; //  No callback limit
   4896         REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
   4897         REGEX_CHECK_STATUS;
   4898 #endif
   4899     }
   4900 
   4901 
   4902 }
   4903 
   4904 
   4905 //---------------------------------------------------------------------------
   4906 //
   4907 //    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
   4908 //                             UTexts. The pure-C implementation of UText
   4909 //                             has no mutable backing stores, but we can
   4910 //                             use UnicodeString here to test the functionality.
   4911 //
   4912 //---------------------------------------------------------------------------
   4913 void RegexTest::PreAllocatedUTextCAPI () {
   4914     UErrorCode           status = U_ZERO_ERROR;
   4915     URegularExpression  *re;
   4916     UText                patternText = UTEXT_INITIALIZER;
   4917     UnicodeString        buffer;
   4918     UText                bufferText = UTEXT_INITIALIZER;
   4919 
   4920     utext_openUnicodeString(&bufferText, &buffer, &status);
   4921 
   4922     /*
   4923      *  getText() and getUText()
   4924      */
   4925     {
   4926         UText  text1 = UTEXT_INITIALIZER;
   4927         UText  text2 = UTEXT_INITIALIZER;
   4928         UChar  text2Chars[20];
   4929         UText  *resultText;
   4930 
   4931         status = U_ZERO_ERROR;
   4932         regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
   4933         regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
   4934         u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
   4935         utext_openUChars(&text2, text2Chars, -1, &status);
   4936 
   4937         regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
   4938         re = uregex_openUText(&patternText, 0, NULL, &status);
   4939 
   4940         /* First set a UText */
   4941         uregex_setUText(re, &text1, &status);
   4942         resultText = uregex_getUText(re, &bufferText, &status);
   4943         REGEX_CHECK_STATUS;
   4944         REGEX_ASSERT(resultText == &bufferText);
   4945         utext_setNativeIndex(resultText, 0);
   4946         utext_setNativeIndex(&text1, 0);
   4947         REGEX_ASSERT(testUTextEqual(resultText, &text1));
   4948 
   4949         resultText = uregex_getUText(re, &bufferText, &status);
   4950         REGEX_CHECK_STATUS;
   4951         REGEX_ASSERT(resultText == &bufferText);
   4952         utext_setNativeIndex(resultText, 0);
   4953         utext_setNativeIndex(&text1, 0);
   4954         REGEX_ASSERT(testUTextEqual(resultText, &text1));
   4955 
   4956         /* Then set a UChar * */
   4957         uregex_setText(re, text2Chars, 7, &status);
   4958         resultText = uregex_getUText(re, &bufferText, &status);
   4959         REGEX_CHECK_STATUS;
   4960         REGEX_ASSERT(resultText == &bufferText);
   4961         utext_setNativeIndex(resultText, 0);
   4962         utext_setNativeIndex(&text2, 0);
   4963         REGEX_ASSERT(testUTextEqual(resultText, &text2));
   4964 
   4965         uregex_close(re);
   4966         utext_close(&text1);
   4967         utext_close(&text2);
   4968     }
   4969 
   4970     /*
   4971      *  group()
   4972      */
   4973     {
   4974         UChar    text1[80];
   4975         UText   *actual;
   4976         UBool    result;
   4977         u_uastrncpy(text1, "noise abc interior def, and this is off the end",  sizeof(text1)/2);
   4978 
   4979         status = U_ZERO_ERROR;
   4980         re = uregex_openC("abc(.*?)def", 0, NULL, &status);
   4981         REGEX_CHECK_STATUS;
   4982 
   4983         uregex_setText(re, text1, -1, &status);
   4984         result = uregex_find(re, 0, &status);
   4985         REGEX_ASSERT(result==TRUE);
   4986 
   4987         /*  Capture Group 0, the full match.  Should succeed.  */
   4988         status = U_ZERO_ERROR;
   4989         actual = uregex_groupUTextDeep(re, 0, &bufferText, &status);
   4990         REGEX_CHECK_STATUS;
   4991         REGEX_ASSERT(actual == &bufferText);
   4992         REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual);
   4993 
   4994         /*  Capture group #1.  Should succeed. */
   4995         status = U_ZERO_ERROR;
   4996         actual = uregex_groupUTextDeep(re, 1, &bufferText, &status);
   4997         REGEX_CHECK_STATUS;
   4998         REGEX_ASSERT(actual == &bufferText);
   4999         REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual);
   5000 
   5001         /*  Capture group out of range.  Error. */
   5002         status = U_ZERO_ERROR;
   5003         actual = uregex_groupUTextDeep(re, 2, &bufferText, &status);
   5004         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   5005         REGEX_ASSERT(actual == &bufferText);
   5006 
   5007         uregex_close(re);
   5008 
   5009     }
   5010 
   5011     /*
   5012      *  replaceFirst()
   5013      */
   5014     {
   5015         UChar    text1[80];
   5016         UChar    text2[80];
   5017         UText    replText = UTEXT_INITIALIZER;
   5018         UText   *result;
   5019 
   5020         status = U_ZERO_ERROR;
   5021         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
   5022         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
   5023         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
   5024 
   5025         re = uregex_openC("x(.*?)x", 0, NULL, &status);
   5026         REGEX_CHECK_STATUS;
   5027 
   5028         /*  Normal case, with match */
   5029         uregex_setText(re, text1, -1, &status);
   5030         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5031         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   5032         REGEX_CHECK_STATUS;
   5033         REGEX_ASSERT(result == &bufferText);
   5034         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
   5035 
   5036         /* No match.  Text should copy to output with no changes.  */
   5037         uregex_setText(re, text2, -1, &status);
   5038         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5039         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   5040         REGEX_CHECK_STATUS;
   5041         REGEX_ASSERT(result == &bufferText);
   5042         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
   5043 
   5044         /* Unicode escapes */
   5045         uregex_setText(re, text1, -1, &status);
   5046         regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
   5047         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5048         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   5049         REGEX_CHECK_STATUS;
   5050         REGEX_ASSERT(result == &bufferText);
   5051         REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
   5052 
   5053         uregex_close(re);
   5054         utext_close(&replText);
   5055     }
   5056 
   5057 
   5058     /*
   5059      *  replaceAll()
   5060      */
   5061     {
   5062         UChar    text1[80];
   5063         UChar    text2[80];
   5064         UText    replText = UTEXT_INITIALIZER;
   5065         UText   *result;
   5066 
   5067         status = U_ZERO_ERROR;
   5068         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
   5069         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
   5070         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
   5071 
   5072         re = uregex_openC("x(.*?)x", 0, NULL, &status);
   5073         REGEX_CHECK_STATUS;
   5074 
   5075         /*  Normal case, with match */
   5076         uregex_setText(re, text1, -1, &status);
   5077         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5078         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
   5079         REGEX_CHECK_STATUS;
   5080         REGEX_ASSERT(result == &bufferText);
   5081         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
   5082 
   5083         /* No match.  Text should copy to output with no changes.  */
   5084         uregex_setText(re, text2, -1, &status);
   5085         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5086         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
   5087         REGEX_CHECK_STATUS;
   5088         REGEX_ASSERT(result == &bufferText);
   5089         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
   5090 
   5091         uregex_close(re);
   5092         utext_close(&replText);
   5093     }
   5094 
   5095 
   5096     /*
   5097      *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
   5098      *   so we don't need to test it here.
   5099      */
   5100 
   5101     utext_close(&bufferText);
   5102     utext_close(&patternText);
   5103 }
   5104 
   5105 //--------------------------------------------------------------
   5106 //
   5107 //  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
   5108 //
   5109 //---------------------------------------------------------------
   5110 void RegexTest::Bug7651() {
   5111     UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
   5112     //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
   5113     //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
   5114     UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
   5115     UnicodeString s("#ff @abcd This is test");
   5116     RegexPattern  *REPattern = NULL;
   5117     RegexMatcher  *REMatcher = NULL;
   5118     UErrorCode status = U_ZERO_ERROR;
   5119     UParseError pe;
   5120 
   5121     REPattern = RegexPattern::compile(pattern1, 0, pe, status);
   5122     REGEX_CHECK_STATUS;
   5123     REMatcher = REPattern->matcher(s, status);
   5124     REGEX_CHECK_STATUS;
   5125     REGEX_ASSERT(REMatcher->find());
   5126     REGEX_ASSERT(REMatcher->start(status) == 0);
   5127     delete REPattern;
   5128     delete REMatcher;
   5129     status = U_ZERO_ERROR;
   5130 
   5131     REPattern = RegexPattern::compile(pattern2, 0, pe, status);
   5132     REGEX_CHECK_STATUS;
   5133     REMatcher = REPattern->matcher(s, status);
   5134     REGEX_CHECK_STATUS;
   5135     REGEX_ASSERT(REMatcher->find());
   5136     REGEX_ASSERT(REMatcher->start(status) == 0);
   5137     delete REPattern;
   5138     delete REMatcher;
   5139     status = U_ZERO_ERROR;
   5140  }
   5141 
   5142 void RegexTest::Bug7740() {
   5143     UErrorCode status = U_ZERO_ERROR;
   5144     UnicodeString pattern = "(a)";
   5145     UnicodeString text = "abcdef";
   5146     RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
   5147     REGEX_CHECK_STATUS;
   5148     REGEX_ASSERT(m->lookingAt(status));
   5149     REGEX_CHECK_STATUS;
   5150     status = U_ILLEGAL_ARGUMENT_ERROR;
   5151     UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
   5152     REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
   5153     REGEX_ASSERT(s == "");
   5154     delete m;
   5155 }
   5156 
   5157 // Bug 8479:  was crashing whith a Bogus UnicodeString as input.
   5158 
   5159 void RegexTest::Bug8479() {
   5160     UErrorCode status = U_ZERO_ERROR;
   5161 
   5162     RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
   5163     REGEX_CHECK_STATUS;
   5164     if (U_SUCCESS(status))
   5165     {
   5166         UnicodeString str;
   5167         str.setToBogus();
   5168         pMatcher->reset(str);
   5169         status = U_ZERO_ERROR;
   5170         pMatcher->matches(status);
   5171         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
   5172         delete pMatcher;
   5173     }
   5174 }
   5175 
   5176 
   5177 // Bug 7029
   5178 void RegexTest::Bug7029() {
   5179     UErrorCode status = U_ZERO_ERROR;
   5180 
   5181     RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
   5182     UnicodeString text = "abc.def";
   5183     UnicodeString splits[10];
   5184     REGEX_CHECK_STATUS;
   5185     int32_t numFields = pMatcher->split(text, splits, 10, status);
   5186     REGEX_CHECK_STATUS;
   5187     REGEX_ASSERT(numFields == 8);
   5188     delete pMatcher;
   5189 }
   5190 
   5191 // Bug 9283
   5192 //   This test is checking for the existance of any supplemental characters that case-fold
   5193 //   to a bmp character.
   5194 //
   5195 //   At the time of this writing there are none. If any should appear in a subsequent release
   5196 //   of Unicode, the code in regular expressions compilation that determines the longest
   5197 //   posssible match for a literal string  will need to be enhanced.
   5198 //
   5199 //   See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
   5200 //   for details on what to do in case of a failure of this test.
   5201 //
   5202 void RegexTest::Bug9283() {
   5203     UErrorCode status = U_ZERO_ERROR;
   5204     UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
   5205     REGEX_CHECK_STATUS;
   5206     int32_t index;
   5207     UChar32 c;
   5208     for (index=0; ; index++) {
   5209         c = supplementalsWithCaseFolding.charAt(index);
   5210         if (c == -1) {
   5211             break;
   5212         }
   5213         UnicodeString cf = UnicodeString(c).foldCase();
   5214         REGEX_ASSERT(cf.length() >= 2);
   5215     }
   5216 }
   5217 
   5218 
   5219 void RegexTest::CheckInvBufSize() {
   5220   if(inv_next>=INV_BUFSIZ) {
   5221     errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
   5222           __FILE__, INV_BUFSIZ, inv_next);
   5223   } else {
   5224     logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
   5225   }
   5226 }
   5227 
   5228 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
   5229 
   5230