Home | History | Annotate | Download | only in intltest
      1 /********************************************************************
      2  * COPYRIGHT:
      3  * Copyright (c) 2002-2013, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  ********************************************************************/
      6 
      7 //
      8 //   regextst.cpp
      9 //
     10 //      ICU Regular Expressions test, part of intltest.
     11 //
     12 
     13 /*
     14      NOTE!!
     15 
     16      PLEASE be careful about ASCII assumptions in this test.
     17      This test is one of the worst repeat offenders.
     18      If you have questions, contact someone on the ICU PMC
     19      who has access to an EBCDIC system.
     20 
     21  */
     22 
     23 #include "intltest.h"
     24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     25 
     26 #include "unicode/regex.h"
     27 #include "unicode/uchar.h"
     28 #include "unicode/ucnv.h"
     29 #include "unicode/uniset.h"
     30 #include "unicode/ustring.h"
     31 #include "regextst.h"
     32 #include "uvector.h"
     33 #include "util.h"
     34 #include <stdlib.h>
     35 #include <string.h>
     36 #include <stdio.h>
     37 #include "cstring.h"
     38 #include "uinvchar.h"
     39 
     40 #define SUPPORT_MUTATING_INPUT_STRING   0
     41 
     42 //---------------------------------------------------------------------------
     43 //
     44 //  Test class boilerplate
     45 //
     46 //---------------------------------------------------------------------------
     47 RegexTest::RegexTest()
     48 {
     49 }
     50 
     51 
     52 RegexTest::~RegexTest()
     53 {
     54 }
     55 
     56 
     57 
     58 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
     59 {
     60     if (exec) logln("TestSuite RegexTest: ");
     61     switch (index) {
     62 
     63         case 0: name = "Basic";
     64             if (exec) Basic();
     65             break;
     66         case 1: name = "API_Match";
     67             if (exec) API_Match();
     68             break;
     69         case 2: name = "API_Replace";
     70             if (exec) API_Replace();
     71             break;
     72         case 3: name = "API_Pattern";
     73             if (exec) API_Pattern();
     74             break;
     75         case 4:
     76 #if !UCONFIG_NO_FILE_IO
     77             name = "Extended";
     78             if (exec) Extended();
     79 #else
     80             name = "skip";
     81 #endif
     82             break;
     83         case 5: name = "Errors";
     84             if (exec) Errors();
     85             break;
     86         case 6: name = "PerlTests";
     87             if (exec) PerlTests();
     88             break;
     89         case 7: name = "Callbacks";
     90             if (exec) Callbacks();
     91             break;
     92         case 8: name = "FindProgressCallbacks";
     93             if (exec) FindProgressCallbacks();
     94             break;
     95         case 9: name = "Bug 6149";
     96              if (exec) Bug6149();
     97              break;
     98         case 10: name = "UTextBasic";
     99           if (exec) UTextBasic();
    100           break;
    101         case 11: name = "API_Match_UTF8";
    102           if (exec) API_Match_UTF8();
    103           break;
    104         case 12: name = "API_Replace_UTF8";
    105           if (exec) API_Replace_UTF8();
    106           break;
    107         case 13: name = "API_Pattern_UTF8";
    108           if (exec) API_Pattern_UTF8();
    109           break;
    110         case 14: name = "PerlTestsUTF8";
    111           if (exec) PerlTestsUTF8();
    112           break;
    113         case 15: name = "PreAllocatedUTextCAPI";
    114           if (exec) PreAllocatedUTextCAPI();
    115           break;
    116         case 16: name = "Bug 7651";
    117              if (exec) Bug7651();
    118              break;
    119         case 17: name = "Bug 7740";
    120             if (exec) Bug7740();
    121             break;
    122         case 18: name = "Bug 8479";
    123             if (exec) Bug8479();
    124             break;
    125         case 19: name = "Bug 7029";
    126             if (exec) Bug7029();
    127             break;
    128         case 20: name = "CheckInvBufSize";
    129             if (exec) CheckInvBufSize();
    130             break;
    131         case 21: name = "Bug 9283";
    132             if (exec) Bug9283();
    133             break;
    134 
    135         default: name = "";
    136             break; //needed to end loop
    137     }
    138 }
    139 
    140 
    141 
    142 /**
    143  * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
    144  * into ASCII.
    145  * @see utext_openUTF8
    146  */
    147 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
    148 
    149 //---------------------------------------------------------------------------
    150 //
    151 //   Error Checking / Reporting macros used in all of the tests.
    152 //
    153 //---------------------------------------------------------------------------
    154 
    155 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
    156   int64_t oldIndex = utext_getNativeIndex(text);
    157   utext_setNativeIndex(text, 0);
    158   char *bufPtr = buf;
    159   UChar32 c = utext_next32From(text, 0);
    160   while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
    161     if (0x000020<=c && c<0x00007e) {
    162       *bufPtr = c;
    163     } else {
    164 #if 0
    165       sprintf(bufPtr,"U+%04X", c);
    166       bufPtr+= strlen(bufPtr)-1;
    167 #else
    168       *bufPtr = '%';
    169 #endif
    170     }
    171     bufPtr++;
    172     c = UTEXT_NEXT32(text);
    173   }
    174   *bufPtr = 0;
    175 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
    176   char *ebuf = (char*)malloc(bufLen);
    177   uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
    178   uprv_strncpy(buf, ebuf, bufLen);
    179   free((void*)ebuf);
    180 #endif
    181   utext_setNativeIndex(text, oldIndex);
    182 }
    183 
    184 
    185 static char ASSERT_BUF[1024];
    186 
    187 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
    188   if(message.length()==0) {
    189     strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
    190   } else {
    191     UnicodeString buf;
    192     IntlTest::prettify(message,buf);
    193     if(buf.length()==0) {
    194       strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
    195     } else {
    196       buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
    197       if(ASSERT_BUF[0]==0) {
    198         ASSERT_BUF[0]=0;
    199         for(int32_t i=0;i<buf.length();i++) {
    200           UChar ch = buf[i];
    201           sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
    202         }
    203       }
    204     }
    205   }
    206   ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
    207   return ASSERT_BUF;
    208 }
    209 
    210 
    211 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
    212 
    213 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
    214                                                               __FILE__, __LINE__, u_errorName(status)); return;}}
    215 
    216 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
    217 
    218 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
    219 if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
    220     __LINE__, u_errorName(errcode), u_errorName(status));};}
    221 
    222 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
    223     "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
    224 
    225 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
    226     errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
    227 
    228 #define REGEX_ASSERT_UNISTR(ustr,inv) {if (!(ustr==inv)) {errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s,%s) failed \n", __FILE__, __LINE__, extractToAssertBuf(ustr),inv);};}
    229 
    230 
    231 static UBool testUTextEqual(UText *uta, UText *utb) {
    232     UChar32 ca = 0;
    233     UChar32 cb = 0;
    234     utext_setNativeIndex(uta, 0);
    235     utext_setNativeIndex(utb, 0);
    236     do {
    237         ca = utext_next32(uta);
    238         cb = utext_next32(utb);
    239         if (ca != cb) {
    240             break;
    241         }
    242     } while (ca != U_SENTINEL);
    243     return ca == cb;
    244 }
    245 
    246 
    247 /**
    248  * @param expected expected text in UTF-8 (not platform) codepage
    249  */
    250 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
    251     UErrorCode status = U_ZERO_ERROR;
    252     UText expectedText = UTEXT_INITIALIZER;
    253     utext_openUTF8(&expectedText, expected, -1, &status);
    254     if(U_FAILURE(status)) {
    255       errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
    256       return;
    257     }
    258     if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
    259       errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
    260       return;
    261     }
    262     utext_setNativeIndex(actual, 0);
    263     if (!testUTextEqual(&expectedText, actual)) {
    264         char buf[201 /*21*/];
    265         char expectedBuf[201];
    266         utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
    267         utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
    268         errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
    269     }
    270     utext_close(&expectedText);
    271 }
    272 /**
    273  * @param expected invariant (platform local text) input
    274  */
    275 
    276 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
    277     UErrorCode status = U_ZERO_ERROR;
    278     UText expectedText = UTEXT_INITIALIZER;
    279     regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
    280     if(U_FAILURE(status)) {
    281       errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
    282       return;
    283     }
    284     utext_setNativeIndex(actual, 0);
    285     if (!testUTextEqual(&expectedText, actual)) {
    286         char buf[201 /*21*/];
    287         char expectedBuf[201];
    288         utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
    289         utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
    290         errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
    291     }
    292     utext_close(&expectedText);
    293 }
    294 
    295 /**
    296  * Assumes utf-8 input
    297  */
    298 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
    299 /**
    300  * Assumes Invariant input
    301  */
    302 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
    303 
    304 /**
    305  * This buffer ( inv_buf ) is used to hold the UTF-8 strings
    306  * passed into utext_openUTF8. An error will be given if
    307  * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
    308  */
    309 
    310 #define INV_BUFSIZ 2048 /* increase this if too small */
    311 
    312 static int64_t inv_next=0;
    313 
    314 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
    315 static char inv_buf[INV_BUFSIZ];
    316 #endif
    317 
    318 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
    319   if(length==-1) length=strlen(inv);
    320 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
    321   inv_next+=length;
    322   return utext_openUTF8(ut, inv, length, status);
    323 #else
    324   if(inv_next+length+1>INV_BUFSIZ) {
    325     fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
    326             __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
    327     *status = U_MEMORY_ALLOCATION_ERROR;
    328     return NULL;
    329   }
    330 
    331   unsigned char *buf = (unsigned char*)inv_buf+inv_next;
    332   uprv_aestrncpy(buf, (const uint8_t*)inv, length);
    333   inv_next+=length;
    334 
    335 #if 0
    336   fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
    337 #endif
    338 
    339   return utext_openUTF8(ut, (const char*)buf, length, status);
    340 #endif
    341 }
    342 
    343 
    344 //---------------------------------------------------------------------------
    345 //
    346 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
    347 //                       for the LookingAt() and  Match() functions.
    348 //
    349 //       usage:
    350 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
    351 //
    352 //          The expected results are UBool - TRUE or FALSE.
    353 //          The input text is unescaped.  The pattern is not.
    354 //
    355 //
    356 //---------------------------------------------------------------------------
    357 
    358 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
    359 
    360 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
    361     const UnicodeString pattern(pat, -1, US_INV);
    362     const UnicodeString inputText(text, -1, US_INV);
    363     UErrorCode          status  = U_ZERO_ERROR;
    364     UParseError         pe;
    365     RegexPattern        *REPattern = NULL;
    366     RegexMatcher        *REMatcher = NULL;
    367     UBool               retVal     = TRUE;
    368 
    369     UnicodeString patString(pat, -1, US_INV);
    370     REPattern = RegexPattern::compile(patString, 0, pe, status);
    371     if (U_FAILURE(status)) {
    372         dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
    373             line, u_errorName(status));
    374         return FALSE;
    375     }
    376     if (line==376) { RegexPatternDump(REPattern);}
    377 
    378     UnicodeString inputString(inputText);
    379     UnicodeString unEscapedInput = inputString.unescape();
    380     REMatcher = REPattern->matcher(unEscapedInput, status);
    381     if (U_FAILURE(status)) {
    382         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
    383             line, u_errorName(status));
    384         return FALSE;
    385     }
    386 
    387     UBool actualmatch;
    388     actualmatch = REMatcher->lookingAt(status);
    389     if (U_FAILURE(status)) {
    390         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
    391             line, u_errorName(status));
    392         retVal =  FALSE;
    393     }
    394     if (actualmatch != looking) {
    395         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
    396         retVal = FALSE;
    397     }
    398 
    399     status = U_ZERO_ERROR;
    400     actualmatch = REMatcher->matches(status);
    401     if (U_FAILURE(status)) {
    402         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
    403             line, u_errorName(status));
    404         retVal = FALSE;
    405     }
    406     if (actualmatch != match) {
    407         errln("RegexTest: wrong return from matches() at line %d.\n", line);
    408         retVal = FALSE;
    409     }
    410 
    411     if (retVal == FALSE) {
    412         RegexPatternDump(REPattern);
    413     }
    414 
    415     delete REPattern;
    416     delete REMatcher;
    417     return retVal;
    418 }
    419 
    420 
    421 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
    422     UText               pattern    = UTEXT_INITIALIZER;
    423     int32_t             inputUTF8Length;
    424     char                *textChars = NULL;
    425     UText               inputText  = UTEXT_INITIALIZER;
    426     UErrorCode          status     = U_ZERO_ERROR;
    427     UParseError         pe;
    428     RegexPattern        *REPattern = NULL;
    429     RegexMatcher        *REMatcher = NULL;
    430     UBool               retVal     = TRUE;
    431 
    432     regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
    433     REPattern = RegexPattern::compile(&pattern, 0, pe, status);
    434     if (U_FAILURE(status)) {
    435         dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
    436             line, u_errorName(status));
    437         return FALSE;
    438     }
    439 
    440     UnicodeString inputString(text, -1, US_INV);
    441     UnicodeString unEscapedInput = inputString.unescape();
    442     LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
    443     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
    444 
    445     inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
    446     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
    447         // UTF-8 does not allow unpaired surrogates, so this could actually happen
    448         logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
    449         return TRUE; // not a failure of the Regex engine
    450     }
    451     status = U_ZERO_ERROR; // buffer overflow
    452     textChars = new char[inputUTF8Length+1];
    453     unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
    454     utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
    455 
    456     REMatcher = &REPattern->matcher(status)->reset(&inputText);
    457     if (U_FAILURE(status)) {
    458         errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
    459             line, u_errorName(status));
    460         return FALSE;
    461     }
    462 
    463     UBool actualmatch;
    464     actualmatch = REMatcher->lookingAt(status);
    465     if (U_FAILURE(status)) {
    466         errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
    467             line, u_errorName(status));
    468         retVal =  FALSE;
    469     }
    470     if (actualmatch != looking) {
    471         errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
    472         retVal = FALSE;
    473     }
    474 
    475     status = U_ZERO_ERROR;
    476     actualmatch = REMatcher->matches(status);
    477     if (U_FAILURE(status)) {
    478         errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
    479             line, u_errorName(status));
    480         retVal = FALSE;
    481     }
    482     if (actualmatch != match) {
    483         errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
    484         retVal = FALSE;
    485     }
    486 
    487     if (retVal == FALSE) {
    488         RegexPatternDump(REPattern);
    489     }
    490 
    491     delete REPattern;
    492     delete REMatcher;
    493     utext_close(&inputText);
    494     utext_close(&pattern);
    495     delete[] textChars;
    496     return retVal;
    497 }
    498 
    499 
    500 
    501 //---------------------------------------------------------------------------
    502 //
    503 //    REGEX_ERR       Macro + invocation function to simplify writing tests
    504 //                       regex tests for incorrect patterns
    505 //
    506 //       usage:
    507 //          REGEX_ERR("pattern",   expected error line, column, expected status);
    508 //
    509 //---------------------------------------------------------------------------
    510 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
    511 
    512 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
    513                           UErrorCode expectedStatus, int32_t line) {
    514     UnicodeString       pattern(pat);
    515 
    516     UErrorCode          status         = U_ZERO_ERROR;
    517     UParseError         pe;
    518     RegexPattern        *callerPattern = NULL;
    519 
    520     //
    521     //  Compile the caller's pattern
    522     //
    523     UnicodeString patString(pat);
    524     callerPattern = RegexPattern::compile(patString, 0, pe, status);
    525     if (status != expectedStatus) {
    526         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
    527     } else {
    528         if (status != U_ZERO_ERROR) {
    529             if (pe.line != errLine || pe.offset != errCol) {
    530                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
    531                     line, errLine, errCol, pe.line, pe.offset);
    532             }
    533         }
    534     }
    535 
    536     delete callerPattern;
    537 
    538     //
    539     //  Compile again, using a UTF-8-based UText
    540     //
    541     UText patternText = UTEXT_INITIALIZER;
    542     regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
    543     callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
    544     if (status != expectedStatus) {
    545         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
    546     } else {
    547         if (status != U_ZERO_ERROR) {
    548             if (pe.line != errLine || pe.offset != errCol) {
    549                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
    550                     line, errLine, errCol, pe.line, pe.offset);
    551             }
    552         }
    553     }
    554 
    555     delete callerPattern;
    556     utext_close(&patternText);
    557 }
    558 
    559 
    560 
    561 //---------------------------------------------------------------------------
    562 //
    563 //      Basic      Check for basic functionality of regex pattern matching.
    564 //                 Avoid the use of REGEX_FIND test macro, which has
    565 //                 substantial dependencies on basic Regex functionality.
    566 //
    567 //---------------------------------------------------------------------------
    568 void RegexTest::Basic() {
    569 
    570 
    571 //
    572 // Debug - slide failing test cases early
    573 //
    574 #if 0
    575     {
    576         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
    577         UParseError pe;
    578         UErrorCode  status = U_ZERO_ERROR;
    579         RegexPattern *pattern;
    580         pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
    581         RegexPatternDump(pattern);
    582         RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
    583         UBool result = m->find();
    584         printf("result = %d\n", result);
    585         // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
    586         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
    587     }
    588     exit(1);
    589 #endif
    590 
    591 
    592     //
    593     // Pattern with parentheses
    594     //
    595     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
    596     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
    597     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
    598 
    599     //
    600     // Patterns with *
    601     //
    602     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
    603     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
    604     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
    605     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
    606     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
    607 
    608     REGEX_TESTLM("a*", "",  TRUE, TRUE);
    609     REGEX_TESTLM("a*", "b", TRUE, FALSE);
    610 
    611 
    612     //
    613     //  Patterns with "."
    614     //
    615     REGEX_TESTLM(".", "abc", TRUE, FALSE);
    616     REGEX_TESTLM("...", "abc", TRUE, TRUE);
    617     REGEX_TESTLM("....", "abc", FALSE, FALSE);
    618     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
    619     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
    620     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
    621     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
    622     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
    623 
    624     //
    625     //  Patterns with * applied to chars at end of literal string
    626     //
    627     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
    628     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
    629 
    630     //
    631     //  Supplemental chars match as single chars, not a pair of surrogates.
    632     //
    633     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
    634     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
    635     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
    636 
    637 
    638     //
    639     //  UnicodeSets in the pattern
    640     //
    641     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
    642     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
    643     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
    644     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
    645     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
    646     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
    647 
    648     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
    649     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
    650     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
    651     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
    652     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
    653 
    654     //
    655     //   OR operator in patterns
    656     //
    657     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
    658     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
    659     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
    660     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
    661 
    662     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
    663     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
    664     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
    665     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
    666     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
    667     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
    668 
    669     //
    670     //  +
    671     //
    672     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
    673     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
    674     REGEX_TESTLM("b+", "", FALSE, FALSE);
    675     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
    676     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
    677     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
    678 
    679     //
    680     //   ?
    681     //
    682     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
    683     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
    684     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
    685     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
    686     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
    687     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
    688     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
    689     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
    690     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
    691 
    692     //
    693     //  Escape sequences that become single literal chars, handled internally
    694     //   by ICU's Unescape.
    695     //
    696 
    697     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
    698     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
    699     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
    700     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
    701     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
    702     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
    703     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
    704     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
    705     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
    706     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
    707 
    708     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
    709     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
    710 
    711     // Escape of special chars in patterns
    712     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
    713 }
    714 
    715 
    716 //---------------------------------------------------------------------------
    717 //
    718 //    UTextBasic   Check for quirks that are specific to the UText
    719 //                 implementation.
    720 //
    721 //---------------------------------------------------------------------------
    722 void RegexTest::UTextBasic() {
    723     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
    724     UErrorCode status = U_ZERO_ERROR;
    725     UText pattern = UTEXT_INITIALIZER;
    726     utext_openUTF8(&pattern, str_abc, -1, &status);
    727     RegexMatcher matcher(&pattern, 0, status);
    728     REGEX_CHECK_STATUS;
    729 
    730     UText input = UTEXT_INITIALIZER;
    731     utext_openUTF8(&input, str_abc, -1, &status);
    732     REGEX_CHECK_STATUS;
    733     matcher.reset(&input);
    734     REGEX_CHECK_STATUS;
    735     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
    736 
    737     matcher.reset(matcher.inputText());
    738     REGEX_CHECK_STATUS;
    739     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
    740 
    741     utext_close(&pattern);
    742     utext_close(&input);
    743 }
    744 
    745 
    746 //---------------------------------------------------------------------------
    747 //
    748 //      API_Match   Test that the API for class RegexMatcher
    749 //                  is present and nominally working, but excluding functions
    750 //                  implementing replace operations.
    751 //
    752 //---------------------------------------------------------------------------
    753 void RegexTest::API_Match() {
    754     UParseError         pe;
    755     UErrorCode          status=U_ZERO_ERROR;
    756     int32_t             flags = 0;
    757 
    758     //
    759     // Debug - slide failing test cases early
    760     //
    761 #if 0
    762     {
    763     }
    764     return;
    765 #endif
    766 
    767     //
    768     // Simple pattern compilation
    769     //
    770     {
    771         UnicodeString       re("abc");
    772         RegexPattern        *pat2;
    773         pat2 = RegexPattern::compile(re, flags, pe, status);
    774         REGEX_CHECK_STATUS;
    775 
    776         UnicodeString inStr1 = "abcdef this is a test";
    777         UnicodeString instr2 = "not abc";
    778         UnicodeString empty  = "";
    779 
    780 
    781         //
    782         // Matcher creation and reset.
    783         //
    784         RegexMatcher *m1 = pat2->matcher(inStr1, status);
    785         REGEX_CHECK_STATUS;
    786         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    787         REGEX_ASSERT(m1->input() == inStr1);
    788         m1->reset(instr2);
    789         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
    790         REGEX_ASSERT(m1->input() == instr2);
    791         m1->reset(inStr1);
    792         REGEX_ASSERT(m1->input() == inStr1);
    793         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    794         m1->reset(empty);
    795         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
    796         REGEX_ASSERT(m1->input() == empty);
    797         REGEX_ASSERT(&m1->pattern() == pat2);
    798 
    799         //
    800         //  reset(pos, status)
    801         //
    802         m1->reset(inStr1);
    803         m1->reset(4, status);
    804         REGEX_CHECK_STATUS;
    805         REGEX_ASSERT(m1->input() == inStr1);
    806         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    807 
    808         m1->reset(-1, status);
    809         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    810         status = U_ZERO_ERROR;
    811 
    812         m1->reset(0, status);
    813         REGEX_CHECK_STATUS;
    814         status = U_ZERO_ERROR;
    815 
    816         int32_t len = m1->input().length();
    817         m1->reset(len-1, status);
    818         REGEX_CHECK_STATUS;
    819         status = U_ZERO_ERROR;
    820 
    821         m1->reset(len, status);
    822         REGEX_CHECK_STATUS;
    823         status = U_ZERO_ERROR;
    824 
    825         m1->reset(len+1, status);
    826         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    827         status = U_ZERO_ERROR;
    828 
    829         //
    830         // match(pos, status)
    831         //
    832         m1->reset(instr2);
    833         REGEX_ASSERT(m1->matches(4, status) == TRUE);
    834         m1->reset();
    835         REGEX_ASSERT(m1->matches(3, status) == FALSE);
    836         m1->reset();
    837         REGEX_ASSERT(m1->matches(5, status) == FALSE);
    838         REGEX_ASSERT(m1->matches(4, status) == TRUE);
    839         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
    840         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    841 
    842         // Match() at end of string should fail, but should not
    843         //  be an error.
    844         status = U_ZERO_ERROR;
    845         len = m1->input().length();
    846         REGEX_ASSERT(m1->matches(len, status) == FALSE);
    847         REGEX_CHECK_STATUS;
    848 
    849         // Match beyond end of string should fail with an error.
    850         status = U_ZERO_ERROR;
    851         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
    852         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    853 
    854         // Successful match at end of string.
    855         {
    856             status = U_ZERO_ERROR;
    857             RegexMatcher m("A?", 0, status);  // will match zero length string.
    858             REGEX_CHECK_STATUS;
    859             m.reset(inStr1);
    860             len = inStr1.length();
    861             REGEX_ASSERT(m.matches(len, status) == TRUE);
    862             REGEX_CHECK_STATUS;
    863             m.reset(empty);
    864             REGEX_ASSERT(m.matches(0, status) == TRUE);
    865             REGEX_CHECK_STATUS;
    866         }
    867 
    868 
    869         //
    870         // lookingAt(pos, status)
    871         //
    872         status = U_ZERO_ERROR;
    873         m1->reset(instr2);  // "not abc"
    874         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
    875         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
    876         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
    877         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
    878         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
    879         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    880         status = U_ZERO_ERROR;
    881         len = m1->input().length();
    882         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
    883         REGEX_CHECK_STATUS;
    884         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
    885         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    886 
    887         delete m1;
    888         delete pat2;
    889     }
    890 
    891 
    892     //
    893     // Capture Group.
    894     //     RegexMatcher::start();
    895     //     RegexMatcher::end();
    896     //     RegexMatcher::groupCount();
    897     //
    898     {
    899         int32_t             flags=0;
    900         UParseError         pe;
    901         UErrorCode          status=U_ZERO_ERROR;
    902 
    903         UnicodeString       re("01(23(45)67)(.*)");
    904         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
    905         REGEX_CHECK_STATUS;
    906         UnicodeString data = "0123456789";
    907 
    908         RegexMatcher *matcher = pat->matcher(data, status);
    909         REGEX_CHECK_STATUS;
    910         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
    911         static const int32_t matchStarts[] = {0,  2, 4, 8};
    912         static const int32_t matchEnds[]   = {10, 8, 6, 10};
    913         int32_t i;
    914         for (i=0; i<4; i++) {
    915             int32_t actualStart = matcher->start(i, status);
    916             REGEX_CHECK_STATUS;
    917             if (actualStart != matchStarts[i]) {
    918                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
    919                     __LINE__, i, matchStarts[i], actualStart);
    920             }
    921             int32_t actualEnd = matcher->end(i, status);
    922             REGEX_CHECK_STATUS;
    923             if (actualEnd != matchEnds[i]) {
    924                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
    925                     __LINE__, i, matchEnds[i], actualEnd);
    926             }
    927         }
    928 
    929         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
    930         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
    931 
    932         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
    933         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
    934         matcher->reset();
    935         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
    936 
    937         matcher->lookingAt(status);
    938         REGEX_ASSERT(matcher->group(status)    == "0123456789");
    939         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
    940         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
    941         REGEX_ASSERT(matcher->group(2, status) == "45"        );
    942         REGEX_ASSERT(matcher->group(3, status) == "89"        );
    943         REGEX_CHECK_STATUS;
    944         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
    945         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
    946         matcher->reset();
    947         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
    948 
    949         delete matcher;
    950         delete pat;
    951 
    952     }
    953 
    954     //
    955     //  find
    956     //
    957     {
    958         int32_t             flags=0;
    959         UParseError         pe;
    960         UErrorCode          status=U_ZERO_ERROR;
    961 
    962         UnicodeString       re("abc");
    963         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
    964         REGEX_CHECK_STATUS;
    965         UnicodeString data = ".abc..abc...abc..";
    966         //                    012345678901234567
    967 
    968         RegexMatcher *matcher = pat->matcher(data, status);
    969         REGEX_CHECK_STATUS;
    970         REGEX_ASSERT(matcher->find());
    971         REGEX_ASSERT(matcher->start(status) == 1);
    972         REGEX_ASSERT(matcher->find());
    973         REGEX_ASSERT(matcher->start(status) == 6);
    974         REGEX_ASSERT(matcher->find());
    975         REGEX_ASSERT(matcher->start(status) == 12);
    976         REGEX_ASSERT(matcher->find() == FALSE);
    977         REGEX_ASSERT(matcher->find() == FALSE);
    978 
    979         matcher->reset();
    980         REGEX_ASSERT(matcher->find());
    981         REGEX_ASSERT(matcher->start(status) == 1);
    982 
    983         REGEX_ASSERT(matcher->find(0, status));
    984         REGEX_ASSERT(matcher->start(status) == 1);
    985         REGEX_ASSERT(matcher->find(1, status));
    986         REGEX_ASSERT(matcher->start(status) == 1);
    987         REGEX_ASSERT(matcher->find(2, status));
    988         REGEX_ASSERT(matcher->start(status) == 6);
    989         REGEX_ASSERT(matcher->find(12, status));
    990         REGEX_ASSERT(matcher->start(status) == 12);
    991         REGEX_ASSERT(matcher->find(13, status) == FALSE);
    992         REGEX_ASSERT(matcher->find(16, status) == FALSE);
    993         REGEX_ASSERT(matcher->find(17, status) == FALSE);
    994         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
    995 
    996         status = U_ZERO_ERROR;
    997         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
    998         status = U_ZERO_ERROR;
    999         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
   1000 
   1001         REGEX_ASSERT(matcher->groupCount() == 0);
   1002 
   1003         delete matcher;
   1004         delete pat;
   1005     }
   1006 
   1007 
   1008     //
   1009     //  find, with \G in pattern (true if at the end of a previous match).
   1010     //
   1011     {
   1012         int32_t             flags=0;
   1013         UParseError         pe;
   1014         UErrorCode          status=U_ZERO_ERROR;
   1015 
   1016         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
   1017         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
   1018         REGEX_CHECK_STATUS;
   1019         UnicodeString data = ".abcabc.abc..";
   1020         //                    012345678901234567
   1021 
   1022         RegexMatcher *matcher = pat->matcher(data, status);
   1023         REGEX_CHECK_STATUS;
   1024         REGEX_ASSERT(matcher->find());
   1025         REGEX_ASSERT(matcher->start(status) == 0);
   1026         REGEX_ASSERT(matcher->start(1, status) == -1);
   1027         REGEX_ASSERT(matcher->start(2, status) == 1);
   1028 
   1029         REGEX_ASSERT(matcher->find());
   1030         REGEX_ASSERT(matcher->start(status) == 4);
   1031         REGEX_ASSERT(matcher->start(1, status) == 4);
   1032         REGEX_ASSERT(matcher->start(2, status) == -1);
   1033         REGEX_CHECK_STATUS;
   1034 
   1035         delete matcher;
   1036         delete pat;
   1037     }
   1038 
   1039     //
   1040     //   find with zero length matches, match position should bump ahead
   1041     //     to prevent loops.
   1042     //
   1043     {
   1044         int32_t                 i;
   1045         UErrorCode          status=U_ZERO_ERROR;
   1046         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
   1047                                                       //   using an always-true look-ahead.
   1048         REGEX_CHECK_STATUS;
   1049         UnicodeString s("    ");
   1050         m.reset(s);
   1051         for (i=0; ; i++) {
   1052             if (m.find() == FALSE) {
   1053                 break;
   1054             }
   1055             REGEX_ASSERT(m.start(status) == i);
   1056             REGEX_ASSERT(m.end(status) == i);
   1057         }
   1058         REGEX_ASSERT(i==5);
   1059 
   1060         // Check that the bump goes over surrogate pairs OK
   1061         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
   1062         s = s.unescape();
   1063         m.reset(s);
   1064         for (i=0; ; i+=2) {
   1065             if (m.find() == FALSE) {
   1066                 break;
   1067             }
   1068             REGEX_ASSERT(m.start(status) == i);
   1069             REGEX_ASSERT(m.end(status) == i);
   1070         }
   1071         REGEX_ASSERT(i==10);
   1072     }
   1073     {
   1074         // find() loop breaking test.
   1075         //        with pattern of /.?/, should see a series of one char matches, then a single
   1076         //        match of zero length at the end of the input string.
   1077         int32_t                 i;
   1078         UErrorCode          status=U_ZERO_ERROR;
   1079         RegexMatcher        m(".?", 0, status);
   1080         REGEX_CHECK_STATUS;
   1081         UnicodeString s("    ");
   1082         m.reset(s);
   1083         for (i=0; ; i++) {
   1084             if (m.find() == FALSE) {
   1085                 break;
   1086             }
   1087             REGEX_ASSERT(m.start(status) == i);
   1088             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
   1089         }
   1090         REGEX_ASSERT(i==5);
   1091     }
   1092 
   1093 
   1094     //
   1095     // Matchers with no input string behave as if they had an empty input string.
   1096     //
   1097 
   1098     {
   1099         UErrorCode status = U_ZERO_ERROR;
   1100         RegexMatcher  m(".?", 0, status);
   1101         REGEX_CHECK_STATUS;
   1102         REGEX_ASSERT(m.find());
   1103         REGEX_ASSERT(m.start(status) == 0);
   1104         REGEX_ASSERT(m.input() == "");
   1105     }
   1106     {
   1107         UErrorCode status = U_ZERO_ERROR;
   1108         RegexPattern  *p = RegexPattern::compile(".", 0, status);
   1109         RegexMatcher  *m = p->matcher(status);
   1110         REGEX_CHECK_STATUS;
   1111 
   1112         REGEX_ASSERT(m->find() == FALSE);
   1113         REGEX_ASSERT(m->input() == "");
   1114         delete m;
   1115         delete p;
   1116     }
   1117 
   1118     //
   1119     // Regions
   1120     //
   1121     {
   1122         UErrorCode status = U_ZERO_ERROR;
   1123         UnicodeString testString("This is test data");
   1124         RegexMatcher m(".*", testString,  0, status);
   1125         REGEX_CHECK_STATUS;
   1126         REGEX_ASSERT(m.regionStart() == 0);
   1127         REGEX_ASSERT(m.regionEnd() == testString.length());
   1128         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1129         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1130 
   1131         m.region(2,4, status);
   1132         REGEX_CHECK_STATUS;
   1133         REGEX_ASSERT(m.matches(status));
   1134         REGEX_ASSERT(m.start(status)==2);
   1135         REGEX_ASSERT(m.end(status)==4);
   1136         REGEX_CHECK_STATUS;
   1137 
   1138         m.reset();
   1139         REGEX_ASSERT(m.regionStart() == 0);
   1140         REGEX_ASSERT(m.regionEnd() == testString.length());
   1141 
   1142         UnicodeString shorterString("short");
   1143         m.reset(shorterString);
   1144         REGEX_ASSERT(m.regionStart() == 0);
   1145         REGEX_ASSERT(m.regionEnd() == shorterString.length());
   1146 
   1147         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1148         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
   1149         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   1150         REGEX_ASSERT(&m == &m.reset());
   1151         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   1152 
   1153         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
   1154         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1155         REGEX_ASSERT(&m == &m.reset());
   1156         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1157 
   1158         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1159         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
   1160         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   1161         REGEX_ASSERT(&m == &m.reset());
   1162         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   1163 
   1164         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
   1165         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1166         REGEX_ASSERT(&m == &m.reset());
   1167         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1168 
   1169     }
   1170 
   1171     //
   1172     // hitEnd() and requireEnd()
   1173     //
   1174     {
   1175         UErrorCode status = U_ZERO_ERROR;
   1176         UnicodeString testString("aabb");
   1177         RegexMatcher m1(".*", testString,  0, status);
   1178         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
   1179         REGEX_ASSERT(m1.hitEnd() == TRUE);
   1180         REGEX_ASSERT(m1.requireEnd() == FALSE);
   1181         REGEX_CHECK_STATUS;
   1182 
   1183         status = U_ZERO_ERROR;
   1184         RegexMatcher m2("a*", testString, 0, status);
   1185         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
   1186         REGEX_ASSERT(m2.hitEnd() == FALSE);
   1187         REGEX_ASSERT(m2.requireEnd() == FALSE);
   1188         REGEX_CHECK_STATUS;
   1189 
   1190         status = U_ZERO_ERROR;
   1191         RegexMatcher m3(".*$", testString, 0, status);
   1192         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
   1193         REGEX_ASSERT(m3.hitEnd() == TRUE);
   1194         REGEX_ASSERT(m3.requireEnd() == TRUE);
   1195         REGEX_CHECK_STATUS;
   1196     }
   1197 
   1198 
   1199     //
   1200     // Compilation error on reset with UChar *
   1201     //   These were a hazard that people were stumbling over with runtime errors.
   1202     //   Changed them to compiler errors by adding private methods that more closely
   1203     //   matched the incorrect use of the functions.
   1204     //
   1205 #if 0
   1206     {
   1207         UErrorCode status = U_ZERO_ERROR;
   1208         UChar ucharString[20];
   1209         RegexMatcher m(".", 0, status);
   1210         m.reset(ucharString);  // should not compile.
   1211 
   1212         RegexPattern *p = RegexPattern::compile(".", 0, status);
   1213         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
   1214 
   1215         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
   1216     }
   1217 #endif
   1218 
   1219     //
   1220     //  Time Outs.
   1221     //       Note:  These tests will need to be changed when the regexp engine is
   1222     //              able to detect and cut short the exponential time behavior on
   1223     //              this type of match.
   1224     //
   1225     {
   1226         UErrorCode status = U_ZERO_ERROR;
   1227         //    Enough 'a's in the string to cause the match to time out.
   1228         //       (Each on additonal 'a' doubles the time)
   1229         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
   1230         RegexMatcher matcher("(a+)+b", testString, 0, status);
   1231         REGEX_CHECK_STATUS;
   1232         REGEX_ASSERT(matcher.getTimeLimit() == 0);
   1233         matcher.setTimeLimit(100, status);
   1234         REGEX_ASSERT(matcher.getTimeLimit() == 100);
   1235         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1236         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
   1237     }
   1238     {
   1239         UErrorCode status = U_ZERO_ERROR;
   1240         //   Few enough 'a's to slip in under the time limit.
   1241         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
   1242         RegexMatcher matcher("(a+)+b", testString, 0, status);
   1243         REGEX_CHECK_STATUS;
   1244         matcher.setTimeLimit(100, status);
   1245         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1246         REGEX_CHECK_STATUS;
   1247     }
   1248 
   1249     //
   1250     //  Stack Limits
   1251     //
   1252     {
   1253         UErrorCode status = U_ZERO_ERROR;
   1254         UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
   1255 
   1256         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
   1257         //   of the '+', and makes the stack frames larger.
   1258         RegexMatcher matcher("(A)+A$", testString, 0, status);
   1259 
   1260         // With the default stack, this match should fail to run
   1261         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1262         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
   1263 
   1264         // With unlimited stack, it should run
   1265         status = U_ZERO_ERROR;
   1266         matcher.setStackLimit(0, status);
   1267         REGEX_CHECK_STATUS;
   1268         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
   1269         REGEX_CHECK_STATUS;
   1270         REGEX_ASSERT(matcher.getStackLimit() == 0);
   1271 
   1272         // With a limited stack, it the match should fail
   1273         status = U_ZERO_ERROR;
   1274         matcher.setStackLimit(10000, status);
   1275         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1276         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
   1277         REGEX_ASSERT(matcher.getStackLimit() == 10000);
   1278     }
   1279 
   1280         // A pattern that doesn't save state should work with
   1281         //   a minimal sized stack
   1282     {
   1283         UErrorCode status = U_ZERO_ERROR;
   1284         UnicodeString testString = "abc";
   1285         RegexMatcher matcher("abc", testString, 0, status);
   1286         REGEX_CHECK_STATUS;
   1287         matcher.setStackLimit(30, status);
   1288         REGEX_CHECK_STATUS;
   1289         REGEX_ASSERT(matcher.matches(status) == TRUE);
   1290         REGEX_CHECK_STATUS;
   1291         REGEX_ASSERT(matcher.getStackLimit() == 30);
   1292 
   1293         // Negative stack sizes should fail
   1294         status = U_ZERO_ERROR;
   1295         matcher.setStackLimit(1000, status);
   1296         REGEX_CHECK_STATUS;
   1297         matcher.setStackLimit(-1, status);
   1298         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
   1299         REGEX_ASSERT(matcher.getStackLimit() == 1000);
   1300     }
   1301 
   1302 
   1303 }
   1304 
   1305 
   1306 
   1307 
   1308 
   1309 
   1310 //---------------------------------------------------------------------------
   1311 //
   1312 //      API_Replace        API test for class RegexMatcher, testing the
   1313 //                         Replace family of functions.
   1314 //
   1315 //---------------------------------------------------------------------------
   1316 void RegexTest::API_Replace() {
   1317     //
   1318     //  Replace
   1319     //
   1320     int32_t             flags=0;
   1321     UParseError         pe;
   1322     UErrorCode          status=U_ZERO_ERROR;
   1323 
   1324     UnicodeString       re("abc");
   1325     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
   1326     REGEX_CHECK_STATUS;
   1327     UnicodeString data = ".abc..abc...abc..";
   1328     //                    012345678901234567
   1329     RegexMatcher *matcher = pat->matcher(data, status);
   1330 
   1331     //
   1332     //  Plain vanilla matches.
   1333     //
   1334     UnicodeString  dest;
   1335     dest = matcher->replaceFirst("yz", status);
   1336     REGEX_CHECK_STATUS;
   1337     REGEX_ASSERT(dest == ".yz..abc...abc..");
   1338 
   1339     dest = matcher->replaceAll("yz", status);
   1340     REGEX_CHECK_STATUS;
   1341     REGEX_ASSERT(dest == ".yz..yz...yz..");
   1342 
   1343     //
   1344     //  Plain vanilla non-matches.
   1345     //
   1346     UnicodeString d2 = ".abx..abx...abx..";
   1347     matcher->reset(d2);
   1348     dest = matcher->replaceFirst("yz", status);
   1349     REGEX_CHECK_STATUS;
   1350     REGEX_ASSERT(dest == ".abx..abx...abx..");
   1351 
   1352     dest = matcher->replaceAll("yz", status);
   1353     REGEX_CHECK_STATUS;
   1354     REGEX_ASSERT(dest == ".abx..abx...abx..");
   1355 
   1356     //
   1357     // Empty source string
   1358     //
   1359     UnicodeString d3 = "";
   1360     matcher->reset(d3);
   1361     dest = matcher->replaceFirst("yz", status);
   1362     REGEX_CHECK_STATUS;
   1363     REGEX_ASSERT(dest == "");
   1364 
   1365     dest = matcher->replaceAll("yz", status);
   1366     REGEX_CHECK_STATUS;
   1367     REGEX_ASSERT(dest == "");
   1368 
   1369     //
   1370     // Empty substitution string
   1371     //
   1372     matcher->reset(data);              // ".abc..abc...abc.."
   1373     dest = matcher->replaceFirst("", status);
   1374     REGEX_CHECK_STATUS;
   1375     REGEX_ASSERT(dest == "...abc...abc..");
   1376 
   1377     dest = matcher->replaceAll("", status);
   1378     REGEX_CHECK_STATUS;
   1379     REGEX_ASSERT(dest == "........");
   1380 
   1381     //
   1382     // match whole string
   1383     //
   1384     UnicodeString d4 = "abc";
   1385     matcher->reset(d4);
   1386     dest = matcher->replaceFirst("xyz", status);
   1387     REGEX_CHECK_STATUS;
   1388     REGEX_ASSERT(dest == "xyz");
   1389 
   1390     dest = matcher->replaceAll("xyz", status);
   1391     REGEX_CHECK_STATUS;
   1392     REGEX_ASSERT(dest == "xyz");
   1393 
   1394     //
   1395     // Capture Group, simple case
   1396     //
   1397     UnicodeString       re2("a(..)");
   1398     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
   1399     REGEX_CHECK_STATUS;
   1400     UnicodeString d5 = "abcdefg";
   1401     RegexMatcher *matcher2 = pat2->matcher(d5, status);
   1402     REGEX_CHECK_STATUS;
   1403     dest = matcher2->replaceFirst("$1$1", status);
   1404     REGEX_CHECK_STATUS;
   1405     REGEX_ASSERT(dest == "bcbcdefg");
   1406 
   1407     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
   1408     REGEX_CHECK_STATUS;
   1409     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
   1410 
   1411     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
   1412     REGEX_CHECK_STATUS;
   1413     REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
   1414 
   1415     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
   1416     replacement = replacement.unescape();
   1417     dest = matcher2->replaceFirst(replacement, status);
   1418     REGEX_CHECK_STATUS;
   1419     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
   1420 
   1421     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
   1422 
   1423 
   1424     //
   1425     // Replacement String with \u hex escapes
   1426     //
   1427     {
   1428         UnicodeString  src = "abc 1 abc 2 abc 3";
   1429         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
   1430         matcher->reset(src);
   1431         UnicodeString  result = matcher->replaceAll(substitute, status);
   1432         REGEX_CHECK_STATUS;
   1433         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
   1434     }
   1435     {
   1436         UnicodeString  src = "abc !";
   1437         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
   1438         matcher->reset(src);
   1439         UnicodeString  result = matcher->replaceAll(substitute, status);
   1440         REGEX_CHECK_STATUS;
   1441         UnicodeString expected = UnicodeString("--");
   1442         expected.append((UChar32)0x10000);
   1443         expected.append("-- !");
   1444         REGEX_ASSERT(result == expected);
   1445     }
   1446     // TODO:  need more through testing of capture substitutions.
   1447 
   1448     // Bug 4057
   1449     //
   1450     {
   1451         status = U_ZERO_ERROR;
   1452         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
   1453         RegexMatcher m("ss(.*?)ee", 0, status);
   1454         REGEX_CHECK_STATUS;
   1455         UnicodeString result;
   1456 
   1457         // Multiple finds do NOT bump up the previous appendReplacement postion.
   1458         m.reset(s);
   1459         m.find();
   1460         m.find();
   1461         m.appendReplacement(result, "ooh", status);
   1462         REGEX_CHECK_STATUS;
   1463         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1464 
   1465         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
   1466         status = U_ZERO_ERROR;
   1467         result.truncate(0);
   1468         m.reset(10, status);
   1469         m.find();
   1470         m.find();
   1471         m.appendReplacement(result, "ooh", status);
   1472         REGEX_CHECK_STATUS;
   1473         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1474 
   1475         // find() at interior of string, appendReplacemnt still starts at beginning.
   1476         status = U_ZERO_ERROR;
   1477         result.truncate(0);
   1478         m.reset();
   1479         m.find(10, status);
   1480         m.find();
   1481         m.appendReplacement(result, "ooh", status);
   1482         REGEX_CHECK_STATUS;
   1483         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1484 
   1485         m.appendTail(result);
   1486         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
   1487 
   1488     }
   1489 
   1490     delete matcher2;
   1491     delete pat2;
   1492     delete matcher;
   1493     delete pat;
   1494 }
   1495 
   1496 
   1497 //---------------------------------------------------------------------------
   1498 //
   1499 //      API_Pattern       Test that the API for class RegexPattern is
   1500 //                        present and nominally working.
   1501 //
   1502 //---------------------------------------------------------------------------
   1503 void RegexTest::API_Pattern() {
   1504     RegexPattern        pata;    // Test default constructor to not crash.
   1505     RegexPattern        patb;
   1506 
   1507     REGEX_ASSERT(pata == patb);
   1508     REGEX_ASSERT(pata == pata);
   1509 
   1510     UnicodeString re1("abc[a-l][m-z]");
   1511     UnicodeString re2("def");
   1512     UErrorCode    status = U_ZERO_ERROR;
   1513     UParseError   pe;
   1514 
   1515     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
   1516     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
   1517     REGEX_CHECK_STATUS;
   1518     REGEX_ASSERT(*pat1 == *pat1);
   1519     REGEX_ASSERT(*pat1 != pata);
   1520 
   1521     // Assign
   1522     patb = *pat1;
   1523     REGEX_ASSERT(patb == *pat1);
   1524 
   1525     // Copy Construct
   1526     RegexPattern patc(*pat1);
   1527     REGEX_ASSERT(patc == *pat1);
   1528     REGEX_ASSERT(patb == patc);
   1529     REGEX_ASSERT(pat1 != pat2);
   1530     patb = *pat2;
   1531     REGEX_ASSERT(patb != patc);
   1532     REGEX_ASSERT(patb == *pat2);
   1533 
   1534     // Compile with no flags.
   1535     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
   1536     REGEX_ASSERT(*pat1a == *pat1);
   1537 
   1538     REGEX_ASSERT(pat1a->flags() == 0);
   1539 
   1540     // Compile with different flags should be not equal
   1541     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
   1542     REGEX_CHECK_STATUS;
   1543 
   1544     REGEX_ASSERT(*pat1b != *pat1a);
   1545     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
   1546     REGEX_ASSERT(pat1a->flags() == 0);
   1547     delete pat1b;
   1548 
   1549     // clone
   1550     RegexPattern *pat1c = pat1->clone();
   1551     REGEX_ASSERT(*pat1c == *pat1);
   1552     REGEX_ASSERT(*pat1c != *pat2);
   1553 
   1554     delete pat1c;
   1555     delete pat1a;
   1556     delete pat1;
   1557     delete pat2;
   1558 
   1559 
   1560     //
   1561     //   Verify that a matcher created from a cloned pattern works.
   1562     //     (Jitterbug 3423)
   1563     //
   1564     {
   1565         UErrorCode     status     = U_ZERO_ERROR;
   1566         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
   1567         RegexPattern  *pClone     = pSource->clone();
   1568         delete         pSource;
   1569         RegexMatcher  *mFromClone = pClone->matcher(status);
   1570         REGEX_CHECK_STATUS;
   1571         UnicodeString s = "Hello World";
   1572         mFromClone->reset(s);
   1573         REGEX_ASSERT(mFromClone->find() == TRUE);
   1574         REGEX_ASSERT(mFromClone->group(status) == "Hello");
   1575         REGEX_ASSERT(mFromClone->find() == TRUE);
   1576         REGEX_ASSERT(mFromClone->group(status) == "World");
   1577         REGEX_ASSERT(mFromClone->find() == FALSE);
   1578         delete mFromClone;
   1579         delete pClone;
   1580     }
   1581 
   1582     //
   1583     //   matches convenience API
   1584     //
   1585     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
   1586     REGEX_CHECK_STATUS;
   1587     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
   1588     REGEX_CHECK_STATUS;
   1589     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
   1590     REGEX_CHECK_STATUS;
   1591     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
   1592     REGEX_CHECK_STATUS;
   1593     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
   1594     REGEX_CHECK_STATUS;
   1595     status = U_INDEX_OUTOFBOUNDS_ERROR;
   1596     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
   1597     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1598 
   1599 
   1600     //
   1601     // Split()
   1602     //
   1603     status = U_ZERO_ERROR;
   1604     pat1 = RegexPattern::compile(" +",  pe, status);
   1605     REGEX_CHECK_STATUS;
   1606     UnicodeString  fields[10];
   1607 
   1608     int32_t n;
   1609     n = pat1->split("Now is the time", fields, 10, status);
   1610     REGEX_CHECK_STATUS;
   1611     REGEX_ASSERT(n==4);
   1612     REGEX_ASSERT(fields[0]=="Now");
   1613     REGEX_ASSERT(fields[1]=="is");
   1614     REGEX_ASSERT(fields[2]=="the");
   1615     REGEX_ASSERT(fields[3]=="time");
   1616     REGEX_ASSERT(fields[4]=="");
   1617 
   1618     n = pat1->split("Now is the time", fields, 2, status);
   1619     REGEX_CHECK_STATUS;
   1620     REGEX_ASSERT(n==2);
   1621     REGEX_ASSERT(fields[0]=="Now");
   1622     REGEX_ASSERT(fields[1]=="is the time");
   1623     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
   1624 
   1625     fields[1] = "*";
   1626     status = U_ZERO_ERROR;
   1627     n = pat1->split("Now is the time", fields, 1, status);
   1628     REGEX_CHECK_STATUS;
   1629     REGEX_ASSERT(n==1);
   1630     REGEX_ASSERT(fields[0]=="Now is the time");
   1631     REGEX_ASSERT(fields[1]=="*");
   1632     status = U_ZERO_ERROR;
   1633 
   1634     n = pat1->split("    Now       is the time   ", fields, 10, status);
   1635     REGEX_CHECK_STATUS;
   1636     REGEX_ASSERT(n==6);
   1637     REGEX_ASSERT(fields[0]=="");
   1638     REGEX_ASSERT(fields[1]=="Now");
   1639     REGEX_ASSERT(fields[2]=="is");
   1640     REGEX_ASSERT(fields[3]=="the");
   1641     REGEX_ASSERT(fields[4]=="time");
   1642     REGEX_ASSERT(fields[5]=="");
   1643 
   1644     n = pat1->split("     ", fields, 10, status);
   1645     REGEX_CHECK_STATUS;
   1646     REGEX_ASSERT(n==2);
   1647     REGEX_ASSERT(fields[0]=="");
   1648     REGEX_ASSERT(fields[1]=="");
   1649 
   1650     fields[0] = "foo";
   1651     n = pat1->split("", fields, 10, status);
   1652     REGEX_CHECK_STATUS;
   1653     REGEX_ASSERT(n==0);
   1654     REGEX_ASSERT(fields[0]=="foo");
   1655 
   1656     delete pat1;
   1657 
   1658     //  split, with a pattern with (capture)
   1659     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
   1660     REGEX_CHECK_STATUS;
   1661 
   1662     status = U_ZERO_ERROR;
   1663     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
   1664     REGEX_CHECK_STATUS;
   1665     REGEX_ASSERT(n==7);
   1666     REGEX_ASSERT(fields[0]=="");
   1667     REGEX_ASSERT(fields[1]=="a");
   1668     REGEX_ASSERT(fields[2]=="Now is ");
   1669     REGEX_ASSERT(fields[3]=="b");
   1670     REGEX_ASSERT(fields[4]=="the time");
   1671     REGEX_ASSERT(fields[5]=="c");
   1672     REGEX_ASSERT(fields[6]=="");
   1673     REGEX_ASSERT(status==U_ZERO_ERROR);
   1674 
   1675     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
   1676     REGEX_CHECK_STATUS;
   1677     REGEX_ASSERT(n==7);
   1678     REGEX_ASSERT(fields[0]=="  ");
   1679     REGEX_ASSERT(fields[1]=="a");
   1680     REGEX_ASSERT(fields[2]=="Now is ");
   1681     REGEX_ASSERT(fields[3]=="b");
   1682     REGEX_ASSERT(fields[4]=="the time");
   1683     REGEX_ASSERT(fields[5]=="c");
   1684     REGEX_ASSERT(fields[6]=="");
   1685 
   1686     status = U_ZERO_ERROR;
   1687     fields[6] = "foo";
   1688     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
   1689     REGEX_CHECK_STATUS;
   1690     REGEX_ASSERT(n==6);
   1691     REGEX_ASSERT(fields[0]=="  ");
   1692     REGEX_ASSERT(fields[1]=="a");
   1693     REGEX_ASSERT(fields[2]=="Now is ");
   1694     REGEX_ASSERT(fields[3]=="b");
   1695     REGEX_ASSERT(fields[4]=="the time");
   1696     REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
   1697     REGEX_ASSERT(fields[6]=="foo");
   1698 
   1699     status = U_ZERO_ERROR;
   1700     fields[5] = "foo";
   1701     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
   1702     REGEX_CHECK_STATUS;
   1703     REGEX_ASSERT(n==5);
   1704     REGEX_ASSERT(fields[0]=="  ");
   1705     REGEX_ASSERT(fields[1]=="a");
   1706     REGEX_ASSERT(fields[2]=="Now is ");
   1707     REGEX_ASSERT(fields[3]=="b");
   1708     REGEX_ASSERT(fields[4]=="the time<c>");
   1709     REGEX_ASSERT(fields[5]=="foo");
   1710 
   1711     status = U_ZERO_ERROR;
   1712     fields[5] = "foo";
   1713     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
   1714     REGEX_CHECK_STATUS;
   1715     REGEX_ASSERT(n==5);
   1716     REGEX_ASSERT(fields[0]=="  ");
   1717     REGEX_ASSERT(fields[1]=="a");
   1718     REGEX_ASSERT(fields[2]=="Now is ");
   1719     REGEX_ASSERT(fields[3]=="b");
   1720     REGEX_ASSERT(fields[4]=="the time");
   1721     REGEX_ASSERT(fields[5]=="foo");
   1722 
   1723     status = U_ZERO_ERROR;
   1724     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
   1725     REGEX_CHECK_STATUS;
   1726     REGEX_ASSERT(n==4);
   1727     REGEX_ASSERT(fields[0]=="  ");
   1728     REGEX_ASSERT(fields[1]=="a");
   1729     REGEX_ASSERT(fields[2]=="Now is ");
   1730     REGEX_ASSERT(fields[3]=="the time<c>");
   1731     status = U_ZERO_ERROR;
   1732     delete pat1;
   1733 
   1734     pat1 = RegexPattern::compile("([-,])",  pe, status);
   1735     REGEX_CHECK_STATUS;
   1736     n = pat1->split("1-10,20", fields, 10, status);
   1737     REGEX_CHECK_STATUS;
   1738     REGEX_ASSERT(n==5);
   1739     REGEX_ASSERT(fields[0]=="1");
   1740     REGEX_ASSERT(fields[1]=="-");
   1741     REGEX_ASSERT(fields[2]=="10");
   1742     REGEX_ASSERT(fields[3]==",");
   1743     REGEX_ASSERT(fields[4]=="20");
   1744     delete pat1;
   1745 
   1746     // Test split of string with empty trailing fields
   1747     pat1 = RegexPattern::compile(",", pe, status);
   1748     REGEX_CHECK_STATUS;
   1749     n = pat1->split("a,b,c,", fields, 10, status);
   1750     REGEX_CHECK_STATUS;
   1751     REGEX_ASSERT(n==4);
   1752     REGEX_ASSERT(fields[0]=="a");
   1753     REGEX_ASSERT(fields[1]=="b");
   1754     REGEX_ASSERT(fields[2]=="c");
   1755     REGEX_ASSERT(fields[3]=="");
   1756 
   1757     n = pat1->split("a,,,", fields, 10, status);
   1758     REGEX_CHECK_STATUS;
   1759     REGEX_ASSERT(n==4);
   1760     REGEX_ASSERT(fields[0]=="a");
   1761     REGEX_ASSERT(fields[1]=="");
   1762     REGEX_ASSERT(fields[2]=="");
   1763     REGEX_ASSERT(fields[3]=="");
   1764     delete pat1;
   1765 
   1766     // Split Separator with zero length match.
   1767     pat1 = RegexPattern::compile(":?", pe, status);
   1768     REGEX_CHECK_STATUS;
   1769     n = pat1->split("abc", fields, 10, status);
   1770     REGEX_CHECK_STATUS;
   1771     REGEX_ASSERT(n==5);
   1772     REGEX_ASSERT(fields[0]=="");
   1773     REGEX_ASSERT(fields[1]=="a");
   1774     REGEX_ASSERT(fields[2]=="b");
   1775     REGEX_ASSERT(fields[3]=="c");
   1776     REGEX_ASSERT(fields[4]=="");
   1777 
   1778     delete pat1;
   1779 
   1780     //
   1781     // RegexPattern::pattern()
   1782     //
   1783     pat1 = new RegexPattern();
   1784     REGEX_ASSERT(pat1->pattern() == "");
   1785     delete pat1;
   1786 
   1787     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
   1788     REGEX_CHECK_STATUS;
   1789     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
   1790     delete pat1;
   1791 
   1792 
   1793     //
   1794     // classID functions
   1795     //
   1796     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
   1797     REGEX_CHECK_STATUS;
   1798     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
   1799     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
   1800     UnicodeString Hello("Hello, world.");
   1801     RegexMatcher *m = pat1->matcher(Hello, status);
   1802     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
   1803     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
   1804     REGEX_ASSERT(m->getDynamicClassID() != NULL);
   1805     delete m;
   1806     delete pat1;
   1807 
   1808 }
   1809 
   1810 //---------------------------------------------------------------------------
   1811 //
   1812 //      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
   1813 //                       is present and working, but excluding functions
   1814 //                       implementing replace operations.
   1815 //
   1816 //---------------------------------------------------------------------------
   1817 void RegexTest::API_Match_UTF8() {
   1818     UParseError         pe;
   1819     UErrorCode          status=U_ZERO_ERROR;
   1820     int32_t             flags = 0;
   1821 
   1822     //
   1823     // Debug - slide failing test cases early
   1824     //
   1825 #if 0
   1826     {
   1827     }
   1828     return;
   1829 #endif
   1830 
   1831     //
   1832     // Simple pattern compilation
   1833     //
   1834     {
   1835         UText               re = UTEXT_INITIALIZER;
   1836         regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
   1837         REGEX_VERBOSE_TEXT(&re);
   1838         RegexPattern        *pat2;
   1839         pat2 = RegexPattern::compile(&re, flags, pe, status);
   1840         REGEX_CHECK_STATUS;
   1841 
   1842         UText input1 = UTEXT_INITIALIZER;
   1843         UText input2 = UTEXT_INITIALIZER;
   1844         UText empty  = UTEXT_INITIALIZER;
   1845         regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
   1846         REGEX_VERBOSE_TEXT(&input1);
   1847         regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
   1848         REGEX_VERBOSE_TEXT(&input2);
   1849         utext_openUChars(&empty, NULL, 0, &status);
   1850 
   1851         int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
   1852         int32_t input2Len = strlen("not abc");
   1853 
   1854 
   1855         //
   1856         // Matcher creation and reset.
   1857         //
   1858         RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
   1859         REGEX_CHECK_STATUS;
   1860         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1861         const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
   1862         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
   1863         m1->reset(&input2);
   1864         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
   1865         const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
   1866         REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
   1867         m1->reset(&input1);
   1868         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
   1869         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1870         m1->reset(&empty);
   1871         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
   1872         REGEX_ASSERT(utext_nativeLength(&empty) == 0);
   1873 
   1874         //
   1875         //  reset(pos, status)
   1876         //
   1877         m1->reset(&input1);
   1878         m1->reset(4, status);
   1879         REGEX_CHECK_STATUS;
   1880         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
   1881         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1882 
   1883         m1->reset(-1, status);
   1884         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1885         status = U_ZERO_ERROR;
   1886 
   1887         m1->reset(0, status);
   1888         REGEX_CHECK_STATUS;
   1889         status = U_ZERO_ERROR;
   1890 
   1891         m1->reset(input1Len-1, status);
   1892         REGEX_CHECK_STATUS;
   1893         status = U_ZERO_ERROR;
   1894 
   1895         m1->reset(input1Len, status);
   1896         REGEX_CHECK_STATUS;
   1897         status = U_ZERO_ERROR;
   1898 
   1899         m1->reset(input1Len+1, status);
   1900         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1901         status = U_ZERO_ERROR;
   1902 
   1903         //
   1904         // match(pos, status)
   1905         //
   1906         m1->reset(&input2);
   1907         REGEX_ASSERT(m1->matches(4, status) == TRUE);
   1908         m1->reset();
   1909         REGEX_ASSERT(m1->matches(3, status) == FALSE);
   1910         m1->reset();
   1911         REGEX_ASSERT(m1->matches(5, status) == FALSE);
   1912         REGEX_ASSERT(m1->matches(4, status) == TRUE);
   1913         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
   1914         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1915 
   1916         // Match() at end of string should fail, but should not
   1917         //  be an error.
   1918         status = U_ZERO_ERROR;
   1919         REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
   1920         REGEX_CHECK_STATUS;
   1921 
   1922         // Match beyond end of string should fail with an error.
   1923         status = U_ZERO_ERROR;
   1924         REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
   1925         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1926 
   1927         // Successful match at end of string.
   1928         {
   1929             status = U_ZERO_ERROR;
   1930             RegexMatcher m("A?", 0, status);  // will match zero length string.
   1931             REGEX_CHECK_STATUS;
   1932             m.reset(&input1);
   1933             REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
   1934             REGEX_CHECK_STATUS;
   1935             m.reset(&empty);
   1936             REGEX_ASSERT(m.matches(0, status) == TRUE);
   1937             REGEX_CHECK_STATUS;
   1938         }
   1939 
   1940 
   1941         //
   1942         // lookingAt(pos, status)
   1943         //
   1944         status = U_ZERO_ERROR;
   1945         m1->reset(&input2);  // "not abc"
   1946         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
   1947         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
   1948         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
   1949         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
   1950         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
   1951         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1952         status = U_ZERO_ERROR;
   1953         REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
   1954         REGEX_CHECK_STATUS;
   1955         REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
   1956         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1957 
   1958         delete m1;
   1959         delete pat2;
   1960 
   1961         utext_close(&re);
   1962         utext_close(&input1);
   1963         utext_close(&input2);
   1964         utext_close(&empty);
   1965     }
   1966 
   1967 
   1968     //
   1969     // Capture Group.
   1970     //     RegexMatcher::start();
   1971     //     RegexMatcher::end();
   1972     //     RegexMatcher::groupCount();
   1973     //
   1974     {
   1975         int32_t             flags=0;
   1976         UParseError         pe;
   1977         UErrorCode          status=U_ZERO_ERROR;
   1978         UText               re=UTEXT_INITIALIZER;
   1979         const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
   1980         utext_openUTF8(&re, str_01234567_pat, -1, &status);
   1981 
   1982         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   1983         REGEX_CHECK_STATUS;
   1984 
   1985         UText input = UTEXT_INITIALIZER;
   1986         const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
   1987         utext_openUTF8(&input, str_0123456789, -1, &status);
   1988 
   1989         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
   1990         REGEX_CHECK_STATUS;
   1991         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
   1992         static const int32_t matchStarts[] = {0,  2, 4, 8};
   1993         static const int32_t matchEnds[]   = {10, 8, 6, 10};
   1994         int32_t i;
   1995         for (i=0; i<4; i++) {
   1996             int32_t actualStart = matcher->start(i, status);
   1997             REGEX_CHECK_STATUS;
   1998             if (actualStart != matchStarts[i]) {
   1999                 errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
   2000                       __FILE__, __LINE__, i, matchStarts[i], actualStart);
   2001             }
   2002             int32_t actualEnd = matcher->end(i, status);
   2003             REGEX_CHECK_STATUS;
   2004             if (actualEnd != matchEnds[i]) {
   2005                 errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
   2006                       __FILE__, __LINE__, i, matchEnds[i], actualEnd);
   2007             }
   2008         }
   2009 
   2010         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
   2011         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
   2012 
   2013         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2014         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2015         matcher->reset();
   2016         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
   2017 
   2018         matcher->lookingAt(status);
   2019 
   2020         UnicodeString dest;
   2021         UText destText = UTEXT_INITIALIZER;
   2022         utext_openUnicodeString(&destText, &dest, &status);
   2023         UText *result;
   2024         //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
   2025         //	Test shallow-clone API
   2026         int64_t   group_len;
   2027         result = matcher->group((UText *)NULL, group_len, status);
   2028         REGEX_CHECK_STATUS;
   2029         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   2030         utext_close(result);
   2031         result = matcher->group(0, &destText, group_len, status);
   2032         REGEX_CHECK_STATUS;
   2033         REGEX_ASSERT(result == &destText);
   2034         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   2035         //  destText is now immutable, reopen it
   2036         utext_close(&destText);
   2037         utext_openUnicodeString(&destText, &dest, &status);
   2038 
   2039         result = matcher->group(0, NULL, status);
   2040         REGEX_CHECK_STATUS;
   2041         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   2042         utext_close(result);
   2043         result = matcher->group(0, &destText, status);
   2044         REGEX_CHECK_STATUS;
   2045         REGEX_ASSERT(result == &destText);
   2046         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   2047 
   2048         result = matcher->group(1, NULL, status);
   2049         REGEX_CHECK_STATUS;
   2050         const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */
   2051         REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
   2052         utext_close(result);
   2053         result = matcher->group(1, &destText, status);
   2054         REGEX_CHECK_STATUS;
   2055         REGEX_ASSERT(result == &destText);
   2056         REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
   2057 
   2058         result = matcher->group(2, NULL, status);
   2059         REGEX_CHECK_STATUS;
   2060         const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */
   2061         REGEX_ASSERT_UTEXT_UTF8(str_45, result);
   2062         utext_close(result);
   2063         result = matcher->group(2, &destText, status);
   2064         REGEX_CHECK_STATUS;
   2065         REGEX_ASSERT(result == &destText);
   2066         REGEX_ASSERT_UTEXT_UTF8(str_45, result);
   2067 
   2068         result = matcher->group(3, NULL, status);
   2069         REGEX_CHECK_STATUS;
   2070         const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */
   2071         REGEX_ASSERT_UTEXT_UTF8(str_89, result);
   2072         utext_close(result);
   2073         result = matcher->group(3, &destText, status);
   2074         REGEX_CHECK_STATUS;
   2075         REGEX_ASSERT(result == &destText);
   2076         REGEX_ASSERT_UTEXT_UTF8(str_89, result);
   2077 
   2078         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2079         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2080         matcher->reset();
   2081         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
   2082 
   2083         delete matcher;
   2084         delete pat;
   2085 
   2086         utext_close(&destText);
   2087         utext_close(&input);
   2088         utext_close(&re);
   2089     }
   2090 
   2091     //
   2092     //  find
   2093     //
   2094     {
   2095         int32_t             flags=0;
   2096         UParseError         pe;
   2097         UErrorCode          status=U_ZERO_ERROR;
   2098         UText               re=UTEXT_INITIALIZER;
   2099         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
   2100         utext_openUTF8(&re, str_abc, -1, &status);
   2101 
   2102         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2103         REGEX_CHECK_STATUS;
   2104         UText input = UTEXT_INITIALIZER;
   2105         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
   2106         utext_openUTF8(&input, str_abcabcabc, -1, &status);
   2107         //                      012345678901234567
   2108 
   2109         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
   2110         REGEX_CHECK_STATUS;
   2111         REGEX_ASSERT(matcher->find());
   2112         REGEX_ASSERT(matcher->start(status) == 1);
   2113         REGEX_ASSERT(matcher->find());
   2114         REGEX_ASSERT(matcher->start(status) == 6);
   2115         REGEX_ASSERT(matcher->find());
   2116         REGEX_ASSERT(matcher->start(status) == 12);
   2117         REGEX_ASSERT(matcher->find() == FALSE);
   2118         REGEX_ASSERT(matcher->find() == FALSE);
   2119 
   2120         matcher->reset();
   2121         REGEX_ASSERT(matcher->find());
   2122         REGEX_ASSERT(matcher->start(status) == 1);
   2123 
   2124         REGEX_ASSERT(matcher->find(0, status));
   2125         REGEX_ASSERT(matcher->start(status) == 1);
   2126         REGEX_ASSERT(matcher->find(1, status));
   2127         REGEX_ASSERT(matcher->start(status) == 1);
   2128         REGEX_ASSERT(matcher->find(2, status));
   2129         REGEX_ASSERT(matcher->start(status) == 6);
   2130         REGEX_ASSERT(matcher->find(12, status));
   2131         REGEX_ASSERT(matcher->start(status) == 12);
   2132         REGEX_ASSERT(matcher->find(13, status) == FALSE);
   2133         REGEX_ASSERT(matcher->find(16, status) == FALSE);
   2134         REGEX_ASSERT(matcher->find(17, status) == FALSE);
   2135         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
   2136 
   2137         status = U_ZERO_ERROR;
   2138         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2139         status = U_ZERO_ERROR;
   2140         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2141 
   2142         REGEX_ASSERT(matcher->groupCount() == 0);
   2143 
   2144         delete matcher;
   2145         delete pat;
   2146 
   2147         utext_close(&input);
   2148         utext_close(&re);
   2149     }
   2150 
   2151 
   2152     //
   2153     //  find, with \G in pattern (true if at the end of a previous match).
   2154     //
   2155     {
   2156         int32_t             flags=0;
   2157         UParseError         pe;
   2158         UErrorCode          status=U_ZERO_ERROR;
   2159         UText               re=UTEXT_INITIALIZER;
   2160         const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
   2161         utext_openUTF8(&re, str_Gabcabc, -1, &status);
   2162 
   2163         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2164 
   2165         REGEX_CHECK_STATUS;
   2166         UText input = UTEXT_INITIALIZER;
   2167         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
   2168         utext_openUTF8(&input, str_abcabcabc, -1, &status);
   2169         //                      012345678901234567
   2170 
   2171         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
   2172         REGEX_CHECK_STATUS;
   2173         REGEX_ASSERT(matcher->find());
   2174         REGEX_ASSERT(matcher->start(status) == 0);
   2175         REGEX_ASSERT(matcher->start(1, status) == -1);
   2176         REGEX_ASSERT(matcher->start(2, status) == 1);
   2177 
   2178         REGEX_ASSERT(matcher->find());
   2179         REGEX_ASSERT(matcher->start(status) == 4);
   2180         REGEX_ASSERT(matcher->start(1, status) == 4);
   2181         REGEX_ASSERT(matcher->start(2, status) == -1);
   2182         REGEX_CHECK_STATUS;
   2183 
   2184         delete matcher;
   2185         delete pat;
   2186 
   2187         utext_close(&input);
   2188         utext_close(&re);
   2189     }
   2190 
   2191     //
   2192     //   find with zero length matches, match position should bump ahead
   2193     //     to prevent loops.
   2194     //
   2195     {
   2196         int32_t                 i;
   2197         UErrorCode          status=U_ZERO_ERROR;
   2198         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
   2199                                                       //   using an always-true look-ahead.
   2200         REGEX_CHECK_STATUS;
   2201         UText s = UTEXT_INITIALIZER;
   2202         utext_openUTF8(&s, "    ", -1, &status);
   2203         m.reset(&s);
   2204         for (i=0; ; i++) {
   2205             if (m.find() == FALSE) {
   2206                 break;
   2207             }
   2208             REGEX_ASSERT(m.start(status) == i);
   2209             REGEX_ASSERT(m.end(status) == i);
   2210         }
   2211         REGEX_ASSERT(i==5);
   2212 
   2213         // Check that the bump goes over characters outside the BMP OK
   2214         // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
   2215         unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
   2216         utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
   2217         m.reset(&s);
   2218         for (i=0; ; i+=4) {
   2219             if (m.find() == FALSE) {
   2220                 break;
   2221             }
   2222             REGEX_ASSERT(m.start(status) == i);
   2223             REGEX_ASSERT(m.end(status) == i);
   2224         }
   2225         REGEX_ASSERT(i==20);
   2226 
   2227         utext_close(&s);
   2228     }
   2229     {
   2230         // find() loop breaking test.
   2231         //        with pattern of /.?/, should see a series of one char matches, then a single
   2232         //        match of zero length at the end of the input string.
   2233         int32_t                 i;
   2234         UErrorCode          status=U_ZERO_ERROR;
   2235         RegexMatcher        m(".?", 0, status);
   2236         REGEX_CHECK_STATUS;
   2237         UText s = UTEXT_INITIALIZER;
   2238         utext_openUTF8(&s, "    ", -1, &status);
   2239         m.reset(&s);
   2240         for (i=0; ; i++) {
   2241             if (m.find() == FALSE) {
   2242                 break;
   2243             }
   2244             REGEX_ASSERT(m.start(status) == i);
   2245             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
   2246         }
   2247         REGEX_ASSERT(i==5);
   2248 
   2249         utext_close(&s);
   2250     }
   2251 
   2252 
   2253     //
   2254     // Matchers with no input string behave as if they had an empty input string.
   2255     //
   2256 
   2257     {
   2258         UErrorCode status = U_ZERO_ERROR;
   2259         RegexMatcher  m(".?", 0, status);
   2260         REGEX_CHECK_STATUS;
   2261         REGEX_ASSERT(m.find());
   2262         REGEX_ASSERT(m.start(status) == 0);
   2263         REGEX_ASSERT(m.input() == "");
   2264     }
   2265     {
   2266         UErrorCode status = U_ZERO_ERROR;
   2267         RegexPattern  *p = RegexPattern::compile(".", 0, status);
   2268         RegexMatcher  *m = p->matcher(status);
   2269         REGEX_CHECK_STATUS;
   2270 
   2271         REGEX_ASSERT(m->find() == FALSE);
   2272         REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
   2273         delete m;
   2274         delete p;
   2275     }
   2276 
   2277     //
   2278     // Regions
   2279     //
   2280     {
   2281         UErrorCode status = U_ZERO_ERROR;
   2282         UText testPattern = UTEXT_INITIALIZER;
   2283         UText testText    = UTEXT_INITIALIZER;
   2284         regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
   2285         REGEX_VERBOSE_TEXT(&testPattern);
   2286         regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
   2287         REGEX_VERBOSE_TEXT(&testText);
   2288 
   2289         RegexMatcher m(&testPattern, &testText, 0, status);
   2290         REGEX_CHECK_STATUS;
   2291         REGEX_ASSERT(m.regionStart() == 0);
   2292         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
   2293         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2294         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2295 
   2296         m.region(2,4, status);
   2297         REGEX_CHECK_STATUS;
   2298         REGEX_ASSERT(m.matches(status));
   2299         REGEX_ASSERT(m.start(status)==2);
   2300         REGEX_ASSERT(m.end(status)==4);
   2301         REGEX_CHECK_STATUS;
   2302 
   2303         m.reset();
   2304         REGEX_ASSERT(m.regionStart() == 0);
   2305         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
   2306 
   2307         regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
   2308         REGEX_VERBOSE_TEXT(&testText);
   2309         m.reset(&testText);
   2310         REGEX_ASSERT(m.regionStart() == 0);
   2311         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
   2312 
   2313         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2314         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
   2315         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   2316         REGEX_ASSERT(&m == &m.reset());
   2317         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   2318 
   2319         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
   2320         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2321         REGEX_ASSERT(&m == &m.reset());
   2322         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2323 
   2324         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2325         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
   2326         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   2327         REGEX_ASSERT(&m == &m.reset());
   2328         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   2329 
   2330         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
   2331         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2332         REGEX_ASSERT(&m == &m.reset());
   2333         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2334 
   2335         utext_close(&testText);
   2336         utext_close(&testPattern);
   2337     }
   2338 
   2339     //
   2340     // hitEnd() and requireEnd()
   2341     //
   2342     {
   2343         UErrorCode status = U_ZERO_ERROR;
   2344         UText testPattern = UTEXT_INITIALIZER;
   2345         UText testText    = UTEXT_INITIALIZER;
   2346         const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
   2347         const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
   2348         utext_openUTF8(&testPattern, str_, -1, &status);
   2349         utext_openUTF8(&testText, str_aabb, -1, &status);
   2350 
   2351         RegexMatcher m1(&testPattern, &testText,  0, status);
   2352         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
   2353         REGEX_ASSERT(m1.hitEnd() == TRUE);
   2354         REGEX_ASSERT(m1.requireEnd() == FALSE);
   2355         REGEX_CHECK_STATUS;
   2356 
   2357         status = U_ZERO_ERROR;
   2358         const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
   2359         utext_openUTF8(&testPattern, str_a, -1, &status);
   2360         RegexMatcher m2(&testPattern, &testText, 0, status);
   2361         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
   2362         REGEX_ASSERT(m2.hitEnd() == FALSE);
   2363         REGEX_ASSERT(m2.requireEnd() == FALSE);
   2364         REGEX_CHECK_STATUS;
   2365 
   2366         status = U_ZERO_ERROR;
   2367         const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
   2368         utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
   2369         RegexMatcher m3(&testPattern, &testText, 0, status);
   2370         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
   2371         REGEX_ASSERT(m3.hitEnd() == TRUE);
   2372         REGEX_ASSERT(m3.requireEnd() == TRUE);
   2373         REGEX_CHECK_STATUS;
   2374 
   2375         utext_close(&testText);
   2376         utext_close(&testPattern);
   2377     }
   2378 }
   2379 
   2380 
   2381 //---------------------------------------------------------------------------
   2382 //
   2383 //      API_Replace_UTF8   API test for class RegexMatcher, testing the
   2384 //                         Replace family of functions.
   2385 //
   2386 //---------------------------------------------------------------------------
   2387 void RegexTest::API_Replace_UTF8() {
   2388     //
   2389     //  Replace
   2390     //
   2391     int32_t             flags=0;
   2392     UParseError         pe;
   2393     UErrorCode          status=U_ZERO_ERROR;
   2394 
   2395     UText               re=UTEXT_INITIALIZER;
   2396     regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
   2397     REGEX_VERBOSE_TEXT(&re);
   2398     RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2399     REGEX_CHECK_STATUS;
   2400 
   2401     char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
   2402     //             012345678901234567
   2403     UText dataText = UTEXT_INITIALIZER;
   2404     utext_openUTF8(&dataText, data, -1, &status);
   2405     REGEX_CHECK_STATUS;
   2406     REGEX_VERBOSE_TEXT(&dataText);
   2407     RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
   2408 
   2409     //
   2410     //  Plain vanilla matches.
   2411     //
   2412     UnicodeString  dest;
   2413     UText destText = UTEXT_INITIALIZER;
   2414     utext_openUnicodeString(&destText, &dest, &status);
   2415     UText *result;
   2416 
   2417     UText replText = UTEXT_INITIALIZER;
   2418 
   2419     const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
   2420     utext_openUTF8(&replText, str_yz, -1, &status);
   2421     REGEX_VERBOSE_TEXT(&replText);
   2422     result = matcher->replaceFirst(&replText, NULL, status);
   2423     REGEX_CHECK_STATUS;
   2424     const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
   2425     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
   2426     utext_close(result);
   2427     result = matcher->replaceFirst(&replText, &destText, status);
   2428     REGEX_CHECK_STATUS;
   2429     REGEX_ASSERT(result == &destText);
   2430     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
   2431 
   2432     result = matcher->replaceAll(&replText, NULL, status);
   2433     REGEX_CHECK_STATUS;
   2434     const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
   2435     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
   2436     utext_close(result);
   2437 
   2438     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2439     result = matcher->replaceAll(&replText, &destText, status);
   2440     REGEX_CHECK_STATUS;
   2441     REGEX_ASSERT(result == &destText);
   2442     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
   2443 
   2444     //
   2445     //  Plain vanilla non-matches.
   2446     //
   2447     const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
   2448     utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
   2449     matcher->reset(&dataText);
   2450 
   2451     result = matcher->replaceFirst(&replText, NULL, status);
   2452     REGEX_CHECK_STATUS;
   2453     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2454     utext_close(result);
   2455     result = matcher->replaceFirst(&replText, &destText, status);
   2456     REGEX_CHECK_STATUS;
   2457     REGEX_ASSERT(result == &destText);
   2458     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2459 
   2460     result = matcher->replaceAll(&replText, NULL, status);
   2461     REGEX_CHECK_STATUS;
   2462     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2463     utext_close(result);
   2464     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2465     result = matcher->replaceAll(&replText, &destText, status);
   2466     REGEX_CHECK_STATUS;
   2467     REGEX_ASSERT(result == &destText);
   2468     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2469 
   2470     //
   2471     // Empty source string
   2472     //
   2473     utext_openUTF8(&dataText, NULL, 0, &status);
   2474     matcher->reset(&dataText);
   2475 
   2476     result = matcher->replaceFirst(&replText, NULL, status);
   2477     REGEX_CHECK_STATUS;
   2478     REGEX_ASSERT_UTEXT_UTF8("", result);
   2479     utext_close(result);
   2480     result = matcher->replaceFirst(&replText, &destText, status);
   2481     REGEX_CHECK_STATUS;
   2482     REGEX_ASSERT(result == &destText);
   2483     REGEX_ASSERT_UTEXT_UTF8("", result);
   2484 
   2485     result = matcher->replaceAll(&replText, NULL, status);
   2486     REGEX_CHECK_STATUS;
   2487     REGEX_ASSERT_UTEXT_UTF8("", result);
   2488     utext_close(result);
   2489     result = matcher->replaceAll(&replText, &destText, status);
   2490     REGEX_CHECK_STATUS;
   2491     REGEX_ASSERT(result == &destText);
   2492     REGEX_ASSERT_UTEXT_UTF8("", result);
   2493 
   2494     //
   2495     // Empty substitution string
   2496     //
   2497     utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
   2498     matcher->reset(&dataText);
   2499 
   2500     utext_openUTF8(&replText, NULL, 0, &status);
   2501     result = matcher->replaceFirst(&replText, NULL, status);
   2502     REGEX_CHECK_STATUS;
   2503     const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
   2504     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
   2505     utext_close(result);
   2506     result = matcher->replaceFirst(&replText, &destText, status);
   2507     REGEX_CHECK_STATUS;
   2508     REGEX_ASSERT(result == &destText);
   2509     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
   2510 
   2511     result = matcher->replaceAll(&replText, NULL, status);
   2512     REGEX_CHECK_STATUS;
   2513     const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
   2514     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
   2515     utext_close(result);
   2516     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2517     result = matcher->replaceAll(&replText, &destText, status);
   2518     REGEX_CHECK_STATUS;
   2519     REGEX_ASSERT(result == &destText);
   2520     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
   2521 
   2522     //
   2523     // match whole string
   2524     //
   2525     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
   2526     utext_openUTF8(&dataText, str_abc, -1, &status);
   2527     matcher->reset(&dataText);
   2528 
   2529     const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
   2530     utext_openUTF8(&replText, str_xyz, -1, &status);
   2531     result = matcher->replaceFirst(&replText, NULL, status);
   2532     REGEX_CHECK_STATUS;
   2533     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2534     utext_close(result);
   2535     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2536     result = matcher->replaceFirst(&replText, &destText, status);
   2537     REGEX_CHECK_STATUS;
   2538     REGEX_ASSERT(result == &destText);
   2539     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2540 
   2541     result = matcher->replaceAll(&replText, NULL, status);
   2542     REGEX_CHECK_STATUS;
   2543     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2544     utext_close(result);
   2545     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2546     result = matcher->replaceAll(&replText, &destText, status);
   2547     REGEX_CHECK_STATUS;
   2548     REGEX_ASSERT(result == &destText);
   2549     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2550 
   2551     //
   2552     // Capture Group, simple case
   2553     //
   2554     const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
   2555     utext_openUTF8(&re, str_add, -1, &status);
   2556     RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
   2557     REGEX_CHECK_STATUS;
   2558 
   2559     const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
   2560     utext_openUTF8(&dataText, str_abcdefg, -1, &status);
   2561     RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
   2562     REGEX_CHECK_STATUS;
   2563 
   2564     const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
   2565     utext_openUTF8(&replText, str_11, -1, &status);
   2566     result = matcher2->replaceFirst(&replText, NULL, status);
   2567     REGEX_CHECK_STATUS;
   2568     const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
   2569     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
   2570     utext_close(result);
   2571     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2572     result = matcher2->replaceFirst(&replText, &destText, status);
   2573     REGEX_CHECK_STATUS;
   2574     REGEX_ASSERT(result == &destText);
   2575     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
   2576 
   2577     const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
   2578     utext_openUTF8(&replText, str_v, -1, &status);
   2579     REGEX_VERBOSE_TEXT(&replText);
   2580     result = matcher2->replaceFirst(&replText, NULL, status);
   2581     REGEX_CHECK_STATUS;
   2582     const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
   2583     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
   2584     utext_close(result);
   2585     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2586     result = matcher2->replaceFirst(&replText, &destText, status);
   2587     REGEX_CHECK_STATUS;
   2588     REGEX_ASSERT(result == &destText);
   2589     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
   2590 
   2591     const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */
   2592     utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
   2593     result = matcher2->replaceFirst(&replText, NULL, status);
   2594     REGEX_CHECK_STATUS;
   2595     const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
   2596     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
   2597     utext_close(result);
   2598     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2599     result = matcher2->replaceFirst(&replText, &destText, status);
   2600     REGEX_CHECK_STATUS;
   2601     REGEX_ASSERT(result == &destText);
   2602     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
   2603 
   2604     unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
   2605     //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
   2606     //                                 012345678901234567890123456
   2607     supplDigitChars[22] = 0xF0;
   2608     supplDigitChars[23] = 0x9D;
   2609     supplDigitChars[24] = 0x9F;
   2610     supplDigitChars[25] = 0x8F;
   2611     utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
   2612 
   2613     result = matcher2->replaceFirst(&replText, NULL, status);
   2614     REGEX_CHECK_STATUS;
   2615     const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
   2616     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
   2617     utext_close(result);
   2618     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2619     result = matcher2->replaceFirst(&replText, &destText, status);
   2620     REGEX_CHECK_STATUS;
   2621     REGEX_ASSERT(result == &destText);
   2622     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
   2623     const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
   2624     utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
   2625     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
   2626 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
   2627     utext_close(result);
   2628     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2629     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
   2630     REGEX_ASSERT(result == &destText);
   2631 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
   2632 
   2633     //
   2634     // Replacement String with \u hex escapes
   2635     //
   2636     {
   2637       const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
   2638       const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
   2639         utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
   2640         utext_openUTF8(&replText, str_u0043, -1, &status);
   2641         matcher->reset(&dataText);
   2642 
   2643         result = matcher->replaceAll(&replText, NULL, status);
   2644         REGEX_CHECK_STATUS;
   2645         const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
   2646         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
   2647         utext_close(result);
   2648         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2649         result = matcher->replaceAll(&replText, &destText, status);
   2650         REGEX_CHECK_STATUS;
   2651         REGEX_ASSERT(result == &destText);
   2652         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
   2653     }
   2654     {
   2655       const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
   2656         utext_openUTF8(&dataText, str_abc, -1, &status);
   2657         const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
   2658         utext_openUTF8(&replText, str_U00010000, -1, &status);
   2659         matcher->reset(&dataText);
   2660 
   2661         unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
   2662         //                          0123456789
   2663         expected[2] = 0xF0;
   2664         expected[3] = 0x90;
   2665         expected[4] = 0x80;
   2666         expected[5] = 0x80;
   2667 
   2668         result = matcher->replaceAll(&replText, NULL, status);
   2669         REGEX_CHECK_STATUS;
   2670         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
   2671         utext_close(result);
   2672         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2673         result = matcher->replaceAll(&replText, &destText, status);
   2674         REGEX_CHECK_STATUS;
   2675         REGEX_ASSERT(result == &destText);
   2676         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
   2677     }
   2678     // TODO:  need more through testing of capture substitutions.
   2679 
   2680     // Bug 4057
   2681     //
   2682     {
   2683         status = U_ZERO_ERROR;
   2684 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
   2685 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
   2686 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
   2687         utext_openUTF8(&re, str_ssee, -1, &status);
   2688         utext_openUTF8(&dataText, str_blah, -1, &status);
   2689         utext_openUTF8(&replText, str_ooh, -1, &status);
   2690 
   2691         RegexMatcher m(&re, 0, status);
   2692         REGEX_CHECK_STATUS;
   2693 
   2694         UnicodeString result;
   2695         UText resultText = UTEXT_INITIALIZER;
   2696         utext_openUnicodeString(&resultText, &result, &status);
   2697 
   2698         // Multiple finds do NOT bump up the previous appendReplacement postion.
   2699         m.reset(&dataText);
   2700         m.find();
   2701         m.find();
   2702         m.appendReplacement(&resultText, &replText, status);
   2703         REGEX_CHECK_STATUS;
   2704         const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
   2705         REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
   2706 
   2707         // After a reset into the interior of a string, appendReplacement still starts at beginning.
   2708         status = U_ZERO_ERROR;
   2709         result.truncate(0);
   2710         utext_openUnicodeString(&resultText, &result, &status);
   2711         m.reset(10, status);
   2712         m.find();
   2713         m.find();
   2714         m.appendReplacement(&resultText, &replText, status);
   2715         REGEX_CHECK_STATUS;
   2716         const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
   2717         REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
   2718 
   2719         // find() at interior of string, appendReplacement still starts at beginning.
   2720         status = U_ZERO_ERROR;
   2721         result.truncate(0);
   2722         utext_openUnicodeString(&resultText, &result, &status);
   2723         m.reset();
   2724         m.find(10, status);
   2725         m.find();
   2726         m.appendReplacement(&resultText, &replText, status);
   2727         REGEX_CHECK_STATUS;
   2728         const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
   2729         REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
   2730 
   2731         m.appendTail(&resultText, status);
   2732         const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
   2733         REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
   2734 
   2735         utext_close(&resultText);
   2736     }
   2737 
   2738     delete matcher2;
   2739     delete pat2;
   2740     delete matcher;
   2741     delete pat;
   2742 
   2743     utext_close(&dataText);
   2744     utext_close(&replText);
   2745     utext_close(&destText);
   2746     utext_close(&re);
   2747 }
   2748 
   2749 
   2750 //---------------------------------------------------------------------------
   2751 //
   2752 //      API_Pattern_UTF8  Test that the API for class RegexPattern is
   2753 //                        present and nominally working.
   2754 //
   2755 //---------------------------------------------------------------------------
   2756 void RegexTest::API_Pattern_UTF8() {
   2757     RegexPattern        pata;    // Test default constructor to not crash.
   2758     RegexPattern        patb;
   2759 
   2760     REGEX_ASSERT(pata == patb);
   2761     REGEX_ASSERT(pata == pata);
   2762 
   2763     UText         re1 = UTEXT_INITIALIZER;
   2764     UText         re2 = UTEXT_INITIALIZER;
   2765     UErrorCode    status = U_ZERO_ERROR;
   2766     UParseError   pe;
   2767 
   2768     const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
   2769     const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
   2770     utext_openUTF8(&re1, str_abcalmz, -1, &status);
   2771     utext_openUTF8(&re2, str_def, -1, &status);
   2772 
   2773     RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
   2774     RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
   2775     REGEX_CHECK_STATUS;
   2776     REGEX_ASSERT(*pat1 == *pat1);
   2777     REGEX_ASSERT(*pat1 != pata);
   2778 
   2779     // Assign
   2780     patb = *pat1;
   2781     REGEX_ASSERT(patb == *pat1);
   2782 
   2783     // Copy Construct
   2784     RegexPattern patc(*pat1);
   2785     REGEX_ASSERT(patc == *pat1);
   2786     REGEX_ASSERT(patb == patc);
   2787     REGEX_ASSERT(pat1 != pat2);
   2788     patb = *pat2;
   2789     REGEX_ASSERT(patb != patc);
   2790     REGEX_ASSERT(patb == *pat2);
   2791 
   2792     // Compile with no flags.
   2793     RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
   2794     REGEX_ASSERT(*pat1a == *pat1);
   2795 
   2796     REGEX_ASSERT(pat1a->flags() == 0);
   2797 
   2798     // Compile with different flags should be not equal
   2799     RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
   2800     REGEX_CHECK_STATUS;
   2801 
   2802     REGEX_ASSERT(*pat1b != *pat1a);
   2803     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
   2804     REGEX_ASSERT(pat1a->flags() == 0);
   2805     delete pat1b;
   2806 
   2807     // clone
   2808     RegexPattern *pat1c = pat1->clone();
   2809     REGEX_ASSERT(*pat1c == *pat1);
   2810     REGEX_ASSERT(*pat1c != *pat2);
   2811 
   2812     delete pat1c;
   2813     delete pat1a;
   2814     delete pat1;
   2815     delete pat2;
   2816 
   2817     utext_close(&re1);
   2818     utext_close(&re2);
   2819 
   2820 
   2821     //
   2822     //   Verify that a matcher created from a cloned pattern works.
   2823     //     (Jitterbug 3423)
   2824     //
   2825     {
   2826         UErrorCode     status     = U_ZERO_ERROR;
   2827         UText          pattern    = UTEXT_INITIALIZER;
   2828         const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
   2829         utext_openUTF8(&pattern, str_pL, -1, &status);
   2830 
   2831         RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
   2832         RegexPattern  *pClone     = pSource->clone();
   2833         delete         pSource;
   2834         RegexMatcher  *mFromClone = pClone->matcher(status);
   2835         REGEX_CHECK_STATUS;
   2836 
   2837         UText          input      = UTEXT_INITIALIZER;
   2838         const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
   2839         utext_openUTF8(&input, str_HelloWorld, -1, &status);
   2840         mFromClone->reset(&input);
   2841         REGEX_ASSERT(mFromClone->find() == TRUE);
   2842         REGEX_ASSERT(mFromClone->group(status) == "Hello");
   2843         REGEX_ASSERT(mFromClone->find() == TRUE);
   2844         REGEX_ASSERT(mFromClone->group(status) == "World");
   2845         REGEX_ASSERT(mFromClone->find() == FALSE);
   2846         delete mFromClone;
   2847         delete pClone;
   2848 
   2849         utext_close(&input);
   2850         utext_close(&pattern);
   2851     }
   2852 
   2853     //
   2854     //   matches convenience API
   2855     //
   2856     {
   2857         UErrorCode status  = U_ZERO_ERROR;
   2858         UText      pattern = UTEXT_INITIALIZER;
   2859         UText      input   = UTEXT_INITIALIZER;
   2860 
   2861         const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
   2862         utext_openUTF8(&input, str_randominput, -1, &status);
   2863 
   2864         const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
   2865         utext_openUTF8(&pattern, str_dotstar, -1, &status);
   2866         REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
   2867         REGEX_CHECK_STATUS;
   2868 
   2869         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
   2870         utext_openUTF8(&pattern, str_abc, -1, &status);
   2871         REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
   2872         REGEX_CHECK_STATUS;
   2873 
   2874         const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
   2875         utext_openUTF8(&pattern, str_nput, -1, &status);
   2876         REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
   2877         REGEX_CHECK_STATUS;
   2878 
   2879         utext_openUTF8(&pattern, str_randominput, -1, &status);
   2880         REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
   2881         REGEX_CHECK_STATUS;
   2882 
   2883         const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
   2884         utext_openUTF8(&pattern, str_u, -1, &status);
   2885         REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
   2886         REGEX_CHECK_STATUS;
   2887 
   2888         utext_openUTF8(&input, str_abc, -1, &status);
   2889         utext_openUTF8(&pattern, str_abc, -1, &status);
   2890         status = U_INDEX_OUTOFBOUNDS_ERROR;
   2891         REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
   2892         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   2893 
   2894         utext_close(&input);
   2895         utext_close(&pattern);
   2896     }
   2897 
   2898 
   2899     //
   2900     // Split()
   2901     //
   2902     status = U_ZERO_ERROR;
   2903     const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
   2904     utext_openUTF8(&re1, str_spaceplus, -1, &status);
   2905     pat1 = RegexPattern::compile(&re1, pe, status);
   2906     REGEX_CHECK_STATUS;
   2907     UnicodeString  fields[10];
   2908 
   2909     int32_t n;
   2910     n = pat1->split("Now is the time", fields, 10, status);
   2911     REGEX_CHECK_STATUS;
   2912     REGEX_ASSERT(n==4);
   2913     REGEX_ASSERT(fields[0]=="Now");
   2914     REGEX_ASSERT(fields[1]=="is");
   2915     REGEX_ASSERT(fields[2]=="the");
   2916     REGEX_ASSERT(fields[3]=="time");
   2917     REGEX_ASSERT(fields[4]=="");
   2918 
   2919     n = pat1->split("Now is the time", fields, 2, status);
   2920     REGEX_CHECK_STATUS;
   2921     REGEX_ASSERT(n==2);
   2922     REGEX_ASSERT(fields[0]=="Now");
   2923     REGEX_ASSERT(fields[1]=="is the time");
   2924     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
   2925 
   2926     fields[1] = "*";
   2927     status = U_ZERO_ERROR;
   2928     n = pat1->split("Now is the time", fields, 1, status);
   2929     REGEX_CHECK_STATUS;
   2930     REGEX_ASSERT(n==1);
   2931     REGEX_ASSERT(fields[0]=="Now is the time");
   2932     REGEX_ASSERT(fields[1]=="*");
   2933     status = U_ZERO_ERROR;
   2934 
   2935     n = pat1->split("    Now       is the time   ", fields, 10, status);
   2936     REGEX_CHECK_STATUS;
   2937     REGEX_ASSERT(n==6);
   2938     REGEX_ASSERT(fields[0]=="");
   2939     REGEX_ASSERT(fields[1]=="Now");
   2940     REGEX_ASSERT(fields[2]=="is");
   2941     REGEX_ASSERT(fields[3]=="the");
   2942     REGEX_ASSERT(fields[4]=="time");
   2943     REGEX_ASSERT(fields[5]=="");
   2944     REGEX_ASSERT(fields[6]=="");
   2945 
   2946     fields[2] = "*";
   2947     n = pat1->split("     ", fields, 10, status);
   2948     REGEX_CHECK_STATUS;
   2949     REGEX_ASSERT(n==2);
   2950     REGEX_ASSERT(fields[0]=="");
   2951     REGEX_ASSERT(fields[1]=="");
   2952     REGEX_ASSERT(fields[2]=="*");
   2953 
   2954     fields[0] = "foo";
   2955     n = pat1->split("", fields, 10, status);
   2956     REGEX_CHECK_STATUS;
   2957     REGEX_ASSERT(n==0);
   2958     REGEX_ASSERT(fields[0]=="foo");
   2959 
   2960     delete pat1;
   2961 
   2962     //  split, with a pattern with (capture)
   2963     regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
   2964     pat1 = RegexPattern::compile(&re1,  pe, status);
   2965     REGEX_CHECK_STATUS;
   2966 
   2967     status = U_ZERO_ERROR;
   2968     fields[6] = fields[7] = "*";
   2969     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
   2970     REGEX_CHECK_STATUS;
   2971     REGEX_ASSERT(n==7);
   2972     REGEX_ASSERT(fields[0]=="");
   2973     REGEX_ASSERT(fields[1]=="a");
   2974     REGEX_ASSERT(fields[2]=="Now is ");
   2975     REGEX_ASSERT(fields[3]=="b");
   2976     REGEX_ASSERT(fields[4]=="the time");
   2977     REGEX_ASSERT(fields[5]=="c");
   2978     REGEX_ASSERT(fields[6]=="");
   2979     REGEX_ASSERT(fields[7]=="*");
   2980     REGEX_ASSERT(status==U_ZERO_ERROR);
   2981 
   2982     fields[6] = fields[7] = "*";
   2983     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
   2984     REGEX_CHECK_STATUS;
   2985     REGEX_ASSERT(n==7);
   2986     REGEX_ASSERT(fields[0]=="  ");
   2987     REGEX_ASSERT(fields[1]=="a");
   2988     REGEX_ASSERT(fields[2]=="Now is ");
   2989     REGEX_ASSERT(fields[3]=="b");
   2990     REGEX_ASSERT(fields[4]=="the time");
   2991     REGEX_ASSERT(fields[5]=="c");
   2992     REGEX_ASSERT(fields[6]=="");
   2993     REGEX_ASSERT(fields[7]=="*");
   2994 
   2995     status = U_ZERO_ERROR;
   2996     fields[6] = "foo";
   2997     n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
   2998     REGEX_CHECK_STATUS;
   2999     REGEX_ASSERT(n==6);
   3000     REGEX_ASSERT(fields[0]=="  ");
   3001     REGEX_ASSERT(fields[1]=="a");
   3002     REGEX_ASSERT(fields[2]=="Now is ");
   3003     REGEX_ASSERT(fields[3]=="b");
   3004     REGEX_ASSERT(fields[4]=="the time");
   3005     REGEX_ASSERT(fields[5]==" ");
   3006     REGEX_ASSERT(fields[6]=="foo");
   3007 
   3008     status = U_ZERO_ERROR;
   3009     fields[5] = "foo";
   3010     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
   3011     REGEX_CHECK_STATUS;
   3012     REGEX_ASSERT(n==5);
   3013     REGEX_ASSERT(fields[0]=="  ");
   3014     REGEX_ASSERT(fields[1]=="a");
   3015     REGEX_ASSERT(fields[2]=="Now is ");
   3016     REGEX_ASSERT(fields[3]=="b");
   3017     REGEX_ASSERT(fields[4]=="the time<c>");
   3018     REGEX_ASSERT(fields[5]=="foo");
   3019 
   3020     status = U_ZERO_ERROR;
   3021     fields[5] = "foo";
   3022     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
   3023     REGEX_CHECK_STATUS;
   3024     REGEX_ASSERT(n==5);
   3025     REGEX_ASSERT(fields[0]=="  ");
   3026     REGEX_ASSERT(fields[1]=="a");
   3027     REGEX_ASSERT(fields[2]=="Now is ");
   3028     REGEX_ASSERT(fields[3]=="b");
   3029     REGEX_ASSERT(fields[4]=="the time");
   3030     REGEX_ASSERT(fields[5]=="foo");
   3031 
   3032     status = U_ZERO_ERROR;
   3033     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
   3034     REGEX_CHECK_STATUS;
   3035     REGEX_ASSERT(n==4);
   3036     REGEX_ASSERT(fields[0]=="  ");
   3037     REGEX_ASSERT(fields[1]=="a");
   3038     REGEX_ASSERT(fields[2]=="Now is ");
   3039     REGEX_ASSERT(fields[3]=="the time<c>");
   3040     status = U_ZERO_ERROR;
   3041     delete pat1;
   3042 
   3043     regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
   3044     pat1 = RegexPattern::compile(&re1, pe, status);
   3045     REGEX_CHECK_STATUS;
   3046     n = pat1->split("1-10,20", fields, 10, status);
   3047     REGEX_CHECK_STATUS;
   3048     REGEX_ASSERT(n==5);
   3049     REGEX_ASSERT(fields[0]=="1");
   3050     REGEX_ASSERT(fields[1]=="-");
   3051     REGEX_ASSERT(fields[2]=="10");
   3052     REGEX_ASSERT(fields[3]==",");
   3053     REGEX_ASSERT(fields[4]=="20");
   3054     delete pat1;
   3055 
   3056 
   3057     //
   3058     // RegexPattern::pattern() and patternText()
   3059     //
   3060     pat1 = new RegexPattern();
   3061     REGEX_ASSERT(pat1->pattern() == "");
   3062     REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
   3063     delete pat1;
   3064     const char *helloWorldInvariant = "(Hello, world)*";
   3065     regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
   3066     pat1 = RegexPattern::compile(&re1, pe, status);
   3067     REGEX_CHECK_STATUS;
   3068     REGEX_ASSERT_UNISTR(pat1->pattern(),"(Hello, world)*");
   3069     REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
   3070     delete pat1;
   3071 
   3072     utext_close(&re1);
   3073 }
   3074 
   3075 
   3076 //---------------------------------------------------------------------------
   3077 //
   3078 //      Extended       A more thorough check for features of regex patterns
   3079 //                     The test cases are in a separate data file,
   3080 //                       source/tests/testdata/regextst.txt
   3081 //                     A description of the test data format is included in that file.
   3082 //
   3083 //---------------------------------------------------------------------------
   3084 
   3085 const char *
   3086 RegexTest::getPath(char buffer[2048], const char *filename) {
   3087     UErrorCode status=U_ZERO_ERROR;
   3088     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   3089     if (U_FAILURE(status)) {
   3090         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
   3091         return NULL;
   3092     }
   3093 
   3094     strcpy(buffer, testDataDirectory);
   3095     strcat(buffer, filename);
   3096     return buffer;
   3097 }
   3098 
   3099 void RegexTest::Extended() {
   3100     char tdd[2048];
   3101     const char *srcPath;
   3102     UErrorCode  status  = U_ZERO_ERROR;
   3103     int32_t     lineNum = 0;
   3104 
   3105     //
   3106     //  Open and read the test data file.
   3107     //
   3108     srcPath=getPath(tdd, "regextst.txt");
   3109     if(srcPath==NULL) {
   3110         return; /* something went wrong, error already output */
   3111     }
   3112 
   3113     int32_t    len;
   3114     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
   3115     if (U_FAILURE(status)) {
   3116         return; /* something went wrong, error already output */
   3117     }
   3118 
   3119     //
   3120     //  Put the test data into a UnicodeString
   3121     //
   3122     UnicodeString testString(FALSE, testData, len);
   3123 
   3124     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
   3125     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
   3126     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
   3127 
   3128     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
   3129     UnicodeString   testPattern;   // The pattern for test from the test file.
   3130     UnicodeString   testFlags;     // the flags   for a test.
   3131     UnicodeString   matchString;   // The marked up string to be used as input
   3132 
   3133     if (U_FAILURE(status)){
   3134         dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
   3135         delete [] testData;
   3136         return;
   3137     }
   3138 
   3139     //
   3140     //  Loop over the test data file, once per line.
   3141     //
   3142     while (lineMat.find()) {
   3143         lineNum++;
   3144         if (U_FAILURE(status)) {
   3145           errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
   3146         }
   3147 
   3148         status = U_ZERO_ERROR;
   3149         UnicodeString testLine = lineMat.group(1, status);
   3150         if (testLine.length() == 0) {
   3151             continue;
   3152         }
   3153 
   3154         //
   3155         // Parse the test line.  Skip blank and comment only lines.
   3156         // Separate out the three main fields - pattern, flags, target.
   3157         //
   3158 
   3159         commentMat.reset(testLine);
   3160         if (commentMat.lookingAt(status)) {
   3161             // This line is a comment, or blank.
   3162             continue;
   3163         }
   3164 
   3165         //
   3166         //  Pull out the pattern field, remove it from the test file line.
   3167         //
   3168         quotedStuffMat.reset(testLine);
   3169         if (quotedStuffMat.lookingAt(status)) {
   3170             testPattern = quotedStuffMat.group(2, status);
   3171             testLine.remove(0, quotedStuffMat.end(0, status));
   3172         } else {
   3173             errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
   3174             continue;
   3175         }
   3176 
   3177 
   3178         //
   3179         //  Pull out the flags from the test file line.
   3180         //
   3181         flagsMat.reset(testLine);
   3182         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
   3183         testFlags = flagsMat.group(1, status);
   3184         if (flagsMat.group(2, status).length() > 0) {
   3185             errln("Bad Match flag at line %d. Scanning %c\n",
   3186                 lineNum, flagsMat.group(2, status).charAt(0));
   3187             continue;
   3188         }
   3189         testLine.remove(0, flagsMat.end(0, status));
   3190 
   3191         //
   3192         //  Pull out the match string, as a whole.
   3193         //    We'll process the <tags> later.
   3194         //
   3195         quotedStuffMat.reset(testLine);
   3196         if (quotedStuffMat.lookingAt(status)) {
   3197             matchString = quotedStuffMat.group(2, status);
   3198             testLine.remove(0, quotedStuffMat.end(0, status));
   3199         } else {
   3200             errln("Bad match string at test file line %d", lineNum);
   3201             continue;
   3202         }
   3203 
   3204         //
   3205         //  The only thing left from the input line should be an optional trailing comment.
   3206         //
   3207         commentMat.reset(testLine);
   3208         if (commentMat.lookingAt(status) == FALSE) {
   3209             errln("Line %d: unexpected characters at end of test line.", lineNum);
   3210             continue;
   3211         }
   3212 
   3213         //
   3214         //  Run the test
   3215         //
   3216         regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
   3217     }
   3218 
   3219     delete [] testData;
   3220 
   3221 }
   3222 
   3223 
   3224 
   3225 //---------------------------------------------------------------------------
   3226 //
   3227 //    regex_find(pattern, flags, inputString, lineNumber)
   3228 //
   3229 //         Function to run a single test from the Extended (data driven) tests.
   3230 //         See file test/testdata/regextst.txt for a description of the
   3231 //         pattern and inputString fields, and the allowed flags.
   3232 //         lineNumber is the source line in regextst.txt of the test.
   3233 //
   3234 //---------------------------------------------------------------------------
   3235 
   3236 
   3237 //  Set a value into a UVector at position specified by a decimal number in
   3238 //   a UnicodeString.   This is a utility function needed by the actual test function,
   3239 //   which follows.
   3240 static void set(UVector &vec, int32_t val, UnicodeString index) {
   3241     UErrorCode  status=U_ZERO_ERROR;
   3242     int32_t  idx = 0;
   3243     for (int32_t i=0; i<index.length(); i++) {
   3244         int32_t d=u_charDigitValue(index.charAt(i));
   3245         if (d<0) {return;}
   3246         idx = idx*10 + d;
   3247     }
   3248     while (vec.size()<idx+1) {vec.addElement(-1, status);}
   3249     vec.setElementAt(val, idx);
   3250 }
   3251 
   3252 static void setInt(UVector &vec, int32_t val, int32_t idx) {
   3253     UErrorCode  status=U_ZERO_ERROR;
   3254     while (vec.size()<idx+1) {vec.addElement(-1, status);}
   3255     vec.setElementAt(val, idx);
   3256 }
   3257 
   3258 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
   3259 {
   3260     UBool couldFind = TRUE;
   3261     UTEXT_SETNATIVEINDEX(utext, 0);
   3262     int32_t i = 0;
   3263     while (i < unistrOffset) {
   3264         UChar32 c = UTEXT_NEXT32(utext);
   3265         if (c != U_SENTINEL) {
   3266             i += U16_LENGTH(c);
   3267         } else {
   3268             couldFind = FALSE;
   3269             break;
   3270         }
   3271     }
   3272     nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
   3273     return couldFind;
   3274 }
   3275 
   3276 
   3277 void RegexTest::regex_find(const UnicodeString &pattern,
   3278                            const UnicodeString &flags,
   3279                            const UnicodeString &inputString,
   3280                            const char *srcPath,
   3281                            int32_t line) {
   3282     UnicodeString       unEscapedInput;
   3283     UnicodeString       deTaggedInput;
   3284 
   3285     int32_t             patternUTF8Length,      inputUTF8Length;
   3286     char                *patternChars  = NULL, *inputChars = NULL;
   3287     UText               patternText    = UTEXT_INITIALIZER;
   3288     UText               inputText      = UTEXT_INITIALIZER;
   3289     UConverter          *UTF8Converter = NULL;
   3290 
   3291     UErrorCode          status         = U_ZERO_ERROR;
   3292     UParseError         pe;
   3293     RegexPattern        *parsePat      = NULL;
   3294     RegexMatcher        *parseMatcher  = NULL;
   3295     RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
   3296     RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
   3297     UVector             groupStarts(status);
   3298     UVector             groupEnds(status);
   3299     UVector             groupStartsUTF8(status);
   3300     UVector             groupEndsUTF8(status);
   3301     UBool               isMatch        = FALSE, isUTF8Match = FALSE;
   3302     UBool               failed         = FALSE;
   3303     int32_t             numFinds;
   3304     int32_t             i;
   3305     UBool               useMatchesFunc   = FALSE;
   3306     UBool               useLookingAtFunc = FALSE;
   3307     int32_t             regionStart      = -1;
   3308     int32_t             regionEnd        = -1;
   3309     int32_t             regionStartUTF8  = -1;
   3310     int32_t             regionEndUTF8    = -1;
   3311 
   3312 
   3313     //
   3314     //  Compile the caller's pattern
   3315     //
   3316     uint32_t bflags = 0;
   3317     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
   3318         bflags |= UREGEX_CASE_INSENSITIVE;
   3319     }
   3320     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
   3321         bflags |= UREGEX_COMMENTS;
   3322     }
   3323     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
   3324         bflags |= UREGEX_DOTALL;
   3325     }
   3326     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
   3327         bflags |= UREGEX_MULTILINE;
   3328     }
   3329 
   3330     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
   3331         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
   3332     }
   3333     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
   3334         bflags |= UREGEX_UNIX_LINES;
   3335     }
   3336     if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
   3337         bflags |= UREGEX_LITERAL;
   3338     }
   3339 
   3340 
   3341     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
   3342     if (status != U_ZERO_ERROR) {
   3343         #if UCONFIG_NO_BREAK_ITERATION==1
   3344         // 'v' test flag means that the test pattern should not compile if ICU was configured
   3345         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
   3346         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
   3347             goto cleanupAndReturn;
   3348         }
   3349         #endif
   3350         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
   3351             // Expected pattern compilation error.
   3352             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
   3353                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
   3354             }
   3355             goto cleanupAndReturn;
   3356         } else {
   3357             // Unexpected pattern compilation error.
   3358             dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
   3359             goto cleanupAndReturn;
   3360         }
   3361     }
   3362 
   3363     UTF8Converter = ucnv_open("UTF8", &status);
   3364     ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
   3365 
   3366     patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
   3367     status = U_ZERO_ERROR; // buffer overflow
   3368     patternChars = new char[patternUTF8Length+1];
   3369     pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
   3370     utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
   3371 
   3372     if (status == U_ZERO_ERROR) {
   3373         UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
   3374 
   3375         if (status != U_ZERO_ERROR) {
   3376 #if UCONFIG_NO_BREAK_ITERATION==1
   3377             // 'v' test flag means that the test pattern should not compile if ICU was configured
   3378             //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
   3379             if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
   3380                 goto cleanupAndReturn;
   3381             }
   3382 #endif
   3383             if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
   3384                 // Expected pattern compilation error.
   3385                 if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
   3386                     logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
   3387                 }
   3388                 goto cleanupAndReturn;
   3389             } else {
   3390                 // Unexpected pattern compilation error.
   3391                 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
   3392                 goto cleanupAndReturn;
   3393             }
   3394         }
   3395     }
   3396 
   3397     if (UTF8Pattern == NULL) {
   3398         // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
   3399         logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
   3400         status = U_ZERO_ERROR;
   3401     }
   3402 
   3403     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
   3404         RegexPatternDump(callerPattern);
   3405     }
   3406 
   3407     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
   3408         errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
   3409         goto cleanupAndReturn;
   3410     }
   3411 
   3412 
   3413     //
   3414     // Number of times find() should be called on the test string, default to 1
   3415     //
   3416     numFinds = 1;
   3417     for (i=2; i<=9; i++) {
   3418         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
   3419             if (numFinds != 1) {
   3420                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
   3421                 goto cleanupAndReturn;
   3422             }
   3423             numFinds = i;
   3424         }
   3425     }
   3426 
   3427     // 'M' flag.  Use matches() instead of find()
   3428     if (flags.indexOf((UChar)0x4d) >= 0) {
   3429         useMatchesFunc = TRUE;
   3430     }
   3431     if (flags.indexOf((UChar)0x4c) >= 0) {
   3432         useLookingAtFunc = TRUE;
   3433     }
   3434 
   3435     //
   3436     //  Find the tags in the input data, remove them, and record the group boundary
   3437     //    positions.
   3438     //
   3439     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
   3440     REGEX_CHECK_STATUS_L(line);
   3441 
   3442     unEscapedInput = inputString.unescape();
   3443     parseMatcher = parsePat->matcher(unEscapedInput, status);
   3444     REGEX_CHECK_STATUS_L(line);
   3445     while(parseMatcher->find()) {
   3446         parseMatcher->appendReplacement(deTaggedInput, "", status);
   3447         REGEX_CHECK_STATUS;
   3448         UnicodeString groupNum = parseMatcher->group(2, status);
   3449         if (groupNum == "r") {
   3450             // <r> or </r>, a region specification within the string
   3451             if (parseMatcher->group(1, status) == "/") {
   3452                 regionEnd = deTaggedInput.length();
   3453             } else {
   3454                 regionStart = deTaggedInput.length();
   3455             }
   3456         } else {
   3457             // <digits> or </digits>, a group match boundary tag.
   3458             if (parseMatcher->group(1, status) == "/") {
   3459                 set(groupEnds, deTaggedInput.length(), groupNum);
   3460             } else {
   3461                 set(groupStarts, deTaggedInput.length(), groupNum);
   3462             }
   3463         }
   3464     }
   3465     parseMatcher->appendTail(deTaggedInput);
   3466     REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
   3467     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
   3468       errln("mismatched <r> tags");
   3469       failed = TRUE;
   3470       goto cleanupAndReturn;
   3471     }
   3472 
   3473     //
   3474     //  Configure the matcher according to the flags specified with this test.
   3475     //
   3476     matcher = callerPattern->matcher(deTaggedInput, status);
   3477     REGEX_CHECK_STATUS_L(line);
   3478     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
   3479         matcher->setTrace(TRUE);
   3480     }
   3481 
   3482     if (UTF8Pattern != NULL) {
   3483         inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
   3484         status = U_ZERO_ERROR; // buffer overflow
   3485         inputChars = new char[inputUTF8Length+1];
   3486         deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
   3487         utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
   3488 
   3489         if (status == U_ZERO_ERROR) {
   3490             UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
   3491             REGEX_CHECK_STATUS_L(line);
   3492         }
   3493 
   3494         if (UTF8Matcher == NULL) {
   3495             // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
   3496           logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
   3497             status = U_ZERO_ERROR;
   3498         }
   3499     }
   3500 
   3501     //
   3502     //  Generate native indices for UTF8 versions of region and capture group info
   3503     //
   3504     if (UTF8Matcher != NULL) {
   3505         if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
   3506         if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
   3507 
   3508         //  Fill out the native index UVector info.
   3509         //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
   3510         for (i=0; i<groupStarts.size(); i++) {
   3511             int32_t  start = groupStarts.elementAti(i);
   3512             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
   3513             if (start >= 0) {
   3514                 int32_t  startUTF8;
   3515                 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
   3516                     errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
   3517                     failed = TRUE;
   3518                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3519                 }
   3520                 setInt(groupStartsUTF8, startUTF8, i);
   3521             }
   3522 
   3523             int32_t  end = groupEnds.elementAti(i);
   3524             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
   3525             if (end >= 0) {
   3526                 int32_t  endUTF8;
   3527                 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
   3528                     errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
   3529                     failed = TRUE;
   3530                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3531                 }
   3532                 setInt(groupEndsUTF8, endUTF8, i);
   3533             }
   3534         }
   3535     }
   3536 
   3537     if (regionStart>=0) {
   3538        matcher->region(regionStart, regionEnd, status);
   3539        REGEX_CHECK_STATUS_L(line);
   3540        if (UTF8Matcher != NULL) {
   3541            UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
   3542            REGEX_CHECK_STATUS_L(line);
   3543        }
   3544     }
   3545     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
   3546         matcher->useAnchoringBounds(FALSE);
   3547         if (UTF8Matcher != NULL) {
   3548             UTF8Matcher->useAnchoringBounds(FALSE);
   3549         }
   3550     }
   3551     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
   3552         matcher->useTransparentBounds(TRUE);
   3553         if (UTF8Matcher != NULL) {
   3554             UTF8Matcher->useTransparentBounds(TRUE);
   3555         }
   3556     }
   3557 
   3558 
   3559 
   3560     //
   3561     // Do a find on the de-tagged input using the caller's pattern
   3562     //     TODO: error on count>1 and not find().
   3563     //           error on both matches() and lookingAt().
   3564     //
   3565     for (i=0; i<numFinds; i++) {
   3566         if (useMatchesFunc) {
   3567             isMatch = matcher->matches(status);
   3568             if (UTF8Matcher != NULL) {
   3569                isUTF8Match = UTF8Matcher->matches(status);
   3570             }
   3571         } else  if (useLookingAtFunc) {
   3572             isMatch = matcher->lookingAt(status);
   3573             if (UTF8Matcher != NULL) {
   3574                 isUTF8Match = UTF8Matcher->lookingAt(status);
   3575             }
   3576         } else {
   3577             isMatch = matcher->find();
   3578             if (UTF8Matcher != NULL) {
   3579                 isUTF8Match = UTF8Matcher->find();
   3580             }
   3581         }
   3582     }
   3583     matcher->setTrace(FALSE);
   3584     if (U_FAILURE(status)) {
   3585         errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
   3586     }
   3587 
   3588     //
   3589     // Match up the groups from the find() with the groups from the tags
   3590     //
   3591 
   3592     // number of tags should match number of groups from find operation.
   3593     // matcher->groupCount does not include group 0, the entire match, hence the +1.
   3594     //   G option in test means that capture group data is not available in the
   3595     //     expected results, so the check needs to be suppressed.
   3596     if (isMatch == FALSE && groupStarts.size() != 0) {
   3597         dataerrln("Error at line %d:  Match expected, but none found.", line);
   3598         failed = TRUE;
   3599         goto cleanupAndReturn;
   3600     } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
   3601         errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
   3602         failed = TRUE;
   3603         goto cleanupAndReturn;
   3604     }
   3605 
   3606     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
   3607         // Only check for match / no match.  Don't check capture groups.
   3608         if (isMatch && groupStarts.size() == 0) {
   3609             errln("Error at line %d:  No match expected, but one found.", line);
   3610             failed = TRUE;
   3611         } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
   3612             errln("Error at line %d:  No match expected, but one found. (UTF8)", line);
   3613             failed = TRUE;
   3614         }
   3615         goto cleanupAndReturn;
   3616     }
   3617 
   3618     REGEX_CHECK_STATUS_L(line);
   3619     for (i=0; i<=matcher->groupCount(); i++) {
   3620         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
   3621         int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
   3622         if (matcher->start(i, status) != expectedStart) {
   3623             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
   3624                 line, i, expectedStart, matcher->start(i, status));
   3625             failed = TRUE;
   3626             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3627         } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
   3628             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
   3629                   line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
   3630             failed = TRUE;
   3631             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3632         }
   3633 
   3634         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
   3635         int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
   3636         if (matcher->end(i, status) != expectedEnd) {
   3637             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
   3638                 line, i, expectedEnd, matcher->end(i, status));
   3639             failed = TRUE;
   3640             // Error on end position;  keep going; real error is probably yet to come as group
   3641             //   end positions work from end of the input data towards the front.
   3642         } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
   3643             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
   3644                   line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
   3645             failed = TRUE;
   3646             // Error on end position;  keep going; real error is probably yet to come as group
   3647             //   end positions work from end of the input data towards the front.
   3648         }
   3649     }
   3650     if ( matcher->groupCount()+1 < groupStarts.size()) {
   3651         errln("Error at line %d: Expected %d capture groups, found %d.",
   3652             line, groupStarts.size()-1, matcher->groupCount());
   3653         failed = TRUE;
   3654         }
   3655     else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
   3656         errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
   3657               line, groupStarts.size()-1, UTF8Matcher->groupCount());
   3658         failed = TRUE;
   3659     }
   3660 
   3661     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3662         matcher->requireEnd() == TRUE) {
   3663         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
   3664         failed = TRUE;
   3665     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3666         UTF8Matcher->requireEnd() == TRUE) {
   3667         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
   3668         failed = TRUE;
   3669     }
   3670 
   3671     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
   3672         matcher->requireEnd() == FALSE) {
   3673         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
   3674         failed = TRUE;
   3675     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3676         UTF8Matcher->requireEnd() == FALSE) {
   3677         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
   3678         failed = TRUE;
   3679     }
   3680 
   3681     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
   3682         matcher->hitEnd() == TRUE) {
   3683         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
   3684         failed = TRUE;
   3685     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
   3686                UTF8Matcher->hitEnd() == TRUE) {
   3687         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
   3688         failed = TRUE;
   3689     }
   3690 
   3691     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
   3692         matcher->hitEnd() == FALSE) {
   3693         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
   3694         failed = TRUE;
   3695     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
   3696                UTF8Matcher->hitEnd() == FALSE) {
   3697         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
   3698         failed = TRUE;
   3699     }
   3700 
   3701 
   3702 cleanupAndReturn:
   3703     if (failed) {
   3704         infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
   3705             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
   3706         // callerPattern->dump();
   3707     }
   3708     delete parseMatcher;
   3709     delete parsePat;
   3710     delete UTF8Matcher;
   3711     delete UTF8Pattern;
   3712     delete matcher;
   3713     delete callerPattern;
   3714 
   3715     utext_close(&inputText);
   3716     delete[] inputChars;
   3717     utext_close(&patternText);
   3718     delete[] patternChars;
   3719     ucnv_close(UTF8Converter);
   3720 }
   3721 
   3722 
   3723 
   3724 
   3725 //---------------------------------------------------------------------------
   3726 //
   3727 //      Errors     Check for error handling in patterns.
   3728 //
   3729 //---------------------------------------------------------------------------
   3730 void RegexTest::Errors() {
   3731     // \escape sequences that aren't implemented yet.
   3732     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
   3733 
   3734     // Missing close parentheses
   3735     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
   3736     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
   3737     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
   3738 
   3739     // Extra close paren
   3740     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
   3741     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
   3742     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
   3743 
   3744     // Look-ahead, Look-behind
   3745     //  TODO:  add tests for unbounded length look-behinds.
   3746     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
   3747 
   3748     // Attempt to use non-default flags
   3749     {
   3750         UParseError   pe;
   3751         UErrorCode    status = U_ZERO_ERROR;
   3752         int32_t       flags  = UREGEX_CANON_EQ |
   3753                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
   3754                                UREGEX_MULTILINE;
   3755         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
   3756         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
   3757         delete pat1;
   3758     }
   3759 
   3760 
   3761     // Quantifiers are allowed only after something that can be quantified.
   3762     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
   3763     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
   3764     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
   3765 
   3766     // Mal-formed {min,max} quantifiers
   3767     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
   3768     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
   3769     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
   3770     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
   3771     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
   3772     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
   3773     REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
   3774     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
   3775     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
   3776 
   3777     // Ticket 5389
   3778     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
   3779 
   3780     // Invalid Back Reference \0
   3781     //    For ICU 3.8 and earlier
   3782     //    For ICU versions newer than 3.8, \0 introduces an octal escape.
   3783     //
   3784     REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
   3785 
   3786 }
   3787 
   3788 
   3789 //-------------------------------------------------------------------------------
   3790 //
   3791 //  Read a text data file, convert it to UChars, and return the data
   3792 //    in one big UChar * buffer, which the caller must delete.
   3793 //
   3794 //--------------------------------------------------------------------------------
   3795 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
   3796                                      const char *defEncoding, UErrorCode &status) {
   3797     UChar       *retPtr  = NULL;
   3798     char        *fileBuf = NULL;
   3799     UConverter* conv     = NULL;
   3800     FILE        *f       = NULL;
   3801 
   3802     ulen = 0;
   3803     if (U_FAILURE(status)) {
   3804         return retPtr;
   3805     }
   3806 
   3807     //
   3808     //  Open the file.
   3809     //
   3810     f = fopen(fileName, "rb");
   3811     if (f == 0) {
   3812         dataerrln("Error opening test data file %s\n", fileName);
   3813         status = U_FILE_ACCESS_ERROR;
   3814         return NULL;
   3815     }
   3816     //
   3817     //  Read it in
   3818     //
   3819     int32_t            fileSize;
   3820     int32_t            amt_read;
   3821 
   3822     fseek( f, 0, SEEK_END);
   3823     fileSize = ftell(f);
   3824     fileBuf = new char[fileSize];
   3825     fseek(f, 0, SEEK_SET);
   3826     amt_read = fread(fileBuf, 1, fileSize, f);
   3827     if (amt_read != fileSize || fileSize <= 0) {
   3828         errln("Error reading test data file.");
   3829         goto cleanUpAndReturn;
   3830     }
   3831 
   3832     //
   3833     // Look for a Unicode Signature (BOM) on the data just read
   3834     //
   3835     int32_t        signatureLength;
   3836     const char *   fileBufC;
   3837     const char*    encoding;
   3838 
   3839     fileBufC = fileBuf;
   3840     encoding = ucnv_detectUnicodeSignature(
   3841         fileBuf, fileSize, &signatureLength, &status);
   3842     if(encoding!=NULL ){
   3843         fileBufC  += signatureLength;
   3844         fileSize  -= signatureLength;
   3845     } else {
   3846         encoding = defEncoding;
   3847         if (strcmp(encoding, "utf-8") == 0) {
   3848             errln("file %s is missing its BOM", fileName);
   3849         }
   3850     }
   3851 
   3852     //
   3853     // Open a converter to take the rule file to UTF-16
   3854     //
   3855     conv = ucnv_open(encoding, &status);
   3856     if (U_FAILURE(status)) {
   3857         goto cleanUpAndReturn;
   3858     }
   3859 
   3860     //
   3861     // Convert the rules to UChar.
   3862     //  Preflight first to determine required buffer size.
   3863     //
   3864     ulen = ucnv_toUChars(conv,
   3865         NULL,           //  dest,
   3866         0,              //  destCapacity,
   3867         fileBufC,
   3868         fileSize,
   3869         &status);
   3870     if (status == U_BUFFER_OVERFLOW_ERROR) {
   3871         // Buffer Overflow is expected from the preflight operation.
   3872         status = U_ZERO_ERROR;
   3873 
   3874         retPtr = new UChar[ulen+1];
   3875         ucnv_toUChars(conv,
   3876             retPtr,       //  dest,
   3877             ulen+1,
   3878             fileBufC,
   3879             fileSize,
   3880             &status);
   3881     }
   3882 
   3883 cleanUpAndReturn:
   3884     fclose(f);
   3885     delete[] fileBuf;
   3886     ucnv_close(conv);
   3887     if (U_FAILURE(status)) {
   3888         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   3889         delete []retPtr;
   3890         retPtr = 0;
   3891         ulen   = 0;
   3892     };
   3893     return retPtr;
   3894 }
   3895 
   3896 
   3897 //-------------------------------------------------------------------------------
   3898 //
   3899 //   PerlTests  - Run Perl's regular expression tests
   3900 //                The input file for this test is re_tests, the standard regular
   3901 //                expression test data distributed with the Perl source code.
   3902 //
   3903 //                Here is Perl's description of the test data file:
   3904 //
   3905 //        # The tests are in a separate file 't/op/re_tests'.
   3906 //        # Each line in that file is a separate test.
   3907 //        # There are five columns, separated by tabs.
   3908 //        #
   3909 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
   3910 //        # Modifiers can be put after the closing C<'>.
   3911 //        #
   3912 //        # Column 2 contains the string to be matched.
   3913 //        #
   3914 //        # Column 3 contains the expected result:
   3915 //        #     y   expect a match
   3916 //        #     n   expect no match
   3917 //        #     c   expect an error
   3918 //        # B   test exposes a known bug in Perl, should be skipped
   3919 //        # b   test exposes a known bug in Perl, should be skipped if noamp
   3920 //        #
   3921 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
   3922 //        #
   3923 //        # Column 4 contains a string, usually C<$&>.
   3924 //        #
   3925 //        # Column 5 contains the expected result of double-quote
   3926 //        # interpolating that string after the match, or start of error message.
   3927 //        #
   3928 //        # Column 6, if present, contains a reason why the test is skipped.
   3929 //        # This is printed with "skipped", for harness to pick up.
   3930 //        #
   3931 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
   3932 //        #
   3933 //        # If you want to add a regular expression test that can't be expressed
   3934 //        # in this format, don't add it here: put it in op/pat.t instead.
   3935 //
   3936 //        For ICU, if field 3 contains an 'i', the test will be skipped.
   3937 //        The test exposes is some known incompatibility between ICU and Perl regexps.
   3938 //        (The i is in addition to whatever was there before.)
   3939 //
   3940 //-------------------------------------------------------------------------------
   3941 void RegexTest::PerlTests() {
   3942     char tdd[2048];
   3943     const char *srcPath;
   3944     UErrorCode  status = U_ZERO_ERROR;
   3945     UParseError pe;
   3946 
   3947     //
   3948     //  Open and read the test data file.
   3949     //
   3950     srcPath=getPath(tdd, "re_tests.txt");
   3951     if(srcPath==NULL) {
   3952         return; /* something went wrong, error already output */
   3953     }
   3954 
   3955     int32_t    len;
   3956     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
   3957     if (U_FAILURE(status)) {
   3958         return; /* something went wrong, error already output */
   3959     }
   3960 
   3961     //
   3962     //  Put the test data into a UnicodeString
   3963     //
   3964     UnicodeString testDataString(FALSE, testData, len);
   3965 
   3966     //
   3967     //  Regex to break the input file into lines, and strip the new lines.
   3968     //     One line per match, capture group one is the desired data.
   3969     //
   3970     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
   3971     if (U_FAILURE(status)) {
   3972         dataerrln("RegexPattern::compile() error");
   3973         return;
   3974     }
   3975     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
   3976 
   3977     //
   3978     //  Regex to split a test file line into fields.
   3979     //    There are six fields, separated by tabs.
   3980     //
   3981     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
   3982 
   3983     //
   3984     //  Regex to identify test patterns with flag settings, and to separate them.
   3985     //    Test patterns with flags look like 'pattern'i
   3986     //    Test patterns without flags are not quoted:   pattern
   3987     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
   3988     //
   3989     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
   3990     RegexMatcher* flagMat = flagPat->matcher(status);
   3991 
   3992     //
   3993     // The Perl tests reference several perl-isms, which are evaluated/substituted
   3994     //   in the test data.  Not being perl, this must be done explicitly.  Here
   3995     //   are string constants and REs for these constructs.
   3996     //
   3997     UnicodeString nulnulSrc("${nulnul}");
   3998     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
   3999     nulnul = nulnul.unescape();
   4000 
   4001     UnicodeString ffffSrc("${ffff}");
   4002     UnicodeString ffff("\\uffff", -1, US_INV);
   4003     ffff = ffff.unescape();
   4004 
   4005     //  regexp for $-[0], $+[2], etc.
   4006     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
   4007     RegexMatcher *groupsMat = groupsPat->matcher(status);
   4008 
   4009     //  regexp for $0, $1, $2, etc.
   4010     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
   4011     RegexMatcher *cgMat = cgPat->matcher(status);
   4012 
   4013 
   4014     //
   4015     // Main Loop for the Perl Tests, runs once per line from the
   4016     //   test data file.
   4017     //
   4018     int32_t  lineNum = 0;
   4019     int32_t  skippedUnimplementedCount = 0;
   4020     while (lineMat->find()) {
   4021         lineNum++;
   4022 
   4023         //
   4024         //  Get a line, break it into its fields, do the Perl
   4025         //    variable substitutions.
   4026         //
   4027         UnicodeString line = lineMat->group(1, status);
   4028         UnicodeString fields[7];
   4029         fieldPat->split(line, fields, 7, status);
   4030 
   4031         flagMat->reset(fields[0]);
   4032         flagMat->matches(status);
   4033         UnicodeString pattern  = flagMat->group(2, status);
   4034         pattern.findAndReplace("${bang}", "!");
   4035         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
   4036         pattern.findAndReplace(ffffSrc, ffff);
   4037 
   4038         //
   4039         //  Identify patterns that include match flag settings,
   4040         //    split off the flags, remove the extra quotes.
   4041         //
   4042         UnicodeString flagStr = flagMat->group(3, status);
   4043         if (U_FAILURE(status)) {
   4044             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   4045             return;
   4046         }
   4047         int32_t flags = 0;
   4048         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
   4049         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
   4050         const UChar UChar_m = 0x6d;
   4051         const UChar UChar_x = 0x78;
   4052         const UChar UChar_y = 0x79;
   4053         if (flagStr.indexOf(UChar_i) != -1) {
   4054             flags |= UREGEX_CASE_INSENSITIVE;
   4055         }
   4056         if (flagStr.indexOf(UChar_m) != -1) {
   4057             flags |= UREGEX_MULTILINE;
   4058         }
   4059         if (flagStr.indexOf(UChar_x) != -1) {
   4060             flags |= UREGEX_COMMENTS;
   4061         }
   4062 
   4063         //
   4064         // Compile the test pattern.
   4065         //
   4066         status = U_ZERO_ERROR;
   4067         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
   4068         if (status == U_REGEX_UNIMPLEMENTED) {
   4069             //
   4070             // Test of a feature that is planned for ICU, but not yet implemented.
   4071             //   skip the test.
   4072             skippedUnimplementedCount++;
   4073             delete testPat;
   4074             status = U_ZERO_ERROR;
   4075             continue;
   4076         }
   4077 
   4078         if (U_FAILURE(status)) {
   4079             // Some tests are supposed to generate errors.
   4080             //   Only report an error for tests that are supposed to succeed.
   4081             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
   4082                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
   4083             {
   4084                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
   4085             }
   4086             status = U_ZERO_ERROR;
   4087             delete testPat;
   4088             continue;
   4089         }
   4090 
   4091         if (fields[2].indexOf(UChar_i) >= 0) {
   4092             // ICU should skip this test.
   4093             delete testPat;
   4094             continue;
   4095         }
   4096 
   4097         if (fields[2].indexOf(UChar_c) >= 0) {
   4098             // This pattern should have caused a compilation error, but didn't/
   4099             errln("line %d: Expected a pattern compile error, got success.", lineNum);
   4100             delete testPat;
   4101             continue;
   4102         }
   4103 
   4104         //
   4105         // replace the Perl variables that appear in some of the
   4106         //   match data strings.
   4107         //
   4108         UnicodeString matchString = fields[1];
   4109         matchString.findAndReplace(nulnulSrc, nulnul);
   4110         matchString.findAndReplace(ffffSrc,   ffff);
   4111 
   4112         // Replace any \n in the match string with an actual new-line char.
   4113         //  Don't do full unescape, as this unescapes more than Perl does, which
   4114         //  causes other spurious failures in the tests.
   4115         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4116 
   4117 
   4118 
   4119         //
   4120         // Run the test, check for expected match/don't match result.
   4121         //
   4122         RegexMatcher *testMat = testPat->matcher(matchString, status);
   4123         UBool found = testMat->find();
   4124         UBool expected = FALSE;
   4125         if (fields[2].indexOf(UChar_y) >=0) {
   4126             expected = TRUE;
   4127         }
   4128         if (expected != found) {
   4129             errln("line %d: Expected %smatch, got %smatch",
   4130                 lineNum, expected?"":"no ", found?"":"no " );
   4131             continue;
   4132         }
   4133 
   4134         // Don't try to check expected results if there is no match.
   4135         //   (Some have stuff in the expected fields)
   4136         if (!found) {
   4137             delete testMat;
   4138             delete testPat;
   4139             continue;
   4140         }
   4141 
   4142         //
   4143         // Interpret the Perl expression from the fourth field of the data file,
   4144         // building up an ICU string from the results of the ICU match.
   4145         //   The Perl expression will contain references to the results of
   4146         //     a regex match, including the matched string, capture group strings,
   4147         //     group starting and ending indicies, etc.
   4148         //
   4149         UnicodeString resultString;
   4150         UnicodeString perlExpr = fields[3];
   4151 #if SUPPORT_MUTATING_INPUT_STRING
   4152         groupsMat->reset(perlExpr);
   4153         cgMat->reset(perlExpr);
   4154 #endif
   4155 
   4156         while (perlExpr.length() > 0) {
   4157 #if !SUPPORT_MUTATING_INPUT_STRING
   4158             //  Perferred usage.  Reset after any modification to input string.
   4159             groupsMat->reset(perlExpr);
   4160             cgMat->reset(perlExpr);
   4161 #endif
   4162 
   4163             if (perlExpr.startsWith("$&")) {
   4164                 resultString.append(testMat->group(status));
   4165                 perlExpr.remove(0, 2);
   4166             }
   4167 
   4168             else if (groupsMat->lookingAt(status)) {
   4169                 // $-[0]   $+[2]  etc.
   4170                 UnicodeString digitString = groupsMat->group(2, status);
   4171                 int32_t t = 0;
   4172                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4173                 UnicodeString plusOrMinus = groupsMat->group(1, status);
   4174                 int32_t matchPosition;
   4175                 if (plusOrMinus.compare("+") == 0) {
   4176                     matchPosition = testMat->end(groupNum, status);
   4177                 } else {
   4178                     matchPosition = testMat->start(groupNum, status);
   4179                 }
   4180                 if (matchPosition != -1) {
   4181                     ICU_Utility::appendNumber(resultString, matchPosition);
   4182                 }
   4183                 perlExpr.remove(0, groupsMat->end(status));
   4184             }
   4185 
   4186             else if (cgMat->lookingAt(status)) {
   4187                 // $1, $2, $3, etc.
   4188                 UnicodeString digitString = cgMat->group(1, status);
   4189                 int32_t t = 0;
   4190                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4191                 if (U_SUCCESS(status)) {
   4192                     resultString.append(testMat->group(groupNum, status));
   4193                     status = U_ZERO_ERROR;
   4194                 }
   4195                 perlExpr.remove(0, cgMat->end(status));
   4196             }
   4197 
   4198             else if (perlExpr.startsWith("@-")) {
   4199                 int32_t i;
   4200                 for (i=0; i<=testMat->groupCount(); i++) {
   4201                     if (i>0) {
   4202                         resultString.append(" ");
   4203                     }
   4204                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
   4205                 }
   4206                 perlExpr.remove(0, 2);
   4207             }
   4208 
   4209             else if (perlExpr.startsWith("@+")) {
   4210                 int32_t i;
   4211                 for (i=0; i<=testMat->groupCount(); i++) {
   4212                     if (i>0) {
   4213                         resultString.append(" ");
   4214                     }
   4215                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
   4216                 }
   4217                 perlExpr.remove(0, 2);
   4218             }
   4219 
   4220             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
   4221                                                      //           or as an escaped sequence (e.g. \n)
   4222                 if (perlExpr.length() > 1) {
   4223                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
   4224                 }
   4225                 UChar c = perlExpr.charAt(0);
   4226                 switch (c) {
   4227                 case 'n':   c = '\n'; break;
   4228                 // add any other escape sequences that show up in the test expected results.
   4229                 }
   4230                 resultString.append(c);
   4231                 perlExpr.remove(0, 1);
   4232             }
   4233 
   4234             else  {
   4235                 // Any characters from the perl expression that we don't explicitly
   4236                 //  recognize before here are assumed to be literals and copied
   4237                 //  as-is to the expected results.
   4238                 resultString.append(perlExpr.charAt(0));
   4239                 perlExpr.remove(0, 1);
   4240             }
   4241 
   4242             if (U_FAILURE(status)) {
   4243                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
   4244                 break;
   4245             }
   4246         }
   4247 
   4248         //
   4249         // Expected Results Compare
   4250         //
   4251         UnicodeString expectedS(fields[4]);
   4252         expectedS.findAndReplace(nulnulSrc, nulnul);
   4253         expectedS.findAndReplace(ffffSrc,   ffff);
   4254         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4255 
   4256 
   4257         if (expectedS.compare(resultString) != 0) {
   4258             err("Line %d: Incorrect perl expression results.", lineNum);
   4259             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
   4260         }
   4261 
   4262         delete testMat;
   4263         delete testPat;
   4264     }
   4265 
   4266     //
   4267     // All done.  Clean up allocated stuff.
   4268     //
   4269     delete cgMat;
   4270     delete cgPat;
   4271 
   4272     delete groupsMat;
   4273     delete groupsPat;
   4274 
   4275     delete flagMat;
   4276     delete flagPat;
   4277 
   4278     delete lineMat;
   4279     delete linePat;
   4280 
   4281     delete fieldPat;
   4282     delete [] testData;
   4283 
   4284 
   4285     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
   4286 
   4287 }
   4288 
   4289 
   4290 //-------------------------------------------------------------------------------
   4291 //
   4292 //   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
   4293 //                  (instead of using UnicodeStrings) to test the alternate engine.
   4294 //                  The input file for this test is re_tests, the standard regular
   4295 //                  expression test data distributed with the Perl source code.
   4296 //                  See PerlTests() for more information.
   4297 //
   4298 //-------------------------------------------------------------------------------
   4299 void RegexTest::PerlTestsUTF8() {
   4300     char tdd[2048];
   4301     const char *srcPath;
   4302     UErrorCode  status = U_ZERO_ERROR;
   4303     UParseError pe;
   4304     LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
   4305     UText       patternText = UTEXT_INITIALIZER;
   4306     char       *patternChars = NULL;
   4307     int32_t     patternLength;
   4308     int32_t     patternCapacity = 0;
   4309     UText       inputText = UTEXT_INITIALIZER;
   4310     char       *inputChars = NULL;
   4311     int32_t     inputLength;
   4312     int32_t     inputCapacity = 0;
   4313 
   4314     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
   4315 
   4316     //
   4317     //  Open and read the test data file.
   4318     //
   4319     srcPath=getPath(tdd, "re_tests.txt");
   4320     if(srcPath==NULL) {
   4321         return; /* something went wrong, error already output */
   4322     }
   4323 
   4324     int32_t    len;
   4325     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
   4326     if (U_FAILURE(status)) {
   4327         return; /* something went wrong, error already output */
   4328     }
   4329 
   4330     //
   4331     //  Put the test data into a UnicodeString
   4332     //
   4333     UnicodeString testDataString(FALSE, testData, len);
   4334 
   4335     //
   4336     //  Regex to break the input file into lines, and strip the new lines.
   4337     //     One line per match, capture group one is the desired data.
   4338     //
   4339     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
   4340     if (U_FAILURE(status)) {
   4341         dataerrln("RegexPattern::compile() error");
   4342         return;
   4343     }
   4344     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
   4345 
   4346     //
   4347     //  Regex to split a test file line into fields.
   4348     //    There are six fields, separated by tabs.
   4349     //
   4350     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
   4351 
   4352     //
   4353     //  Regex to identify test patterns with flag settings, and to separate them.
   4354     //    Test patterns with flags look like 'pattern'i
   4355     //    Test patterns without flags are not quoted:   pattern
   4356     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
   4357     //
   4358     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
   4359     RegexMatcher* flagMat = flagPat->matcher(status);
   4360 
   4361     //
   4362     // The Perl tests reference several perl-isms, which are evaluated/substituted
   4363     //   in the test data.  Not being perl, this must be done explicitly.  Here
   4364     //   are string constants and REs for these constructs.
   4365     //
   4366     UnicodeString nulnulSrc("${nulnul}");
   4367     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
   4368     nulnul = nulnul.unescape();
   4369 
   4370     UnicodeString ffffSrc("${ffff}");
   4371     UnicodeString ffff("\\uffff", -1, US_INV);
   4372     ffff = ffff.unescape();
   4373 
   4374     //  regexp for $-[0], $+[2], etc.
   4375     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
   4376     RegexMatcher *groupsMat = groupsPat->matcher(status);
   4377 
   4378     //  regexp for $0, $1, $2, etc.
   4379     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
   4380     RegexMatcher *cgMat = cgPat->matcher(status);
   4381 
   4382 
   4383     //
   4384     // Main Loop for the Perl Tests, runs once per line from the
   4385     //   test data file.
   4386     //
   4387     int32_t  lineNum = 0;
   4388     int32_t  skippedUnimplementedCount = 0;
   4389     while (lineMat->find()) {
   4390         lineNum++;
   4391 
   4392         //
   4393         //  Get a line, break it into its fields, do the Perl
   4394         //    variable substitutions.
   4395         //
   4396         UnicodeString line = lineMat->group(1, status);
   4397         UnicodeString fields[7];
   4398         fieldPat->split(line, fields, 7, status);
   4399 
   4400         flagMat->reset(fields[0]);
   4401         flagMat->matches(status);
   4402         UnicodeString pattern  = flagMat->group(2, status);
   4403         pattern.findAndReplace("${bang}", "!");
   4404         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
   4405         pattern.findAndReplace(ffffSrc, ffff);
   4406 
   4407         //
   4408         //  Identify patterns that include match flag settings,
   4409         //    split off the flags, remove the extra quotes.
   4410         //
   4411         UnicodeString flagStr = flagMat->group(3, status);
   4412         if (U_FAILURE(status)) {
   4413             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   4414             return;
   4415         }
   4416         int32_t flags = 0;
   4417         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
   4418         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
   4419         const UChar UChar_m = 0x6d;
   4420         const UChar UChar_x = 0x78;
   4421         const UChar UChar_y = 0x79;
   4422         if (flagStr.indexOf(UChar_i) != -1) {
   4423             flags |= UREGEX_CASE_INSENSITIVE;
   4424         }
   4425         if (flagStr.indexOf(UChar_m) != -1) {
   4426             flags |= UREGEX_MULTILINE;
   4427         }
   4428         if (flagStr.indexOf(UChar_x) != -1) {
   4429             flags |= UREGEX_COMMENTS;
   4430         }
   4431 
   4432         //
   4433         // Put the pattern in a UTF-8 UText
   4434         //
   4435         status = U_ZERO_ERROR;
   4436         patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
   4437         if (status == U_BUFFER_OVERFLOW_ERROR) {
   4438             status = U_ZERO_ERROR;
   4439             delete[] patternChars;
   4440             patternCapacity = patternLength + 1;
   4441             patternChars = new char[patternCapacity];
   4442             pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
   4443         }
   4444         utext_openUTF8(&patternText, patternChars, patternLength, &status);
   4445 
   4446         //
   4447         // Compile the test pattern.
   4448         //
   4449         RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
   4450         if (status == U_REGEX_UNIMPLEMENTED) {
   4451             //
   4452             // Test of a feature that is planned for ICU, but not yet implemented.
   4453             //   skip the test.
   4454             skippedUnimplementedCount++;
   4455             delete testPat;
   4456             status = U_ZERO_ERROR;
   4457             continue;
   4458         }
   4459 
   4460         if (U_FAILURE(status)) {
   4461             // Some tests are supposed to generate errors.
   4462             //   Only report an error for tests that are supposed to succeed.
   4463             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
   4464                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
   4465             {
   4466                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
   4467             }
   4468             status = U_ZERO_ERROR;
   4469             delete testPat;
   4470             continue;
   4471         }
   4472 
   4473         if (fields[2].indexOf(UChar_i) >= 0) {
   4474             // ICU should skip this test.
   4475             delete testPat;
   4476             continue;
   4477         }
   4478 
   4479         if (fields[2].indexOf(UChar_c) >= 0) {
   4480             // This pattern should have caused a compilation error, but didn't/
   4481             errln("line %d: Expected a pattern compile error, got success.", lineNum);
   4482             delete testPat;
   4483             continue;
   4484         }
   4485 
   4486 
   4487         //
   4488         // replace the Perl variables that appear in some of the
   4489         //   match data strings.
   4490         //
   4491         UnicodeString matchString = fields[1];
   4492         matchString.findAndReplace(nulnulSrc, nulnul);
   4493         matchString.findAndReplace(ffffSrc,   ffff);
   4494 
   4495         // Replace any \n in the match string with an actual new-line char.
   4496         //  Don't do full unescape, as this unescapes more than Perl does, which
   4497         //  causes other spurious failures in the tests.
   4498         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4499 
   4500         //
   4501         // Put the input in a UTF-8 UText
   4502         //
   4503         status = U_ZERO_ERROR;
   4504         inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
   4505         if (status == U_BUFFER_OVERFLOW_ERROR) {
   4506             status = U_ZERO_ERROR;
   4507             delete[] inputChars;
   4508             inputCapacity = inputLength + 1;
   4509             inputChars = new char[inputCapacity];
   4510             matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
   4511         }
   4512         utext_openUTF8(&inputText, inputChars, inputLength, &status);
   4513 
   4514         //
   4515         // Run the test, check for expected match/don't match result.
   4516         //
   4517         RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
   4518         UBool found = testMat->find();
   4519         UBool expected = FALSE;
   4520         if (fields[2].indexOf(UChar_y) >=0) {
   4521             expected = TRUE;
   4522         }
   4523         if (expected != found) {
   4524             errln("line %d: Expected %smatch, got %smatch",
   4525                 lineNum, expected?"":"no ", found?"":"no " );
   4526             continue;
   4527         }
   4528 
   4529         // Don't try to check expected results if there is no match.
   4530         //   (Some have stuff in the expected fields)
   4531         if (!found) {
   4532             delete testMat;
   4533             delete testPat;
   4534             continue;
   4535         }
   4536 
   4537         //
   4538         // Interpret the Perl expression from the fourth field of the data file,
   4539         // building up an ICU string from the results of the ICU match.
   4540         //   The Perl expression will contain references to the results of
   4541         //     a regex match, including the matched string, capture group strings,
   4542         //     group starting and ending indicies, etc.
   4543         //
   4544         UnicodeString resultString;
   4545         UnicodeString perlExpr = fields[3];
   4546 
   4547         while (perlExpr.length() > 0) {
   4548             groupsMat->reset(perlExpr);
   4549             cgMat->reset(perlExpr);
   4550 
   4551             if (perlExpr.startsWith("$&")) {
   4552                 resultString.append(testMat->group(status));
   4553                 perlExpr.remove(0, 2);
   4554             }
   4555 
   4556             else if (groupsMat->lookingAt(status)) {
   4557                 // $-[0]   $+[2]  etc.
   4558                 UnicodeString digitString = groupsMat->group(2, status);
   4559                 int32_t t = 0;
   4560                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4561                 UnicodeString plusOrMinus = groupsMat->group(1, status);
   4562                 int32_t matchPosition;
   4563                 if (plusOrMinus.compare("+") == 0) {
   4564                     matchPosition = testMat->end(groupNum, status);
   4565                 } else {
   4566                     matchPosition = testMat->start(groupNum, status);
   4567                 }
   4568                 if (matchPosition != -1) {
   4569                     ICU_Utility::appendNumber(resultString, matchPosition);
   4570                 }
   4571                 perlExpr.remove(0, groupsMat->end(status));
   4572             }
   4573 
   4574             else if (cgMat->lookingAt(status)) {
   4575                 // $1, $2, $3, etc.
   4576                 UnicodeString digitString = cgMat->group(1, status);
   4577                 int32_t t = 0;
   4578                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4579                 if (U_SUCCESS(status)) {
   4580                     resultString.append(testMat->group(groupNum, status));
   4581                     status = U_ZERO_ERROR;
   4582                 }
   4583                 perlExpr.remove(0, cgMat->end(status));
   4584             }
   4585 
   4586             else if (perlExpr.startsWith("@-")) {
   4587                 int32_t i;
   4588                 for (i=0; i<=testMat->groupCount(); i++) {
   4589                     if (i>0) {
   4590                         resultString.append(" ");
   4591                     }
   4592                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
   4593                 }
   4594                 perlExpr.remove(0, 2);
   4595             }
   4596 
   4597             else if (perlExpr.startsWith("@+")) {
   4598                 int32_t i;
   4599                 for (i=0; i<=testMat->groupCount(); i++) {
   4600                     if (i>0) {
   4601                         resultString.append(" ");
   4602                     }
   4603                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
   4604                 }
   4605                 perlExpr.remove(0, 2);
   4606             }
   4607 
   4608             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
   4609                                                      //           or as an escaped sequence (e.g. \n)
   4610                 if (perlExpr.length() > 1) {
   4611                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
   4612                 }
   4613                 UChar c = perlExpr.charAt(0);
   4614                 switch (c) {
   4615                 case 'n':   c = '\n'; break;
   4616                 // add any other escape sequences that show up in the test expected results.
   4617                 }
   4618                 resultString.append(c);
   4619                 perlExpr.remove(0, 1);
   4620             }
   4621 
   4622             else  {
   4623                 // Any characters from the perl expression that we don't explicitly
   4624                 //  recognize before here are assumed to be literals and copied
   4625                 //  as-is to the expected results.
   4626                 resultString.append(perlExpr.charAt(0));
   4627                 perlExpr.remove(0, 1);
   4628             }
   4629 
   4630             if (U_FAILURE(status)) {
   4631                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
   4632                 break;
   4633             }
   4634         }
   4635 
   4636         //
   4637         // Expected Results Compare
   4638         //
   4639         UnicodeString expectedS(fields[4]);
   4640         expectedS.findAndReplace(nulnulSrc, nulnul);
   4641         expectedS.findAndReplace(ffffSrc,   ffff);
   4642         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4643 
   4644 
   4645         if (expectedS.compare(resultString) != 0) {
   4646             err("Line %d: Incorrect perl expression results.", lineNum);
   4647             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
   4648         }
   4649 
   4650         delete testMat;
   4651         delete testPat;
   4652     }
   4653 
   4654     //
   4655     // All done.  Clean up allocated stuff.
   4656     //
   4657     delete cgMat;
   4658     delete cgPat;
   4659 
   4660     delete groupsMat;
   4661     delete groupsPat;
   4662 
   4663     delete flagMat;
   4664     delete flagPat;
   4665 
   4666     delete lineMat;
   4667     delete linePat;
   4668 
   4669     delete fieldPat;
   4670     delete [] testData;
   4671 
   4672     utext_close(&patternText);
   4673     utext_close(&inputText);
   4674 
   4675     delete [] patternChars;
   4676     delete [] inputChars;
   4677 
   4678 
   4679     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
   4680 
   4681 }
   4682 
   4683 
   4684 //--------------------------------------------------------------
   4685 //
   4686 //  Bug6149   Verify limits to heap expansion for backtrack stack.
   4687 //             Use this pattern,
   4688 //                 "(a?){1,8000000}"
   4689 //             Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
   4690 //                   This test is likely to be fragile, as further optimizations stop
   4691 //                   more cases of pointless looping in the match engine.
   4692 //
   4693 //---------------------------------------------------------------
   4694 void RegexTest::Bug6149() {
   4695     UnicodeString pattern("(a?){1,8000000}");
   4696     UnicodeString s("xyz");
   4697     uint32_t flags = 0;
   4698     UErrorCode status = U_ZERO_ERROR;
   4699 
   4700     RegexMatcher  matcher(pattern, s, flags, status);
   4701     UBool result = false;
   4702     REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
   4703     REGEX_ASSERT(result == FALSE);
   4704  }
   4705 
   4706 
   4707 //
   4708 //   Callbacks()    Test the callback function.
   4709 //                  When set, callbacks occur periodically during matching operations,
   4710 //                  giving the application code the ability to abort the operation
   4711 //                  before it's normal completion.
   4712 //
   4713 
   4714 struct callBackContext {
   4715     RegexTest        *test;
   4716     int32_t          maxCalls;
   4717     int32_t          numCalls;
   4718     int32_t          lastSteps;
   4719     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
   4720 };
   4721 
   4722 U_CDECL_BEGIN
   4723 static UBool U_CALLCONV
   4724 testCallBackFn(const void *context, int32_t steps) {
   4725     callBackContext  *info = (callBackContext *)context;
   4726     if (info->lastSteps+1 != steps) {
   4727         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
   4728     }
   4729     info->lastSteps = steps;
   4730     info->numCalls++;
   4731     return (info->numCalls < info->maxCalls);
   4732 }
   4733 U_CDECL_END
   4734 
   4735 void RegexTest::Callbacks() {
   4736    {
   4737         // Getter returns NULLs if no callback has been set
   4738 
   4739         //   The variables that the getter will fill in.
   4740         //   Init to non-null values so that the action of the getter can be seen.
   4741         const void          *returnedContext = &returnedContext;
   4742         URegexMatchCallback *returnedFn = &testCallBackFn;
   4743 
   4744         UErrorCode status = U_ZERO_ERROR;
   4745         RegexMatcher matcher("x", 0, status);
   4746         REGEX_CHECK_STATUS;
   4747         matcher.getMatchCallback(returnedFn, returnedContext, status);
   4748         REGEX_CHECK_STATUS;
   4749         REGEX_ASSERT(returnedFn == NULL);
   4750         REGEX_ASSERT(returnedContext == NULL);
   4751     }
   4752 
   4753    {
   4754         // Set and Get work
   4755         callBackContext cbInfo = {this, 0, 0, 0};
   4756         const void          *returnedContext;
   4757         URegexMatchCallback *returnedFn;
   4758         UErrorCode status = U_ZERO_ERROR;
   4759         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
   4760         REGEX_CHECK_STATUS;
   4761         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
   4762         REGEX_CHECK_STATUS;
   4763         matcher.getMatchCallback(returnedFn, returnedContext, status);
   4764         REGEX_CHECK_STATUS;
   4765         REGEX_ASSERT(returnedFn == testCallBackFn);
   4766         REGEX_ASSERT(returnedContext == &cbInfo);
   4767 
   4768         // A short-running match shouldn't invoke the callback
   4769         status = U_ZERO_ERROR;
   4770         cbInfo.reset(1);
   4771         UnicodeString s = "xxx";
   4772         matcher.reset(s);
   4773         REGEX_ASSERT(matcher.matches(status));
   4774         REGEX_CHECK_STATUS;
   4775         REGEX_ASSERT(cbInfo.numCalls == 0);
   4776 
   4777         // A medium-length match that runs long enough to invoke the
   4778         //   callback, but not so long that the callback aborts it.
   4779         status = U_ZERO_ERROR;
   4780         cbInfo.reset(4);
   4781         s = "aaaaaaaaaaaaaaaaaaab";
   4782         matcher.reset(s);
   4783         REGEX_ASSERT(matcher.matches(status)==FALSE);
   4784         REGEX_CHECK_STATUS;
   4785         REGEX_ASSERT(cbInfo.numCalls > 0);
   4786 
   4787         // A longer running match that the callback function will abort.
   4788         status = U_ZERO_ERROR;
   4789         cbInfo.reset(4);
   4790         s = "aaaaaaaaaaaaaaaaaaaaaaab";
   4791         matcher.reset(s);
   4792         REGEX_ASSERT(matcher.matches(status)==FALSE);
   4793         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
   4794         REGEX_ASSERT(cbInfo.numCalls == 4);
   4795     }
   4796 
   4797 
   4798 }
   4799 
   4800 
   4801 //
   4802 //   FindProgressCallbacks()    Test the find "progress" callback function.
   4803 //                  When set, the find progress callback will be invoked during a find operations
   4804 //                  after each return from a match attempt, giving the application the opportunity
   4805 //                  to terminate a long-running find operation before it's normal completion.
   4806 //
   4807 
   4808 struct progressCallBackContext {
   4809     RegexTest        *test;
   4810     int64_t          lastIndex;
   4811     int32_t          maxCalls;
   4812     int32_t          numCalls;
   4813     void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
   4814 };
   4815 
   4816 U_CDECL_BEGIN
   4817 static UBool U_CALLCONV
   4818 testProgressCallBackFn(const void *context, int64_t matchIndex) {
   4819     progressCallBackContext  *info = (progressCallBackContext *)context;
   4820     info->numCalls++;
   4821     info->lastIndex = matchIndex;
   4822 //    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
   4823     return (info->numCalls < info->maxCalls);
   4824 }
   4825 U_CDECL_END
   4826 
   4827 void RegexTest::FindProgressCallbacks() {
   4828    {
   4829         // Getter returns NULLs if no callback has been set
   4830 
   4831         //   The variables that the getter will fill in.
   4832         //   Init to non-null values so that the action of the getter can be seen.
   4833         const void                  *returnedContext = &returnedContext;
   4834         URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
   4835 
   4836         UErrorCode status = U_ZERO_ERROR;
   4837         RegexMatcher matcher("x", 0, status);
   4838         REGEX_CHECK_STATUS;
   4839         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
   4840         REGEX_CHECK_STATUS;
   4841         REGEX_ASSERT(returnedFn == NULL);
   4842         REGEX_ASSERT(returnedContext == NULL);
   4843     }
   4844 
   4845    {
   4846         // Set and Get work
   4847         progressCallBackContext cbInfo = {this, 0, 0, 0};
   4848         const void                  *returnedContext;
   4849         URegexFindProgressCallback  *returnedFn;
   4850         UErrorCode status = U_ZERO_ERROR;
   4851         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
   4852         REGEX_CHECK_STATUS;
   4853         matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
   4854         REGEX_CHECK_STATUS;
   4855         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
   4856         REGEX_CHECK_STATUS;
   4857         REGEX_ASSERT(returnedFn == testProgressCallBackFn);
   4858         REGEX_ASSERT(returnedContext == &cbInfo);
   4859 
   4860         // A short-running match should NOT invoke the callback.
   4861         status = U_ZERO_ERROR;
   4862         cbInfo.reset(100);
   4863         UnicodeString s = "abxxx";
   4864         matcher.reset(s);
   4865 #if 0
   4866         matcher.setTrace(TRUE);
   4867 #endif
   4868         REGEX_ASSERT(matcher.find(0, status));
   4869         REGEX_CHECK_STATUS;
   4870         REGEX_ASSERT(cbInfo.numCalls == 0);
   4871 
   4872         // A medium running match that causes matcher.find() to invoke our callback for each index.
   4873         status = U_ZERO_ERROR;
   4874         s = "aaaaaaaaaaaaaaaaaaab";
   4875         cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
   4876         matcher.reset(s);
   4877         REGEX_ASSERT(matcher.find(0, status)==FALSE);
   4878         REGEX_CHECK_STATUS;
   4879         REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
   4880 
   4881         // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
   4882         status = U_ZERO_ERROR;
   4883         UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
   4884         cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
   4885         matcher.reset(s1);
   4886         REGEX_ASSERT(matcher.find(0, status)==FALSE);
   4887         REGEX_CHECK_STATUS;
   4888         REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
   4889 
   4890 #if 0
   4891         // Now a match that will succeed, but after an interruption
   4892         status = U_ZERO_ERROR;
   4893         UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
   4894         cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
   4895         matcher.reset(s2);
   4896         REGEX_ASSERT(matcher.find(0, status)==FALSE);
   4897         REGEX_CHECK_STATUS;
   4898         // Now retry the match from where left off
   4899         cbInfo.maxCalls = 100; //  No callback limit
   4900         REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
   4901         REGEX_CHECK_STATUS;
   4902 #endif
   4903     }
   4904 
   4905 
   4906 }
   4907 
   4908 
   4909 //---------------------------------------------------------------------------
   4910 //
   4911 //    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
   4912 //                             UTexts. The pure-C implementation of UText
   4913 //                             has no mutable backing stores, but we can
   4914 //                             use UnicodeString here to test the functionality.
   4915 //
   4916 //---------------------------------------------------------------------------
   4917 void RegexTest::PreAllocatedUTextCAPI () {
   4918     UErrorCode           status = U_ZERO_ERROR;
   4919     URegularExpression  *re;
   4920     UText                patternText = UTEXT_INITIALIZER;
   4921     UnicodeString        buffer;
   4922     UText                bufferText = UTEXT_INITIALIZER;
   4923 
   4924     utext_openUnicodeString(&bufferText, &buffer, &status);
   4925 
   4926     /*
   4927      *  getText() and getUText()
   4928      */
   4929     {
   4930         UText  text1 = UTEXT_INITIALIZER;
   4931         UText  text2 = UTEXT_INITIALIZER;
   4932         UChar  text2Chars[20];
   4933         UText  *resultText;
   4934 
   4935         status = U_ZERO_ERROR;
   4936         regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
   4937         regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
   4938         u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
   4939         utext_openUChars(&text2, text2Chars, -1, &status);
   4940 
   4941         regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
   4942         re = uregex_openUText(&patternText, 0, NULL, &status);
   4943 
   4944         /* First set a UText */
   4945         uregex_setUText(re, &text1, &status);
   4946         resultText = uregex_getUText(re, &bufferText, &status);
   4947         REGEX_CHECK_STATUS;
   4948         REGEX_ASSERT(resultText == &bufferText);
   4949         utext_setNativeIndex(resultText, 0);
   4950         utext_setNativeIndex(&text1, 0);
   4951         REGEX_ASSERT(testUTextEqual(resultText, &text1));
   4952 
   4953         resultText = uregex_getUText(re, &bufferText, &status);
   4954         REGEX_CHECK_STATUS;
   4955         REGEX_ASSERT(resultText == &bufferText);
   4956         utext_setNativeIndex(resultText, 0);
   4957         utext_setNativeIndex(&text1, 0);
   4958         REGEX_ASSERT(testUTextEqual(resultText, &text1));
   4959 
   4960         /* Then set a UChar * */
   4961         uregex_setText(re, text2Chars, 7, &status);
   4962         resultText = uregex_getUText(re, &bufferText, &status);
   4963         REGEX_CHECK_STATUS;
   4964         REGEX_ASSERT(resultText == &bufferText);
   4965         utext_setNativeIndex(resultText, 0);
   4966         utext_setNativeIndex(&text2, 0);
   4967         REGEX_ASSERT(testUTextEqual(resultText, &text2));
   4968 
   4969         uregex_close(re);
   4970         utext_close(&text1);
   4971         utext_close(&text2);
   4972     }
   4973 
   4974     /*
   4975      *  group()
   4976      */
   4977     {
   4978         UChar    text1[80];
   4979         UText   *actual;
   4980         UBool    result;
   4981         u_uastrncpy(text1, "noise abc interior def, and this is off the end",  sizeof(text1)/2);
   4982 
   4983         status = U_ZERO_ERROR;
   4984         re = uregex_openC("abc(.*?)def", 0, NULL, &status);
   4985         REGEX_CHECK_STATUS;
   4986 
   4987         uregex_setText(re, text1, -1, &status);
   4988         result = uregex_find(re, 0, &status);
   4989         REGEX_ASSERT(result==TRUE);
   4990 
   4991         /*  Capture Group 0, the full match.  Should succeed.  */
   4992         status = U_ZERO_ERROR;
   4993         actual = uregex_groupUTextDeep(re, 0, &bufferText, &status);
   4994         REGEX_CHECK_STATUS;
   4995         REGEX_ASSERT(actual == &bufferText);
   4996         REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual);
   4997 
   4998         /*  Capture group #1.  Should succeed. */
   4999         status = U_ZERO_ERROR;
   5000         actual = uregex_groupUTextDeep(re, 1, &bufferText, &status);
   5001         REGEX_CHECK_STATUS;
   5002         REGEX_ASSERT(actual == &bufferText);
   5003         REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual);
   5004 
   5005         /*  Capture group out of range.  Error. */
   5006         status = U_ZERO_ERROR;
   5007         actual = uregex_groupUTextDeep(re, 2, &bufferText, &status);
   5008         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   5009         REGEX_ASSERT(actual == &bufferText);
   5010 
   5011         uregex_close(re);
   5012 
   5013     }
   5014 
   5015     /*
   5016      *  replaceFirst()
   5017      */
   5018     {
   5019         UChar    text1[80];
   5020         UChar    text2[80];
   5021         UText    replText = UTEXT_INITIALIZER;
   5022         UText   *result;
   5023 
   5024         status = U_ZERO_ERROR;
   5025         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
   5026         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
   5027         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
   5028 
   5029         re = uregex_openC("x(.*?)x", 0, NULL, &status);
   5030         REGEX_CHECK_STATUS;
   5031 
   5032         /*  Normal case, with match */
   5033         uregex_setText(re, text1, -1, &status);
   5034         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5035         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   5036         REGEX_CHECK_STATUS;
   5037         REGEX_ASSERT(result == &bufferText);
   5038         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
   5039 
   5040         /* No match.  Text should copy to output with no changes.  */
   5041         uregex_setText(re, text2, -1, &status);
   5042         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5043         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   5044         REGEX_CHECK_STATUS;
   5045         REGEX_ASSERT(result == &bufferText);
   5046         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
   5047 
   5048         /* Unicode escapes */
   5049         uregex_setText(re, text1, -1, &status);
   5050         regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
   5051         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5052         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   5053         REGEX_CHECK_STATUS;
   5054         REGEX_ASSERT(result == &bufferText);
   5055         REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
   5056 
   5057         uregex_close(re);
   5058         utext_close(&replText);
   5059     }
   5060 
   5061 
   5062     /*
   5063      *  replaceAll()
   5064      */
   5065     {
   5066         UChar    text1[80];
   5067         UChar    text2[80];
   5068         UText    replText = UTEXT_INITIALIZER;
   5069         UText   *result;
   5070 
   5071         status = U_ZERO_ERROR;
   5072         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
   5073         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
   5074         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
   5075 
   5076         re = uregex_openC("x(.*?)x", 0, NULL, &status);
   5077         REGEX_CHECK_STATUS;
   5078 
   5079         /*  Normal case, with match */
   5080         uregex_setText(re, text1, -1, &status);
   5081         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5082         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
   5083         REGEX_CHECK_STATUS;
   5084         REGEX_ASSERT(result == &bufferText);
   5085         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
   5086 
   5087         /* No match.  Text should copy to output with no changes.  */
   5088         uregex_setText(re, text2, -1, &status);
   5089         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5090         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
   5091         REGEX_CHECK_STATUS;
   5092         REGEX_ASSERT(result == &bufferText);
   5093         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
   5094 
   5095         uregex_close(re);
   5096         utext_close(&replText);
   5097     }
   5098 
   5099 
   5100     /*
   5101      *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
   5102      *   so we don't need to test it here.
   5103      */
   5104 
   5105     utext_close(&bufferText);
   5106     utext_close(&patternText);
   5107 }
   5108 
   5109 //--------------------------------------------------------------
   5110 //
   5111 //  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
   5112 //
   5113 //---------------------------------------------------------------
   5114 void RegexTest::Bug7651() {
   5115     UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
   5116     //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
   5117     //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
   5118     UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
   5119     UnicodeString s("#ff @abcd This is test");
   5120     RegexPattern  *REPattern = NULL;
   5121     RegexMatcher  *REMatcher = NULL;
   5122     UErrorCode status = U_ZERO_ERROR;
   5123     UParseError pe;
   5124 
   5125     REPattern = RegexPattern::compile(pattern1, 0, pe, status);
   5126     REGEX_CHECK_STATUS;
   5127     REMatcher = REPattern->matcher(s, status);
   5128     REGEX_CHECK_STATUS;
   5129     REGEX_ASSERT(REMatcher->find());
   5130     REGEX_ASSERT(REMatcher->start(status) == 0);
   5131     delete REPattern;
   5132     delete REMatcher;
   5133     status = U_ZERO_ERROR;
   5134 
   5135     REPattern = RegexPattern::compile(pattern2, 0, pe, status);
   5136     REGEX_CHECK_STATUS;
   5137     REMatcher = REPattern->matcher(s, status);
   5138     REGEX_CHECK_STATUS;
   5139     REGEX_ASSERT(REMatcher->find());
   5140     REGEX_ASSERT(REMatcher->start(status) == 0);
   5141     delete REPattern;
   5142     delete REMatcher;
   5143     status = U_ZERO_ERROR;
   5144  }
   5145 
   5146 void RegexTest::Bug7740() {
   5147     UErrorCode status = U_ZERO_ERROR;
   5148     UnicodeString pattern = "(a)";
   5149     UnicodeString text = "abcdef";
   5150     RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
   5151     REGEX_CHECK_STATUS;
   5152     REGEX_ASSERT(m->lookingAt(status));
   5153     REGEX_CHECK_STATUS;
   5154     status = U_ILLEGAL_ARGUMENT_ERROR;
   5155     UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
   5156     REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
   5157     REGEX_ASSERT(s == "");
   5158     delete m;
   5159 }
   5160 
   5161 // Bug 8479:  was crashing whith a Bogus UnicodeString as input.
   5162 
   5163 void RegexTest::Bug8479() {
   5164     UErrorCode status = U_ZERO_ERROR;
   5165 
   5166     RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
   5167     REGEX_CHECK_STATUS;
   5168     if (U_SUCCESS(status))
   5169     {
   5170         UnicodeString str;
   5171         str.setToBogus();
   5172         pMatcher->reset(str);
   5173         status = U_ZERO_ERROR;
   5174         pMatcher->matches(status);
   5175         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
   5176         delete pMatcher;
   5177     }
   5178 }
   5179 
   5180 
   5181 // Bug 7029
   5182 void RegexTest::Bug7029() {
   5183     UErrorCode status = U_ZERO_ERROR;
   5184 
   5185     RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
   5186     UnicodeString text = "abc.def";
   5187     UnicodeString splits[10];
   5188     REGEX_CHECK_STATUS;
   5189     int32_t numFields = pMatcher->split(text, splits, 10, status);
   5190     REGEX_CHECK_STATUS;
   5191     REGEX_ASSERT(numFields == 8);
   5192     delete pMatcher;
   5193 }
   5194 
   5195 // Bug 9283
   5196 //   This test is checking for the existance of any supplemental characters that case-fold
   5197 //   to a bmp character.
   5198 //
   5199 //   At the time of this writing there are none. If any should appear in a subsequent release
   5200 //   of Unicode, the code in regular expressions compilation that determines the longest
   5201 //   posssible match for a literal string  will need to be enhanced.
   5202 //
   5203 //   See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
   5204 //   for details on what to do in case of a failure of this test.
   5205 //
   5206 void RegexTest::Bug9283() {
   5207     UErrorCode status = U_ZERO_ERROR;
   5208     UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
   5209     REGEX_CHECK_STATUS;
   5210     int32_t index;
   5211     UChar32 c;
   5212     for (index=0; ; index++) {
   5213         c = supplementalsWithCaseFolding.charAt(index);
   5214         if (c == -1) {
   5215             break;
   5216         }
   5217         UnicodeString cf = UnicodeString(c).foldCase();
   5218         REGEX_ASSERT(cf.length() >= 2);
   5219     }
   5220 }
   5221 
   5222 
   5223 void RegexTest::CheckInvBufSize() {
   5224   if(inv_next>=INV_BUFSIZ) {
   5225     errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
   5226           __FILE__, INV_BUFSIZ, inv_next);
   5227   } else {
   5228     logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
   5229   }
   5230 }
   5231 
   5232 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
   5233 
   5234