Home | History | Annotate | Download | only in intltest
      1 /********************************************************************
      2  * COPYRIGHT:
      3  * Copyright (c) 2002-2014, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  ********************************************************************/
      6 
      7 //
      8 //   regextst.cpp
      9 //
     10 //      ICU Regular Expressions test, part of intltest.
     11 //
     12 
     13 /*
     14      NOTE!!
     15 
     16      PLEASE be careful about ASCII assumptions in this test.
     17      This test is one of the worst repeat offenders.
     18      If you have questions, contact someone on the ICU PMC
     19      who has access to an EBCDIC system.
     20 
     21  */
     22 
     23 #include "intltest.h"
     24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     25 
     26 #include "unicode/regex.h"
     27 #include "unicode/uchar.h"
     28 #include "unicode/ucnv.h"
     29 #include "unicode/uniset.h"
     30 #include "unicode/uregex.h"
     31 #include "unicode/ustring.h"
     32 #include "regextst.h"
     33 #include "uvector.h"
     34 #include "util.h"
     35 #include <stdlib.h>
     36 #include <string.h>
     37 #include <stdio.h>
     38 #include "cstring.h"
     39 #include "uinvchar.h"
     40 
     41 #define SUPPORT_MUTATING_INPUT_STRING   0
     42 
     43 //---------------------------------------------------------------------------
     44 //
     45 //  Test class boilerplate
     46 //
     47 //---------------------------------------------------------------------------
     48 RegexTest::RegexTest()
     49 {
     50 }
     51 
     52 
     53 RegexTest::~RegexTest()
     54 {
     55 }
     56 
     57 
     58 
     59 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
     60 {
     61     if (exec) logln("TestSuite RegexTest: ");
     62     switch (index) {
     63 
     64         case 0: name = "Basic";
     65             if (exec) Basic();
     66             break;
     67         case 1: name = "API_Match";
     68             if (exec) API_Match();
     69             break;
     70         case 2: name = "API_Replace";
     71             if (exec) API_Replace();
     72             break;
     73         case 3: name = "API_Pattern";
     74             if (exec) API_Pattern();
     75             break;
     76         case 4:
     77 #if !UCONFIG_NO_FILE_IO
     78             name = "Extended";
     79             if (exec) Extended();
     80 #else
     81             name = "skip";
     82 #endif
     83             break;
     84         case 5: name = "Errors";
     85             if (exec) Errors();
     86             break;
     87         case 6: name = "PerlTests";
     88             if (exec) PerlTests();
     89             break;
     90         case 7: name = "Callbacks";
     91             if (exec) Callbacks();
     92             break;
     93         case 8: name = "FindProgressCallbacks";
     94             if (exec) FindProgressCallbacks();
     95             break;
     96         case 9: name = "Bug 6149";
     97              if (exec) Bug6149();
     98              break;
     99         case 10: name = "UTextBasic";
    100           if (exec) UTextBasic();
    101           break;
    102         case 11: name = "API_Match_UTF8";
    103           if (exec) API_Match_UTF8();
    104           break;
    105         case 12: name = "API_Replace_UTF8";
    106           if (exec) API_Replace_UTF8();
    107           break;
    108         case 13: name = "API_Pattern_UTF8";
    109           if (exec) API_Pattern_UTF8();
    110           break;
    111         case 14: name = "PerlTestsUTF8";
    112           if (exec) PerlTestsUTF8();
    113           break;
    114         case 15: name = "PreAllocatedUTextCAPI";
    115           if (exec) PreAllocatedUTextCAPI();
    116           break;
    117         case 16: name = "Bug 7651";
    118              if (exec) Bug7651();
    119              break;
    120         case 17: name = "Bug 7740";
    121             if (exec) Bug7740();
    122             break;
    123         case 18: name = "Bug 8479";
    124             if (exec) Bug8479();
    125             break;
    126         case 19: name = "Bug 7029";
    127             if (exec) Bug7029();
    128             break;
    129         case 20: name = "CheckInvBufSize";
    130             if (exec) CheckInvBufSize();
    131             break;
    132         case 21: name = "Bug 9283";
    133             if (exec) Bug9283();
    134             break;
    135         case 22: name = "Bug10459";
    136             if (exec) Bug10459();
    137             break;
    138 
    139         default: name = "";
    140             break; //needed to end loop
    141     }
    142 }
    143 
    144 
    145 
    146 /**
    147  * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
    148  * into ASCII.
    149  * @see utext_openUTF8
    150  */
    151 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
    152 
    153 //---------------------------------------------------------------------------
    154 //
    155 //   Error Checking / Reporting macros used in all of the tests.
    156 //
    157 //---------------------------------------------------------------------------
    158 
    159 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
    160   int64_t oldIndex = utext_getNativeIndex(text);
    161   utext_setNativeIndex(text, 0);
    162   char *bufPtr = buf;
    163   UChar32 c = utext_next32From(text, 0);
    164   while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
    165     if (0x000020<=c && c<0x00007e) {
    166       *bufPtr = c;
    167     } else {
    168 #if 0
    169       sprintf(bufPtr,"U+%04X", c);
    170       bufPtr+= strlen(bufPtr)-1;
    171 #else
    172       *bufPtr = '%';
    173 #endif
    174     }
    175     bufPtr++;
    176     c = UTEXT_NEXT32(text);
    177   }
    178   *bufPtr = 0;
    179 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
    180   char *ebuf = (char*)malloc(bufLen);
    181   uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
    182   uprv_strncpy(buf, ebuf, bufLen);
    183   free((void*)ebuf);
    184 #endif
    185   utext_setNativeIndex(text, oldIndex);
    186 }
    187 
    188 
    189 static char ASSERT_BUF[1024];
    190 
    191 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
    192   if(message.length()==0) {
    193     strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
    194   } else {
    195     UnicodeString buf;
    196     IntlTest::prettify(message,buf);
    197     if(buf.length()==0) {
    198       strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
    199     } else {
    200       buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
    201       if(ASSERT_BUF[0]==0) {
    202         ASSERT_BUF[0]=0;
    203         for(int32_t i=0;i<buf.length();i++) {
    204           UChar ch = buf[i];
    205           sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
    206         }
    207       }
    208     }
    209   }
    210   ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
    211   return ASSERT_BUF;
    212 }
    213 
    214 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
    215 
    216 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
    217 
    218 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
    219                                                               __FILE__, __LINE__, u_errorName(status)); return;}}
    220 
    221 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
    222 
    223 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
    224 if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
    225     __LINE__, u_errorName(errcode), u_errorName(status));};}
    226 
    227 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
    228     "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
    229 
    230 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
    231     errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
    232 
    233 #define REGEX_ASSERT_UNISTR(ustr,inv) {if (!(ustr==inv)) {errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s,%s) failed \n", __FILE__, __LINE__, extractToAssertBuf(ustr),inv);};}
    234 
    235 
    236 static UBool testUTextEqual(UText *uta, UText *utb) {
    237     UChar32 ca = 0;
    238     UChar32 cb = 0;
    239     utext_setNativeIndex(uta, 0);
    240     utext_setNativeIndex(utb, 0);
    241     do {
    242         ca = utext_next32(uta);
    243         cb = utext_next32(utb);
    244         if (ca != cb) {
    245             break;
    246         }
    247     } while (ca != U_SENTINEL);
    248     return ca == cb;
    249 }
    250 
    251 
    252 /**
    253  * @param expected expected text in UTF-8 (not platform) codepage
    254  */
    255 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
    256     UErrorCode status = U_ZERO_ERROR;
    257     UText expectedText = UTEXT_INITIALIZER;
    258     utext_openUTF8(&expectedText, expected, -1, &status);
    259     if(U_FAILURE(status)) {
    260       errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
    261       return;
    262     }
    263     if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
    264       errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
    265       return;
    266     }
    267     utext_setNativeIndex(actual, 0);
    268     if (!testUTextEqual(&expectedText, actual)) {
    269         char buf[201 /*21*/];
    270         char expectedBuf[201];
    271         utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
    272         utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
    273         errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
    274     }
    275     utext_close(&expectedText);
    276 }
    277 /**
    278  * @param expected invariant (platform local text) input
    279  */
    280 
    281 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
    282     UErrorCode status = U_ZERO_ERROR;
    283     UText expectedText = UTEXT_INITIALIZER;
    284     regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
    285     if(U_FAILURE(status)) {
    286       errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
    287       return;
    288     }
    289     utext_setNativeIndex(actual, 0);
    290     if (!testUTextEqual(&expectedText, actual)) {
    291         char buf[201 /*21*/];
    292         char expectedBuf[201];
    293         utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
    294         utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
    295         errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
    296     }
    297     utext_close(&expectedText);
    298 }
    299 
    300 /**
    301  * Assumes utf-8 input
    302  */
    303 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
    304 /**
    305  * Assumes Invariant input
    306  */
    307 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
    308 
    309 /**
    310  * This buffer ( inv_buf ) is used to hold the UTF-8 strings
    311  * passed into utext_openUTF8. An error will be given if
    312  * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
    313  */
    314 
    315 #define INV_BUFSIZ 2048 /* increase this if too small */
    316 
    317 static int64_t inv_next=0;
    318 
    319 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
    320 static char inv_buf[INV_BUFSIZ];
    321 #endif
    322 
    323 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
    324   if(length==-1) length=strlen(inv);
    325 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
    326   inv_next+=length;
    327   return utext_openUTF8(ut, inv, length, status);
    328 #else
    329   if(inv_next+length+1>INV_BUFSIZ) {
    330     fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
    331             __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
    332     *status = U_MEMORY_ALLOCATION_ERROR;
    333     return NULL;
    334   }
    335 
    336   unsigned char *buf = (unsigned char*)inv_buf+inv_next;
    337   uprv_aestrncpy(buf, (const uint8_t*)inv, length);
    338   inv_next+=length;
    339 
    340 #if 0
    341   fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
    342 #endif
    343 
    344   return utext_openUTF8(ut, (const char*)buf, length, status);
    345 #endif
    346 }
    347 
    348 
    349 //---------------------------------------------------------------------------
    350 //
    351 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
    352 //                       for the LookingAt() and  Match() functions.
    353 //
    354 //       usage:
    355 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
    356 //
    357 //          The expected results are UBool - TRUE or FALSE.
    358 //          The input text is unescaped.  The pattern is not.
    359 //
    360 //
    361 //---------------------------------------------------------------------------
    362 
    363 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
    364 
    365 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
    366     const UnicodeString pattern(pat, -1, US_INV);
    367     const UnicodeString inputText(text, -1, US_INV);
    368     UErrorCode          status  = U_ZERO_ERROR;
    369     UParseError         pe;
    370     RegexPattern        *REPattern = NULL;
    371     RegexMatcher        *REMatcher = NULL;
    372     UBool               retVal     = TRUE;
    373 
    374     UnicodeString patString(pat, -1, US_INV);
    375     REPattern = RegexPattern::compile(patString, 0, pe, status);
    376     if (U_FAILURE(status)) {
    377         dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
    378             line, u_errorName(status));
    379         return FALSE;
    380     }
    381     if (line==376) { REPattern->dumpPattern();}
    382 
    383     UnicodeString inputString(inputText);
    384     UnicodeString unEscapedInput = inputString.unescape();
    385     REMatcher = REPattern->matcher(unEscapedInput, status);
    386     if (U_FAILURE(status)) {
    387         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
    388             line, u_errorName(status));
    389         return FALSE;
    390     }
    391 
    392     UBool actualmatch;
    393     actualmatch = REMatcher->lookingAt(status);
    394     if (U_FAILURE(status)) {
    395         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
    396             line, u_errorName(status));
    397         retVal =  FALSE;
    398     }
    399     if (actualmatch != looking) {
    400         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
    401         retVal = FALSE;
    402     }
    403 
    404     status = U_ZERO_ERROR;
    405     actualmatch = REMatcher->matches(status);
    406     if (U_FAILURE(status)) {
    407         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
    408             line, u_errorName(status));
    409         retVal = FALSE;
    410     }
    411     if (actualmatch != match) {
    412         errln("RegexTest: wrong return from matches() at line %d.\n", line);
    413         retVal = FALSE;
    414     }
    415 
    416     if (retVal == FALSE) {
    417         REPattern->dumpPattern();
    418     }
    419 
    420     delete REPattern;
    421     delete REMatcher;
    422     return retVal;
    423 }
    424 
    425 
    426 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
    427     UText               pattern    = UTEXT_INITIALIZER;
    428     int32_t             inputUTF8Length;
    429     char                *textChars = NULL;
    430     UText               inputText  = UTEXT_INITIALIZER;
    431     UErrorCode          status     = U_ZERO_ERROR;
    432     UParseError         pe;
    433     RegexPattern        *REPattern = NULL;
    434     RegexMatcher        *REMatcher = NULL;
    435     UBool               retVal     = TRUE;
    436 
    437     regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
    438     REPattern = RegexPattern::compile(&pattern, 0, pe, status);
    439     if (U_FAILURE(status)) {
    440         dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
    441             line, u_errorName(status));
    442         return FALSE;
    443     }
    444 
    445     UnicodeString inputString(text, -1, US_INV);
    446     UnicodeString unEscapedInput = inputString.unescape();
    447     LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
    448     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
    449 
    450     inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
    451     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
    452         // UTF-8 does not allow unpaired surrogates, so this could actually happen
    453         logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
    454         return TRUE; // not a failure of the Regex engine
    455     }
    456     status = U_ZERO_ERROR; // buffer overflow
    457     textChars = new char[inputUTF8Length+1];
    458     unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
    459     utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
    460 
    461     REMatcher = &REPattern->matcher(status)->reset(&inputText);
    462     if (U_FAILURE(status)) {
    463         errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
    464             line, u_errorName(status));
    465         return FALSE;
    466     }
    467 
    468     UBool actualmatch;
    469     actualmatch = REMatcher->lookingAt(status);
    470     if (U_FAILURE(status)) {
    471         errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
    472             line, u_errorName(status));
    473         retVal =  FALSE;
    474     }
    475     if (actualmatch != looking) {
    476         errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
    477         retVal = FALSE;
    478     }
    479 
    480     status = U_ZERO_ERROR;
    481     actualmatch = REMatcher->matches(status);
    482     if (U_FAILURE(status)) {
    483         errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
    484             line, u_errorName(status));
    485         retVal = FALSE;
    486     }
    487     if (actualmatch != match) {
    488         errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
    489         retVal = FALSE;
    490     }
    491 
    492     if (retVal == FALSE) {
    493         REPattern->dumpPattern();
    494     }
    495 
    496     delete REPattern;
    497     delete REMatcher;
    498     utext_close(&inputText);
    499     utext_close(&pattern);
    500     delete[] textChars;
    501     return retVal;
    502 }
    503 
    504 
    505 
    506 //---------------------------------------------------------------------------
    507 //
    508 //    REGEX_ERR       Macro + invocation function to simplify writing tests
    509 //                       regex tests for incorrect patterns
    510 //
    511 //       usage:
    512 //          REGEX_ERR("pattern",   expected error line, column, expected status);
    513 //
    514 //---------------------------------------------------------------------------
    515 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
    516 
    517 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
    518                           UErrorCode expectedStatus, int32_t line) {
    519     UnicodeString       pattern(pat);
    520 
    521     UErrorCode          status         = U_ZERO_ERROR;
    522     UParseError         pe;
    523     RegexPattern        *callerPattern = NULL;
    524 
    525     //
    526     //  Compile the caller's pattern
    527     //
    528     UnicodeString patString(pat);
    529     callerPattern = RegexPattern::compile(patString, 0, pe, status);
    530     if (status != expectedStatus) {
    531         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
    532     } else {
    533         if (status != U_ZERO_ERROR) {
    534             if (pe.line != errLine || pe.offset != errCol) {
    535                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
    536                     line, errLine, errCol, pe.line, pe.offset);
    537             }
    538         }
    539     }
    540 
    541     delete callerPattern;
    542 
    543     //
    544     //  Compile again, using a UTF-8-based UText
    545     //
    546     UText patternText = UTEXT_INITIALIZER;
    547     regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
    548     callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
    549     if (status != expectedStatus) {
    550         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
    551     } else {
    552         if (status != U_ZERO_ERROR) {
    553             if (pe.line != errLine || pe.offset != errCol) {
    554                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
    555                     line, errLine, errCol, pe.line, pe.offset);
    556             }
    557         }
    558     }
    559 
    560     delete callerPattern;
    561     utext_close(&patternText);
    562 }
    563 
    564 
    565 
    566 //---------------------------------------------------------------------------
    567 //
    568 //      Basic      Check for basic functionality of regex pattern matching.
    569 //                 Avoid the use of REGEX_FIND test macro, which has
    570 //                 substantial dependencies on basic Regex functionality.
    571 //
    572 //---------------------------------------------------------------------------
    573 void RegexTest::Basic() {
    574 
    575 
    576 //
    577 // Debug - slide failing test cases early
    578 //
    579 #if 0
    580     {
    581         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
    582         UParseError pe;
    583         UErrorCode  status = U_ZERO_ERROR;
    584         RegexPattern *pattern;
    585         pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
    586         pattern->dumpPattern();
    587         RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
    588         UBool result = m->find();
    589         printf("result = %d\n", result);
    590         // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
    591         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
    592     }
    593     exit(1);
    594 #endif
    595 
    596 
    597     //
    598     // Pattern with parentheses
    599     //
    600     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
    601     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
    602     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
    603 
    604     //
    605     // Patterns with *
    606     //
    607     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
    608     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
    609     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
    610     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
    611     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
    612 
    613     REGEX_TESTLM("a*", "",  TRUE, TRUE);
    614     REGEX_TESTLM("a*", "b", TRUE, FALSE);
    615 
    616 
    617     //
    618     //  Patterns with "."
    619     //
    620     REGEX_TESTLM(".", "abc", TRUE, FALSE);
    621     REGEX_TESTLM("...", "abc", TRUE, TRUE);
    622     REGEX_TESTLM("....", "abc", FALSE, FALSE);
    623     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
    624     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
    625     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
    626     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
    627     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
    628 
    629     //
    630     //  Patterns with * applied to chars at end of literal string
    631     //
    632     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
    633     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
    634 
    635     //
    636     //  Supplemental chars match as single chars, not a pair of surrogates.
    637     //
    638     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
    639     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
    640     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
    641 
    642 
    643     //
    644     //  UnicodeSets in the pattern
    645     //
    646     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
    647     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
    648     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
    649     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
    650     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
    651     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
    652 
    653     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
    654     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
    655     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
    656     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
    657     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
    658 
    659     //
    660     //   OR operator in patterns
    661     //
    662     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
    663     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
    664     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
    665     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
    666 
    667     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
    668     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
    669     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
    670     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
    671     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
    672     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
    673 
    674     //
    675     //  +
    676     //
    677     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
    678     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
    679     REGEX_TESTLM("b+", "", FALSE, FALSE);
    680     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
    681     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
    682     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
    683 
    684     //
    685     //   ?
    686     //
    687     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
    688     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
    689     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
    690     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
    691     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
    692     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
    693     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
    694     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
    695     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
    696 
    697     //
    698     //  Escape sequences that become single literal chars, handled internally
    699     //   by ICU's Unescape.
    700     //
    701 
    702     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
    703     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
    704     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
    705     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
    706     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
    707     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
    708     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
    709     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
    710     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
    711     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
    712 
    713     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
    714     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
    715 
    716     // Escape of special chars in patterns
    717     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
    718 }
    719 
    720 
    721 //---------------------------------------------------------------------------
    722 //
    723 //    UTextBasic   Check for quirks that are specific to the UText
    724 //                 implementation.
    725 //
    726 //---------------------------------------------------------------------------
    727 void RegexTest::UTextBasic() {
    728     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
    729     UErrorCode status = U_ZERO_ERROR;
    730     UText pattern = UTEXT_INITIALIZER;
    731     utext_openUTF8(&pattern, str_abc, -1, &status);
    732     RegexMatcher matcher(&pattern, 0, status);
    733     REGEX_CHECK_STATUS;
    734 
    735     UText input = UTEXT_INITIALIZER;
    736     utext_openUTF8(&input, str_abc, -1, &status);
    737     REGEX_CHECK_STATUS;
    738     matcher.reset(&input);
    739     REGEX_CHECK_STATUS;
    740     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
    741 
    742     matcher.reset(matcher.inputText());
    743     REGEX_CHECK_STATUS;
    744     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
    745 
    746     utext_close(&pattern);
    747     utext_close(&input);
    748 }
    749 
    750 
    751 //---------------------------------------------------------------------------
    752 //
    753 //      API_Match   Test that the API for class RegexMatcher
    754 //                  is present and nominally working, but excluding functions
    755 //                  implementing replace operations.
    756 //
    757 //---------------------------------------------------------------------------
    758 void RegexTest::API_Match() {
    759     UParseError         pe;
    760     UErrorCode          status=U_ZERO_ERROR;
    761     int32_t             flags = 0;
    762 
    763     //
    764     // Debug - slide failing test cases early
    765     //
    766 #if 0
    767     {
    768     }
    769     return;
    770 #endif
    771 
    772     //
    773     // Simple pattern compilation
    774     //
    775     {
    776         UnicodeString       re("abc");
    777         RegexPattern        *pat2;
    778         pat2 = RegexPattern::compile(re, flags, pe, status);
    779         REGEX_CHECK_STATUS;
    780 
    781         UnicodeString inStr1 = "abcdef this is a test";
    782         UnicodeString instr2 = "not abc";
    783         UnicodeString empty  = "";
    784 
    785 
    786         //
    787         // Matcher creation and reset.
    788         //
    789         RegexMatcher *m1 = pat2->matcher(inStr1, status);
    790         REGEX_CHECK_STATUS;
    791         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    792         REGEX_ASSERT(m1->input() == inStr1);
    793         m1->reset(instr2);
    794         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
    795         REGEX_ASSERT(m1->input() == instr2);
    796         m1->reset(inStr1);
    797         REGEX_ASSERT(m1->input() == inStr1);
    798         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    799         m1->reset(empty);
    800         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
    801         REGEX_ASSERT(m1->input() == empty);
    802         REGEX_ASSERT(&m1->pattern() == pat2);
    803 
    804         //
    805         //  reset(pos, status)
    806         //
    807         m1->reset(inStr1);
    808         m1->reset(4, status);
    809         REGEX_CHECK_STATUS;
    810         REGEX_ASSERT(m1->input() == inStr1);
    811         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    812 
    813         m1->reset(-1, status);
    814         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    815         status = U_ZERO_ERROR;
    816 
    817         m1->reset(0, status);
    818         REGEX_CHECK_STATUS;
    819         status = U_ZERO_ERROR;
    820 
    821         int32_t len = m1->input().length();
    822         m1->reset(len-1, status);
    823         REGEX_CHECK_STATUS;
    824         status = U_ZERO_ERROR;
    825 
    826         m1->reset(len, status);
    827         REGEX_CHECK_STATUS;
    828         status = U_ZERO_ERROR;
    829 
    830         m1->reset(len+1, status);
    831         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    832         status = U_ZERO_ERROR;
    833 
    834         //
    835         // match(pos, status)
    836         //
    837         m1->reset(instr2);
    838         REGEX_ASSERT(m1->matches(4, status) == TRUE);
    839         m1->reset();
    840         REGEX_ASSERT(m1->matches(3, status) == FALSE);
    841         m1->reset();
    842         REGEX_ASSERT(m1->matches(5, status) == FALSE);
    843         REGEX_ASSERT(m1->matches(4, status) == TRUE);
    844         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
    845         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    846 
    847         // Match() at end of string should fail, but should not
    848         //  be an error.
    849         status = U_ZERO_ERROR;
    850         len = m1->input().length();
    851         REGEX_ASSERT(m1->matches(len, status) == FALSE);
    852         REGEX_CHECK_STATUS;
    853 
    854         // Match beyond end of string should fail with an error.
    855         status = U_ZERO_ERROR;
    856         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
    857         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    858 
    859         // Successful match at end of string.
    860         {
    861             status = U_ZERO_ERROR;
    862             RegexMatcher m("A?", 0, status);  // will match zero length string.
    863             REGEX_CHECK_STATUS;
    864             m.reset(inStr1);
    865             len = inStr1.length();
    866             REGEX_ASSERT(m.matches(len, status) == TRUE);
    867             REGEX_CHECK_STATUS;
    868             m.reset(empty);
    869             REGEX_ASSERT(m.matches(0, status) == TRUE);
    870             REGEX_CHECK_STATUS;
    871         }
    872 
    873 
    874         //
    875         // lookingAt(pos, status)
    876         //
    877         status = U_ZERO_ERROR;
    878         m1->reset(instr2);  // "not abc"
    879         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
    880         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
    881         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
    882         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
    883         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
    884         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    885         status = U_ZERO_ERROR;
    886         len = m1->input().length();
    887         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
    888         REGEX_CHECK_STATUS;
    889         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
    890         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    891 
    892         delete m1;
    893         delete pat2;
    894     }
    895 
    896 
    897     //
    898     // Capture Group.
    899     //     RegexMatcher::start();
    900     //     RegexMatcher::end();
    901     //     RegexMatcher::groupCount();
    902     //
    903     {
    904         int32_t             flags=0;
    905         UParseError         pe;
    906         UErrorCode          status=U_ZERO_ERROR;
    907 
    908         UnicodeString       re("01(23(45)67)(.*)");
    909         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
    910         REGEX_CHECK_STATUS;
    911         UnicodeString data = "0123456789";
    912 
    913         RegexMatcher *matcher = pat->matcher(data, status);
    914         REGEX_CHECK_STATUS;
    915         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
    916         static const int32_t matchStarts[] = {0,  2, 4, 8};
    917         static const int32_t matchEnds[]   = {10, 8, 6, 10};
    918         int32_t i;
    919         for (i=0; i<4; i++) {
    920             int32_t actualStart = matcher->start(i, status);
    921             REGEX_CHECK_STATUS;
    922             if (actualStart != matchStarts[i]) {
    923                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
    924                     __LINE__, i, matchStarts[i], actualStart);
    925             }
    926             int32_t actualEnd = matcher->end(i, status);
    927             REGEX_CHECK_STATUS;
    928             if (actualEnd != matchEnds[i]) {
    929                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
    930                     __LINE__, i, matchEnds[i], actualEnd);
    931             }
    932         }
    933 
    934         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
    935         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
    936 
    937         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
    938         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
    939         matcher->reset();
    940         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
    941 
    942         matcher->lookingAt(status);
    943         REGEX_ASSERT(matcher->group(status)    == "0123456789");
    944         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
    945         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
    946         REGEX_ASSERT(matcher->group(2, status) == "45"        );
    947         REGEX_ASSERT(matcher->group(3, status) == "89"        );
    948         REGEX_CHECK_STATUS;
    949         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
    950         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
    951         matcher->reset();
    952         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
    953 
    954         delete matcher;
    955         delete pat;
    956 
    957     }
    958 
    959     //
    960     //  find
    961     //
    962     {
    963         int32_t             flags=0;
    964         UParseError         pe;
    965         UErrorCode          status=U_ZERO_ERROR;
    966 
    967         UnicodeString       re("abc");
    968         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
    969         REGEX_CHECK_STATUS;
    970         UnicodeString data = ".abc..abc...abc..";
    971         //                    012345678901234567
    972 
    973         RegexMatcher *matcher = pat->matcher(data, status);
    974         REGEX_CHECK_STATUS;
    975         REGEX_ASSERT(matcher->find());
    976         REGEX_ASSERT(matcher->start(status) == 1);
    977         REGEX_ASSERT(matcher->find());
    978         REGEX_ASSERT(matcher->start(status) == 6);
    979         REGEX_ASSERT(matcher->find());
    980         REGEX_ASSERT(matcher->start(status) == 12);
    981         REGEX_ASSERT(matcher->find() == FALSE);
    982         REGEX_ASSERT(matcher->find() == FALSE);
    983 
    984         matcher->reset();
    985         REGEX_ASSERT(matcher->find());
    986         REGEX_ASSERT(matcher->start(status) == 1);
    987 
    988         REGEX_ASSERT(matcher->find(0, status));
    989         REGEX_ASSERT(matcher->start(status) == 1);
    990         REGEX_ASSERT(matcher->find(1, status));
    991         REGEX_ASSERT(matcher->start(status) == 1);
    992         REGEX_ASSERT(matcher->find(2, status));
    993         REGEX_ASSERT(matcher->start(status) == 6);
    994         REGEX_ASSERT(matcher->find(12, status));
    995         REGEX_ASSERT(matcher->start(status) == 12);
    996         REGEX_ASSERT(matcher->find(13, status) == FALSE);
    997         REGEX_ASSERT(matcher->find(16, status) == FALSE);
    998         REGEX_ASSERT(matcher->find(17, status) == FALSE);
    999         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
   1000 
   1001         status = U_ZERO_ERROR;
   1002         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   1003         status = U_ZERO_ERROR;
   1004         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
   1005 
   1006         REGEX_ASSERT(matcher->groupCount() == 0);
   1007 
   1008         delete matcher;
   1009         delete pat;
   1010     }
   1011 
   1012 
   1013     //
   1014     //  find, with \G in pattern (true if at the end of a previous match).
   1015     //
   1016     {
   1017         int32_t             flags=0;
   1018         UParseError         pe;
   1019         UErrorCode          status=U_ZERO_ERROR;
   1020 
   1021         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
   1022         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
   1023         REGEX_CHECK_STATUS;
   1024         UnicodeString data = ".abcabc.abc..";
   1025         //                    012345678901234567
   1026 
   1027         RegexMatcher *matcher = pat->matcher(data, status);
   1028         REGEX_CHECK_STATUS;
   1029         REGEX_ASSERT(matcher->find());
   1030         REGEX_ASSERT(matcher->start(status) == 0);
   1031         REGEX_ASSERT(matcher->start(1, status) == -1);
   1032         REGEX_ASSERT(matcher->start(2, status) == 1);
   1033 
   1034         REGEX_ASSERT(matcher->find());
   1035         REGEX_ASSERT(matcher->start(status) == 4);
   1036         REGEX_ASSERT(matcher->start(1, status) == 4);
   1037         REGEX_ASSERT(matcher->start(2, status) == -1);
   1038         REGEX_CHECK_STATUS;
   1039 
   1040         delete matcher;
   1041         delete pat;
   1042     }
   1043 
   1044     //
   1045     //   find with zero length matches, match position should bump ahead
   1046     //     to prevent loops.
   1047     //
   1048     {
   1049         int32_t                 i;
   1050         UErrorCode          status=U_ZERO_ERROR;
   1051         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
   1052                                                       //   using an always-true look-ahead.
   1053         REGEX_CHECK_STATUS;
   1054         UnicodeString s("    ");
   1055         m.reset(s);
   1056         for (i=0; ; i++) {
   1057             if (m.find() == FALSE) {
   1058                 break;
   1059             }
   1060             REGEX_ASSERT(m.start(status) == i);
   1061             REGEX_ASSERT(m.end(status) == i);
   1062         }
   1063         REGEX_ASSERT(i==5);
   1064 
   1065         // Check that the bump goes over surrogate pairs OK
   1066         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
   1067         s = s.unescape();
   1068         m.reset(s);
   1069         for (i=0; ; i+=2) {
   1070             if (m.find() == FALSE) {
   1071                 break;
   1072             }
   1073             REGEX_ASSERT(m.start(status) == i);
   1074             REGEX_ASSERT(m.end(status) == i);
   1075         }
   1076         REGEX_ASSERT(i==10);
   1077     }
   1078     {
   1079         // find() loop breaking test.
   1080         //        with pattern of /.?/, should see a series of one char matches, then a single
   1081         //        match of zero length at the end of the input string.
   1082         int32_t                 i;
   1083         UErrorCode          status=U_ZERO_ERROR;
   1084         RegexMatcher        m(".?", 0, status);
   1085         REGEX_CHECK_STATUS;
   1086         UnicodeString s("    ");
   1087         m.reset(s);
   1088         for (i=0; ; i++) {
   1089             if (m.find() == FALSE) {
   1090                 break;
   1091             }
   1092             REGEX_ASSERT(m.start(status) == i);
   1093             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
   1094         }
   1095         REGEX_ASSERT(i==5);
   1096     }
   1097 
   1098 
   1099     //
   1100     // Matchers with no input string behave as if they had an empty input string.
   1101     //
   1102 
   1103     {
   1104         UErrorCode status = U_ZERO_ERROR;
   1105         RegexMatcher  m(".?", 0, status);
   1106         REGEX_CHECK_STATUS;
   1107         REGEX_ASSERT(m.find());
   1108         REGEX_ASSERT(m.start(status) == 0);
   1109         REGEX_ASSERT(m.input() == "");
   1110     }
   1111     {
   1112         UErrorCode status = U_ZERO_ERROR;
   1113         RegexPattern  *p = RegexPattern::compile(".", 0, status);
   1114         RegexMatcher  *m = p->matcher(status);
   1115         REGEX_CHECK_STATUS;
   1116 
   1117         REGEX_ASSERT(m->find() == FALSE);
   1118         REGEX_ASSERT(m->input() == "");
   1119         delete m;
   1120         delete p;
   1121     }
   1122 
   1123     //
   1124     // Regions
   1125     //
   1126     {
   1127         UErrorCode status = U_ZERO_ERROR;
   1128         UnicodeString testString("This is test data");
   1129         RegexMatcher m(".*", testString,  0, status);
   1130         REGEX_CHECK_STATUS;
   1131         REGEX_ASSERT(m.regionStart() == 0);
   1132         REGEX_ASSERT(m.regionEnd() == testString.length());
   1133         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1134         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1135 
   1136         m.region(2,4, status);
   1137         REGEX_CHECK_STATUS;
   1138         REGEX_ASSERT(m.matches(status));
   1139         REGEX_ASSERT(m.start(status)==2);
   1140         REGEX_ASSERT(m.end(status)==4);
   1141         REGEX_CHECK_STATUS;
   1142 
   1143         m.reset();
   1144         REGEX_ASSERT(m.regionStart() == 0);
   1145         REGEX_ASSERT(m.regionEnd() == testString.length());
   1146 
   1147         UnicodeString shorterString("short");
   1148         m.reset(shorterString);
   1149         REGEX_ASSERT(m.regionStart() == 0);
   1150         REGEX_ASSERT(m.regionEnd() == shorterString.length());
   1151 
   1152         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1153         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
   1154         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   1155         REGEX_ASSERT(&m == &m.reset());
   1156         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   1157 
   1158         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
   1159         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1160         REGEX_ASSERT(&m == &m.reset());
   1161         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1162 
   1163         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1164         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
   1165         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   1166         REGEX_ASSERT(&m == &m.reset());
   1167         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   1168 
   1169         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
   1170         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1171         REGEX_ASSERT(&m == &m.reset());
   1172         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1173 
   1174     }
   1175 
   1176     //
   1177     // hitEnd() and requireEnd()
   1178     //
   1179     {
   1180         UErrorCode status = U_ZERO_ERROR;
   1181         UnicodeString testString("aabb");
   1182         RegexMatcher m1(".*", testString,  0, status);
   1183         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
   1184         REGEX_ASSERT(m1.hitEnd() == TRUE);
   1185         REGEX_ASSERT(m1.requireEnd() == FALSE);
   1186         REGEX_CHECK_STATUS;
   1187 
   1188         status = U_ZERO_ERROR;
   1189         RegexMatcher m2("a*", testString, 0, status);
   1190         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
   1191         REGEX_ASSERT(m2.hitEnd() == FALSE);
   1192         REGEX_ASSERT(m2.requireEnd() == FALSE);
   1193         REGEX_CHECK_STATUS;
   1194 
   1195         status = U_ZERO_ERROR;
   1196         RegexMatcher m3(".*$", testString, 0, status);
   1197         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
   1198         REGEX_ASSERT(m3.hitEnd() == TRUE);
   1199         REGEX_ASSERT(m3.requireEnd() == TRUE);
   1200         REGEX_CHECK_STATUS;
   1201     }
   1202 
   1203 
   1204     //
   1205     // Compilation error on reset with UChar *
   1206     //   These were a hazard that people were stumbling over with runtime errors.
   1207     //   Changed them to compiler errors by adding private methods that more closely
   1208     //   matched the incorrect use of the functions.
   1209     //
   1210 #if 0
   1211     {
   1212         UErrorCode status = U_ZERO_ERROR;
   1213         UChar ucharString[20];
   1214         RegexMatcher m(".", 0, status);
   1215         m.reset(ucharString);  // should not compile.
   1216 
   1217         RegexPattern *p = RegexPattern::compile(".", 0, status);
   1218         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
   1219 
   1220         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
   1221     }
   1222 #endif
   1223 
   1224     //
   1225     //  Time Outs.
   1226     //       Note:  These tests will need to be changed when the regexp engine is
   1227     //              able to detect and cut short the exponential time behavior on
   1228     //              this type of match.
   1229     //
   1230     {
   1231         UErrorCode status = U_ZERO_ERROR;
   1232         //    Enough 'a's in the string to cause the match to time out.
   1233         //       (Each on additonal 'a' doubles the time)
   1234         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
   1235         RegexMatcher matcher("(a+)+b", testString, 0, status);
   1236         REGEX_CHECK_STATUS;
   1237         REGEX_ASSERT(matcher.getTimeLimit() == 0);
   1238         matcher.setTimeLimit(100, status);
   1239         REGEX_ASSERT(matcher.getTimeLimit() == 100);
   1240         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1241         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
   1242     }
   1243     {
   1244         UErrorCode status = U_ZERO_ERROR;
   1245         //   Few enough 'a's to slip in under the time limit.
   1246         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
   1247         RegexMatcher matcher("(a+)+b", testString, 0, status);
   1248         REGEX_CHECK_STATUS;
   1249         matcher.setTimeLimit(100, status);
   1250         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1251         REGEX_CHECK_STATUS;
   1252     }
   1253 
   1254     //
   1255     //  Stack Limits
   1256     //
   1257     {
   1258         UErrorCode status = U_ZERO_ERROR;
   1259         UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
   1260 
   1261         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
   1262         //   of the '+', and makes the stack frames larger.
   1263         RegexMatcher matcher("(A)+A$", testString, 0, status);
   1264 
   1265         // With the default stack, this match should fail to run
   1266         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1267         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
   1268 
   1269         // With unlimited stack, it should run
   1270         status = U_ZERO_ERROR;
   1271         matcher.setStackLimit(0, status);
   1272         REGEX_CHECK_STATUS;
   1273         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
   1274         REGEX_CHECK_STATUS;
   1275         REGEX_ASSERT(matcher.getStackLimit() == 0);
   1276 
   1277         // With a limited stack, it the match should fail
   1278         status = U_ZERO_ERROR;
   1279         matcher.setStackLimit(10000, status);
   1280         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1281         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
   1282         REGEX_ASSERT(matcher.getStackLimit() == 10000);
   1283     }
   1284 
   1285         // A pattern that doesn't save state should work with
   1286         //   a minimal sized stack
   1287     {
   1288         UErrorCode status = U_ZERO_ERROR;
   1289         UnicodeString testString = "abc";
   1290         RegexMatcher matcher("abc", testString, 0, status);
   1291         REGEX_CHECK_STATUS;
   1292         matcher.setStackLimit(30, status);
   1293         REGEX_CHECK_STATUS;
   1294         REGEX_ASSERT(matcher.matches(status) == TRUE);
   1295         REGEX_CHECK_STATUS;
   1296         REGEX_ASSERT(matcher.getStackLimit() == 30);
   1297 
   1298         // Negative stack sizes should fail
   1299         status = U_ZERO_ERROR;
   1300         matcher.setStackLimit(1000, status);
   1301         REGEX_CHECK_STATUS;
   1302         matcher.setStackLimit(-1, status);
   1303         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
   1304         REGEX_ASSERT(matcher.getStackLimit() == 1000);
   1305     }
   1306 
   1307 
   1308 }
   1309 
   1310 
   1311 
   1312 
   1313 
   1314 
   1315 //---------------------------------------------------------------------------
   1316 //
   1317 //      API_Replace        API test for class RegexMatcher, testing the
   1318 //                         Replace family of functions.
   1319 //
   1320 //---------------------------------------------------------------------------
   1321 void RegexTest::API_Replace() {
   1322     //
   1323     //  Replace
   1324     //
   1325     int32_t             flags=0;
   1326     UParseError         pe;
   1327     UErrorCode          status=U_ZERO_ERROR;
   1328 
   1329     UnicodeString       re("abc");
   1330     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
   1331     REGEX_CHECK_STATUS;
   1332     UnicodeString data = ".abc..abc...abc..";
   1333     //                    012345678901234567
   1334     RegexMatcher *matcher = pat->matcher(data, status);
   1335 
   1336     //
   1337     //  Plain vanilla matches.
   1338     //
   1339     UnicodeString  dest;
   1340     dest = matcher->replaceFirst("yz", status);
   1341     REGEX_CHECK_STATUS;
   1342     REGEX_ASSERT(dest == ".yz..abc...abc..");
   1343 
   1344     dest = matcher->replaceAll("yz", status);
   1345     REGEX_CHECK_STATUS;
   1346     REGEX_ASSERT(dest == ".yz..yz...yz..");
   1347 
   1348     //
   1349     //  Plain vanilla non-matches.
   1350     //
   1351     UnicodeString d2 = ".abx..abx...abx..";
   1352     matcher->reset(d2);
   1353     dest = matcher->replaceFirst("yz", status);
   1354     REGEX_CHECK_STATUS;
   1355     REGEX_ASSERT(dest == ".abx..abx...abx..");
   1356 
   1357     dest = matcher->replaceAll("yz", status);
   1358     REGEX_CHECK_STATUS;
   1359     REGEX_ASSERT(dest == ".abx..abx...abx..");
   1360 
   1361     //
   1362     // Empty source string
   1363     //
   1364     UnicodeString d3 = "";
   1365     matcher->reset(d3);
   1366     dest = matcher->replaceFirst("yz", status);
   1367     REGEX_CHECK_STATUS;
   1368     REGEX_ASSERT(dest == "");
   1369 
   1370     dest = matcher->replaceAll("yz", status);
   1371     REGEX_CHECK_STATUS;
   1372     REGEX_ASSERT(dest == "");
   1373 
   1374     //
   1375     // Empty substitution string
   1376     //
   1377     matcher->reset(data);              // ".abc..abc...abc.."
   1378     dest = matcher->replaceFirst("", status);
   1379     REGEX_CHECK_STATUS;
   1380     REGEX_ASSERT(dest == "...abc...abc..");
   1381 
   1382     dest = matcher->replaceAll("", status);
   1383     REGEX_CHECK_STATUS;
   1384     REGEX_ASSERT(dest == "........");
   1385 
   1386     //
   1387     // match whole string
   1388     //
   1389     UnicodeString d4 = "abc";
   1390     matcher->reset(d4);
   1391     dest = matcher->replaceFirst("xyz", status);
   1392     REGEX_CHECK_STATUS;
   1393     REGEX_ASSERT(dest == "xyz");
   1394 
   1395     dest = matcher->replaceAll("xyz", status);
   1396     REGEX_CHECK_STATUS;
   1397     REGEX_ASSERT(dest == "xyz");
   1398 
   1399     //
   1400     // Capture Group, simple case
   1401     //
   1402     UnicodeString       re2("a(..)");
   1403     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
   1404     REGEX_CHECK_STATUS;
   1405     UnicodeString d5 = "abcdefg";
   1406     RegexMatcher *matcher2 = pat2->matcher(d5, status);
   1407     REGEX_CHECK_STATUS;
   1408     dest = matcher2->replaceFirst("$1$1", status);
   1409     REGEX_CHECK_STATUS;
   1410     REGEX_ASSERT(dest == "bcbcdefg");
   1411 
   1412     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
   1413     REGEX_CHECK_STATUS;
   1414     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
   1415 
   1416     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
   1417     REGEX_CHECK_STATUS;
   1418     REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
   1419 
   1420     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
   1421     replacement = replacement.unescape();
   1422     dest = matcher2->replaceFirst(replacement, status);
   1423     REGEX_CHECK_STATUS;
   1424     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
   1425 
   1426     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
   1427 
   1428 
   1429     //
   1430     // Replacement String with \u hex escapes
   1431     //
   1432     {
   1433         UnicodeString  src = "abc 1 abc 2 abc 3";
   1434         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
   1435         matcher->reset(src);
   1436         UnicodeString  result = matcher->replaceAll(substitute, status);
   1437         REGEX_CHECK_STATUS;
   1438         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
   1439     }
   1440     {
   1441         UnicodeString  src = "abc !";
   1442         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
   1443         matcher->reset(src);
   1444         UnicodeString  result = matcher->replaceAll(substitute, status);
   1445         REGEX_CHECK_STATUS;
   1446         UnicodeString expected = UnicodeString("--");
   1447         expected.append((UChar32)0x10000);
   1448         expected.append("-- !");
   1449         REGEX_ASSERT(result == expected);
   1450     }
   1451     // TODO:  need more through testing of capture substitutions.
   1452 
   1453     // Bug 4057
   1454     //
   1455     {
   1456         status = U_ZERO_ERROR;
   1457         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
   1458         RegexMatcher m("ss(.*?)ee", 0, status);
   1459         REGEX_CHECK_STATUS;
   1460         UnicodeString result;
   1461 
   1462         // Multiple finds do NOT bump up the previous appendReplacement postion.
   1463         m.reset(s);
   1464         m.find();
   1465         m.find();
   1466         m.appendReplacement(result, "ooh", status);
   1467         REGEX_CHECK_STATUS;
   1468         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1469 
   1470         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
   1471         status = U_ZERO_ERROR;
   1472         result.truncate(0);
   1473         m.reset(10, status);
   1474         m.find();
   1475         m.find();
   1476         m.appendReplacement(result, "ooh", status);
   1477         REGEX_CHECK_STATUS;
   1478         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1479 
   1480         // find() at interior of string, appendReplacemnt still starts at beginning.
   1481         status = U_ZERO_ERROR;
   1482         result.truncate(0);
   1483         m.reset();
   1484         m.find(10, status);
   1485         m.find();
   1486         m.appendReplacement(result, "ooh", status);
   1487         REGEX_CHECK_STATUS;
   1488         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1489 
   1490         m.appendTail(result);
   1491         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
   1492 
   1493     }
   1494 
   1495     delete matcher2;
   1496     delete pat2;
   1497     delete matcher;
   1498     delete pat;
   1499 }
   1500 
   1501 
   1502 //---------------------------------------------------------------------------
   1503 //
   1504 //      API_Pattern       Test that the API for class RegexPattern is
   1505 //                        present and nominally working.
   1506 //
   1507 //---------------------------------------------------------------------------
   1508 void RegexTest::API_Pattern() {
   1509     RegexPattern        pata;    // Test default constructor to not crash.
   1510     RegexPattern        patb;
   1511 
   1512     REGEX_ASSERT(pata == patb);
   1513     REGEX_ASSERT(pata == pata);
   1514 
   1515     UnicodeString re1("abc[a-l][m-z]");
   1516     UnicodeString re2("def");
   1517     UErrorCode    status = U_ZERO_ERROR;
   1518     UParseError   pe;
   1519 
   1520     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
   1521     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
   1522     REGEX_CHECK_STATUS;
   1523     REGEX_ASSERT(*pat1 == *pat1);
   1524     REGEX_ASSERT(*pat1 != pata);
   1525 
   1526     // Assign
   1527     patb = *pat1;
   1528     REGEX_ASSERT(patb == *pat1);
   1529 
   1530     // Copy Construct
   1531     RegexPattern patc(*pat1);
   1532     REGEX_ASSERT(patc == *pat1);
   1533     REGEX_ASSERT(patb == patc);
   1534     REGEX_ASSERT(pat1 != pat2);
   1535     patb = *pat2;
   1536     REGEX_ASSERT(patb != patc);
   1537     REGEX_ASSERT(patb == *pat2);
   1538 
   1539     // Compile with no flags.
   1540     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
   1541     REGEX_ASSERT(*pat1a == *pat1);
   1542 
   1543     REGEX_ASSERT(pat1a->flags() == 0);
   1544 
   1545     // Compile with different flags should be not equal
   1546     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
   1547     REGEX_CHECK_STATUS;
   1548 
   1549     REGEX_ASSERT(*pat1b != *pat1a);
   1550     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
   1551     REGEX_ASSERT(pat1a->flags() == 0);
   1552     delete pat1b;
   1553 
   1554     // clone
   1555     RegexPattern *pat1c = pat1->clone();
   1556     REGEX_ASSERT(*pat1c == *pat1);
   1557     REGEX_ASSERT(*pat1c != *pat2);
   1558 
   1559     delete pat1c;
   1560     delete pat1a;
   1561     delete pat1;
   1562     delete pat2;
   1563 
   1564 
   1565     //
   1566     //   Verify that a matcher created from a cloned pattern works.
   1567     //     (Jitterbug 3423)
   1568     //
   1569     {
   1570         UErrorCode     status     = U_ZERO_ERROR;
   1571         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
   1572         RegexPattern  *pClone     = pSource->clone();
   1573         delete         pSource;
   1574         RegexMatcher  *mFromClone = pClone->matcher(status);
   1575         REGEX_CHECK_STATUS;
   1576         UnicodeString s = "Hello World";
   1577         mFromClone->reset(s);
   1578         REGEX_ASSERT(mFromClone->find() == TRUE);
   1579         REGEX_ASSERT(mFromClone->group(status) == "Hello");
   1580         REGEX_ASSERT(mFromClone->find() == TRUE);
   1581         REGEX_ASSERT(mFromClone->group(status) == "World");
   1582         REGEX_ASSERT(mFromClone->find() == FALSE);
   1583         delete mFromClone;
   1584         delete pClone;
   1585     }
   1586 
   1587     //
   1588     //   matches convenience API
   1589     //
   1590     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
   1591     REGEX_CHECK_STATUS;
   1592     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
   1593     REGEX_CHECK_STATUS;
   1594     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
   1595     REGEX_CHECK_STATUS;
   1596     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
   1597     REGEX_CHECK_STATUS;
   1598     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
   1599     REGEX_CHECK_STATUS;
   1600     status = U_INDEX_OUTOFBOUNDS_ERROR;
   1601     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
   1602     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1603 
   1604 
   1605     //
   1606     // Split()
   1607     //
   1608     status = U_ZERO_ERROR;
   1609     pat1 = RegexPattern::compile(" +",  pe, status);
   1610     REGEX_CHECK_STATUS;
   1611     UnicodeString  fields[10];
   1612 
   1613     int32_t n;
   1614     n = pat1->split("Now is the time", fields, 10, status);
   1615     REGEX_CHECK_STATUS;
   1616     REGEX_ASSERT(n==4);
   1617     REGEX_ASSERT(fields[0]=="Now");
   1618     REGEX_ASSERT(fields[1]=="is");
   1619     REGEX_ASSERT(fields[2]=="the");
   1620     REGEX_ASSERT(fields[3]=="time");
   1621     REGEX_ASSERT(fields[4]=="");
   1622 
   1623     n = pat1->split("Now is the time", fields, 2, status);
   1624     REGEX_CHECK_STATUS;
   1625     REGEX_ASSERT(n==2);
   1626     REGEX_ASSERT(fields[0]=="Now");
   1627     REGEX_ASSERT(fields[1]=="is the time");
   1628     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
   1629 
   1630     fields[1] = "*";
   1631     status = U_ZERO_ERROR;
   1632     n = pat1->split("Now is the time", fields, 1, status);
   1633     REGEX_CHECK_STATUS;
   1634     REGEX_ASSERT(n==1);
   1635     REGEX_ASSERT(fields[0]=="Now is the time");
   1636     REGEX_ASSERT(fields[1]=="*");
   1637     status = U_ZERO_ERROR;
   1638 
   1639     n = pat1->split("    Now       is the time   ", fields, 10, status);
   1640     REGEX_CHECK_STATUS;
   1641     REGEX_ASSERT(n==6);
   1642     REGEX_ASSERT(fields[0]=="");
   1643     REGEX_ASSERT(fields[1]=="Now");
   1644     REGEX_ASSERT(fields[2]=="is");
   1645     REGEX_ASSERT(fields[3]=="the");
   1646     REGEX_ASSERT(fields[4]=="time");
   1647     REGEX_ASSERT(fields[5]=="");
   1648 
   1649     n = pat1->split("     ", fields, 10, status);
   1650     REGEX_CHECK_STATUS;
   1651     REGEX_ASSERT(n==2);
   1652     REGEX_ASSERT(fields[0]=="");
   1653     REGEX_ASSERT(fields[1]=="");
   1654 
   1655     fields[0] = "foo";
   1656     n = pat1->split("", fields, 10, status);
   1657     REGEX_CHECK_STATUS;
   1658     REGEX_ASSERT(n==0);
   1659     REGEX_ASSERT(fields[0]=="foo");
   1660 
   1661     delete pat1;
   1662 
   1663     //  split, with a pattern with (capture)
   1664     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
   1665     REGEX_CHECK_STATUS;
   1666 
   1667     status = U_ZERO_ERROR;
   1668     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
   1669     REGEX_CHECK_STATUS;
   1670     REGEX_ASSERT(n==7);
   1671     REGEX_ASSERT(fields[0]=="");
   1672     REGEX_ASSERT(fields[1]=="a");
   1673     REGEX_ASSERT(fields[2]=="Now is ");
   1674     REGEX_ASSERT(fields[3]=="b");
   1675     REGEX_ASSERT(fields[4]=="the time");
   1676     REGEX_ASSERT(fields[5]=="c");
   1677     REGEX_ASSERT(fields[6]=="");
   1678     REGEX_ASSERT(status==U_ZERO_ERROR);
   1679 
   1680     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
   1681     REGEX_CHECK_STATUS;
   1682     REGEX_ASSERT(n==7);
   1683     REGEX_ASSERT(fields[0]=="  ");
   1684     REGEX_ASSERT(fields[1]=="a");
   1685     REGEX_ASSERT(fields[2]=="Now is ");
   1686     REGEX_ASSERT(fields[3]=="b");
   1687     REGEX_ASSERT(fields[4]=="the time");
   1688     REGEX_ASSERT(fields[5]=="c");
   1689     REGEX_ASSERT(fields[6]=="");
   1690 
   1691     status = U_ZERO_ERROR;
   1692     fields[6] = "foo";
   1693     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
   1694     REGEX_CHECK_STATUS;
   1695     REGEX_ASSERT(n==6);
   1696     REGEX_ASSERT(fields[0]=="  ");
   1697     REGEX_ASSERT(fields[1]=="a");
   1698     REGEX_ASSERT(fields[2]=="Now is ");
   1699     REGEX_ASSERT(fields[3]=="b");
   1700     REGEX_ASSERT(fields[4]=="the time");
   1701     REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
   1702     REGEX_ASSERT(fields[6]=="foo");
   1703 
   1704     status = U_ZERO_ERROR;
   1705     fields[5] = "foo";
   1706     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
   1707     REGEX_CHECK_STATUS;
   1708     REGEX_ASSERT(n==5);
   1709     REGEX_ASSERT(fields[0]=="  ");
   1710     REGEX_ASSERT(fields[1]=="a");
   1711     REGEX_ASSERT(fields[2]=="Now is ");
   1712     REGEX_ASSERT(fields[3]=="b");
   1713     REGEX_ASSERT(fields[4]=="the time<c>");
   1714     REGEX_ASSERT(fields[5]=="foo");
   1715 
   1716     status = U_ZERO_ERROR;
   1717     fields[5] = "foo";
   1718     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
   1719     REGEX_CHECK_STATUS;
   1720     REGEX_ASSERT(n==5);
   1721     REGEX_ASSERT(fields[0]=="  ");
   1722     REGEX_ASSERT(fields[1]=="a");
   1723     REGEX_ASSERT(fields[2]=="Now is ");
   1724     REGEX_ASSERT(fields[3]=="b");
   1725     REGEX_ASSERT(fields[4]=="the time");
   1726     REGEX_ASSERT(fields[5]=="foo");
   1727 
   1728     status = U_ZERO_ERROR;
   1729     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
   1730     REGEX_CHECK_STATUS;
   1731     REGEX_ASSERT(n==4);
   1732     REGEX_ASSERT(fields[0]=="  ");
   1733     REGEX_ASSERT(fields[1]=="a");
   1734     REGEX_ASSERT(fields[2]=="Now is ");
   1735     REGEX_ASSERT(fields[3]=="the time<c>");
   1736     status = U_ZERO_ERROR;
   1737     delete pat1;
   1738 
   1739     pat1 = RegexPattern::compile("([-,])",  pe, status);
   1740     REGEX_CHECK_STATUS;
   1741     n = pat1->split("1-10,20", fields, 10, status);
   1742     REGEX_CHECK_STATUS;
   1743     REGEX_ASSERT(n==5);
   1744     REGEX_ASSERT(fields[0]=="1");
   1745     REGEX_ASSERT(fields[1]=="-");
   1746     REGEX_ASSERT(fields[2]=="10");
   1747     REGEX_ASSERT(fields[3]==",");
   1748     REGEX_ASSERT(fields[4]=="20");
   1749     delete pat1;
   1750 
   1751     // Test split of string with empty trailing fields
   1752     pat1 = RegexPattern::compile(",", pe, status);
   1753     REGEX_CHECK_STATUS;
   1754     n = pat1->split("a,b,c,", fields, 10, status);
   1755     REGEX_CHECK_STATUS;
   1756     REGEX_ASSERT(n==4);
   1757     REGEX_ASSERT(fields[0]=="a");
   1758     REGEX_ASSERT(fields[1]=="b");
   1759     REGEX_ASSERT(fields[2]=="c");
   1760     REGEX_ASSERT(fields[3]=="");
   1761 
   1762     n = pat1->split("a,,,", fields, 10, status);
   1763     REGEX_CHECK_STATUS;
   1764     REGEX_ASSERT(n==4);
   1765     REGEX_ASSERT(fields[0]=="a");
   1766     REGEX_ASSERT(fields[1]=="");
   1767     REGEX_ASSERT(fields[2]=="");
   1768     REGEX_ASSERT(fields[3]=="");
   1769     delete pat1;
   1770 
   1771     // Split Separator with zero length match.
   1772     pat1 = RegexPattern::compile(":?", pe, status);
   1773     REGEX_CHECK_STATUS;
   1774     n = pat1->split("abc", fields, 10, status);
   1775     REGEX_CHECK_STATUS;
   1776     REGEX_ASSERT(n==5);
   1777     REGEX_ASSERT(fields[0]=="");
   1778     REGEX_ASSERT(fields[1]=="a");
   1779     REGEX_ASSERT(fields[2]=="b");
   1780     REGEX_ASSERT(fields[3]=="c");
   1781     REGEX_ASSERT(fields[4]=="");
   1782 
   1783     delete pat1;
   1784 
   1785     //
   1786     // RegexPattern::pattern()
   1787     //
   1788     pat1 = new RegexPattern();
   1789     REGEX_ASSERT(pat1->pattern() == "");
   1790     delete pat1;
   1791 
   1792     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
   1793     REGEX_CHECK_STATUS;
   1794     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
   1795     delete pat1;
   1796 
   1797 
   1798     //
   1799     // classID functions
   1800     //
   1801     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
   1802     REGEX_CHECK_STATUS;
   1803     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
   1804     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
   1805     UnicodeString Hello("Hello, world.");
   1806     RegexMatcher *m = pat1->matcher(Hello, status);
   1807     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
   1808     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
   1809     REGEX_ASSERT(m->getDynamicClassID() != NULL);
   1810     delete m;
   1811     delete pat1;
   1812 
   1813 }
   1814 
   1815 //---------------------------------------------------------------------------
   1816 //
   1817 //      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
   1818 //                       is present and working, but excluding functions
   1819 //                       implementing replace operations.
   1820 //
   1821 //---------------------------------------------------------------------------
   1822 void RegexTest::API_Match_UTF8() {
   1823     UParseError         pe;
   1824     UErrorCode          status=U_ZERO_ERROR;
   1825     int32_t             flags = 0;
   1826 
   1827     //
   1828     // Debug - slide failing test cases early
   1829     //
   1830 #if 0
   1831     {
   1832     }
   1833     return;
   1834 #endif
   1835 
   1836     //
   1837     // Simple pattern compilation
   1838     //
   1839     {
   1840         UText               re = UTEXT_INITIALIZER;
   1841         regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
   1842         REGEX_VERBOSE_TEXT(&re);
   1843         RegexPattern        *pat2;
   1844         pat2 = RegexPattern::compile(&re, flags, pe, status);
   1845         REGEX_CHECK_STATUS;
   1846 
   1847         UText input1 = UTEXT_INITIALIZER;
   1848         UText input2 = UTEXT_INITIALIZER;
   1849         UText empty  = UTEXT_INITIALIZER;
   1850         regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
   1851         REGEX_VERBOSE_TEXT(&input1);
   1852         regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
   1853         REGEX_VERBOSE_TEXT(&input2);
   1854         utext_openUChars(&empty, NULL, 0, &status);
   1855 
   1856         int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
   1857         int32_t input2Len = strlen("not abc");
   1858 
   1859 
   1860         //
   1861         // Matcher creation and reset.
   1862         //
   1863         RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
   1864         REGEX_CHECK_STATUS;
   1865         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1866         const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
   1867         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
   1868         m1->reset(&input2);
   1869         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
   1870         const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
   1871         REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
   1872         m1->reset(&input1);
   1873         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
   1874         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1875         m1->reset(&empty);
   1876         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
   1877         REGEX_ASSERT(utext_nativeLength(&empty) == 0);
   1878 
   1879         //
   1880         //  reset(pos, status)
   1881         //
   1882         m1->reset(&input1);
   1883         m1->reset(4, status);
   1884         REGEX_CHECK_STATUS;
   1885         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
   1886         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1887 
   1888         m1->reset(-1, status);
   1889         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1890         status = U_ZERO_ERROR;
   1891 
   1892         m1->reset(0, status);
   1893         REGEX_CHECK_STATUS;
   1894         status = U_ZERO_ERROR;
   1895 
   1896         m1->reset(input1Len-1, status);
   1897         REGEX_CHECK_STATUS;
   1898         status = U_ZERO_ERROR;
   1899 
   1900         m1->reset(input1Len, status);
   1901         REGEX_CHECK_STATUS;
   1902         status = U_ZERO_ERROR;
   1903 
   1904         m1->reset(input1Len+1, status);
   1905         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1906         status = U_ZERO_ERROR;
   1907 
   1908         //
   1909         // match(pos, status)
   1910         //
   1911         m1->reset(&input2);
   1912         REGEX_ASSERT(m1->matches(4, status) == TRUE);
   1913         m1->reset();
   1914         REGEX_ASSERT(m1->matches(3, status) == FALSE);
   1915         m1->reset();
   1916         REGEX_ASSERT(m1->matches(5, status) == FALSE);
   1917         REGEX_ASSERT(m1->matches(4, status) == TRUE);
   1918         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
   1919         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1920 
   1921         // Match() at end of string should fail, but should not
   1922         //  be an error.
   1923         status = U_ZERO_ERROR;
   1924         REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
   1925         REGEX_CHECK_STATUS;
   1926 
   1927         // Match beyond end of string should fail with an error.
   1928         status = U_ZERO_ERROR;
   1929         REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
   1930         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1931 
   1932         // Successful match at end of string.
   1933         {
   1934             status = U_ZERO_ERROR;
   1935             RegexMatcher m("A?", 0, status);  // will match zero length string.
   1936             REGEX_CHECK_STATUS;
   1937             m.reset(&input1);
   1938             REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
   1939             REGEX_CHECK_STATUS;
   1940             m.reset(&empty);
   1941             REGEX_ASSERT(m.matches(0, status) == TRUE);
   1942             REGEX_CHECK_STATUS;
   1943         }
   1944 
   1945 
   1946         //
   1947         // lookingAt(pos, status)
   1948         //
   1949         status = U_ZERO_ERROR;
   1950         m1->reset(&input2);  // "not abc"
   1951         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
   1952         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
   1953         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
   1954         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
   1955         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
   1956         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1957         status = U_ZERO_ERROR;
   1958         REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
   1959         REGEX_CHECK_STATUS;
   1960         REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
   1961         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1962 
   1963         delete m1;
   1964         delete pat2;
   1965 
   1966         utext_close(&re);
   1967         utext_close(&input1);
   1968         utext_close(&input2);
   1969         utext_close(&empty);
   1970     }
   1971 
   1972 
   1973     //
   1974     // Capture Group.
   1975     //     RegexMatcher::start();
   1976     //     RegexMatcher::end();
   1977     //     RegexMatcher::groupCount();
   1978     //
   1979     {
   1980         int32_t             flags=0;
   1981         UParseError         pe;
   1982         UErrorCode          status=U_ZERO_ERROR;
   1983         UText               re=UTEXT_INITIALIZER;
   1984         const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
   1985         utext_openUTF8(&re, str_01234567_pat, -1, &status);
   1986 
   1987         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   1988         REGEX_CHECK_STATUS;
   1989 
   1990         UText input = UTEXT_INITIALIZER;
   1991         const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
   1992         utext_openUTF8(&input, str_0123456789, -1, &status);
   1993 
   1994         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
   1995         REGEX_CHECK_STATUS;
   1996         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
   1997         static const int32_t matchStarts[] = {0,  2, 4, 8};
   1998         static const int32_t matchEnds[]   = {10, 8, 6, 10};
   1999         int32_t i;
   2000         for (i=0; i<4; i++) {
   2001             int32_t actualStart = matcher->start(i, status);
   2002             REGEX_CHECK_STATUS;
   2003             if (actualStart != matchStarts[i]) {
   2004                 errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
   2005                       __FILE__, __LINE__, i, matchStarts[i], actualStart);
   2006             }
   2007             int32_t actualEnd = matcher->end(i, status);
   2008             REGEX_CHECK_STATUS;
   2009             if (actualEnd != matchEnds[i]) {
   2010                 errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
   2011                       __FILE__, __LINE__, i, matchEnds[i], actualEnd);
   2012             }
   2013         }
   2014 
   2015         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
   2016         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
   2017 
   2018         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2019         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2020         matcher->reset();
   2021         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
   2022 
   2023         matcher->lookingAt(status);
   2024 
   2025         UnicodeString dest;
   2026         UText destText = UTEXT_INITIALIZER;
   2027         utext_openUnicodeString(&destText, &dest, &status);
   2028         UText *result;
   2029         //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
   2030         //	Test shallow-clone API
   2031         int64_t   group_len;
   2032         result = matcher->group((UText *)NULL, group_len, status);
   2033         REGEX_CHECK_STATUS;
   2034         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   2035         utext_close(result);
   2036         result = matcher->group(0, &destText, group_len, status);
   2037         REGEX_CHECK_STATUS;
   2038         REGEX_ASSERT(result == &destText);
   2039         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   2040         //  destText is now immutable, reopen it
   2041         utext_close(&destText);
   2042         utext_openUnicodeString(&destText, &dest, &status);
   2043 
   2044         result = matcher->group(0, NULL, status);
   2045         REGEX_CHECK_STATUS;
   2046         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   2047         utext_close(result);
   2048         result = matcher->group(0, &destText, status);
   2049         REGEX_CHECK_STATUS;
   2050         REGEX_ASSERT(result == &destText);
   2051         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   2052 
   2053         result = matcher->group(1, NULL, status);
   2054         REGEX_CHECK_STATUS;
   2055         const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */
   2056         REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
   2057         utext_close(result);
   2058         result = matcher->group(1, &destText, status);
   2059         REGEX_CHECK_STATUS;
   2060         REGEX_ASSERT(result == &destText);
   2061         REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
   2062 
   2063         result = matcher->group(2, NULL, status);
   2064         REGEX_CHECK_STATUS;
   2065         const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */
   2066         REGEX_ASSERT_UTEXT_UTF8(str_45, result);
   2067         utext_close(result);
   2068         result = matcher->group(2, &destText, status);
   2069         REGEX_CHECK_STATUS;
   2070         REGEX_ASSERT(result == &destText);
   2071         REGEX_ASSERT_UTEXT_UTF8(str_45, result);
   2072 
   2073         result = matcher->group(3, NULL, status);
   2074         REGEX_CHECK_STATUS;
   2075         const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */
   2076         REGEX_ASSERT_UTEXT_UTF8(str_89, result);
   2077         utext_close(result);
   2078         result = matcher->group(3, &destText, status);
   2079         REGEX_CHECK_STATUS;
   2080         REGEX_ASSERT(result == &destText);
   2081         REGEX_ASSERT_UTEXT_UTF8(str_89, result);
   2082 
   2083         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2084         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2085         matcher->reset();
   2086         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
   2087 
   2088         delete matcher;
   2089         delete pat;
   2090 
   2091         utext_close(&destText);
   2092         utext_close(&input);
   2093         utext_close(&re);
   2094     }
   2095 
   2096     //
   2097     //  find
   2098     //
   2099     {
   2100         int32_t             flags=0;
   2101         UParseError         pe;
   2102         UErrorCode          status=U_ZERO_ERROR;
   2103         UText               re=UTEXT_INITIALIZER;
   2104         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
   2105         utext_openUTF8(&re, str_abc, -1, &status);
   2106 
   2107         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2108         REGEX_CHECK_STATUS;
   2109         UText input = UTEXT_INITIALIZER;
   2110         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
   2111         utext_openUTF8(&input, str_abcabcabc, -1, &status);
   2112         //                      012345678901234567
   2113 
   2114         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
   2115         REGEX_CHECK_STATUS;
   2116         REGEX_ASSERT(matcher->find());
   2117         REGEX_ASSERT(matcher->start(status) == 1);
   2118         REGEX_ASSERT(matcher->find());
   2119         REGEX_ASSERT(matcher->start(status) == 6);
   2120         REGEX_ASSERT(matcher->find());
   2121         REGEX_ASSERT(matcher->start(status) == 12);
   2122         REGEX_ASSERT(matcher->find() == FALSE);
   2123         REGEX_ASSERT(matcher->find() == FALSE);
   2124 
   2125         matcher->reset();
   2126         REGEX_ASSERT(matcher->find());
   2127         REGEX_ASSERT(matcher->start(status) == 1);
   2128 
   2129         REGEX_ASSERT(matcher->find(0, status));
   2130         REGEX_ASSERT(matcher->start(status) == 1);
   2131         REGEX_ASSERT(matcher->find(1, status));
   2132         REGEX_ASSERT(matcher->start(status) == 1);
   2133         REGEX_ASSERT(matcher->find(2, status));
   2134         REGEX_ASSERT(matcher->start(status) == 6);
   2135         REGEX_ASSERT(matcher->find(12, status));
   2136         REGEX_ASSERT(matcher->start(status) == 12);
   2137         REGEX_ASSERT(matcher->find(13, status) == FALSE);
   2138         REGEX_ASSERT(matcher->find(16, status) == FALSE);
   2139         REGEX_ASSERT(matcher->find(17, status) == FALSE);
   2140         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
   2141 
   2142         status = U_ZERO_ERROR;
   2143         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2144         status = U_ZERO_ERROR;
   2145         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2146 
   2147         REGEX_ASSERT(matcher->groupCount() == 0);
   2148 
   2149         delete matcher;
   2150         delete pat;
   2151 
   2152         utext_close(&input);
   2153         utext_close(&re);
   2154     }
   2155 
   2156 
   2157     //
   2158     //  find, with \G in pattern (true if at the end of a previous match).
   2159     //
   2160     {
   2161         int32_t             flags=0;
   2162         UParseError         pe;
   2163         UErrorCode          status=U_ZERO_ERROR;
   2164         UText               re=UTEXT_INITIALIZER;
   2165         const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
   2166         utext_openUTF8(&re, str_Gabcabc, -1, &status);
   2167 
   2168         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2169 
   2170         REGEX_CHECK_STATUS;
   2171         UText input = UTEXT_INITIALIZER;
   2172         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
   2173         utext_openUTF8(&input, str_abcabcabc, -1, &status);
   2174         //                      012345678901234567
   2175 
   2176         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
   2177         REGEX_CHECK_STATUS;
   2178         REGEX_ASSERT(matcher->find());
   2179         REGEX_ASSERT(matcher->start(status) == 0);
   2180         REGEX_ASSERT(matcher->start(1, status) == -1);
   2181         REGEX_ASSERT(matcher->start(2, status) == 1);
   2182 
   2183         REGEX_ASSERT(matcher->find());
   2184         REGEX_ASSERT(matcher->start(status) == 4);
   2185         REGEX_ASSERT(matcher->start(1, status) == 4);
   2186         REGEX_ASSERT(matcher->start(2, status) == -1);
   2187         REGEX_CHECK_STATUS;
   2188 
   2189         delete matcher;
   2190         delete pat;
   2191 
   2192         utext_close(&input);
   2193         utext_close(&re);
   2194     }
   2195 
   2196     //
   2197     //   find with zero length matches, match position should bump ahead
   2198     //     to prevent loops.
   2199     //
   2200     {
   2201         int32_t                 i;
   2202         UErrorCode          status=U_ZERO_ERROR;
   2203         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
   2204                                                       //   using an always-true look-ahead.
   2205         REGEX_CHECK_STATUS;
   2206         UText s = UTEXT_INITIALIZER;
   2207         utext_openUTF8(&s, "    ", -1, &status);
   2208         m.reset(&s);
   2209         for (i=0; ; i++) {
   2210             if (m.find() == FALSE) {
   2211                 break;
   2212             }
   2213             REGEX_ASSERT(m.start(status) == i);
   2214             REGEX_ASSERT(m.end(status) == i);
   2215         }
   2216         REGEX_ASSERT(i==5);
   2217 
   2218         // Check that the bump goes over characters outside the BMP OK
   2219         // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
   2220         unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
   2221         utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
   2222         m.reset(&s);
   2223         for (i=0; ; i+=4) {
   2224             if (m.find() == FALSE) {
   2225                 break;
   2226             }
   2227             REGEX_ASSERT(m.start(status) == i);
   2228             REGEX_ASSERT(m.end(status) == i);
   2229         }
   2230         REGEX_ASSERT(i==20);
   2231 
   2232         utext_close(&s);
   2233     }
   2234     {
   2235         // find() loop breaking test.
   2236         //        with pattern of /.?/, should see a series of one char matches, then a single
   2237         //        match of zero length at the end of the input string.
   2238         int32_t                 i;
   2239         UErrorCode          status=U_ZERO_ERROR;
   2240         RegexMatcher        m(".?", 0, status);
   2241         REGEX_CHECK_STATUS;
   2242         UText s = UTEXT_INITIALIZER;
   2243         utext_openUTF8(&s, "    ", -1, &status);
   2244         m.reset(&s);
   2245         for (i=0; ; i++) {
   2246             if (m.find() == FALSE) {
   2247                 break;
   2248             }
   2249             REGEX_ASSERT(m.start(status) == i);
   2250             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
   2251         }
   2252         REGEX_ASSERT(i==5);
   2253 
   2254         utext_close(&s);
   2255     }
   2256 
   2257 
   2258     //
   2259     // Matchers with no input string behave as if they had an empty input string.
   2260     //
   2261 
   2262     {
   2263         UErrorCode status = U_ZERO_ERROR;
   2264         RegexMatcher  m(".?", 0, status);
   2265         REGEX_CHECK_STATUS;
   2266         REGEX_ASSERT(m.find());
   2267         REGEX_ASSERT(m.start(status) == 0);
   2268         REGEX_ASSERT(m.input() == "");
   2269     }
   2270     {
   2271         UErrorCode status = U_ZERO_ERROR;
   2272         RegexPattern  *p = RegexPattern::compile(".", 0, status);
   2273         RegexMatcher  *m = p->matcher(status);
   2274         REGEX_CHECK_STATUS;
   2275 
   2276         REGEX_ASSERT(m->find() == FALSE);
   2277         REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
   2278         delete m;
   2279         delete p;
   2280     }
   2281 
   2282     //
   2283     // Regions
   2284     //
   2285     {
   2286         UErrorCode status = U_ZERO_ERROR;
   2287         UText testPattern = UTEXT_INITIALIZER;
   2288         UText testText    = UTEXT_INITIALIZER;
   2289         regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
   2290         REGEX_VERBOSE_TEXT(&testPattern);
   2291         regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
   2292         REGEX_VERBOSE_TEXT(&testText);
   2293 
   2294         RegexMatcher m(&testPattern, &testText, 0, status);
   2295         REGEX_CHECK_STATUS;
   2296         REGEX_ASSERT(m.regionStart() == 0);
   2297         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
   2298         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2299         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2300 
   2301         m.region(2,4, status);
   2302         REGEX_CHECK_STATUS;
   2303         REGEX_ASSERT(m.matches(status));
   2304         REGEX_ASSERT(m.start(status)==2);
   2305         REGEX_ASSERT(m.end(status)==4);
   2306         REGEX_CHECK_STATUS;
   2307 
   2308         m.reset();
   2309         REGEX_ASSERT(m.regionStart() == 0);
   2310         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
   2311 
   2312         regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
   2313         REGEX_VERBOSE_TEXT(&testText);
   2314         m.reset(&testText);
   2315         REGEX_ASSERT(m.regionStart() == 0);
   2316         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
   2317 
   2318         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2319         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
   2320         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   2321         REGEX_ASSERT(&m == &m.reset());
   2322         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   2323 
   2324         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
   2325         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2326         REGEX_ASSERT(&m == &m.reset());
   2327         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2328 
   2329         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2330         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
   2331         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   2332         REGEX_ASSERT(&m == &m.reset());
   2333         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   2334 
   2335         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
   2336         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2337         REGEX_ASSERT(&m == &m.reset());
   2338         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2339 
   2340         utext_close(&testText);
   2341         utext_close(&testPattern);
   2342     }
   2343 
   2344     //
   2345     // hitEnd() and requireEnd()
   2346     //
   2347     {
   2348         UErrorCode status = U_ZERO_ERROR;
   2349         UText testPattern = UTEXT_INITIALIZER;
   2350         UText testText    = UTEXT_INITIALIZER;
   2351         const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
   2352         const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
   2353         utext_openUTF8(&testPattern, str_, -1, &status);
   2354         utext_openUTF8(&testText, str_aabb, -1, &status);
   2355 
   2356         RegexMatcher m1(&testPattern, &testText,  0, status);
   2357         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
   2358         REGEX_ASSERT(m1.hitEnd() == TRUE);
   2359         REGEX_ASSERT(m1.requireEnd() == FALSE);
   2360         REGEX_CHECK_STATUS;
   2361 
   2362         status = U_ZERO_ERROR;
   2363         const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
   2364         utext_openUTF8(&testPattern, str_a, -1, &status);
   2365         RegexMatcher m2(&testPattern, &testText, 0, status);
   2366         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
   2367         REGEX_ASSERT(m2.hitEnd() == FALSE);
   2368         REGEX_ASSERT(m2.requireEnd() == FALSE);
   2369         REGEX_CHECK_STATUS;
   2370 
   2371         status = U_ZERO_ERROR;
   2372         const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
   2373         utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
   2374         RegexMatcher m3(&testPattern, &testText, 0, status);
   2375         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
   2376         REGEX_ASSERT(m3.hitEnd() == TRUE);
   2377         REGEX_ASSERT(m3.requireEnd() == TRUE);
   2378         REGEX_CHECK_STATUS;
   2379 
   2380         utext_close(&testText);
   2381         utext_close(&testPattern);
   2382     }
   2383 }
   2384 
   2385 
   2386 //---------------------------------------------------------------------------
   2387 //
   2388 //      API_Replace_UTF8   API test for class RegexMatcher, testing the
   2389 //                         Replace family of functions.
   2390 //
   2391 //---------------------------------------------------------------------------
   2392 void RegexTest::API_Replace_UTF8() {
   2393     //
   2394     //  Replace
   2395     //
   2396     int32_t             flags=0;
   2397     UParseError         pe;
   2398     UErrorCode          status=U_ZERO_ERROR;
   2399 
   2400     UText               re=UTEXT_INITIALIZER;
   2401     regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
   2402     REGEX_VERBOSE_TEXT(&re);
   2403     RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2404     REGEX_CHECK_STATUS;
   2405 
   2406     char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
   2407     //             012345678901234567
   2408     UText dataText = UTEXT_INITIALIZER;
   2409     utext_openUTF8(&dataText, data, -1, &status);
   2410     REGEX_CHECK_STATUS;
   2411     REGEX_VERBOSE_TEXT(&dataText);
   2412     RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
   2413 
   2414     //
   2415     //  Plain vanilla matches.
   2416     //
   2417     UnicodeString  dest;
   2418     UText destText = UTEXT_INITIALIZER;
   2419     utext_openUnicodeString(&destText, &dest, &status);
   2420     UText *result;
   2421 
   2422     UText replText = UTEXT_INITIALIZER;
   2423 
   2424     const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
   2425     utext_openUTF8(&replText, str_yz, -1, &status);
   2426     REGEX_VERBOSE_TEXT(&replText);
   2427     result = matcher->replaceFirst(&replText, NULL, status);
   2428     REGEX_CHECK_STATUS;
   2429     const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
   2430     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
   2431     utext_close(result);
   2432     result = matcher->replaceFirst(&replText, &destText, status);
   2433     REGEX_CHECK_STATUS;
   2434     REGEX_ASSERT(result == &destText);
   2435     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
   2436 
   2437     result = matcher->replaceAll(&replText, NULL, status);
   2438     REGEX_CHECK_STATUS;
   2439     const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
   2440     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
   2441     utext_close(result);
   2442 
   2443     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2444     result = matcher->replaceAll(&replText, &destText, status);
   2445     REGEX_CHECK_STATUS;
   2446     REGEX_ASSERT(result == &destText);
   2447     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
   2448 
   2449     //
   2450     //  Plain vanilla non-matches.
   2451     //
   2452     const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
   2453     utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
   2454     matcher->reset(&dataText);
   2455 
   2456     result = matcher->replaceFirst(&replText, NULL, status);
   2457     REGEX_CHECK_STATUS;
   2458     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2459     utext_close(result);
   2460     result = matcher->replaceFirst(&replText, &destText, status);
   2461     REGEX_CHECK_STATUS;
   2462     REGEX_ASSERT(result == &destText);
   2463     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2464 
   2465     result = matcher->replaceAll(&replText, NULL, status);
   2466     REGEX_CHECK_STATUS;
   2467     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2468     utext_close(result);
   2469     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2470     result = matcher->replaceAll(&replText, &destText, status);
   2471     REGEX_CHECK_STATUS;
   2472     REGEX_ASSERT(result == &destText);
   2473     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2474 
   2475     //
   2476     // Empty source string
   2477     //
   2478     utext_openUTF8(&dataText, NULL, 0, &status);
   2479     matcher->reset(&dataText);
   2480 
   2481     result = matcher->replaceFirst(&replText, NULL, status);
   2482     REGEX_CHECK_STATUS;
   2483     REGEX_ASSERT_UTEXT_UTF8("", result);
   2484     utext_close(result);
   2485     result = matcher->replaceFirst(&replText, &destText, status);
   2486     REGEX_CHECK_STATUS;
   2487     REGEX_ASSERT(result == &destText);
   2488     REGEX_ASSERT_UTEXT_UTF8("", result);
   2489 
   2490     result = matcher->replaceAll(&replText, NULL, status);
   2491     REGEX_CHECK_STATUS;
   2492     REGEX_ASSERT_UTEXT_UTF8("", result);
   2493     utext_close(result);
   2494     result = matcher->replaceAll(&replText, &destText, status);
   2495     REGEX_CHECK_STATUS;
   2496     REGEX_ASSERT(result == &destText);
   2497     REGEX_ASSERT_UTEXT_UTF8("", result);
   2498 
   2499     //
   2500     // Empty substitution string
   2501     //
   2502     utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
   2503     matcher->reset(&dataText);
   2504 
   2505     utext_openUTF8(&replText, NULL, 0, &status);
   2506     result = matcher->replaceFirst(&replText, NULL, status);
   2507     REGEX_CHECK_STATUS;
   2508     const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
   2509     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
   2510     utext_close(result);
   2511     result = matcher->replaceFirst(&replText, &destText, status);
   2512     REGEX_CHECK_STATUS;
   2513     REGEX_ASSERT(result == &destText);
   2514     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
   2515 
   2516     result = matcher->replaceAll(&replText, NULL, status);
   2517     REGEX_CHECK_STATUS;
   2518     const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
   2519     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
   2520     utext_close(result);
   2521     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2522     result = matcher->replaceAll(&replText, &destText, status);
   2523     REGEX_CHECK_STATUS;
   2524     REGEX_ASSERT(result == &destText);
   2525     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
   2526 
   2527     //
   2528     // match whole string
   2529     //
   2530     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
   2531     utext_openUTF8(&dataText, str_abc, -1, &status);
   2532     matcher->reset(&dataText);
   2533 
   2534     const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
   2535     utext_openUTF8(&replText, str_xyz, -1, &status);
   2536     result = matcher->replaceFirst(&replText, NULL, status);
   2537     REGEX_CHECK_STATUS;
   2538     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2539     utext_close(result);
   2540     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2541     result = matcher->replaceFirst(&replText, &destText, status);
   2542     REGEX_CHECK_STATUS;
   2543     REGEX_ASSERT(result == &destText);
   2544     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2545 
   2546     result = matcher->replaceAll(&replText, NULL, status);
   2547     REGEX_CHECK_STATUS;
   2548     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2549     utext_close(result);
   2550     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2551     result = matcher->replaceAll(&replText, &destText, status);
   2552     REGEX_CHECK_STATUS;
   2553     REGEX_ASSERT(result == &destText);
   2554     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2555 
   2556     //
   2557     // Capture Group, simple case
   2558     //
   2559     const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
   2560     utext_openUTF8(&re, str_add, -1, &status);
   2561     RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
   2562     REGEX_CHECK_STATUS;
   2563 
   2564     const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
   2565     utext_openUTF8(&dataText, str_abcdefg, -1, &status);
   2566     RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
   2567     REGEX_CHECK_STATUS;
   2568 
   2569     const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
   2570     utext_openUTF8(&replText, str_11, -1, &status);
   2571     result = matcher2->replaceFirst(&replText, NULL, status);
   2572     REGEX_CHECK_STATUS;
   2573     const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
   2574     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
   2575     utext_close(result);
   2576     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2577     result = matcher2->replaceFirst(&replText, &destText, status);
   2578     REGEX_CHECK_STATUS;
   2579     REGEX_ASSERT(result == &destText);
   2580     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
   2581 
   2582     const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
   2583     utext_openUTF8(&replText, str_v, -1, &status);
   2584     REGEX_VERBOSE_TEXT(&replText);
   2585     result = matcher2->replaceFirst(&replText, NULL, status);
   2586     REGEX_CHECK_STATUS;
   2587     const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
   2588     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
   2589     utext_close(result);
   2590     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2591     result = matcher2->replaceFirst(&replText, &destText, status);
   2592     REGEX_CHECK_STATUS;
   2593     REGEX_ASSERT(result == &destText);
   2594     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
   2595 
   2596     const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */
   2597     utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
   2598     result = matcher2->replaceFirst(&replText, NULL, status);
   2599     REGEX_CHECK_STATUS;
   2600     const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
   2601     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
   2602     utext_close(result);
   2603     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2604     result = matcher2->replaceFirst(&replText, &destText, status);
   2605     REGEX_CHECK_STATUS;
   2606     REGEX_ASSERT(result == &destText);
   2607     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
   2608 
   2609     unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
   2610     //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
   2611     //                                 012345678901234567890123456
   2612     supplDigitChars[22] = 0xF0;
   2613     supplDigitChars[23] = 0x9D;
   2614     supplDigitChars[24] = 0x9F;
   2615     supplDigitChars[25] = 0x8F;
   2616     utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
   2617 
   2618     result = matcher2->replaceFirst(&replText, NULL, status);
   2619     REGEX_CHECK_STATUS;
   2620     const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
   2621     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
   2622     utext_close(result);
   2623     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2624     result = matcher2->replaceFirst(&replText, &destText, status);
   2625     REGEX_CHECK_STATUS;
   2626     REGEX_ASSERT(result == &destText);
   2627     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
   2628     const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
   2629     utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
   2630     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
   2631 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
   2632     utext_close(result);
   2633     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2634     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
   2635     REGEX_ASSERT(result == &destText);
   2636 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
   2637 
   2638     //
   2639     // Replacement String with \u hex escapes
   2640     //
   2641     {
   2642       const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
   2643       const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
   2644         utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
   2645         utext_openUTF8(&replText, str_u0043, -1, &status);
   2646         matcher->reset(&dataText);
   2647 
   2648         result = matcher->replaceAll(&replText, NULL, status);
   2649         REGEX_CHECK_STATUS;
   2650         const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
   2651         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
   2652         utext_close(result);
   2653         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2654         result = matcher->replaceAll(&replText, &destText, status);
   2655         REGEX_CHECK_STATUS;
   2656         REGEX_ASSERT(result == &destText);
   2657         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
   2658     }
   2659     {
   2660       const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
   2661         utext_openUTF8(&dataText, str_abc, -1, &status);
   2662         const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
   2663         utext_openUTF8(&replText, str_U00010000, -1, &status);
   2664         matcher->reset(&dataText);
   2665 
   2666         unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
   2667         //                          0123456789
   2668         expected[2] = 0xF0;
   2669         expected[3] = 0x90;
   2670         expected[4] = 0x80;
   2671         expected[5] = 0x80;
   2672 
   2673         result = matcher->replaceAll(&replText, NULL, status);
   2674         REGEX_CHECK_STATUS;
   2675         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
   2676         utext_close(result);
   2677         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2678         result = matcher->replaceAll(&replText, &destText, status);
   2679         REGEX_CHECK_STATUS;
   2680         REGEX_ASSERT(result == &destText);
   2681         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
   2682     }
   2683     // TODO:  need more through testing of capture substitutions.
   2684 
   2685     // Bug 4057
   2686     //
   2687     {
   2688         status = U_ZERO_ERROR;
   2689 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
   2690 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
   2691 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
   2692         utext_openUTF8(&re, str_ssee, -1, &status);
   2693         utext_openUTF8(&dataText, str_blah, -1, &status);
   2694         utext_openUTF8(&replText, str_ooh, -1, &status);
   2695 
   2696         RegexMatcher m(&re, 0, status);
   2697         REGEX_CHECK_STATUS;
   2698 
   2699         UnicodeString result;
   2700         UText resultText = UTEXT_INITIALIZER;
   2701         utext_openUnicodeString(&resultText, &result, &status);
   2702 
   2703         // Multiple finds do NOT bump up the previous appendReplacement postion.
   2704         m.reset(&dataText);
   2705         m.find();
   2706         m.find();
   2707         m.appendReplacement(&resultText, &replText, status);
   2708         REGEX_CHECK_STATUS;
   2709         const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
   2710         REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
   2711 
   2712         // After a reset into the interior of a string, appendReplacement still starts at beginning.
   2713         status = U_ZERO_ERROR;
   2714         result.truncate(0);
   2715         utext_openUnicodeString(&resultText, &result, &status);
   2716         m.reset(10, status);
   2717         m.find();
   2718         m.find();
   2719         m.appendReplacement(&resultText, &replText, status);
   2720         REGEX_CHECK_STATUS;
   2721         const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
   2722         REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
   2723 
   2724         // find() at interior of string, appendReplacement still starts at beginning.
   2725         status = U_ZERO_ERROR;
   2726         result.truncate(0);
   2727         utext_openUnicodeString(&resultText, &result, &status);
   2728         m.reset();
   2729         m.find(10, status);
   2730         m.find();
   2731         m.appendReplacement(&resultText, &replText, status);
   2732         REGEX_CHECK_STATUS;
   2733         const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
   2734         REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
   2735 
   2736         m.appendTail(&resultText, status);
   2737         const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
   2738         REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
   2739 
   2740         utext_close(&resultText);
   2741     }
   2742 
   2743     delete matcher2;
   2744     delete pat2;
   2745     delete matcher;
   2746     delete pat;
   2747 
   2748     utext_close(&dataText);
   2749     utext_close(&replText);
   2750     utext_close(&destText);
   2751     utext_close(&re);
   2752 }
   2753 
   2754 
   2755 //---------------------------------------------------------------------------
   2756 //
   2757 //      API_Pattern_UTF8  Test that the API for class RegexPattern is
   2758 //                        present and nominally working.
   2759 //
   2760 //---------------------------------------------------------------------------
   2761 void RegexTest::API_Pattern_UTF8() {
   2762     RegexPattern        pata;    // Test default constructor to not crash.
   2763     RegexPattern        patb;
   2764 
   2765     REGEX_ASSERT(pata == patb);
   2766     REGEX_ASSERT(pata == pata);
   2767 
   2768     UText         re1 = UTEXT_INITIALIZER;
   2769     UText         re2 = UTEXT_INITIALIZER;
   2770     UErrorCode    status = U_ZERO_ERROR;
   2771     UParseError   pe;
   2772 
   2773     const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
   2774     const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
   2775     utext_openUTF8(&re1, str_abcalmz, -1, &status);
   2776     utext_openUTF8(&re2, str_def, -1, &status);
   2777 
   2778     RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
   2779     RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
   2780     REGEX_CHECK_STATUS;
   2781     REGEX_ASSERT(*pat1 == *pat1);
   2782     REGEX_ASSERT(*pat1 != pata);
   2783 
   2784     // Assign
   2785     patb = *pat1;
   2786     REGEX_ASSERT(patb == *pat1);
   2787 
   2788     // Copy Construct
   2789     RegexPattern patc(*pat1);
   2790     REGEX_ASSERT(patc == *pat1);
   2791     REGEX_ASSERT(patb == patc);
   2792     REGEX_ASSERT(pat1 != pat2);
   2793     patb = *pat2;
   2794     REGEX_ASSERT(patb != patc);
   2795     REGEX_ASSERT(patb == *pat2);
   2796 
   2797     // Compile with no flags.
   2798     RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
   2799     REGEX_ASSERT(*pat1a == *pat1);
   2800 
   2801     REGEX_ASSERT(pat1a->flags() == 0);
   2802 
   2803     // Compile with different flags should be not equal
   2804     RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
   2805     REGEX_CHECK_STATUS;
   2806 
   2807     REGEX_ASSERT(*pat1b != *pat1a);
   2808     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
   2809     REGEX_ASSERT(pat1a->flags() == 0);
   2810     delete pat1b;
   2811 
   2812     // clone
   2813     RegexPattern *pat1c = pat1->clone();
   2814     REGEX_ASSERT(*pat1c == *pat1);
   2815     REGEX_ASSERT(*pat1c != *pat2);
   2816 
   2817     delete pat1c;
   2818     delete pat1a;
   2819     delete pat1;
   2820     delete pat2;
   2821 
   2822     utext_close(&re1);
   2823     utext_close(&re2);
   2824 
   2825 
   2826     //
   2827     //   Verify that a matcher created from a cloned pattern works.
   2828     //     (Jitterbug 3423)
   2829     //
   2830     {
   2831         UErrorCode     status     = U_ZERO_ERROR;
   2832         UText          pattern    = UTEXT_INITIALIZER;
   2833         const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
   2834         utext_openUTF8(&pattern, str_pL, -1, &status);
   2835 
   2836         RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
   2837         RegexPattern  *pClone     = pSource->clone();
   2838         delete         pSource;
   2839         RegexMatcher  *mFromClone = pClone->matcher(status);
   2840         REGEX_CHECK_STATUS;
   2841 
   2842         UText          input      = UTEXT_INITIALIZER;
   2843         const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
   2844         utext_openUTF8(&input, str_HelloWorld, -1, &status);
   2845         mFromClone->reset(&input);
   2846         REGEX_ASSERT(mFromClone->find() == TRUE);
   2847         REGEX_ASSERT(mFromClone->group(status) == "Hello");
   2848         REGEX_ASSERT(mFromClone->find() == TRUE);
   2849         REGEX_ASSERT(mFromClone->group(status) == "World");
   2850         REGEX_ASSERT(mFromClone->find() == FALSE);
   2851         delete mFromClone;
   2852         delete pClone;
   2853 
   2854         utext_close(&input);
   2855         utext_close(&pattern);
   2856     }
   2857 
   2858     //
   2859     //   matches convenience API
   2860     //
   2861     {
   2862         UErrorCode status  = U_ZERO_ERROR;
   2863         UText      pattern = UTEXT_INITIALIZER;
   2864         UText      input   = UTEXT_INITIALIZER;
   2865 
   2866         const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
   2867         utext_openUTF8(&input, str_randominput, -1, &status);
   2868 
   2869         const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
   2870         utext_openUTF8(&pattern, str_dotstar, -1, &status);
   2871         REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
   2872         REGEX_CHECK_STATUS;
   2873 
   2874         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
   2875         utext_openUTF8(&pattern, str_abc, -1, &status);
   2876         REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
   2877         REGEX_CHECK_STATUS;
   2878 
   2879         const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
   2880         utext_openUTF8(&pattern, str_nput, -1, &status);
   2881         REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
   2882         REGEX_CHECK_STATUS;
   2883 
   2884         utext_openUTF8(&pattern, str_randominput, -1, &status);
   2885         REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
   2886         REGEX_CHECK_STATUS;
   2887 
   2888         const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
   2889         utext_openUTF8(&pattern, str_u, -1, &status);
   2890         REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
   2891         REGEX_CHECK_STATUS;
   2892 
   2893         utext_openUTF8(&input, str_abc, -1, &status);
   2894         utext_openUTF8(&pattern, str_abc, -1, &status);
   2895         status = U_INDEX_OUTOFBOUNDS_ERROR;
   2896         REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
   2897         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   2898 
   2899         utext_close(&input);
   2900         utext_close(&pattern);
   2901     }
   2902 
   2903 
   2904     //
   2905     // Split()
   2906     //
   2907     status = U_ZERO_ERROR;
   2908     const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
   2909     utext_openUTF8(&re1, str_spaceplus, -1, &status);
   2910     pat1 = RegexPattern::compile(&re1, pe, status);
   2911     REGEX_CHECK_STATUS;
   2912     UnicodeString  fields[10];
   2913 
   2914     int32_t n;
   2915     n = pat1->split("Now is the time", fields, 10, status);
   2916     REGEX_CHECK_STATUS;
   2917     REGEX_ASSERT(n==4);
   2918     REGEX_ASSERT(fields[0]=="Now");
   2919     REGEX_ASSERT(fields[1]=="is");
   2920     REGEX_ASSERT(fields[2]=="the");
   2921     REGEX_ASSERT(fields[3]=="time");
   2922     REGEX_ASSERT(fields[4]=="");
   2923 
   2924     n = pat1->split("Now is the time", fields, 2, status);
   2925     REGEX_CHECK_STATUS;
   2926     REGEX_ASSERT(n==2);
   2927     REGEX_ASSERT(fields[0]=="Now");
   2928     REGEX_ASSERT(fields[1]=="is the time");
   2929     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
   2930 
   2931     fields[1] = "*";
   2932     status = U_ZERO_ERROR;
   2933     n = pat1->split("Now is the time", fields, 1, status);
   2934     REGEX_CHECK_STATUS;
   2935     REGEX_ASSERT(n==1);
   2936     REGEX_ASSERT(fields[0]=="Now is the time");
   2937     REGEX_ASSERT(fields[1]=="*");
   2938     status = U_ZERO_ERROR;
   2939 
   2940     n = pat1->split("    Now       is the time   ", fields, 10, status);
   2941     REGEX_CHECK_STATUS;
   2942     REGEX_ASSERT(n==6);
   2943     REGEX_ASSERT(fields[0]=="");
   2944     REGEX_ASSERT(fields[1]=="Now");
   2945     REGEX_ASSERT(fields[2]=="is");
   2946     REGEX_ASSERT(fields[3]=="the");
   2947     REGEX_ASSERT(fields[4]=="time");
   2948     REGEX_ASSERT(fields[5]=="");
   2949     REGEX_ASSERT(fields[6]=="");
   2950 
   2951     fields[2] = "*";
   2952     n = pat1->split("     ", fields, 10, status);
   2953     REGEX_CHECK_STATUS;
   2954     REGEX_ASSERT(n==2);
   2955     REGEX_ASSERT(fields[0]=="");
   2956     REGEX_ASSERT(fields[1]=="");
   2957     REGEX_ASSERT(fields[2]=="*");
   2958 
   2959     fields[0] = "foo";
   2960     n = pat1->split("", fields, 10, status);
   2961     REGEX_CHECK_STATUS;
   2962     REGEX_ASSERT(n==0);
   2963     REGEX_ASSERT(fields[0]=="foo");
   2964 
   2965     delete pat1;
   2966 
   2967     //  split, with a pattern with (capture)
   2968     regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
   2969     pat1 = RegexPattern::compile(&re1,  pe, status);
   2970     REGEX_CHECK_STATUS;
   2971 
   2972     status = U_ZERO_ERROR;
   2973     fields[6] = fields[7] = "*";
   2974     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
   2975     REGEX_CHECK_STATUS;
   2976     REGEX_ASSERT(n==7);
   2977     REGEX_ASSERT(fields[0]=="");
   2978     REGEX_ASSERT(fields[1]=="a");
   2979     REGEX_ASSERT(fields[2]=="Now is ");
   2980     REGEX_ASSERT(fields[3]=="b");
   2981     REGEX_ASSERT(fields[4]=="the time");
   2982     REGEX_ASSERT(fields[5]=="c");
   2983     REGEX_ASSERT(fields[6]=="");
   2984     REGEX_ASSERT(fields[7]=="*");
   2985     REGEX_ASSERT(status==U_ZERO_ERROR);
   2986 
   2987     fields[6] = fields[7] = "*";
   2988     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
   2989     REGEX_CHECK_STATUS;
   2990     REGEX_ASSERT(n==7);
   2991     REGEX_ASSERT(fields[0]=="  ");
   2992     REGEX_ASSERT(fields[1]=="a");
   2993     REGEX_ASSERT(fields[2]=="Now is ");
   2994     REGEX_ASSERT(fields[3]=="b");
   2995     REGEX_ASSERT(fields[4]=="the time");
   2996     REGEX_ASSERT(fields[5]=="c");
   2997     REGEX_ASSERT(fields[6]=="");
   2998     REGEX_ASSERT(fields[7]=="*");
   2999 
   3000     status = U_ZERO_ERROR;
   3001     fields[6] = "foo";
   3002     n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
   3003     REGEX_CHECK_STATUS;
   3004     REGEX_ASSERT(n==6);
   3005     REGEX_ASSERT(fields[0]=="  ");
   3006     REGEX_ASSERT(fields[1]=="a");
   3007     REGEX_ASSERT(fields[2]=="Now is ");
   3008     REGEX_ASSERT(fields[3]=="b");
   3009     REGEX_ASSERT(fields[4]=="the time");
   3010     REGEX_ASSERT(fields[5]==" ");
   3011     REGEX_ASSERT(fields[6]=="foo");
   3012 
   3013     status = U_ZERO_ERROR;
   3014     fields[5] = "foo";
   3015     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
   3016     REGEX_CHECK_STATUS;
   3017     REGEX_ASSERT(n==5);
   3018     REGEX_ASSERT(fields[0]=="  ");
   3019     REGEX_ASSERT(fields[1]=="a");
   3020     REGEX_ASSERT(fields[2]=="Now is ");
   3021     REGEX_ASSERT(fields[3]=="b");
   3022     REGEX_ASSERT(fields[4]=="the time<c>");
   3023     REGEX_ASSERT(fields[5]=="foo");
   3024 
   3025     status = U_ZERO_ERROR;
   3026     fields[5] = "foo";
   3027     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
   3028     REGEX_CHECK_STATUS;
   3029     REGEX_ASSERT(n==5);
   3030     REGEX_ASSERT(fields[0]=="  ");
   3031     REGEX_ASSERT(fields[1]=="a");
   3032     REGEX_ASSERT(fields[2]=="Now is ");
   3033     REGEX_ASSERT(fields[3]=="b");
   3034     REGEX_ASSERT(fields[4]=="the time");
   3035     REGEX_ASSERT(fields[5]=="foo");
   3036 
   3037     status = U_ZERO_ERROR;
   3038     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
   3039     REGEX_CHECK_STATUS;
   3040     REGEX_ASSERT(n==4);
   3041     REGEX_ASSERT(fields[0]=="  ");
   3042     REGEX_ASSERT(fields[1]=="a");
   3043     REGEX_ASSERT(fields[2]=="Now is ");
   3044     REGEX_ASSERT(fields[3]=="the time<c>");
   3045     status = U_ZERO_ERROR;
   3046     delete pat1;
   3047 
   3048     regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
   3049     pat1 = RegexPattern::compile(&re1, pe, status);
   3050     REGEX_CHECK_STATUS;
   3051     n = pat1->split("1-10,20", fields, 10, status);
   3052     REGEX_CHECK_STATUS;
   3053     REGEX_ASSERT(n==5);
   3054     REGEX_ASSERT(fields[0]=="1");
   3055     REGEX_ASSERT(fields[1]=="-");
   3056     REGEX_ASSERT(fields[2]=="10");
   3057     REGEX_ASSERT(fields[3]==",");
   3058     REGEX_ASSERT(fields[4]=="20");
   3059     delete pat1;
   3060 
   3061 
   3062     //
   3063     // RegexPattern::pattern() and patternText()
   3064     //
   3065     pat1 = new RegexPattern();
   3066     REGEX_ASSERT(pat1->pattern() == "");
   3067     REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
   3068     delete pat1;
   3069     const char *helloWorldInvariant = "(Hello, world)*";
   3070     regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
   3071     pat1 = RegexPattern::compile(&re1, pe, status);
   3072     REGEX_CHECK_STATUS;
   3073     REGEX_ASSERT_UNISTR(pat1->pattern(),"(Hello, world)*");
   3074     REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
   3075     delete pat1;
   3076 
   3077     utext_close(&re1);
   3078 }
   3079 
   3080 
   3081 //---------------------------------------------------------------------------
   3082 //
   3083 //      Extended       A more thorough check for features of regex patterns
   3084 //                     The test cases are in a separate data file,
   3085 //                       source/tests/testdata/regextst.txt
   3086 //                     A description of the test data format is included in that file.
   3087 //
   3088 //---------------------------------------------------------------------------
   3089 
   3090 const char *
   3091 RegexTest::getPath(char buffer[2048], const char *filename) {
   3092     UErrorCode status=U_ZERO_ERROR;
   3093     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   3094     if (U_FAILURE(status)) {
   3095         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
   3096         return NULL;
   3097     }
   3098 
   3099     strcpy(buffer, testDataDirectory);
   3100     strcat(buffer, filename);
   3101     return buffer;
   3102 }
   3103 
   3104 void RegexTest::Extended() {
   3105     char tdd[2048];
   3106     const char *srcPath;
   3107     UErrorCode  status  = U_ZERO_ERROR;
   3108     int32_t     lineNum = 0;
   3109 
   3110     //
   3111     //  Open and read the test data file.
   3112     //
   3113     srcPath=getPath(tdd, "regextst.txt");
   3114     if(srcPath==NULL) {
   3115         return; /* something went wrong, error already output */
   3116     }
   3117 
   3118     int32_t    len;
   3119     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
   3120     if (U_FAILURE(status)) {
   3121         return; /* something went wrong, error already output */
   3122     }
   3123 
   3124     //
   3125     //  Put the test data into a UnicodeString
   3126     //
   3127     UnicodeString testString(FALSE, testData, len);
   3128 
   3129     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
   3130     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
   3131     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
   3132 
   3133     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
   3134     UnicodeString   testPattern;   // The pattern for test from the test file.
   3135     UnicodeString   testFlags;     // the flags   for a test.
   3136     UnicodeString   matchString;   // The marked up string to be used as input
   3137 
   3138     if (U_FAILURE(status)){
   3139         dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
   3140         delete [] testData;
   3141         return;
   3142     }
   3143 
   3144     //
   3145     //  Loop over the test data file, once per line.
   3146     //
   3147     while (lineMat.find()) {
   3148         lineNum++;
   3149         if (U_FAILURE(status)) {
   3150           errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
   3151         }
   3152 
   3153         status = U_ZERO_ERROR;
   3154         UnicodeString testLine = lineMat.group(1, status);
   3155         if (testLine.length() == 0) {
   3156             continue;
   3157         }
   3158 
   3159         //
   3160         // Parse the test line.  Skip blank and comment only lines.
   3161         // Separate out the three main fields - pattern, flags, target.
   3162         //
   3163 
   3164         commentMat.reset(testLine);
   3165         if (commentMat.lookingAt(status)) {
   3166             // This line is a comment, or blank.
   3167             continue;
   3168         }
   3169 
   3170         //
   3171         //  Pull out the pattern field, remove it from the test file line.
   3172         //
   3173         quotedStuffMat.reset(testLine);
   3174         if (quotedStuffMat.lookingAt(status)) {
   3175             testPattern = quotedStuffMat.group(2, status);
   3176             testLine.remove(0, quotedStuffMat.end(0, status));
   3177         } else {
   3178             errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
   3179             continue;
   3180         }
   3181 
   3182 
   3183         //
   3184         //  Pull out the flags from the test file line.
   3185         //
   3186         flagsMat.reset(testLine);
   3187         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
   3188         testFlags = flagsMat.group(1, status);
   3189         if (flagsMat.group(2, status).length() > 0) {
   3190             errln("Bad Match flag at line %d. Scanning %c\n",
   3191                 lineNum, flagsMat.group(2, status).charAt(0));
   3192             continue;
   3193         }
   3194         testLine.remove(0, flagsMat.end(0, status));
   3195 
   3196         //
   3197         //  Pull out the match string, as a whole.
   3198         //    We'll process the <tags> later.
   3199         //
   3200         quotedStuffMat.reset(testLine);
   3201         if (quotedStuffMat.lookingAt(status)) {
   3202             matchString = quotedStuffMat.group(2, status);
   3203             testLine.remove(0, quotedStuffMat.end(0, status));
   3204         } else {
   3205             errln("Bad match string at test file line %d", lineNum);
   3206             continue;
   3207         }
   3208 
   3209         //
   3210         //  The only thing left from the input line should be an optional trailing comment.
   3211         //
   3212         commentMat.reset(testLine);
   3213         if (commentMat.lookingAt(status) == FALSE) {
   3214             errln("Line %d: unexpected characters at end of test line.", lineNum);
   3215             continue;
   3216         }
   3217 
   3218         //
   3219         //  Run the test
   3220         //
   3221         regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
   3222     }
   3223 
   3224     delete [] testData;
   3225 
   3226 }
   3227 
   3228 
   3229 
   3230 //---------------------------------------------------------------------------
   3231 //
   3232 //    regex_find(pattern, flags, inputString, lineNumber)
   3233 //
   3234 //         Function to run a single test from the Extended (data driven) tests.
   3235 //         See file test/testdata/regextst.txt for a description of the
   3236 //         pattern and inputString fields, and the allowed flags.
   3237 //         lineNumber is the source line in regextst.txt of the test.
   3238 //
   3239 //---------------------------------------------------------------------------
   3240 
   3241 
   3242 //  Set a value into a UVector at position specified by a decimal number in
   3243 //   a UnicodeString.   This is a utility function needed by the actual test function,
   3244 //   which follows.
   3245 static void set(UVector &vec, int32_t val, UnicodeString index) {
   3246     UErrorCode  status=U_ZERO_ERROR;
   3247     int32_t  idx = 0;
   3248     for (int32_t i=0; i<index.length(); i++) {
   3249         int32_t d=u_charDigitValue(index.charAt(i));
   3250         if (d<0) {return;}
   3251         idx = idx*10 + d;
   3252     }
   3253     while (vec.size()<idx+1) {vec.addElement(-1, status);}
   3254     vec.setElementAt(val, idx);
   3255 }
   3256 
   3257 static void setInt(UVector &vec, int32_t val, int32_t idx) {
   3258     UErrorCode  status=U_ZERO_ERROR;
   3259     while (vec.size()<idx+1) {vec.addElement(-1, status);}
   3260     vec.setElementAt(val, idx);
   3261 }
   3262 
   3263 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
   3264 {
   3265     UBool couldFind = TRUE;
   3266     UTEXT_SETNATIVEINDEX(utext, 0);
   3267     int32_t i = 0;
   3268     while (i < unistrOffset) {
   3269         UChar32 c = UTEXT_NEXT32(utext);
   3270         if (c != U_SENTINEL) {
   3271             i += U16_LENGTH(c);
   3272         } else {
   3273             couldFind = FALSE;
   3274             break;
   3275         }
   3276     }
   3277     nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
   3278     return couldFind;
   3279 }
   3280 
   3281 
   3282 void RegexTest::regex_find(const UnicodeString &pattern,
   3283                            const UnicodeString &flags,
   3284                            const UnicodeString &inputString,
   3285                            const char *srcPath,
   3286                            int32_t line) {
   3287     UnicodeString       unEscapedInput;
   3288     UnicodeString       deTaggedInput;
   3289 
   3290     int32_t             patternUTF8Length,      inputUTF8Length;
   3291     char                *patternChars  = NULL, *inputChars = NULL;
   3292     UText               patternText    = UTEXT_INITIALIZER;
   3293     UText               inputText      = UTEXT_INITIALIZER;
   3294     UConverter          *UTF8Converter = NULL;
   3295 
   3296     UErrorCode          status         = U_ZERO_ERROR;
   3297     UParseError         pe;
   3298     RegexPattern        *parsePat      = NULL;
   3299     RegexMatcher        *parseMatcher  = NULL;
   3300     RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
   3301     RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
   3302     UVector             groupStarts(status);
   3303     UVector             groupEnds(status);
   3304     UVector             groupStartsUTF8(status);
   3305     UVector             groupEndsUTF8(status);
   3306     UBool               isMatch        = FALSE, isUTF8Match = FALSE;
   3307     UBool               failed         = FALSE;
   3308     int32_t             numFinds;
   3309     int32_t             i;
   3310     UBool               useMatchesFunc   = FALSE;
   3311     UBool               useLookingAtFunc = FALSE;
   3312     int32_t             regionStart      = -1;
   3313     int32_t             regionEnd        = -1;
   3314     int32_t             regionStartUTF8  = -1;
   3315     int32_t             regionEndUTF8    = -1;
   3316 
   3317 
   3318     //
   3319     //  Compile the caller's pattern
   3320     //
   3321     uint32_t bflags = 0;
   3322     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
   3323         bflags |= UREGEX_CASE_INSENSITIVE;
   3324     }
   3325     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
   3326         bflags |= UREGEX_COMMENTS;
   3327     }
   3328     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
   3329         bflags |= UREGEX_DOTALL;
   3330     }
   3331     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
   3332         bflags |= UREGEX_MULTILINE;
   3333     }
   3334 
   3335     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
   3336         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
   3337     }
   3338     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
   3339         bflags |= UREGEX_UNIX_LINES;
   3340     }
   3341     if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
   3342         bflags |= UREGEX_LITERAL;
   3343     }
   3344 
   3345 
   3346     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
   3347     if (status != U_ZERO_ERROR) {
   3348         #if UCONFIG_NO_BREAK_ITERATION==1
   3349         // 'v' test flag means that the test pattern should not compile if ICU was configured
   3350         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
   3351         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
   3352             goto cleanupAndReturn;
   3353         }
   3354         #endif
   3355         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
   3356             // Expected pattern compilation error.
   3357             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
   3358                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
   3359             }
   3360             goto cleanupAndReturn;
   3361         } else {
   3362             // Unexpected pattern compilation error.
   3363             dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
   3364             goto cleanupAndReturn;
   3365         }
   3366     }
   3367 
   3368     UTF8Converter = ucnv_open("UTF8", &status);
   3369     ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
   3370 
   3371     patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
   3372     status = U_ZERO_ERROR; // buffer overflow
   3373     patternChars = new char[patternUTF8Length+1];
   3374     pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
   3375     utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
   3376 
   3377     if (status == U_ZERO_ERROR) {
   3378         UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
   3379 
   3380         if (status != U_ZERO_ERROR) {
   3381 #if UCONFIG_NO_BREAK_ITERATION==1
   3382             // 'v' test flag means that the test pattern should not compile if ICU was configured
   3383             //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
   3384             if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
   3385                 goto cleanupAndReturn;
   3386             }
   3387 #endif
   3388             if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
   3389                 // Expected pattern compilation error.
   3390                 if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
   3391                     logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
   3392                 }
   3393                 goto cleanupAndReturn;
   3394             } else {
   3395                 // Unexpected pattern compilation error.
   3396                 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
   3397                 goto cleanupAndReturn;
   3398             }
   3399         }
   3400     }
   3401 
   3402     if (UTF8Pattern == NULL) {
   3403         // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
   3404         logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
   3405         status = U_ZERO_ERROR;
   3406     }
   3407 
   3408     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
   3409         callerPattern->dumpPattern();
   3410     }
   3411 
   3412     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
   3413         errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
   3414         goto cleanupAndReturn;
   3415     }
   3416 
   3417 
   3418     //
   3419     // Number of times find() should be called on the test string, default to 1
   3420     //
   3421     numFinds = 1;
   3422     for (i=2; i<=9; i++) {
   3423         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
   3424             if (numFinds != 1) {
   3425                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
   3426                 goto cleanupAndReturn;
   3427             }
   3428             numFinds = i;
   3429         }
   3430     }
   3431 
   3432     // 'M' flag.  Use matches() instead of find()
   3433     if (flags.indexOf((UChar)0x4d) >= 0) {
   3434         useMatchesFunc = TRUE;
   3435     }
   3436     if (flags.indexOf((UChar)0x4c) >= 0) {
   3437         useLookingAtFunc = TRUE;
   3438     }
   3439 
   3440     //
   3441     //  Find the tags in the input data, remove them, and record the group boundary
   3442     //    positions.
   3443     //
   3444     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
   3445     REGEX_CHECK_STATUS_L(line);
   3446 
   3447     unEscapedInput = inputString.unescape();
   3448     parseMatcher = parsePat->matcher(unEscapedInput, status);
   3449     REGEX_CHECK_STATUS_L(line);
   3450     while(parseMatcher->find()) {
   3451         parseMatcher->appendReplacement(deTaggedInput, "", status);
   3452         REGEX_CHECK_STATUS;
   3453         UnicodeString groupNum = parseMatcher->group(2, status);
   3454         if (groupNum == "r") {
   3455             // <r> or </r>, a region specification within the string
   3456             if (parseMatcher->group(1, status) == "/") {
   3457                 regionEnd = deTaggedInput.length();
   3458             } else {
   3459                 regionStart = deTaggedInput.length();
   3460             }
   3461         } else {
   3462             // <digits> or </digits>, a group match boundary tag.
   3463             if (parseMatcher->group(1, status) == "/") {
   3464                 set(groupEnds, deTaggedInput.length(), groupNum);
   3465             } else {
   3466                 set(groupStarts, deTaggedInput.length(), groupNum);
   3467             }
   3468         }
   3469     }
   3470     parseMatcher->appendTail(deTaggedInput);
   3471     REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
   3472     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
   3473       errln("mismatched <r> tags");
   3474       failed = TRUE;
   3475       goto cleanupAndReturn;
   3476     }
   3477 
   3478     //
   3479     //  Configure the matcher according to the flags specified with this test.
   3480     //
   3481     matcher = callerPattern->matcher(deTaggedInput, status);
   3482     REGEX_CHECK_STATUS_L(line);
   3483     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
   3484         matcher->setTrace(TRUE);
   3485     }
   3486 
   3487     if (UTF8Pattern != NULL) {
   3488         inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
   3489         status = U_ZERO_ERROR; // buffer overflow
   3490         inputChars = new char[inputUTF8Length+1];
   3491         deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
   3492         utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
   3493 
   3494         if (status == U_ZERO_ERROR) {
   3495             UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
   3496             REGEX_CHECK_STATUS_L(line);
   3497         }
   3498 
   3499         if (UTF8Matcher == NULL) {
   3500             // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
   3501           logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
   3502             status = U_ZERO_ERROR;
   3503         }
   3504     }
   3505 
   3506     //
   3507     //  Generate native indices for UTF8 versions of region and capture group info
   3508     //
   3509     if (UTF8Matcher != NULL) {
   3510         if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
   3511         if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
   3512 
   3513         //  Fill out the native index UVector info.
   3514         //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
   3515         for (i=0; i<groupStarts.size(); i++) {
   3516             int32_t  start = groupStarts.elementAti(i);
   3517             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
   3518             if (start >= 0) {
   3519                 int32_t  startUTF8;
   3520                 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
   3521                     errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
   3522                     failed = TRUE;
   3523                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3524                 }
   3525                 setInt(groupStartsUTF8, startUTF8, i);
   3526             }
   3527 
   3528             int32_t  end = groupEnds.elementAti(i);
   3529             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
   3530             if (end >= 0) {
   3531                 int32_t  endUTF8;
   3532                 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
   3533                     errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
   3534                     failed = TRUE;
   3535                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3536                 }
   3537                 setInt(groupEndsUTF8, endUTF8, i);
   3538             }
   3539         }
   3540     }
   3541 
   3542     if (regionStart>=0) {
   3543        matcher->region(regionStart, regionEnd, status);
   3544        REGEX_CHECK_STATUS_L(line);
   3545        if (UTF8Matcher != NULL) {
   3546            UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
   3547            REGEX_CHECK_STATUS_L(line);
   3548        }
   3549     }
   3550     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
   3551         matcher->useAnchoringBounds(FALSE);
   3552         if (UTF8Matcher != NULL) {
   3553             UTF8Matcher->useAnchoringBounds(FALSE);
   3554         }
   3555     }
   3556     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
   3557         matcher->useTransparentBounds(TRUE);
   3558         if (UTF8Matcher != NULL) {
   3559             UTF8Matcher->useTransparentBounds(TRUE);
   3560         }
   3561     }
   3562 
   3563 
   3564 
   3565     //
   3566     // Do a find on the de-tagged input using the caller's pattern
   3567     //     TODO: error on count>1 and not find().
   3568     //           error on both matches() and lookingAt().
   3569     //
   3570     for (i=0; i<numFinds; i++) {
   3571         if (useMatchesFunc) {
   3572             isMatch = matcher->matches(status);
   3573             if (UTF8Matcher != NULL) {
   3574                isUTF8Match = UTF8Matcher->matches(status);
   3575             }
   3576         } else  if (useLookingAtFunc) {
   3577             isMatch = matcher->lookingAt(status);
   3578             if (UTF8Matcher != NULL) {
   3579                 isUTF8Match = UTF8Matcher->lookingAt(status);
   3580             }
   3581         } else {
   3582             isMatch = matcher->find();
   3583             if (UTF8Matcher != NULL) {
   3584                 isUTF8Match = UTF8Matcher->find();
   3585             }
   3586         }
   3587     }
   3588     matcher->setTrace(FALSE);
   3589     if (U_FAILURE(status)) {
   3590         errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
   3591     }
   3592 
   3593     //
   3594     // Match up the groups from the find() with the groups from the tags
   3595     //
   3596 
   3597     // number of tags should match number of groups from find operation.
   3598     // matcher->groupCount does not include group 0, the entire match, hence the +1.
   3599     //   G option in test means that capture group data is not available in the
   3600     //     expected results, so the check needs to be suppressed.
   3601     if (isMatch == FALSE && groupStarts.size() != 0) {
   3602         dataerrln("Error at line %d:  Match expected, but none found.", line);
   3603         failed = TRUE;
   3604         goto cleanupAndReturn;
   3605     } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
   3606         errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
   3607         failed = TRUE;
   3608         goto cleanupAndReturn;
   3609     }
   3610 
   3611     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
   3612         // Only check for match / no match.  Don't check capture groups.
   3613         if (isMatch && groupStarts.size() == 0) {
   3614             errln("Error at line %d:  No match expected, but one found.", line);
   3615             failed = TRUE;
   3616         } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
   3617             errln("Error at line %d:  No match expected, but one found. (UTF8)", line);
   3618             failed = TRUE;
   3619         }
   3620         goto cleanupAndReturn;
   3621     }
   3622 
   3623     REGEX_CHECK_STATUS_L(line);
   3624     for (i=0; i<=matcher->groupCount(); i++) {
   3625         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
   3626         int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
   3627         if (matcher->start(i, status) != expectedStart) {
   3628             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
   3629                 line, i, expectedStart, matcher->start(i, status));
   3630             failed = TRUE;
   3631             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3632         } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
   3633             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
   3634                   line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
   3635             failed = TRUE;
   3636             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3637         }
   3638 
   3639         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
   3640         int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
   3641         if (matcher->end(i, status) != expectedEnd) {
   3642             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
   3643                 line, i, expectedEnd, matcher->end(i, status));
   3644             failed = TRUE;
   3645             // Error on end position;  keep going; real error is probably yet to come as group
   3646             //   end positions work from end of the input data towards the front.
   3647         } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
   3648             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
   3649                   line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
   3650             failed = TRUE;
   3651             // Error on end position;  keep going; real error is probably yet to come as group
   3652             //   end positions work from end of the input data towards the front.
   3653         }
   3654     }
   3655     if ( matcher->groupCount()+1 < groupStarts.size()) {
   3656         errln("Error at line %d: Expected %d capture groups, found %d.",
   3657             line, groupStarts.size()-1, matcher->groupCount());
   3658         failed = TRUE;
   3659         }
   3660     else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
   3661         errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
   3662               line, groupStarts.size()-1, UTF8Matcher->groupCount());
   3663         failed = TRUE;
   3664     }
   3665 
   3666     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3667         matcher->requireEnd() == TRUE) {
   3668         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
   3669         failed = TRUE;
   3670     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3671         UTF8Matcher->requireEnd() == TRUE) {
   3672         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
   3673         failed = TRUE;
   3674     }
   3675 
   3676     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
   3677         matcher->requireEnd() == FALSE) {
   3678         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
   3679         failed = TRUE;
   3680     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3681         UTF8Matcher->requireEnd() == FALSE) {
   3682         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
   3683         failed = TRUE;
   3684     }
   3685 
   3686     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
   3687         matcher->hitEnd() == TRUE) {
   3688         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
   3689         failed = TRUE;
   3690     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
   3691                UTF8Matcher->hitEnd() == TRUE) {
   3692         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
   3693         failed = TRUE;
   3694     }
   3695 
   3696     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
   3697         matcher->hitEnd() == FALSE) {
   3698         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
   3699         failed = TRUE;
   3700     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
   3701                UTF8Matcher->hitEnd() == FALSE) {
   3702         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
   3703         failed = TRUE;
   3704     }
   3705 
   3706 
   3707 cleanupAndReturn:
   3708     if (failed) {
   3709         infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
   3710             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
   3711         // callerPattern->dump();
   3712     }
   3713     delete parseMatcher;
   3714     delete parsePat;
   3715     delete UTF8Matcher;
   3716     delete UTF8Pattern;
   3717     delete matcher;
   3718     delete callerPattern;
   3719 
   3720     utext_close(&inputText);
   3721     delete[] inputChars;
   3722     utext_close(&patternText);
   3723     delete[] patternChars;
   3724     ucnv_close(UTF8Converter);
   3725 }
   3726 
   3727 
   3728 
   3729 
   3730 //---------------------------------------------------------------------------
   3731 //
   3732 //      Errors     Check for error handling in patterns.
   3733 //
   3734 //---------------------------------------------------------------------------
   3735 void RegexTest::Errors() {
   3736     // \escape sequences that aren't implemented yet.
   3737     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
   3738 
   3739     // Missing close parentheses
   3740     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
   3741     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
   3742     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
   3743 
   3744     // Extra close paren
   3745     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
   3746     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
   3747     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
   3748 
   3749     // Look-ahead, Look-behind
   3750     //  TODO:  add tests for unbounded length look-behinds.
   3751     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
   3752 
   3753     // Attempt to use non-default flags
   3754     {
   3755         UParseError   pe;
   3756         UErrorCode    status = U_ZERO_ERROR;
   3757         int32_t       flags  = UREGEX_CANON_EQ |
   3758                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
   3759                                UREGEX_MULTILINE;
   3760         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
   3761         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
   3762         delete pat1;
   3763     }
   3764 
   3765 
   3766     // Quantifiers are allowed only after something that can be quantified.
   3767     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
   3768     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
   3769     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
   3770 
   3771     // Mal-formed {min,max} quantifiers
   3772     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
   3773     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
   3774     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
   3775     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
   3776     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
   3777     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
   3778     REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
   3779     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
   3780     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
   3781 
   3782     // Ticket 5389
   3783     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
   3784 
   3785     // Invalid Back Reference \0
   3786     //    For ICU 3.8 and earlier
   3787     //    For ICU versions newer than 3.8, \0 introduces an octal escape.
   3788     //
   3789     REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
   3790 
   3791 }
   3792 
   3793 
   3794 //-------------------------------------------------------------------------------
   3795 //
   3796 //  Read a text data file, convert it to UChars, and return the data
   3797 //    in one big UChar * buffer, which the caller must delete.
   3798 //
   3799 //--------------------------------------------------------------------------------
   3800 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
   3801                                      const char *defEncoding, UErrorCode &status) {
   3802     UChar       *retPtr  = NULL;
   3803     char        *fileBuf = NULL;
   3804     UConverter* conv     = NULL;
   3805     FILE        *f       = NULL;
   3806 
   3807     ulen = 0;
   3808     if (U_FAILURE(status)) {
   3809         return retPtr;
   3810     }
   3811 
   3812     //
   3813     //  Open the file.
   3814     //
   3815     f = fopen(fileName, "rb");
   3816     if (f == 0) {
   3817         dataerrln("Error opening test data file %s\n", fileName);
   3818         status = U_FILE_ACCESS_ERROR;
   3819         return NULL;
   3820     }
   3821     //
   3822     //  Read it in
   3823     //
   3824     int32_t            fileSize;
   3825     int32_t            amt_read;
   3826 
   3827     fseek( f, 0, SEEK_END);
   3828     fileSize = ftell(f);
   3829     fileBuf = new char[fileSize];
   3830     fseek(f, 0, SEEK_SET);
   3831     amt_read = fread(fileBuf, 1, fileSize, f);
   3832     if (amt_read != fileSize || fileSize <= 0) {
   3833         errln("Error reading test data file.");
   3834         goto cleanUpAndReturn;
   3835     }
   3836 
   3837     //
   3838     // Look for a Unicode Signature (BOM) on the data just read
   3839     //
   3840     int32_t        signatureLength;
   3841     const char *   fileBufC;
   3842     const char*    encoding;
   3843 
   3844     fileBufC = fileBuf;
   3845     encoding = ucnv_detectUnicodeSignature(
   3846         fileBuf, fileSize, &signatureLength, &status);
   3847     if(encoding!=NULL ){
   3848         fileBufC  += signatureLength;
   3849         fileSize  -= signatureLength;
   3850     } else {
   3851         encoding = defEncoding;
   3852         if (strcmp(encoding, "utf-8") == 0) {
   3853             errln("file %s is missing its BOM", fileName);
   3854         }
   3855     }
   3856 
   3857     //
   3858     // Open a converter to take the rule file to UTF-16
   3859     //
   3860     conv = ucnv_open(encoding, &status);
   3861     if (U_FAILURE(status)) {
   3862         goto cleanUpAndReturn;
   3863     }
   3864 
   3865     //
   3866     // Convert the rules to UChar.
   3867     //  Preflight first to determine required buffer size.
   3868     //
   3869     ulen = ucnv_toUChars(conv,
   3870         NULL,           //  dest,
   3871         0,              //  destCapacity,
   3872         fileBufC,
   3873         fileSize,
   3874         &status);
   3875     if (status == U_BUFFER_OVERFLOW_ERROR) {
   3876         // Buffer Overflow is expected from the preflight operation.
   3877         status = U_ZERO_ERROR;
   3878 
   3879         retPtr = new UChar[ulen+1];
   3880         ucnv_toUChars(conv,
   3881             retPtr,       //  dest,
   3882             ulen+1,
   3883             fileBufC,
   3884             fileSize,
   3885             &status);
   3886     }
   3887 
   3888 cleanUpAndReturn:
   3889     fclose(f);
   3890     delete[] fileBuf;
   3891     ucnv_close(conv);
   3892     if (U_FAILURE(status)) {
   3893         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   3894         delete []retPtr;
   3895         retPtr = 0;
   3896         ulen   = 0;
   3897     };
   3898     return retPtr;
   3899 }
   3900 
   3901 
   3902 //-------------------------------------------------------------------------------
   3903 //
   3904 //   PerlTests  - Run Perl's regular expression tests
   3905 //                The input file for this test is re_tests, the standard regular
   3906 //                expression test data distributed with the Perl source code.
   3907 //
   3908 //                Here is Perl's description of the test data file:
   3909 //
   3910 //        # The tests are in a separate file 't/op/re_tests'.
   3911 //        # Each line in that file is a separate test.
   3912 //        # There are five columns, separated by tabs.
   3913 //        #
   3914 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
   3915 //        # Modifiers can be put after the closing C<'>.
   3916 //        #
   3917 //        # Column 2 contains the string to be matched.
   3918 //        #
   3919 //        # Column 3 contains the expected result:
   3920 //        #     y   expect a match
   3921 //        #     n   expect no match
   3922 //        #     c   expect an error
   3923 //        # B   test exposes a known bug in Perl, should be skipped
   3924 //        # b   test exposes a known bug in Perl, should be skipped if noamp
   3925 //        #
   3926 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
   3927 //        #
   3928 //        # Column 4 contains a string, usually C<$&>.
   3929 //        #
   3930 //        # Column 5 contains the expected result of double-quote
   3931 //        # interpolating that string after the match, or start of error message.
   3932 //        #
   3933 //        # Column 6, if present, contains a reason why the test is skipped.
   3934 //        # This is printed with "skipped", for harness to pick up.
   3935 //        #
   3936 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
   3937 //        #
   3938 //        # If you want to add a regular expression test that can't be expressed
   3939 //        # in this format, don't add it here: put it in op/pat.t instead.
   3940 //
   3941 //        For ICU, if field 3 contains an 'i', the test will be skipped.
   3942 //        The test exposes is some known incompatibility between ICU and Perl regexps.
   3943 //        (The i is in addition to whatever was there before.)
   3944 //
   3945 //-------------------------------------------------------------------------------
   3946 void RegexTest::PerlTests() {
   3947     char tdd[2048];
   3948     const char *srcPath;
   3949     UErrorCode  status = U_ZERO_ERROR;
   3950     UParseError pe;
   3951 
   3952     //
   3953     //  Open and read the test data file.
   3954     //
   3955     srcPath=getPath(tdd, "re_tests.txt");
   3956     if(srcPath==NULL) {
   3957         return; /* something went wrong, error already output */
   3958     }
   3959 
   3960     int32_t    len;
   3961     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
   3962     if (U_FAILURE(status)) {
   3963         return; /* something went wrong, error already output */
   3964     }
   3965 
   3966     //
   3967     //  Put the test data into a UnicodeString
   3968     //
   3969     UnicodeString testDataString(FALSE, testData, len);
   3970 
   3971     //
   3972     //  Regex to break the input file into lines, and strip the new lines.
   3973     //     One line per match, capture group one is the desired data.
   3974     //
   3975     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
   3976     if (U_FAILURE(status)) {
   3977         dataerrln("RegexPattern::compile() error");
   3978         return;
   3979     }
   3980     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
   3981 
   3982     //
   3983     //  Regex to split a test file line into fields.
   3984     //    There are six fields, separated by tabs.
   3985     //
   3986     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
   3987 
   3988     //
   3989     //  Regex to identify test patterns with flag settings, and to separate them.
   3990     //    Test patterns with flags look like 'pattern'i
   3991     //    Test patterns without flags are not quoted:   pattern
   3992     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
   3993     //
   3994     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
   3995     RegexMatcher* flagMat = flagPat->matcher(status);
   3996 
   3997     //
   3998     // The Perl tests reference several perl-isms, which are evaluated/substituted
   3999     //   in the test data.  Not being perl, this must be done explicitly.  Here
   4000     //   are string constants and REs for these constructs.
   4001     //
   4002     UnicodeString nulnulSrc("${nulnul}");
   4003     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
   4004     nulnul = nulnul.unescape();
   4005 
   4006     UnicodeString ffffSrc("${ffff}");
   4007     UnicodeString ffff("\\uffff", -1, US_INV);
   4008     ffff = ffff.unescape();
   4009 
   4010     //  regexp for $-[0], $+[2], etc.
   4011     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
   4012     RegexMatcher *groupsMat = groupsPat->matcher(status);
   4013 
   4014     //  regexp for $0, $1, $2, etc.
   4015     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
   4016     RegexMatcher *cgMat = cgPat->matcher(status);
   4017 
   4018 
   4019     //
   4020     // Main Loop for the Perl Tests, runs once per line from the
   4021     //   test data file.
   4022     //
   4023     int32_t  lineNum = 0;
   4024     int32_t  skippedUnimplementedCount = 0;
   4025     while (lineMat->find()) {
   4026         lineNum++;
   4027 
   4028         //
   4029         //  Get a line, break it into its fields, do the Perl
   4030         //    variable substitutions.
   4031         //
   4032         UnicodeString line = lineMat->group(1, status);
   4033         UnicodeString fields[7];
   4034         fieldPat->split(line, fields, 7, status);
   4035 
   4036         flagMat->reset(fields[0]);
   4037         flagMat->matches(status);
   4038         UnicodeString pattern  = flagMat->group(2, status);
   4039         pattern.findAndReplace("${bang}", "!");
   4040         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
   4041         pattern.findAndReplace(ffffSrc, ffff);
   4042 
   4043         //
   4044         //  Identify patterns that include match flag settings,
   4045         //    split off the flags, remove the extra quotes.
   4046         //
   4047         UnicodeString flagStr = flagMat->group(3, status);
   4048         if (U_FAILURE(status)) {
   4049             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   4050             return;
   4051         }
   4052         int32_t flags = 0;
   4053         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
   4054         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
   4055         const UChar UChar_m = 0x6d;
   4056         const UChar UChar_x = 0x78;
   4057         const UChar UChar_y = 0x79;
   4058         if (flagStr.indexOf(UChar_i) != -1) {
   4059             flags |= UREGEX_CASE_INSENSITIVE;
   4060         }
   4061         if (flagStr.indexOf(UChar_m) != -1) {
   4062             flags |= UREGEX_MULTILINE;
   4063         }
   4064         if (flagStr.indexOf(UChar_x) != -1) {
   4065             flags |= UREGEX_COMMENTS;
   4066         }
   4067 
   4068         //
   4069         // Compile the test pattern.
   4070         //
   4071         status = U_ZERO_ERROR;
   4072         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
   4073         if (status == U_REGEX_UNIMPLEMENTED) {
   4074             //
   4075             // Test of a feature that is planned for ICU, but not yet implemented.
   4076             //   skip the test.
   4077             skippedUnimplementedCount++;
   4078             delete testPat;
   4079             status = U_ZERO_ERROR;
   4080             continue;
   4081         }
   4082 
   4083         if (U_FAILURE(status)) {
   4084             // Some tests are supposed to generate errors.
   4085             //   Only report an error for tests that are supposed to succeed.
   4086             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
   4087                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
   4088             {
   4089                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
   4090             }
   4091             status = U_ZERO_ERROR;
   4092             delete testPat;
   4093             continue;
   4094         }
   4095 
   4096         if (fields[2].indexOf(UChar_i) >= 0) {
   4097             // ICU should skip this test.
   4098             delete testPat;
   4099             continue;
   4100         }
   4101 
   4102         if (fields[2].indexOf(UChar_c) >= 0) {
   4103             // This pattern should have caused a compilation error, but didn't/
   4104             errln("line %d: Expected a pattern compile error, got success.", lineNum);
   4105             delete testPat;
   4106             continue;
   4107         }
   4108 
   4109         //
   4110         // replace the Perl variables that appear in some of the
   4111         //   match data strings.
   4112         //
   4113         UnicodeString matchString = fields[1];
   4114         matchString.findAndReplace(nulnulSrc, nulnul);
   4115         matchString.findAndReplace(ffffSrc,   ffff);
   4116 
   4117         // Replace any \n in the match string with an actual new-line char.
   4118         //  Don't do full unescape, as this unescapes more than Perl does, which
   4119         //  causes other spurious failures in the tests.
   4120         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4121 
   4122 
   4123 
   4124         //
   4125         // Run the test, check for expected match/don't match result.
   4126         //
   4127         RegexMatcher *testMat = testPat->matcher(matchString, status);
   4128         UBool found = testMat->find();
   4129         UBool expected = FALSE;
   4130         if (fields[2].indexOf(UChar_y) >=0) {
   4131             expected = TRUE;
   4132         }
   4133         if (expected != found) {
   4134             errln("line %d: Expected %smatch, got %smatch",
   4135                 lineNum, expected?"":"no ", found?"":"no " );
   4136             continue;
   4137         }
   4138 
   4139         // Don't try to check expected results if there is no match.
   4140         //   (Some have stuff in the expected fields)
   4141         if (!found) {
   4142             delete testMat;
   4143             delete testPat;
   4144             continue;
   4145         }
   4146 
   4147         //
   4148         // Interpret the Perl expression from the fourth field of the data file,
   4149         // building up an ICU string from the results of the ICU match.
   4150         //   The Perl expression will contain references to the results of
   4151         //     a regex match, including the matched string, capture group strings,
   4152         //     group starting and ending indicies, etc.
   4153         //
   4154         UnicodeString resultString;
   4155         UnicodeString perlExpr = fields[3];
   4156 #if SUPPORT_MUTATING_INPUT_STRING
   4157         groupsMat->reset(perlExpr);
   4158         cgMat->reset(perlExpr);
   4159 #endif
   4160 
   4161         while (perlExpr.length() > 0) {
   4162 #if !SUPPORT_MUTATING_INPUT_STRING
   4163             //  Perferred usage.  Reset after any modification to input string.
   4164             groupsMat->reset(perlExpr);
   4165             cgMat->reset(perlExpr);
   4166 #endif
   4167 
   4168             if (perlExpr.startsWith("$&")) {
   4169                 resultString.append(testMat->group(status));
   4170                 perlExpr.remove(0, 2);
   4171             }
   4172 
   4173             else if (groupsMat->lookingAt(status)) {
   4174                 // $-[0]   $+[2]  etc.
   4175                 UnicodeString digitString = groupsMat->group(2, status);
   4176                 int32_t t = 0;
   4177                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4178                 UnicodeString plusOrMinus = groupsMat->group(1, status);
   4179                 int32_t matchPosition;
   4180                 if (plusOrMinus.compare("+") == 0) {
   4181                     matchPosition = testMat->end(groupNum, status);
   4182                 } else {
   4183                     matchPosition = testMat->start(groupNum, status);
   4184                 }
   4185                 if (matchPosition != -1) {
   4186                     ICU_Utility::appendNumber(resultString, matchPosition);
   4187                 }
   4188                 perlExpr.remove(0, groupsMat->end(status));
   4189             }
   4190 
   4191             else if (cgMat->lookingAt(status)) {
   4192                 // $1, $2, $3, etc.
   4193                 UnicodeString digitString = cgMat->group(1, status);
   4194                 int32_t t = 0;
   4195                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4196                 if (U_SUCCESS(status)) {
   4197                     resultString.append(testMat->group(groupNum, status));
   4198                     status = U_ZERO_ERROR;
   4199                 }
   4200                 perlExpr.remove(0, cgMat->end(status));
   4201             }
   4202 
   4203             else if (perlExpr.startsWith("@-")) {
   4204                 int32_t i;
   4205                 for (i=0; i<=testMat->groupCount(); i++) {
   4206                     if (i>0) {
   4207                         resultString.append(" ");
   4208                     }
   4209                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
   4210                 }
   4211                 perlExpr.remove(0, 2);
   4212             }
   4213 
   4214             else if (perlExpr.startsWith("@+")) {
   4215                 int32_t i;
   4216                 for (i=0; i<=testMat->groupCount(); i++) {
   4217                     if (i>0) {
   4218                         resultString.append(" ");
   4219                     }
   4220                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
   4221                 }
   4222                 perlExpr.remove(0, 2);
   4223             }
   4224 
   4225             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
   4226                                                      //           or as an escaped sequence (e.g. \n)
   4227                 if (perlExpr.length() > 1) {
   4228                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
   4229                 }
   4230                 UChar c = perlExpr.charAt(0);
   4231                 switch (c) {
   4232                 case 'n':   c = '\n'; break;
   4233                 // add any other escape sequences that show up in the test expected results.
   4234                 }
   4235                 resultString.append(c);
   4236                 perlExpr.remove(0, 1);
   4237             }
   4238 
   4239             else  {
   4240                 // Any characters from the perl expression that we don't explicitly
   4241                 //  recognize before here are assumed to be literals and copied
   4242                 //  as-is to the expected results.
   4243                 resultString.append(perlExpr.charAt(0));
   4244                 perlExpr.remove(0, 1);
   4245             }
   4246 
   4247             if (U_FAILURE(status)) {
   4248                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
   4249                 break;
   4250             }
   4251         }
   4252 
   4253         //
   4254         // Expected Results Compare
   4255         //
   4256         UnicodeString expectedS(fields[4]);
   4257         expectedS.findAndReplace(nulnulSrc, nulnul);
   4258         expectedS.findAndReplace(ffffSrc,   ffff);
   4259         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4260 
   4261 
   4262         if (expectedS.compare(resultString) != 0) {
   4263             err("Line %d: Incorrect perl expression results.", lineNum);
   4264             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
   4265         }
   4266 
   4267         delete testMat;
   4268         delete testPat;
   4269     }
   4270 
   4271     //
   4272     // All done.  Clean up allocated stuff.
   4273     //
   4274     delete cgMat;
   4275     delete cgPat;
   4276 
   4277     delete groupsMat;
   4278     delete groupsPat;
   4279 
   4280     delete flagMat;
   4281     delete flagPat;
   4282 
   4283     delete lineMat;
   4284     delete linePat;
   4285 
   4286     delete fieldPat;
   4287     delete [] testData;
   4288 
   4289 
   4290     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
   4291 
   4292 }
   4293 
   4294 
   4295 //-------------------------------------------------------------------------------
   4296 //
   4297 //   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
   4298 //                  (instead of using UnicodeStrings) to test the alternate engine.
   4299 //                  The input file for this test is re_tests, the standard regular
   4300 //                  expression test data distributed with the Perl source code.
   4301 //                  See PerlTests() for more information.
   4302 //
   4303 //-------------------------------------------------------------------------------
   4304 void RegexTest::PerlTestsUTF8() {
   4305     char tdd[2048];
   4306     const char *srcPath;
   4307     UErrorCode  status = U_ZERO_ERROR;
   4308     UParseError pe;
   4309     LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
   4310     UText       patternText = UTEXT_INITIALIZER;
   4311     char       *patternChars = NULL;
   4312     int32_t     patternLength;
   4313     int32_t     patternCapacity = 0;
   4314     UText       inputText = UTEXT_INITIALIZER;
   4315     char       *inputChars = NULL;
   4316     int32_t     inputLength;
   4317     int32_t     inputCapacity = 0;
   4318 
   4319     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
   4320 
   4321     //
   4322     //  Open and read the test data file.
   4323     //
   4324     srcPath=getPath(tdd, "re_tests.txt");
   4325     if(srcPath==NULL) {
   4326         return; /* something went wrong, error already output */
   4327     }
   4328 
   4329     int32_t    len;
   4330     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
   4331     if (U_FAILURE(status)) {
   4332         return; /* something went wrong, error already output */
   4333     }
   4334 
   4335     //
   4336     //  Put the test data into a UnicodeString
   4337     //
   4338     UnicodeString testDataString(FALSE, testData, len);
   4339 
   4340     //
   4341     //  Regex to break the input file into lines, and strip the new lines.
   4342     //     One line per match, capture group one is the desired data.
   4343     //
   4344     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
   4345     if (U_FAILURE(status)) {
   4346         dataerrln("RegexPattern::compile() error");
   4347         return;
   4348     }
   4349     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
   4350 
   4351     //
   4352     //  Regex to split a test file line into fields.
   4353     //    There are six fields, separated by tabs.
   4354     //
   4355     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
   4356 
   4357     //
   4358     //  Regex to identify test patterns with flag settings, and to separate them.
   4359     //    Test patterns with flags look like 'pattern'i
   4360     //    Test patterns without flags are not quoted:   pattern
   4361     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
   4362     //
   4363     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
   4364     RegexMatcher* flagMat = flagPat->matcher(status);
   4365 
   4366     //
   4367     // The Perl tests reference several perl-isms, which are evaluated/substituted
   4368     //   in the test data.  Not being perl, this must be done explicitly.  Here
   4369     //   are string constants and REs for these constructs.
   4370     //
   4371     UnicodeString nulnulSrc("${nulnul}");
   4372     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
   4373     nulnul = nulnul.unescape();
   4374 
   4375     UnicodeString ffffSrc("${ffff}");
   4376     UnicodeString ffff("\\uffff", -1, US_INV);
   4377     ffff = ffff.unescape();
   4378 
   4379     //  regexp for $-[0], $+[2], etc.
   4380     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
   4381     RegexMatcher *groupsMat = groupsPat->matcher(status);
   4382 
   4383     //  regexp for $0, $1, $2, etc.
   4384     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
   4385     RegexMatcher *cgMat = cgPat->matcher(status);
   4386 
   4387 
   4388     //
   4389     // Main Loop for the Perl Tests, runs once per line from the
   4390     //   test data file.
   4391     //
   4392     int32_t  lineNum = 0;
   4393     int32_t  skippedUnimplementedCount = 0;
   4394     while (lineMat->find()) {
   4395         lineNum++;
   4396 
   4397         //
   4398         //  Get a line, break it into its fields, do the Perl
   4399         //    variable substitutions.
   4400         //
   4401         UnicodeString line = lineMat->group(1, status);
   4402         UnicodeString fields[7];
   4403         fieldPat->split(line, fields, 7, status);
   4404 
   4405         flagMat->reset(fields[0]);
   4406         flagMat->matches(status);
   4407         UnicodeString pattern  = flagMat->group(2, status);
   4408         pattern.findAndReplace("${bang}", "!");
   4409         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
   4410         pattern.findAndReplace(ffffSrc, ffff);
   4411 
   4412         //
   4413         //  Identify patterns that include match flag settings,
   4414         //    split off the flags, remove the extra quotes.
   4415         //
   4416         UnicodeString flagStr = flagMat->group(3, status);
   4417         if (U_FAILURE(status)) {
   4418             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   4419             return;
   4420         }
   4421         int32_t flags = 0;
   4422         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
   4423         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
   4424         const UChar UChar_m = 0x6d;
   4425         const UChar UChar_x = 0x78;
   4426         const UChar UChar_y = 0x79;
   4427         if (flagStr.indexOf(UChar_i) != -1) {
   4428             flags |= UREGEX_CASE_INSENSITIVE;
   4429         }
   4430         if (flagStr.indexOf(UChar_m) != -1) {
   4431             flags |= UREGEX_MULTILINE;
   4432         }
   4433         if (flagStr.indexOf(UChar_x) != -1) {
   4434             flags |= UREGEX_COMMENTS;
   4435         }
   4436 
   4437         //
   4438         // Put the pattern in a UTF-8 UText
   4439         //
   4440         status = U_ZERO_ERROR;
   4441         patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
   4442         if (status == U_BUFFER_OVERFLOW_ERROR) {
   4443             status = U_ZERO_ERROR;
   4444             delete[] patternChars;
   4445             patternCapacity = patternLength + 1;
   4446             patternChars = new char[patternCapacity];
   4447             pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
   4448         }
   4449         utext_openUTF8(&patternText, patternChars, patternLength, &status);
   4450 
   4451         //
   4452         // Compile the test pattern.
   4453         //
   4454         RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
   4455         if (status == U_REGEX_UNIMPLEMENTED) {
   4456             //
   4457             // Test of a feature that is planned for ICU, but not yet implemented.
   4458             //   skip the test.
   4459             skippedUnimplementedCount++;
   4460             delete testPat;
   4461             status = U_ZERO_ERROR;
   4462             continue;
   4463         }
   4464 
   4465         if (U_FAILURE(status)) {
   4466             // Some tests are supposed to generate errors.
   4467             //   Only report an error for tests that are supposed to succeed.
   4468             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
   4469                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
   4470             {
   4471                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
   4472             }
   4473             status = U_ZERO_ERROR;
   4474             delete testPat;
   4475             continue;
   4476         }
   4477 
   4478         if (fields[2].indexOf(UChar_i) >= 0) {
   4479             // ICU should skip this test.
   4480             delete testPat;
   4481             continue;
   4482         }
   4483 
   4484         if (fields[2].indexOf(UChar_c) >= 0) {
   4485             // This pattern should have caused a compilation error, but didn't/
   4486             errln("line %d: Expected a pattern compile error, got success.", lineNum);
   4487             delete testPat;
   4488             continue;
   4489         }
   4490 
   4491 
   4492         //
   4493         // replace the Perl variables that appear in some of the
   4494         //   match data strings.
   4495         //
   4496         UnicodeString matchString = fields[1];
   4497         matchString.findAndReplace(nulnulSrc, nulnul);
   4498         matchString.findAndReplace(ffffSrc,   ffff);
   4499 
   4500         // Replace any \n in the match string with an actual new-line char.
   4501         //  Don't do full unescape, as this unescapes more than Perl does, which
   4502         //  causes other spurious failures in the tests.
   4503         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4504 
   4505         //
   4506         // Put the input in a UTF-8 UText
   4507         //
   4508         status = U_ZERO_ERROR;
   4509         inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
   4510         if (status == U_BUFFER_OVERFLOW_ERROR) {
   4511             status = U_ZERO_ERROR;
   4512             delete[] inputChars;
   4513             inputCapacity = inputLength + 1;
   4514             inputChars = new char[inputCapacity];
   4515             matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
   4516         }
   4517         utext_openUTF8(&inputText, inputChars, inputLength, &status);
   4518 
   4519         //
   4520         // Run the test, check for expected match/don't match result.
   4521         //
   4522         RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
   4523         UBool found = testMat->find();
   4524         UBool expected = FALSE;
   4525         if (fields[2].indexOf(UChar_y) >=0) {
   4526             expected = TRUE;
   4527         }
   4528         if (expected != found) {
   4529             errln("line %d: Expected %smatch, got %smatch",
   4530                 lineNum, expected?"":"no ", found?"":"no " );
   4531             continue;
   4532         }
   4533 
   4534         // Don't try to check expected results if there is no match.
   4535         //   (Some have stuff in the expected fields)
   4536         if (!found) {
   4537             delete testMat;
   4538             delete testPat;
   4539             continue;
   4540         }
   4541 
   4542         //
   4543         // Interpret the Perl expression from the fourth field of the data file,
   4544         // building up an ICU string from the results of the ICU match.
   4545         //   The Perl expression will contain references to the results of
   4546         //     a regex match, including the matched string, capture group strings,
   4547         //     group starting and ending indicies, etc.
   4548         //
   4549         UnicodeString resultString;
   4550         UnicodeString perlExpr = fields[3];
   4551 
   4552         while (perlExpr.length() > 0) {
   4553             groupsMat->reset(perlExpr);
   4554             cgMat->reset(perlExpr);
   4555 
   4556             if (perlExpr.startsWith("$&")) {
   4557                 resultString.append(testMat->group(status));
   4558                 perlExpr.remove(0, 2);
   4559             }
   4560 
   4561             else if (groupsMat->lookingAt(status)) {
   4562                 // $-[0]   $+[2]  etc.
   4563                 UnicodeString digitString = groupsMat->group(2, status);
   4564                 int32_t t = 0;
   4565                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4566                 UnicodeString plusOrMinus = groupsMat->group(1, status);
   4567                 int32_t matchPosition;
   4568                 if (plusOrMinus.compare("+") == 0) {
   4569                     matchPosition = testMat->end(groupNum, status);
   4570                 } else {
   4571                     matchPosition = testMat->start(groupNum, status);
   4572                 }
   4573                 if (matchPosition != -1) {
   4574                     ICU_Utility::appendNumber(resultString, matchPosition);
   4575                 }
   4576                 perlExpr.remove(0, groupsMat->end(status));
   4577             }
   4578 
   4579             else if (cgMat->lookingAt(status)) {
   4580                 // $1, $2, $3, etc.
   4581                 UnicodeString digitString = cgMat->group(1, status);
   4582                 int32_t t = 0;
   4583                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4584                 if (U_SUCCESS(status)) {
   4585                     resultString.append(testMat->group(groupNum, status));
   4586                     status = U_ZERO_ERROR;
   4587                 }
   4588                 perlExpr.remove(0, cgMat->end(status));
   4589             }
   4590 
   4591             else if (perlExpr.startsWith("@-")) {
   4592                 int32_t i;
   4593                 for (i=0; i<=testMat->groupCount(); i++) {
   4594                     if (i>0) {
   4595                         resultString.append(" ");
   4596                     }
   4597                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
   4598                 }
   4599                 perlExpr.remove(0, 2);
   4600             }
   4601 
   4602             else if (perlExpr.startsWith("@+")) {
   4603                 int32_t i;
   4604                 for (i=0; i<=testMat->groupCount(); i++) {
   4605                     if (i>0) {
   4606                         resultString.append(" ");
   4607                     }
   4608                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
   4609                 }
   4610                 perlExpr.remove(0, 2);
   4611             }
   4612 
   4613             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
   4614                                                      //           or as an escaped sequence (e.g. \n)
   4615                 if (perlExpr.length() > 1) {
   4616                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
   4617                 }
   4618                 UChar c = perlExpr.charAt(0);
   4619                 switch (c) {
   4620                 case 'n':   c = '\n'; break;
   4621                 // add any other escape sequences that show up in the test expected results.
   4622                 }
   4623                 resultString.append(c);
   4624                 perlExpr.remove(0, 1);
   4625             }
   4626 
   4627             else  {
   4628                 // Any characters from the perl expression that we don't explicitly
   4629                 //  recognize before here are assumed to be literals and copied
   4630                 //  as-is to the expected results.
   4631                 resultString.append(perlExpr.charAt(0));
   4632                 perlExpr.remove(0, 1);
   4633             }
   4634 
   4635             if (U_FAILURE(status)) {
   4636                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
   4637                 break;
   4638             }
   4639         }
   4640 
   4641         //
   4642         // Expected Results Compare
   4643         //
   4644         UnicodeString expectedS(fields[4]);
   4645         expectedS.findAndReplace(nulnulSrc, nulnul);
   4646         expectedS.findAndReplace(ffffSrc,   ffff);
   4647         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4648 
   4649 
   4650         if (expectedS.compare(resultString) != 0) {
   4651             err("Line %d: Incorrect perl expression results.", lineNum);
   4652             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
   4653         }
   4654 
   4655         delete testMat;
   4656         delete testPat;
   4657     }
   4658 
   4659     //
   4660     // All done.  Clean up allocated stuff.
   4661     //
   4662     delete cgMat;
   4663     delete cgPat;
   4664 
   4665     delete groupsMat;
   4666     delete groupsPat;
   4667 
   4668     delete flagMat;
   4669     delete flagPat;
   4670 
   4671     delete lineMat;
   4672     delete linePat;
   4673 
   4674     delete fieldPat;
   4675     delete [] testData;
   4676 
   4677     utext_close(&patternText);
   4678     utext_close(&inputText);
   4679 
   4680     delete [] patternChars;
   4681     delete [] inputChars;
   4682 
   4683 
   4684     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
   4685 
   4686 }
   4687 
   4688 
   4689 //--------------------------------------------------------------
   4690 //
   4691 //  Bug6149   Verify limits to heap expansion for backtrack stack.
   4692 //             Use this pattern,
   4693 //                 "(a?){1,8000000}"
   4694 //             Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
   4695 //                   This test is likely to be fragile, as further optimizations stop
   4696 //                   more cases of pointless looping in the match engine.
   4697 //
   4698 //---------------------------------------------------------------
   4699 void RegexTest::Bug6149() {
   4700     UnicodeString pattern("(a?){1,8000000}");
   4701     UnicodeString s("xyz");
   4702     uint32_t flags = 0;
   4703     UErrorCode status = U_ZERO_ERROR;
   4704 
   4705     RegexMatcher  matcher(pattern, s, flags, status);
   4706     UBool result = false;
   4707     REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
   4708     REGEX_ASSERT(result == FALSE);
   4709  }
   4710 
   4711 
   4712 //
   4713 //   Callbacks()    Test the callback function.
   4714 //                  When set, callbacks occur periodically during matching operations,
   4715 //                  giving the application code the ability to abort the operation
   4716 //                  before it's normal completion.
   4717 //
   4718 
   4719 struct callBackContext {
   4720     RegexTest        *test;
   4721     int32_t          maxCalls;
   4722     int32_t          numCalls;
   4723     int32_t          lastSteps;
   4724     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
   4725 };
   4726 
   4727 U_CDECL_BEGIN
   4728 static UBool U_CALLCONV
   4729 testCallBackFn(const void *context, int32_t steps) {
   4730     callBackContext  *info = (callBackContext *)context;
   4731     if (info->lastSteps+1 != steps) {
   4732         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
   4733     }
   4734     info->lastSteps = steps;
   4735     info->numCalls++;
   4736     return (info->numCalls < info->maxCalls);
   4737 }
   4738 U_CDECL_END
   4739 
   4740 void RegexTest::Callbacks() {
   4741    {
   4742         // Getter returns NULLs if no callback has been set
   4743 
   4744         //   The variables that the getter will fill in.
   4745         //   Init to non-null values so that the action of the getter can be seen.
   4746         const void          *returnedContext = &returnedContext;
   4747         URegexMatchCallback *returnedFn = &testCallBackFn;
   4748 
   4749         UErrorCode status = U_ZERO_ERROR;
   4750         RegexMatcher matcher("x", 0, status);
   4751         REGEX_CHECK_STATUS;
   4752         matcher.getMatchCallback(returnedFn, returnedContext, status);
   4753         REGEX_CHECK_STATUS;
   4754         REGEX_ASSERT(returnedFn == NULL);
   4755         REGEX_ASSERT(returnedContext == NULL);
   4756     }
   4757 
   4758    {
   4759         // Set and Get work
   4760         callBackContext cbInfo = {this, 0, 0, 0};
   4761         const void          *returnedContext;
   4762         URegexMatchCallback *returnedFn;
   4763         UErrorCode status = U_ZERO_ERROR;
   4764         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
   4765         REGEX_CHECK_STATUS;
   4766         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
   4767         REGEX_CHECK_STATUS;
   4768         matcher.getMatchCallback(returnedFn, returnedContext, status);
   4769         REGEX_CHECK_STATUS;
   4770         REGEX_ASSERT(returnedFn == testCallBackFn);
   4771         REGEX_ASSERT(returnedContext == &cbInfo);
   4772 
   4773         // A short-running match shouldn't invoke the callback
   4774         status = U_ZERO_ERROR;
   4775         cbInfo.reset(1);
   4776         UnicodeString s = "xxx";
   4777         matcher.reset(s);
   4778         REGEX_ASSERT(matcher.matches(status));
   4779         REGEX_CHECK_STATUS;
   4780         REGEX_ASSERT(cbInfo.numCalls == 0);
   4781 
   4782         // A medium-length match that runs long enough to invoke the
   4783         //   callback, but not so long that the callback aborts it.
   4784         status = U_ZERO_ERROR;
   4785         cbInfo.reset(4);
   4786         s = "aaaaaaaaaaaaaaaaaaab";
   4787         matcher.reset(s);
   4788         REGEX_ASSERT(matcher.matches(status)==FALSE);
   4789         REGEX_CHECK_STATUS;
   4790         REGEX_ASSERT(cbInfo.numCalls > 0);
   4791 
   4792         // A longer running match that the callback function will abort.
   4793         status = U_ZERO_ERROR;
   4794         cbInfo.reset(4);
   4795         s = "aaaaaaaaaaaaaaaaaaaaaaab";
   4796         matcher.reset(s);
   4797         REGEX_ASSERT(matcher.matches(status)==FALSE);
   4798         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
   4799         REGEX_ASSERT(cbInfo.numCalls == 4);
   4800     }
   4801 
   4802 
   4803 }
   4804 
   4805 
   4806 //
   4807 //   FindProgressCallbacks()    Test the find "progress" callback function.
   4808 //                  When set, the find progress callback will be invoked during a find operations
   4809 //                  after each return from a match attempt, giving the application the opportunity
   4810 //                  to terminate a long-running find operation before it's normal completion.
   4811 //
   4812 
   4813 struct progressCallBackContext {
   4814     RegexTest        *test;
   4815     int64_t          lastIndex;
   4816     int32_t          maxCalls;
   4817     int32_t          numCalls;
   4818     void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
   4819 };
   4820 
   4821 U_CDECL_BEGIN
   4822 static UBool U_CALLCONV
   4823 testProgressCallBackFn(const void *context, int64_t matchIndex) {
   4824     progressCallBackContext  *info = (progressCallBackContext *)context;
   4825     info->numCalls++;
   4826     info->lastIndex = matchIndex;
   4827 //    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
   4828     return (info->numCalls < info->maxCalls);
   4829 }
   4830 U_CDECL_END
   4831 
   4832 void RegexTest::FindProgressCallbacks() {
   4833    {
   4834         // Getter returns NULLs if no callback has been set
   4835 
   4836         //   The variables that the getter will fill in.
   4837         //   Init to non-null values so that the action of the getter can be seen.
   4838         const void                  *returnedContext = &returnedContext;
   4839         URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
   4840 
   4841         UErrorCode status = U_ZERO_ERROR;
   4842         RegexMatcher matcher("x", 0, status);
   4843         REGEX_CHECK_STATUS;
   4844         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
   4845         REGEX_CHECK_STATUS;
   4846         REGEX_ASSERT(returnedFn == NULL);
   4847         REGEX_ASSERT(returnedContext == NULL);
   4848     }
   4849 
   4850    {
   4851         // Set and Get work
   4852         progressCallBackContext cbInfo = {this, 0, 0, 0};
   4853         const void                  *returnedContext;
   4854         URegexFindProgressCallback  *returnedFn;
   4855         UErrorCode status = U_ZERO_ERROR;
   4856         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
   4857         REGEX_CHECK_STATUS;
   4858         matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
   4859         REGEX_CHECK_STATUS;
   4860         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
   4861         REGEX_CHECK_STATUS;
   4862         REGEX_ASSERT(returnedFn == testProgressCallBackFn);
   4863         REGEX_ASSERT(returnedContext == &cbInfo);
   4864 
   4865         // A short-running match should NOT invoke the callback.
   4866         status = U_ZERO_ERROR;
   4867         cbInfo.reset(100);
   4868         UnicodeString s = "abxxx";
   4869         matcher.reset(s);
   4870 #if 0
   4871         matcher.setTrace(TRUE);
   4872 #endif
   4873         REGEX_ASSERT(matcher.find(0, status));
   4874         REGEX_CHECK_STATUS;
   4875         REGEX_ASSERT(cbInfo.numCalls == 0);
   4876 
   4877         // A medium running match that causes matcher.find() to invoke our callback for each index.
   4878         status = U_ZERO_ERROR;
   4879         s = "aaaaaaaaaaaaaaaaaaab";
   4880         cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
   4881         matcher.reset(s);
   4882         REGEX_ASSERT(matcher.find(0, status)==FALSE);
   4883         REGEX_CHECK_STATUS;
   4884         REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
   4885 
   4886         // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
   4887         status = U_ZERO_ERROR;
   4888         UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
   4889         cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
   4890         matcher.reset(s1);
   4891         REGEX_ASSERT(matcher.find(0, status)==FALSE);
   4892         REGEX_CHECK_STATUS;
   4893         REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
   4894 
   4895 #if 0
   4896         // Now a match that will succeed, but after an interruption
   4897         status = U_ZERO_ERROR;
   4898         UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
   4899         cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
   4900         matcher.reset(s2);
   4901         REGEX_ASSERT(matcher.find(0, status)==FALSE);
   4902         REGEX_CHECK_STATUS;
   4903         // Now retry the match from where left off
   4904         cbInfo.maxCalls = 100; //  No callback limit
   4905         REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
   4906         REGEX_CHECK_STATUS;
   4907 #endif
   4908     }
   4909 
   4910 
   4911 }
   4912 
   4913 
   4914 //---------------------------------------------------------------------------
   4915 //
   4916 //    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
   4917 //                             UTexts. The pure-C implementation of UText
   4918 //                             has no mutable backing stores, but we can
   4919 //                             use UnicodeString here to test the functionality.
   4920 //
   4921 //---------------------------------------------------------------------------
   4922 void RegexTest::PreAllocatedUTextCAPI () {
   4923     UErrorCode           status = U_ZERO_ERROR;
   4924     URegularExpression  *re;
   4925     UText                patternText = UTEXT_INITIALIZER;
   4926     UnicodeString        buffer;
   4927     UText                bufferText = UTEXT_INITIALIZER;
   4928 
   4929     utext_openUnicodeString(&bufferText, &buffer, &status);
   4930 
   4931     /*
   4932      *  getText() and getUText()
   4933      */
   4934     {
   4935         UText  text1 = UTEXT_INITIALIZER;
   4936         UText  text2 = UTEXT_INITIALIZER;
   4937         UChar  text2Chars[20];
   4938         UText  *resultText;
   4939 
   4940         status = U_ZERO_ERROR;
   4941         regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
   4942         regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
   4943         u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
   4944         utext_openUChars(&text2, text2Chars, -1, &status);
   4945 
   4946         regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
   4947         re = uregex_openUText(&patternText, 0, NULL, &status);
   4948 
   4949         /* First set a UText */
   4950         uregex_setUText(re, &text1, &status);
   4951         resultText = uregex_getUText(re, &bufferText, &status);
   4952         REGEX_CHECK_STATUS;
   4953         REGEX_ASSERT(resultText == &bufferText);
   4954         utext_setNativeIndex(resultText, 0);
   4955         utext_setNativeIndex(&text1, 0);
   4956         REGEX_ASSERT(testUTextEqual(resultText, &text1));
   4957 
   4958         resultText = uregex_getUText(re, &bufferText, &status);
   4959         REGEX_CHECK_STATUS;
   4960         REGEX_ASSERT(resultText == &bufferText);
   4961         utext_setNativeIndex(resultText, 0);
   4962         utext_setNativeIndex(&text1, 0);
   4963         REGEX_ASSERT(testUTextEqual(resultText, &text1));
   4964 
   4965         /* Then set a UChar * */
   4966         uregex_setText(re, text2Chars, 7, &status);
   4967         resultText = uregex_getUText(re, &bufferText, &status);
   4968         REGEX_CHECK_STATUS;
   4969         REGEX_ASSERT(resultText == &bufferText);
   4970         utext_setNativeIndex(resultText, 0);
   4971         utext_setNativeIndex(&text2, 0);
   4972         REGEX_ASSERT(testUTextEqual(resultText, &text2));
   4973 
   4974         uregex_close(re);
   4975         utext_close(&text1);
   4976         utext_close(&text2);
   4977     }
   4978 
   4979     /*
   4980      *  group()
   4981      */
   4982     {
   4983         UChar    text1[80];
   4984         UText   *actual;
   4985         UBool    result;
   4986         u_uastrncpy(text1, "noise abc interior def, and this is off the end",  sizeof(text1)/2);
   4987 
   4988         status = U_ZERO_ERROR;
   4989         re = uregex_openC("abc(.*?)def", 0, NULL, &status);
   4990         REGEX_CHECK_STATUS;
   4991 
   4992         uregex_setText(re, text1, -1, &status);
   4993         result = uregex_find(re, 0, &status);
   4994         REGEX_ASSERT(result==TRUE);
   4995 
   4996         /*  Capture Group 0, the full match.  Should succeed.  */
   4997         status = U_ZERO_ERROR;
   4998         actual = uregex_groupUTextDeep(re, 0, &bufferText, &status);
   4999         REGEX_CHECK_STATUS;
   5000         REGEX_ASSERT(actual == &bufferText);
   5001         REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual);
   5002 
   5003         /*  Capture group #1.  Should succeed. */
   5004         status = U_ZERO_ERROR;
   5005         actual = uregex_groupUTextDeep(re, 1, &bufferText, &status);
   5006         REGEX_CHECK_STATUS;
   5007         REGEX_ASSERT(actual == &bufferText);
   5008         REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual);
   5009 
   5010         /*  Capture group out of range.  Error. */
   5011         status = U_ZERO_ERROR;
   5012         actual = uregex_groupUTextDeep(re, 2, &bufferText, &status);
   5013         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   5014         REGEX_ASSERT(actual == &bufferText);
   5015 
   5016         uregex_close(re);
   5017 
   5018     }
   5019 
   5020     /*
   5021      *  replaceFirst()
   5022      */
   5023     {
   5024         UChar    text1[80];
   5025         UChar    text2[80];
   5026         UText    replText = UTEXT_INITIALIZER;
   5027         UText   *result;
   5028 
   5029         status = U_ZERO_ERROR;
   5030         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
   5031         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
   5032         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
   5033 
   5034         re = uregex_openC("x(.*?)x", 0, NULL, &status);
   5035         REGEX_CHECK_STATUS;
   5036 
   5037         /*  Normal case, with match */
   5038         uregex_setText(re, text1, -1, &status);
   5039         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5040         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   5041         REGEX_CHECK_STATUS;
   5042         REGEX_ASSERT(result == &bufferText);
   5043         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
   5044 
   5045         /* No match.  Text should copy to output with no changes.  */
   5046         uregex_setText(re, text2, -1, &status);
   5047         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5048         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   5049         REGEX_CHECK_STATUS;
   5050         REGEX_ASSERT(result == &bufferText);
   5051         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
   5052 
   5053         /* Unicode escapes */
   5054         uregex_setText(re, text1, -1, &status);
   5055         regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
   5056         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5057         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   5058         REGEX_CHECK_STATUS;
   5059         REGEX_ASSERT(result == &bufferText);
   5060         REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
   5061 
   5062         uregex_close(re);
   5063         utext_close(&replText);
   5064     }
   5065 
   5066 
   5067     /*
   5068      *  replaceAll()
   5069      */
   5070     {
   5071         UChar    text1[80];
   5072         UChar    text2[80];
   5073         UText    replText = UTEXT_INITIALIZER;
   5074         UText   *result;
   5075 
   5076         status = U_ZERO_ERROR;
   5077         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
   5078         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
   5079         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
   5080 
   5081         re = uregex_openC("x(.*?)x", 0, NULL, &status);
   5082         REGEX_CHECK_STATUS;
   5083 
   5084         /*  Normal case, with match */
   5085         uregex_setText(re, text1, -1, &status);
   5086         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5087         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
   5088         REGEX_CHECK_STATUS;
   5089         REGEX_ASSERT(result == &bufferText);
   5090         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
   5091 
   5092         /* No match.  Text should copy to output with no changes.  */
   5093         uregex_setText(re, text2, -1, &status);
   5094         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5095         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
   5096         REGEX_CHECK_STATUS;
   5097         REGEX_ASSERT(result == &bufferText);
   5098         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
   5099 
   5100         uregex_close(re);
   5101         utext_close(&replText);
   5102     }
   5103 
   5104 
   5105     /*
   5106      *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
   5107      *   so we don't need to test it here.
   5108      */
   5109 
   5110     utext_close(&bufferText);
   5111     utext_close(&patternText);
   5112 }
   5113 
   5114 //--------------------------------------------------------------
   5115 //
   5116 //  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
   5117 //
   5118 //---------------------------------------------------------------
   5119 void RegexTest::Bug7651() {
   5120     UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
   5121     //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
   5122     //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
   5123     UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
   5124     UnicodeString s("#ff @abcd This is test");
   5125     RegexPattern  *REPattern = NULL;
   5126     RegexMatcher  *REMatcher = NULL;
   5127     UErrorCode status = U_ZERO_ERROR;
   5128     UParseError pe;
   5129 
   5130     REPattern = RegexPattern::compile(pattern1, 0, pe, status);
   5131     REGEX_CHECK_STATUS;
   5132     REMatcher = REPattern->matcher(s, status);
   5133     REGEX_CHECK_STATUS;
   5134     REGEX_ASSERT(REMatcher->find());
   5135     REGEX_ASSERT(REMatcher->start(status) == 0);
   5136     delete REPattern;
   5137     delete REMatcher;
   5138     status = U_ZERO_ERROR;
   5139 
   5140     REPattern = RegexPattern::compile(pattern2, 0, pe, status);
   5141     REGEX_CHECK_STATUS;
   5142     REMatcher = REPattern->matcher(s, status);
   5143     REGEX_CHECK_STATUS;
   5144     REGEX_ASSERT(REMatcher->find());
   5145     REGEX_ASSERT(REMatcher->start(status) == 0);
   5146     delete REPattern;
   5147     delete REMatcher;
   5148     status = U_ZERO_ERROR;
   5149  }
   5150 
   5151 void RegexTest::Bug7740() {
   5152     UErrorCode status = U_ZERO_ERROR;
   5153     UnicodeString pattern = "(a)";
   5154     UnicodeString text = "abcdef";
   5155     RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
   5156     REGEX_CHECK_STATUS;
   5157     REGEX_ASSERT(m->lookingAt(status));
   5158     REGEX_CHECK_STATUS;
   5159     status = U_ILLEGAL_ARGUMENT_ERROR;
   5160     UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
   5161     REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
   5162     REGEX_ASSERT(s == "");
   5163     delete m;
   5164 }
   5165 
   5166 // Bug 8479:  was crashing whith a Bogus UnicodeString as input.
   5167 
   5168 void RegexTest::Bug8479() {
   5169     UErrorCode status = U_ZERO_ERROR;
   5170 
   5171     RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
   5172     REGEX_CHECK_STATUS;
   5173     if (U_SUCCESS(status))
   5174     {
   5175         UnicodeString str;
   5176         str.setToBogus();
   5177         pMatcher->reset(str);
   5178         status = U_ZERO_ERROR;
   5179         pMatcher->matches(status);
   5180         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
   5181         delete pMatcher;
   5182     }
   5183 }
   5184 
   5185 
   5186 // Bug 7029
   5187 void RegexTest::Bug7029() {
   5188     UErrorCode status = U_ZERO_ERROR;
   5189 
   5190     RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
   5191     UnicodeString text = "abc.def";
   5192     UnicodeString splits[10];
   5193     REGEX_CHECK_STATUS;
   5194     int32_t numFields = pMatcher->split(text, splits, 10, status);
   5195     REGEX_CHECK_STATUS;
   5196     REGEX_ASSERT(numFields == 8);
   5197     delete pMatcher;
   5198 }
   5199 
   5200 // Bug 9283
   5201 //   This test is checking for the existance of any supplemental characters that case-fold
   5202 //   to a bmp character.
   5203 //
   5204 //   At the time of this writing there are none. If any should appear in a subsequent release
   5205 //   of Unicode, the code in regular expressions compilation that determines the longest
   5206 //   posssible match for a literal string  will need to be enhanced.
   5207 //
   5208 //   See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
   5209 //   for details on what to do in case of a failure of this test.
   5210 //
   5211 void RegexTest::Bug9283() {
   5212 #if !UCONFIG_NO_NORMALIZATION
   5213     UErrorCode status = U_ZERO_ERROR;
   5214     UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
   5215     REGEX_CHECK_STATUS;
   5216     int32_t index;
   5217     UChar32 c;
   5218     for (index=0; ; index++) {
   5219         c = supplementalsWithCaseFolding.charAt(index);
   5220         if (c == -1) {
   5221             break;
   5222         }
   5223         UnicodeString cf = UnicodeString(c).foldCase();
   5224         REGEX_ASSERT(cf.length() >= 2);
   5225     }
   5226 #endif /* #if !UCONFIG_NO_NORMALIZATION */
   5227 }
   5228 
   5229 
   5230 void RegexTest::CheckInvBufSize() {
   5231   if(inv_next>=INV_BUFSIZ) {
   5232     errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
   5233           __FILE__, INV_BUFSIZ, inv_next);
   5234   } else {
   5235     logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
   5236   }
   5237 }
   5238 
   5239 
   5240 void RegexTest::Bug10459() {
   5241     UErrorCode status = U_ZERO_ERROR;
   5242     UnicodeString patternString("(txt)");
   5243     UnicodeString txtString("txt");
   5244 
   5245     UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
   5246     REGEX_CHECK_STATUS;
   5247     UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
   5248     REGEX_CHECK_STATUS;
   5249 
   5250     URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
   5251     REGEX_CHECK_STATUS;
   5252 
   5253     uregex_setUText(icu_re, utext_txt, &status);
   5254     REGEX_CHECK_STATUS;
   5255 
   5256     // The bug was that calling uregex_group() before doing a matching operation
   5257     //   was causing a segfault. Only for Regular Expressions created from UText.
   5258     //   It should set an U_REGEX_INVALID_STATE.
   5259 
   5260     UChar buf[100];
   5261     int32_t len = uregex_group(icu_re, 0, buf, LENGTHOF(buf), &status);
   5262     REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
   5263     REGEX_ASSERT(len == 0);
   5264 
   5265     uregex_close(icu_re);
   5266     utext_close(utext_pat);
   5267     utext_close(utext_txt);
   5268 }
   5269 
   5270 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
   5271 
   5272