Home | History | Annotate | Download | only in intltest
      1 /********************************************************************
      2  * COPYRIGHT:
      3  * Copyright (c) 2002-2010, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  ********************************************************************/
      6 
      7 //
      8 //   regextst.cpp
      9 //
     10 //      ICU Regular Expressions test, part of intltest.
     11 //
     12 
     13 #include "intltest.h"
     14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     15 
     16 #include "unicode/regex.h"
     17 #include "unicode/uchar.h"
     18 #include "unicode/ucnv.h"
     19 #include "unicode/ustring.h"
     20 #include "regextst.h"
     21 #include "uvector.h"
     22 #include "util.h"
     23 #include <stdlib.h>
     24 #include <string.h>
     25 #include <stdio.h>
     26 
     27 #define SUPPORT_MUTATING_INPUT_STRING   0
     28 
     29 
     30 //---------------------------------------------------------------------------
     31 //
     32 //  Test class boilerplate
     33 //
     34 //---------------------------------------------------------------------------
     35 RegexTest::RegexTest()
     36 {
     37 }
     38 
     39 
     40 RegexTest::~RegexTest()
     41 {
     42 }
     43 
     44 
     45 
     46 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
     47 {
     48     if (exec) logln("TestSuite RegexTest: ");
     49     switch (index) {
     50 
     51         case 0: name = "Basic";
     52             if (exec) Basic();
     53             break;
     54         case 1: name = "API_Match";
     55             if (exec) API_Match();
     56             break;
     57         case 2: name = "API_Replace";
     58             if (exec) API_Replace();
     59             break;
     60         case 3: name = "API_Pattern";
     61             if (exec) API_Pattern();
     62             break;
     63         case 4:
     64 #if !UCONFIG_NO_FILE_IO
     65             name = "Extended";
     66             if (exec) Extended();
     67 #else
     68             name = "skip";
     69 #endif
     70             break;
     71         case 5: name = "Errors";
     72             if (exec) Errors();
     73             break;
     74         case 6: name = "PerlTests";
     75             if (exec) PerlTests();
     76             break;
     77         case 7: name = "Callbacks";
     78             if (exec) Callbacks();
     79             break;
     80         case 8: name = "Bug 6149";
     81              if (exec) Bug6149();
     82              break;
     83         case 9: name = "UTextBasic";
     84           if (exec) UTextBasic();
     85           break;
     86         case 10: name = "API_Match_UTF8";
     87           if (exec) API_Match_UTF8();
     88           break;
     89         case 11: name = "API_Replace_UTF8";
     90           if (exec) API_Replace_UTF8();
     91           break;
     92         case 12: name = "API_Pattern_UTF8";
     93           if (exec) API_Pattern_UTF8();
     94           break;
     95         case 13: name = "PerlTestsUTF8";
     96           if (exec) PerlTestsUTF8();
     97           break;
     98         case 14: name = "PreAllocatedUTextCAPI";
     99           if (exec) PreAllocatedUTextCAPI();
    100           break;
    101         case 15: name = "Bug 7651";
    102           if (exec) Bug7651();
    103           break;
    104 
    105         default: name = "";
    106             break; //needed to end loop
    107     }
    108 }
    109 
    110 
    111 //---------------------------------------------------------------------------
    112 //
    113 //   Error Checking / Reporting macros used in all of the tests.
    114 //
    115 //---------------------------------------------------------------------------
    116 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("RegexTest failure at line %d.  status=%s", \
    117 __LINE__, u_errorName(status)); return;}}
    118 
    119 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("RegexTest failure at line %d.\n", __LINE__);};}
    120 
    121 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
    122 if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
    123     __LINE__, u_errorName(errcode), u_errorName(status));};}
    124 
    125 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
    126     "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
    127 
    128 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
    129     errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
    130 
    131 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
    132     UErrorCode status = U_ZERO_ERROR;
    133     UText expectedText = UTEXT_INITIALIZER;
    134     utext_openUTF8(&expectedText, expected, -1, &status);
    135     utext_setNativeIndex(actual, 0);
    136     if (utext_compare(&expectedText, -1, actual, -1) != 0) {
    137         char buf[201 /*21*/];
    138         char *bufPtr = buf;
    139         UChar32 c = utext_next32From(actual, 0);
    140         while (c != U_SENTINEL && bufPtr < buf+200/*20*/) {
    141             if (0x20<c && c<0x7e) {
    142                 *bufPtr = c;
    143             } else {
    144                 *bufPtr = '.';
    145             }
    146             bufPtr++;
    147             c = UTEXT_NEXT32(actual);
    148         }
    149         *bufPtr = 0;
    150 
    151         errln("Failure at file %s, line %d, expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expected, utext_nativeLength(&expectedText), buf, utext_nativeLength(actual));
    152     }
    153     utext_close(&expectedText);
    154 }
    155 
    156 #define REGEX_ASSERT_UTEXT(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
    157 
    158 
    159 //---------------------------------------------------------------------------
    160 //
    161 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
    162 //                       for the LookingAt() and  Match() functions.
    163 //
    164 //       usage:
    165 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
    166 //
    167 //          The expected results are UBool - TRUE or FALSE.
    168 //          The input text is unescaped.  The pattern is not.
    169 //
    170 //
    171 //---------------------------------------------------------------------------
    172 
    173 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
    174 
    175 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
    176     const UnicodeString pattern(pat, -1, US_INV);
    177     const UnicodeString inputText(text, -1, US_INV);
    178     UErrorCode          status  = U_ZERO_ERROR;
    179     UParseError         pe;
    180     RegexPattern        *REPattern = NULL;
    181     RegexMatcher        *REMatcher = NULL;
    182     UBool               retVal     = TRUE;
    183 
    184     UnicodeString patString(pat, -1, US_INV);
    185     REPattern = RegexPattern::compile(patString, 0, pe, status);
    186     if (U_FAILURE(status)) {
    187         dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
    188             line, u_errorName(status));
    189         return FALSE;
    190     }
    191     if (line==376) { RegexPatternDump(REPattern);}
    192 
    193     UnicodeString inputString(inputText);
    194     UnicodeString unEscapedInput = inputString.unescape();
    195     REMatcher = REPattern->matcher(unEscapedInput, status);
    196     if (U_FAILURE(status)) {
    197         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
    198             line, u_errorName(status));
    199         return FALSE;
    200     }
    201 
    202     UBool actualmatch;
    203     actualmatch = REMatcher->lookingAt(status);
    204     if (U_FAILURE(status)) {
    205         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
    206             line, u_errorName(status));
    207         retVal =  FALSE;
    208     }
    209     if (actualmatch != looking) {
    210         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
    211         retVal = FALSE;
    212     }
    213 
    214     status = U_ZERO_ERROR;
    215     actualmatch = REMatcher->matches(status);
    216     if (U_FAILURE(status)) {
    217         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
    218             line, u_errorName(status));
    219         retVal = FALSE;
    220     }
    221     if (actualmatch != match) {
    222         errln("RegexTest: wrong return from matches() at line %d.\n", line);
    223         retVal = FALSE;
    224     }
    225 
    226     if (retVal == FALSE) {
    227         RegexPatternDump(REPattern);
    228     }
    229 
    230     delete REPattern;
    231     delete REMatcher;
    232     return retVal;
    233 }
    234 
    235 
    236 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
    237     UText               pattern    = UTEXT_INITIALIZER;
    238     int32_t             inputUTF8Length;
    239     char                *textChars = NULL;
    240     UText               inputText  = UTEXT_INITIALIZER;
    241     UErrorCode          status     = U_ZERO_ERROR;
    242     UParseError         pe;
    243     RegexPattern        *REPattern = NULL;
    244     RegexMatcher        *REMatcher = NULL;
    245     UBool               retVal     = TRUE;
    246 
    247     utext_openUTF8(&pattern, pat, -1, &status);
    248     REPattern = RegexPattern::compile(&pattern, 0, pe, status);
    249     if (U_FAILURE(status)) {
    250         dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
    251             line, u_errorName(status));
    252         return FALSE;
    253     }
    254 
    255     UnicodeString inputString(text, -1, US_INV);
    256     UnicodeString unEscapedInput = inputString.unescape();
    257     LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
    258     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
    259 
    260     inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
    261     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
    262         // UTF-8 does not allow unpaired surrogates, so this could actually happen
    263         logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
    264         return TRUE; // not a failure of the Regex engine
    265     }
    266     status = U_ZERO_ERROR; // buffer overflow
    267     textChars = new char[inputUTF8Length+1];
    268     unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
    269     utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
    270 
    271     REMatcher = REPattern->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status);
    272     if (U_FAILURE(status)) {
    273         errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
    274             line, u_errorName(status));
    275         return FALSE;
    276     }
    277 
    278     UBool actualmatch;
    279     actualmatch = REMatcher->lookingAt(status);
    280     if (U_FAILURE(status)) {
    281         errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
    282             line, u_errorName(status));
    283         retVal =  FALSE;
    284     }
    285     if (actualmatch != looking) {
    286         errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
    287         retVal = FALSE;
    288     }
    289 
    290     status = U_ZERO_ERROR;
    291     actualmatch = REMatcher->matches(status);
    292     if (U_FAILURE(status)) {
    293         errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
    294             line, u_errorName(status));
    295         retVal = FALSE;
    296     }
    297     if (actualmatch != match) {
    298         errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
    299         retVal = FALSE;
    300     }
    301 
    302     if (retVal == FALSE) {
    303         RegexPatternDump(REPattern);
    304     }
    305 
    306     delete REPattern;
    307     delete REMatcher;
    308     utext_close(&inputText);
    309     utext_close(&pattern);
    310     delete[] textChars;
    311     return retVal;
    312 }
    313 
    314 
    315 
    316 //---------------------------------------------------------------------------
    317 //
    318 //    REGEX_ERR       Macro + invocation function to simplify writing tests
    319 //                       regex tests for incorrect patterns
    320 //
    321 //       usage:
    322 //          REGEX_ERR("pattern",   expected error line, column, expected status);
    323 //
    324 //---------------------------------------------------------------------------
    325 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
    326 
    327 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
    328                           UErrorCode expectedStatus, int32_t line) {
    329     UnicodeString       pattern(pat);
    330 
    331     UErrorCode          status         = U_ZERO_ERROR;
    332     UParseError         pe;
    333     RegexPattern        *callerPattern = NULL;
    334 
    335     //
    336     //  Compile the caller's pattern
    337     //
    338     UnicodeString patString(pat);
    339     callerPattern = RegexPattern::compile(patString, 0, pe, status);
    340     if (status != expectedStatus) {
    341         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
    342     } else {
    343         if (status != U_ZERO_ERROR) {
    344             if (pe.line != errLine || pe.offset != errCol) {
    345                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
    346                     line, errLine, errCol, pe.line, pe.offset);
    347             }
    348         }
    349     }
    350 
    351     delete callerPattern;
    352 
    353     //
    354     //  Compile again, using a UTF-8-based UText
    355     //
    356     UText patternText = UTEXT_INITIALIZER;
    357     utext_openUTF8(&patternText, pat, -1, &status);
    358     callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
    359     if (status != expectedStatus) {
    360         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
    361     } else {
    362         if (status != U_ZERO_ERROR) {
    363             if (pe.line != errLine || pe.offset != errCol) {
    364                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
    365                     line, errLine, errCol, pe.line, pe.offset);
    366             }
    367         }
    368     }
    369 
    370     delete callerPattern;
    371     utext_close(&patternText);
    372 }
    373 
    374 
    375 
    376 //---------------------------------------------------------------------------
    377 //
    378 //      Basic      Check for basic functionality of regex pattern matching.
    379 //                 Avoid the use of REGEX_FIND test macro, which has
    380 //                 substantial dependencies on basic Regex functionality.
    381 //
    382 //---------------------------------------------------------------------------
    383 void RegexTest::Basic() {
    384 
    385 
    386 //
    387 // Debug - slide failing test cases early
    388 //
    389 #if 0
    390     {
    391         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
    392         UParseError pe;
    393         UErrorCode  status = U_ZERO_ERROR;
    394         RegexPattern::compile("^(?:a?b?)*$", 0, pe, status);
    395         // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
    396         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
    397     }
    398     exit(1);
    399 #endif
    400 
    401 
    402     //
    403     // Pattern with parentheses
    404     //
    405     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
    406     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
    407     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
    408 
    409     //
    410     // Patterns with *
    411     //
    412     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
    413     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
    414     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
    415     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
    416     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
    417 
    418     REGEX_TESTLM("a*", "",  TRUE, TRUE);
    419     REGEX_TESTLM("a*", "b", TRUE, FALSE);
    420 
    421 
    422     //
    423     //  Patterns with "."
    424     //
    425     REGEX_TESTLM(".", "abc", TRUE, FALSE);
    426     REGEX_TESTLM("...", "abc", TRUE, TRUE);
    427     REGEX_TESTLM("....", "abc", FALSE, FALSE);
    428     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
    429     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
    430     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
    431     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
    432     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
    433 
    434     //
    435     //  Patterns with * applied to chars at end of literal string
    436     //
    437     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
    438     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
    439 
    440     //
    441     //  Supplemental chars match as single chars, not a pair of surrogates.
    442     //
    443     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
    444     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
    445     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
    446 
    447 
    448     //
    449     //  UnicodeSets in the pattern
    450     //
    451     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
    452     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
    453     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
    454     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
    455     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
    456     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
    457 
    458     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
    459     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
    460     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
    461     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
    462     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
    463 
    464     //
    465     //   OR operator in patterns
    466     //
    467     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
    468     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
    469     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
    470     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
    471 
    472     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
    473     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
    474     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
    475     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
    476     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
    477     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
    478 
    479     //
    480     //  +
    481     //
    482     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
    483     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
    484     REGEX_TESTLM("b+", "", FALSE, FALSE);
    485     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
    486     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
    487     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
    488 
    489     //
    490     //   ?
    491     //
    492     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
    493     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
    494     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
    495     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
    496     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
    497     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
    498     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
    499     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
    500     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
    501 
    502     //
    503     //  Escape sequences that become single literal chars, handled internally
    504     //   by ICU's Unescape.
    505     //
    506 
    507     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
    508     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
    509     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
    510     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
    511     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
    512     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
    513     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
    514     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
    515     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
    516     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
    517 
    518     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
    519     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
    520 
    521     // Escape of special chars in patterns
    522     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
    523 }
    524 
    525 
    526 //---------------------------------------------------------------------------
    527 //
    528 //    UTextBasic   Check for quirks that are specific to the UText
    529 //                 implementation.
    530 //
    531 //---------------------------------------------------------------------------
    532 void RegexTest::UTextBasic() {
    533     UErrorCode status = U_ZERO_ERROR;
    534     UText pattern = UTEXT_INITIALIZER;
    535     utext_openUTF8(&pattern, "abc", -1, &status);
    536     RegexMatcher matcher(&pattern, 0, status);
    537     REGEX_CHECK_STATUS;
    538 
    539     UText input = UTEXT_INITIALIZER;
    540     utext_openUTF8(&input, "abc", -1, &status);
    541     REGEX_CHECK_STATUS;
    542     matcher.reset(&input);
    543     REGEX_CHECK_STATUS;
    544     REGEX_ASSERT_UTEXT("abc", matcher.inputText());
    545 
    546     matcher.reset(matcher.inputText());
    547     REGEX_CHECK_STATUS;
    548     REGEX_ASSERT_UTEXT("abc", matcher.inputText());
    549 
    550     utext_close(&pattern);
    551     utext_close(&input);
    552 }
    553 
    554 
    555 //---------------------------------------------------------------------------
    556 //
    557 //      API_Match   Test that the API for class RegexMatcher
    558 //                  is present and nominally working, but excluding functions
    559 //                  implementing replace operations.
    560 //
    561 //---------------------------------------------------------------------------
    562 void RegexTest::API_Match() {
    563     UParseError         pe;
    564     UErrorCode          status=U_ZERO_ERROR;
    565     int32_t             flags = 0;
    566 
    567     //
    568     // Debug - slide failing test cases early
    569     //
    570 #if 0
    571     {
    572     }
    573     return;
    574 #endif
    575 
    576     //
    577     // Simple pattern compilation
    578     //
    579     {
    580         UnicodeString       re("abc");
    581         RegexPattern        *pat2;
    582         pat2 = RegexPattern::compile(re, flags, pe, status);
    583         REGEX_CHECK_STATUS;
    584 
    585         UnicodeString inStr1 = "abcdef this is a test";
    586         UnicodeString instr2 = "not abc";
    587         UnicodeString empty  = "";
    588 
    589 
    590         //
    591         // Matcher creation and reset.
    592         //
    593         RegexMatcher *m1 = pat2->matcher(inStr1, status);
    594         REGEX_CHECK_STATUS;
    595         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    596         REGEX_ASSERT(m1->input() == inStr1);
    597         m1->reset(instr2);
    598         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
    599         REGEX_ASSERT(m1->input() == instr2);
    600         m1->reset(inStr1);
    601         REGEX_ASSERT(m1->input() == inStr1);
    602         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    603         m1->reset(empty);
    604         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
    605         REGEX_ASSERT(m1->input() == empty);
    606         REGEX_ASSERT(&m1->pattern() == pat2);
    607 
    608         //
    609         //  reset(pos, status)
    610         //
    611         m1->reset(inStr1);
    612         m1->reset(4, status);
    613         REGEX_CHECK_STATUS;
    614         REGEX_ASSERT(m1->input() == inStr1);
    615         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    616 
    617         m1->reset(-1, status);
    618         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    619         status = U_ZERO_ERROR;
    620 
    621         m1->reset(0, status);
    622         REGEX_CHECK_STATUS;
    623         status = U_ZERO_ERROR;
    624 
    625         int32_t len = m1->input().length();
    626         m1->reset(len-1, status);
    627         REGEX_CHECK_STATUS;
    628         status = U_ZERO_ERROR;
    629 
    630         m1->reset(len, status);
    631         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    632         status = U_ZERO_ERROR;
    633 
    634         //
    635         // match(pos, status)
    636         //
    637         m1->reset(instr2);
    638         REGEX_ASSERT(m1->matches(4, status) == TRUE);
    639         m1->reset();
    640         REGEX_ASSERT(m1->matches(3, status) == FALSE);
    641         m1->reset();
    642         REGEX_ASSERT(m1->matches(5, status) == FALSE);
    643         REGEX_ASSERT(m1->matches(4, status) == TRUE);
    644         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
    645         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    646 
    647         // Match() at end of string should fail, but should not
    648         //  be an error.
    649         status = U_ZERO_ERROR;
    650         len = m1->input().length();
    651         REGEX_ASSERT(m1->matches(len, status) == FALSE);
    652         REGEX_CHECK_STATUS;
    653 
    654         // Match beyond end of string should fail with an error.
    655         status = U_ZERO_ERROR;
    656         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
    657         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    658 
    659         // Successful match at end of string.
    660         {
    661             status = U_ZERO_ERROR;
    662             RegexMatcher m("A?", 0, status);  // will match zero length string.
    663             REGEX_CHECK_STATUS;
    664             m.reset(inStr1);
    665             len = inStr1.length();
    666             REGEX_ASSERT(m.matches(len, status) == TRUE);
    667             REGEX_CHECK_STATUS;
    668             m.reset(empty);
    669             REGEX_ASSERT(m.matches(0, status) == TRUE);
    670             REGEX_CHECK_STATUS;
    671         }
    672 
    673 
    674         //
    675         // lookingAt(pos, status)
    676         //
    677         status = U_ZERO_ERROR;
    678         m1->reset(instr2);  // "not abc"
    679         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
    680         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
    681         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
    682         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
    683         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
    684         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    685         status = U_ZERO_ERROR;
    686         len = m1->input().length();
    687         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
    688         REGEX_CHECK_STATUS;
    689         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
    690         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    691 
    692         delete m1;
    693         delete pat2;
    694     }
    695 
    696 
    697     //
    698     // Capture Group.
    699     //     RegexMatcher::start();
    700     //     RegexMatcher::end();
    701     //     RegexMatcher::groupCount();
    702     //
    703     {
    704         int32_t             flags=0;
    705         UParseError         pe;
    706         UErrorCode          status=U_ZERO_ERROR;
    707 
    708         UnicodeString       re("01(23(45)67)(.*)");
    709         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
    710         REGEX_CHECK_STATUS;
    711         UnicodeString data = "0123456789";
    712 
    713         RegexMatcher *matcher = pat->matcher(data, status);
    714         REGEX_CHECK_STATUS;
    715         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
    716         static const int32_t matchStarts[] = {0,  2, 4, 8};
    717         static const int32_t matchEnds[]   = {10, 8, 6, 10};
    718         int32_t i;
    719         for (i=0; i<4; i++) {
    720             int32_t actualStart = matcher->start(i, status);
    721             REGEX_CHECK_STATUS;
    722             if (actualStart != matchStarts[i]) {
    723                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
    724                     __LINE__, i, matchStarts[i], actualStart);
    725             }
    726             int32_t actualEnd = matcher->end(i, status);
    727             REGEX_CHECK_STATUS;
    728             if (actualEnd != matchEnds[i]) {
    729                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
    730                     __LINE__, i, matchEnds[i], actualEnd);
    731             }
    732         }
    733 
    734         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
    735         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
    736 
    737         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
    738         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
    739         matcher->reset();
    740         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
    741 
    742         matcher->lookingAt(status);
    743         REGEX_ASSERT(matcher->group(status)    == "0123456789");
    744         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
    745         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
    746         REGEX_ASSERT(matcher->group(2, status) == "45"        );
    747         REGEX_ASSERT(matcher->group(3, status) == "89"        );
    748         REGEX_CHECK_STATUS;
    749         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
    750         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
    751         matcher->reset();
    752         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
    753 
    754         delete matcher;
    755         delete pat;
    756 
    757     }
    758 
    759     //
    760     //  find
    761     //
    762     {
    763         int32_t             flags=0;
    764         UParseError         pe;
    765         UErrorCode          status=U_ZERO_ERROR;
    766 
    767         UnicodeString       re("abc");
    768         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
    769         REGEX_CHECK_STATUS;
    770         UnicodeString data = ".abc..abc...abc..";
    771         //                    012345678901234567
    772 
    773         RegexMatcher *matcher = pat->matcher(data, status);
    774         REGEX_CHECK_STATUS;
    775         REGEX_ASSERT(matcher->find());
    776         REGEX_ASSERT(matcher->start(status) == 1);
    777         REGEX_ASSERT(matcher->find());
    778         REGEX_ASSERT(matcher->start(status) == 6);
    779         REGEX_ASSERT(matcher->find());
    780         REGEX_ASSERT(matcher->start(status) == 12);
    781         REGEX_ASSERT(matcher->find() == FALSE);
    782         REGEX_ASSERT(matcher->find() == FALSE);
    783 
    784         matcher->reset();
    785         REGEX_ASSERT(matcher->find());
    786         REGEX_ASSERT(matcher->start(status) == 1);
    787 
    788         REGEX_ASSERT(matcher->find(0, status));
    789         REGEX_ASSERT(matcher->start(status) == 1);
    790         REGEX_ASSERT(matcher->find(1, status));
    791         REGEX_ASSERT(matcher->start(status) == 1);
    792         REGEX_ASSERT(matcher->find(2, status));
    793         REGEX_ASSERT(matcher->start(status) == 6);
    794         REGEX_ASSERT(matcher->find(12, status));
    795         REGEX_ASSERT(matcher->start(status) == 12);
    796         REGEX_ASSERT(matcher->find(13, status) == FALSE);
    797         REGEX_ASSERT(matcher->find(16, status) == FALSE);
    798         REGEX_ASSERT(matcher->find(17, status) == FALSE);
    799         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
    800 
    801         status = U_ZERO_ERROR;
    802         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
    803         status = U_ZERO_ERROR;
    804         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
    805 
    806         REGEX_ASSERT(matcher->groupCount() == 0);
    807 
    808         delete matcher;
    809         delete pat;
    810     }
    811 
    812 
    813     //
    814     //  find, with \G in pattern (true if at the end of a previous match).
    815     //
    816     {
    817         int32_t             flags=0;
    818         UParseError         pe;
    819         UErrorCode          status=U_ZERO_ERROR;
    820 
    821         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
    822         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
    823         REGEX_CHECK_STATUS;
    824         UnicodeString data = ".abcabc.abc..";
    825         //                    012345678901234567
    826 
    827         RegexMatcher *matcher = pat->matcher(data, status);
    828         REGEX_CHECK_STATUS;
    829         REGEX_ASSERT(matcher->find());
    830         REGEX_ASSERT(matcher->start(status) == 0);
    831         REGEX_ASSERT(matcher->start(1, status) == -1);
    832         REGEX_ASSERT(matcher->start(2, status) == 1);
    833 
    834         REGEX_ASSERT(matcher->find());
    835         REGEX_ASSERT(matcher->start(status) == 4);
    836         REGEX_ASSERT(matcher->start(1, status) == 4);
    837         REGEX_ASSERT(matcher->start(2, status) == -1);
    838         REGEX_CHECK_STATUS;
    839 
    840         delete matcher;
    841         delete pat;
    842     }
    843 
    844     //
    845     //   find with zero length matches, match position should bump ahead
    846     //     to prevent loops.
    847     //
    848     {
    849         int32_t                 i;
    850         UErrorCode          status=U_ZERO_ERROR;
    851         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
    852                                                       //   using an always-true look-ahead.
    853         REGEX_CHECK_STATUS;
    854         UnicodeString s("    ");
    855         m.reset(s);
    856         for (i=0; ; i++) {
    857             if (m.find() == FALSE) {
    858                 break;
    859             }
    860             REGEX_ASSERT(m.start(status) == i);
    861             REGEX_ASSERT(m.end(status) == i);
    862         }
    863         REGEX_ASSERT(i==5);
    864 
    865         // Check that the bump goes over surrogate pairs OK
    866         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
    867         s = s.unescape();
    868         m.reset(s);
    869         for (i=0; ; i+=2) {
    870             if (m.find() == FALSE) {
    871                 break;
    872             }
    873             REGEX_ASSERT(m.start(status) == i);
    874             REGEX_ASSERT(m.end(status) == i);
    875         }
    876         REGEX_ASSERT(i==10);
    877     }
    878     {
    879         // find() loop breaking test.
    880         //        with pattern of /.?/, should see a series of one char matches, then a single
    881         //        match of zero length at the end of the input string.
    882         int32_t                 i;
    883         UErrorCode          status=U_ZERO_ERROR;
    884         RegexMatcher        m(".?", 0, status);
    885         REGEX_CHECK_STATUS;
    886         UnicodeString s("    ");
    887         m.reset(s);
    888         for (i=0; ; i++) {
    889             if (m.find() == FALSE) {
    890                 break;
    891             }
    892             REGEX_ASSERT(m.start(status) == i);
    893             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
    894         }
    895         REGEX_ASSERT(i==5);
    896     }
    897 
    898 
    899     //
    900     // Matchers with no input string behave as if they had an empty input string.
    901     //
    902 
    903     {
    904         UErrorCode status = U_ZERO_ERROR;
    905         RegexMatcher  m(".?", 0, status);
    906         REGEX_CHECK_STATUS;
    907         REGEX_ASSERT(m.find());
    908         REGEX_ASSERT(m.start(status) == 0);
    909         REGEX_ASSERT(m.input() == "");
    910     }
    911     {
    912         UErrorCode status = U_ZERO_ERROR;
    913         RegexPattern  *p = RegexPattern::compile(".", 0, status);
    914         RegexMatcher  *m = p->matcher(status);
    915         REGEX_CHECK_STATUS;
    916 
    917         REGEX_ASSERT(m->find() == FALSE);
    918         REGEX_ASSERT(m->input() == "");
    919         delete m;
    920         delete p;
    921     }
    922 
    923     //
    924     // Regions
    925     //
    926     {
    927         UErrorCode status = U_ZERO_ERROR;
    928         UnicodeString testString("This is test data");
    929         RegexMatcher m(".*", testString,  0, status);
    930         REGEX_CHECK_STATUS;
    931         REGEX_ASSERT(m.regionStart() == 0);
    932         REGEX_ASSERT(m.regionEnd() == testString.length());
    933         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
    934         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
    935 
    936         m.region(2,4, status);
    937         REGEX_CHECK_STATUS;
    938         REGEX_ASSERT(m.matches(status));
    939         REGEX_ASSERT(m.start(status)==2);
    940         REGEX_ASSERT(m.end(status)==4);
    941         REGEX_CHECK_STATUS;
    942 
    943         m.reset();
    944         REGEX_ASSERT(m.regionStart() == 0);
    945         REGEX_ASSERT(m.regionEnd() == testString.length());
    946 
    947         UnicodeString shorterString("short");
    948         m.reset(shorterString);
    949         REGEX_ASSERT(m.regionStart() == 0);
    950         REGEX_ASSERT(m.regionEnd() == shorterString.length());
    951 
    952         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
    953         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
    954         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
    955         REGEX_ASSERT(&m == &m.reset());
    956         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
    957 
    958         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
    959         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
    960         REGEX_ASSERT(&m == &m.reset());
    961         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
    962 
    963         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
    964         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
    965         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
    966         REGEX_ASSERT(&m == &m.reset());
    967         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
    968 
    969         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
    970         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
    971         REGEX_ASSERT(&m == &m.reset());
    972         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
    973 
    974     }
    975 
    976     //
    977     // hitEnd() and requireEnd()
    978     //
    979     {
    980         UErrorCode status = U_ZERO_ERROR;
    981         UnicodeString testString("aabb");
    982         RegexMatcher m1(".*", testString,  0, status);
    983         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
    984         REGEX_ASSERT(m1.hitEnd() == TRUE);
    985         REGEX_ASSERT(m1.requireEnd() == FALSE);
    986         REGEX_CHECK_STATUS;
    987 
    988         status = U_ZERO_ERROR;
    989         RegexMatcher m2("a*", testString, 0, status);
    990         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
    991         REGEX_ASSERT(m2.hitEnd() == FALSE);
    992         REGEX_ASSERT(m2.requireEnd() == FALSE);
    993         REGEX_CHECK_STATUS;
    994 
    995         status = U_ZERO_ERROR;
    996         RegexMatcher m3(".*$", testString, 0, status);
    997         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
    998         REGEX_ASSERT(m3.hitEnd() == TRUE);
    999         REGEX_ASSERT(m3.requireEnd() == TRUE);
   1000         REGEX_CHECK_STATUS;
   1001     }
   1002 
   1003 
   1004     //
   1005     // Compilation error on reset with UChar *
   1006     //   These were a hazard that people were stumbling over with runtime errors.
   1007     //   Changed them to compiler errors by adding private methods that more closely
   1008     //   matched the incorrect use of the functions.
   1009     //
   1010 #if 0
   1011     {
   1012         UErrorCode status = U_ZERO_ERROR;
   1013         UChar ucharString[20];
   1014         RegexMatcher m(".", 0, status);
   1015         m.reset(ucharString);  // should not compile.
   1016 
   1017         RegexPattern *p = RegexPattern::compile(".", 0, status);
   1018         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
   1019 
   1020         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
   1021     }
   1022 #endif
   1023 
   1024     //
   1025     //  Time Outs.
   1026     //       Note:  These tests will need to be changed when the regexp engine is
   1027     //              able to detect and cut short the exponential time behavior on
   1028     //              this type of match.
   1029     //
   1030     {
   1031         UErrorCode status = U_ZERO_ERROR;
   1032         //    Enough 'a's in the string to cause the match to time out.
   1033         //       (Each on additonal 'a' doubles the time)
   1034         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
   1035         RegexMatcher matcher("(a+)+b", testString, 0, status);
   1036         REGEX_CHECK_STATUS;
   1037         REGEX_ASSERT(matcher.getTimeLimit() == 0);
   1038         matcher.setTimeLimit(100, status);
   1039         REGEX_ASSERT(matcher.getTimeLimit() == 100);
   1040         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1041         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
   1042     }
   1043     {
   1044         UErrorCode status = U_ZERO_ERROR;
   1045         //   Few enough 'a's to slip in under the time limit.
   1046         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
   1047         RegexMatcher matcher("(a+)+b", testString, 0, status);
   1048         REGEX_CHECK_STATUS;
   1049         matcher.setTimeLimit(100, status);
   1050         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1051         REGEX_CHECK_STATUS;
   1052     }
   1053 
   1054     //
   1055     //  Stack Limits
   1056     //
   1057     {
   1058         UErrorCode status = U_ZERO_ERROR;
   1059         UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
   1060 
   1061         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
   1062         //   of the '+', and makes the stack frames larger.
   1063         RegexMatcher matcher("(A)+A$", testString, 0, status);
   1064 
   1065         // With the default stack, this match should fail to run
   1066         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1067         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
   1068 
   1069         // With unlimited stack, it should run
   1070         status = U_ZERO_ERROR;
   1071         matcher.setStackLimit(0, status);
   1072         REGEX_CHECK_STATUS;
   1073         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
   1074         REGEX_CHECK_STATUS;
   1075         REGEX_ASSERT(matcher.getStackLimit() == 0);
   1076 
   1077         // With a limited stack, it the match should fail
   1078         status = U_ZERO_ERROR;
   1079         matcher.setStackLimit(10000, status);
   1080         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1081         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
   1082         REGEX_ASSERT(matcher.getStackLimit() == 10000);
   1083     }
   1084 
   1085         // A pattern that doesn't save state should work with
   1086         //   a minimal sized stack
   1087     {
   1088         UErrorCode status = U_ZERO_ERROR;
   1089         UnicodeString testString = "abc";
   1090         RegexMatcher matcher("abc", testString, 0, status);
   1091         REGEX_CHECK_STATUS;
   1092         matcher.setStackLimit(30, status);
   1093         REGEX_CHECK_STATUS;
   1094         REGEX_ASSERT(matcher.matches(status) == TRUE);
   1095         REGEX_CHECK_STATUS;
   1096         REGEX_ASSERT(matcher.getStackLimit() == 30);
   1097 
   1098         // Negative stack sizes should fail
   1099         status = U_ZERO_ERROR;
   1100         matcher.setStackLimit(1000, status);
   1101         REGEX_CHECK_STATUS;
   1102         matcher.setStackLimit(-1, status);
   1103         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
   1104         REGEX_ASSERT(matcher.getStackLimit() == 1000);
   1105     }
   1106 
   1107 
   1108 }
   1109 
   1110 
   1111 
   1112 
   1113 
   1114 
   1115 //---------------------------------------------------------------------------
   1116 //
   1117 //      API_Replace        API test for class RegexMatcher, testing the
   1118 //                         Replace family of functions.
   1119 //
   1120 //---------------------------------------------------------------------------
   1121 void RegexTest::API_Replace() {
   1122     //
   1123     //  Replace
   1124     //
   1125     int32_t             flags=0;
   1126     UParseError         pe;
   1127     UErrorCode          status=U_ZERO_ERROR;
   1128 
   1129     UnicodeString       re("abc");
   1130     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
   1131     REGEX_CHECK_STATUS;
   1132     UnicodeString data = ".abc..abc...abc..";
   1133     //                    012345678901234567
   1134     RegexMatcher *matcher = pat->matcher(data, status);
   1135 
   1136     //
   1137     //  Plain vanilla matches.
   1138     //
   1139     UnicodeString  dest;
   1140     dest = matcher->replaceFirst("yz", status);
   1141     REGEX_CHECK_STATUS;
   1142     REGEX_ASSERT(dest == ".yz..abc...abc..");
   1143 
   1144     dest = matcher->replaceAll("yz", status);
   1145     REGEX_CHECK_STATUS;
   1146     REGEX_ASSERT(dest == ".yz..yz...yz..");
   1147 
   1148     //
   1149     //  Plain vanilla non-matches.
   1150     //
   1151     UnicodeString d2 = ".abx..abx...abx..";
   1152     matcher->reset(d2);
   1153     dest = matcher->replaceFirst("yz", status);
   1154     REGEX_CHECK_STATUS;
   1155     REGEX_ASSERT(dest == ".abx..abx...abx..");
   1156 
   1157     dest = matcher->replaceAll("yz", status);
   1158     REGEX_CHECK_STATUS;
   1159     REGEX_ASSERT(dest == ".abx..abx...abx..");
   1160 
   1161     //
   1162     // Empty source string
   1163     //
   1164     UnicodeString d3 = "";
   1165     matcher->reset(d3);
   1166     dest = matcher->replaceFirst("yz", status);
   1167     REGEX_CHECK_STATUS;
   1168     REGEX_ASSERT(dest == "");
   1169 
   1170     dest = matcher->replaceAll("yz", status);
   1171     REGEX_CHECK_STATUS;
   1172     REGEX_ASSERT(dest == "");
   1173 
   1174     //
   1175     // Empty substitution string
   1176     //
   1177     matcher->reset(data);              // ".abc..abc...abc.."
   1178     dest = matcher->replaceFirst("", status);
   1179     REGEX_CHECK_STATUS;
   1180     REGEX_ASSERT(dest == "...abc...abc..");
   1181 
   1182     dest = matcher->replaceAll("", status);
   1183     REGEX_CHECK_STATUS;
   1184     REGEX_ASSERT(dest == "........");
   1185 
   1186     //
   1187     // match whole string
   1188     //
   1189     UnicodeString d4 = "abc";
   1190     matcher->reset(d4);
   1191     dest = matcher->replaceFirst("xyz", status);
   1192     REGEX_CHECK_STATUS;
   1193     REGEX_ASSERT(dest == "xyz");
   1194 
   1195     dest = matcher->replaceAll("xyz", status);
   1196     REGEX_CHECK_STATUS;
   1197     REGEX_ASSERT(dest == "xyz");
   1198 
   1199     //
   1200     // Capture Group, simple case
   1201     //
   1202     UnicodeString       re2("a(..)");
   1203     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
   1204     REGEX_CHECK_STATUS;
   1205     UnicodeString d5 = "abcdefg";
   1206     RegexMatcher *matcher2 = pat2->matcher(d5, status);
   1207     REGEX_CHECK_STATUS;
   1208     dest = matcher2->replaceFirst("$1$1", status);
   1209     REGEX_CHECK_STATUS;
   1210     REGEX_ASSERT(dest == "bcbcdefg");
   1211 
   1212     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
   1213     REGEX_CHECK_STATUS;
   1214     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
   1215 
   1216     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
   1217     REGEX_CHECK_STATUS;
   1218     REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
   1219 
   1220     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
   1221     replacement = replacement.unescape();
   1222     dest = matcher2->replaceFirst(replacement, status);
   1223     REGEX_CHECK_STATUS;
   1224     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
   1225 
   1226     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
   1227 
   1228 
   1229     //
   1230     // Replacement String with \u hex escapes
   1231     //
   1232     {
   1233         UnicodeString  src = "abc 1 abc 2 abc 3";
   1234         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
   1235         matcher->reset(src);
   1236         UnicodeString  result = matcher->replaceAll(substitute, status);
   1237         REGEX_CHECK_STATUS;
   1238         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
   1239     }
   1240     {
   1241         UnicodeString  src = "abc !";
   1242         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
   1243         matcher->reset(src);
   1244         UnicodeString  result = matcher->replaceAll(substitute, status);
   1245         REGEX_CHECK_STATUS;
   1246         UnicodeString expected = UnicodeString("--");
   1247         expected.append((UChar32)0x10000);
   1248         expected.append("-- !");
   1249         REGEX_ASSERT(result == expected);
   1250     }
   1251     // TODO:  need more through testing of capture substitutions.
   1252 
   1253     // Bug 4057
   1254     //
   1255     {
   1256         status = U_ZERO_ERROR;
   1257         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
   1258         RegexMatcher m("ss(.*?)ee", 0, status);
   1259         REGEX_CHECK_STATUS;
   1260         UnicodeString result;
   1261 
   1262         // Multiple finds do NOT bump up the previous appendReplacement postion.
   1263         m.reset(s);
   1264         m.find();
   1265         m.find();
   1266         m.appendReplacement(result, "ooh", status);
   1267         REGEX_CHECK_STATUS;
   1268         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1269 
   1270         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
   1271         status = U_ZERO_ERROR;
   1272         result.truncate(0);
   1273         m.reset(10, status);
   1274         m.find();
   1275         m.find();
   1276         m.appendReplacement(result, "ooh", status);
   1277         REGEX_CHECK_STATUS;
   1278         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1279 
   1280         // find() at interior of string, appendReplacemnt still starts at beginning.
   1281         status = U_ZERO_ERROR;
   1282         result.truncate(0);
   1283         m.reset();
   1284         m.find(10, status);
   1285         m.find();
   1286         m.appendReplacement(result, "ooh", status);
   1287         REGEX_CHECK_STATUS;
   1288         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1289 
   1290         m.appendTail(result);
   1291         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
   1292 
   1293     }
   1294 
   1295     delete matcher2;
   1296     delete pat2;
   1297     delete matcher;
   1298     delete pat;
   1299 }
   1300 
   1301 
   1302 //---------------------------------------------------------------------------
   1303 //
   1304 //      API_Pattern       Test that the API for class RegexPattern is
   1305 //                        present and nominally working.
   1306 //
   1307 //---------------------------------------------------------------------------
   1308 void RegexTest::API_Pattern() {
   1309     RegexPattern        pata;    // Test default constructor to not crash.
   1310     RegexPattern        patb;
   1311 
   1312     REGEX_ASSERT(pata == patb);
   1313     REGEX_ASSERT(pata == pata);
   1314 
   1315     UnicodeString re1("abc[a-l][m-z]");
   1316     UnicodeString re2("def");
   1317     UErrorCode    status = U_ZERO_ERROR;
   1318     UParseError   pe;
   1319 
   1320     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
   1321     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
   1322     REGEX_CHECK_STATUS;
   1323     REGEX_ASSERT(*pat1 == *pat1);
   1324     REGEX_ASSERT(*pat1 != pata);
   1325 
   1326     // Assign
   1327     patb = *pat1;
   1328     REGEX_ASSERT(patb == *pat1);
   1329 
   1330     // Copy Construct
   1331     RegexPattern patc(*pat1);
   1332     REGEX_ASSERT(patc == *pat1);
   1333     REGEX_ASSERT(patb == patc);
   1334     REGEX_ASSERT(pat1 != pat2);
   1335     patb = *pat2;
   1336     REGEX_ASSERT(patb != patc);
   1337     REGEX_ASSERT(patb == *pat2);
   1338 
   1339     // Compile with no flags.
   1340     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
   1341     REGEX_ASSERT(*pat1a == *pat1);
   1342 
   1343     REGEX_ASSERT(pat1a->flags() == 0);
   1344 
   1345     // Compile with different flags should be not equal
   1346     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
   1347     REGEX_CHECK_STATUS;
   1348 
   1349     REGEX_ASSERT(*pat1b != *pat1a);
   1350     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
   1351     REGEX_ASSERT(pat1a->flags() == 0);
   1352     delete pat1b;
   1353 
   1354     // clone
   1355     RegexPattern *pat1c = pat1->clone();
   1356     REGEX_ASSERT(*pat1c == *pat1);
   1357     REGEX_ASSERT(*pat1c != *pat2);
   1358 
   1359     delete pat1c;
   1360     delete pat1a;
   1361     delete pat1;
   1362     delete pat2;
   1363 
   1364 
   1365     //
   1366     //   Verify that a matcher created from a cloned pattern works.
   1367     //     (Jitterbug 3423)
   1368     //
   1369     {
   1370         UErrorCode     status     = U_ZERO_ERROR;
   1371         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
   1372         RegexPattern  *pClone     = pSource->clone();
   1373         delete         pSource;
   1374         RegexMatcher  *mFromClone = pClone->matcher(status);
   1375         REGEX_CHECK_STATUS;
   1376         UnicodeString s = "Hello World";
   1377         mFromClone->reset(s);
   1378         REGEX_ASSERT(mFromClone->find() == TRUE);
   1379         REGEX_ASSERT(mFromClone->group(status) == "Hello");
   1380         REGEX_ASSERT(mFromClone->find() == TRUE);
   1381         REGEX_ASSERT(mFromClone->group(status) == "World");
   1382         REGEX_ASSERT(mFromClone->find() == FALSE);
   1383         delete mFromClone;
   1384         delete pClone;
   1385     }
   1386 
   1387     //
   1388     //   matches convenience API
   1389     //
   1390     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
   1391     REGEX_CHECK_STATUS;
   1392     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
   1393     REGEX_CHECK_STATUS;
   1394     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
   1395     REGEX_CHECK_STATUS;
   1396     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
   1397     REGEX_CHECK_STATUS;
   1398     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
   1399     REGEX_CHECK_STATUS;
   1400     status = U_INDEX_OUTOFBOUNDS_ERROR;
   1401     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
   1402     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1403 
   1404 
   1405     //
   1406     // Split()
   1407     //
   1408     status = U_ZERO_ERROR;
   1409     pat1 = RegexPattern::compile(" +",  pe, status);
   1410     REGEX_CHECK_STATUS;
   1411     UnicodeString  fields[10];
   1412 
   1413     int32_t n;
   1414     n = pat1->split("Now is the time", fields, 10, status);
   1415     REGEX_CHECK_STATUS;
   1416     REGEX_ASSERT(n==4);
   1417     REGEX_ASSERT(fields[0]=="Now");
   1418     REGEX_ASSERT(fields[1]=="is");
   1419     REGEX_ASSERT(fields[2]=="the");
   1420     REGEX_ASSERT(fields[3]=="time");
   1421     REGEX_ASSERT(fields[4]=="");
   1422 
   1423     n = pat1->split("Now is the time", fields, 2, status);
   1424     REGEX_CHECK_STATUS;
   1425     REGEX_ASSERT(n==2);
   1426     REGEX_ASSERT(fields[0]=="Now");
   1427     REGEX_ASSERT(fields[1]=="is the time");
   1428     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
   1429 
   1430     fields[1] = "*";
   1431     status = U_ZERO_ERROR;
   1432     n = pat1->split("Now is the time", fields, 1, status);
   1433     REGEX_CHECK_STATUS;
   1434     REGEX_ASSERT(n==1);
   1435     REGEX_ASSERT(fields[0]=="Now is the time");
   1436     REGEX_ASSERT(fields[1]=="*");
   1437     status = U_ZERO_ERROR;
   1438 
   1439     n = pat1->split("    Now       is the time   ", fields, 10, status);
   1440     REGEX_CHECK_STATUS;
   1441     REGEX_ASSERT(n==5);
   1442     REGEX_ASSERT(fields[0]=="");
   1443     REGEX_ASSERT(fields[1]=="Now");
   1444     REGEX_ASSERT(fields[2]=="is");
   1445     REGEX_ASSERT(fields[3]=="the");
   1446     REGEX_ASSERT(fields[4]=="time");
   1447     REGEX_ASSERT(fields[5]=="");
   1448 
   1449     n = pat1->split("     ", fields, 10, status);
   1450     REGEX_CHECK_STATUS;
   1451     REGEX_ASSERT(n==1);
   1452     REGEX_ASSERT(fields[0]=="");
   1453 
   1454     fields[0] = "foo";
   1455     n = pat1->split("", fields, 10, status);
   1456     REGEX_CHECK_STATUS;
   1457     REGEX_ASSERT(n==0);
   1458     REGEX_ASSERT(fields[0]=="foo");
   1459 
   1460     delete pat1;
   1461 
   1462     //  split, with a pattern with (capture)
   1463     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
   1464     REGEX_CHECK_STATUS;
   1465 
   1466     status = U_ZERO_ERROR;
   1467     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
   1468     REGEX_CHECK_STATUS;
   1469     REGEX_ASSERT(n==6);
   1470     REGEX_ASSERT(fields[0]=="");
   1471     REGEX_ASSERT(fields[1]=="a");
   1472     REGEX_ASSERT(fields[2]=="Now is ");
   1473     REGEX_ASSERT(fields[3]=="b");
   1474     REGEX_ASSERT(fields[4]=="the time");
   1475     REGEX_ASSERT(fields[5]=="c");
   1476     REGEX_ASSERT(fields[6]=="");
   1477     REGEX_ASSERT(status==U_ZERO_ERROR);
   1478 
   1479     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
   1480     REGEX_CHECK_STATUS;
   1481     REGEX_ASSERT(n==6);
   1482     REGEX_ASSERT(fields[0]=="  ");
   1483     REGEX_ASSERT(fields[1]=="a");
   1484     REGEX_ASSERT(fields[2]=="Now is ");
   1485     REGEX_ASSERT(fields[3]=="b");
   1486     REGEX_ASSERT(fields[4]=="the time");
   1487     REGEX_ASSERT(fields[5]=="c");
   1488     REGEX_ASSERT(fields[6]=="");
   1489 
   1490     status = U_ZERO_ERROR;
   1491     fields[6] = "foo";
   1492     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
   1493     REGEX_CHECK_STATUS;
   1494     REGEX_ASSERT(n==6);
   1495     REGEX_ASSERT(fields[0]=="  ");
   1496     REGEX_ASSERT(fields[1]=="a");
   1497     REGEX_ASSERT(fields[2]=="Now is ");
   1498     REGEX_ASSERT(fields[3]=="b");
   1499     REGEX_ASSERT(fields[4]=="the time");
   1500     REGEX_ASSERT(fields[5]=="c");
   1501     REGEX_ASSERT(fields[6]=="foo");
   1502 
   1503     status = U_ZERO_ERROR;
   1504     fields[5] = "foo";
   1505     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
   1506     REGEX_CHECK_STATUS;
   1507     REGEX_ASSERT(n==5);
   1508     REGEX_ASSERT(fields[0]=="  ");
   1509     REGEX_ASSERT(fields[1]=="a");
   1510     REGEX_ASSERT(fields[2]=="Now is ");
   1511     REGEX_ASSERT(fields[3]=="b");
   1512     REGEX_ASSERT(fields[4]=="the time<c>");
   1513     REGEX_ASSERT(fields[5]=="foo");
   1514 
   1515     status = U_ZERO_ERROR;
   1516     fields[5] = "foo";
   1517     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
   1518     REGEX_CHECK_STATUS;
   1519     REGEX_ASSERT(n==5);
   1520     REGEX_ASSERT(fields[0]=="  ");
   1521     REGEX_ASSERT(fields[1]=="a");
   1522     REGEX_ASSERT(fields[2]=="Now is ");
   1523     REGEX_ASSERT(fields[3]=="b");
   1524     REGEX_ASSERT(fields[4]=="the time");
   1525     REGEX_ASSERT(fields[5]=="foo");
   1526 
   1527     status = U_ZERO_ERROR;
   1528     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
   1529     REGEX_CHECK_STATUS;
   1530     REGEX_ASSERT(n==4);
   1531     REGEX_ASSERT(fields[0]=="  ");
   1532     REGEX_ASSERT(fields[1]=="a");
   1533     REGEX_ASSERT(fields[2]=="Now is ");
   1534     REGEX_ASSERT(fields[3]=="the time<c>");
   1535     status = U_ZERO_ERROR;
   1536     delete pat1;
   1537 
   1538     pat1 = RegexPattern::compile("([-,])",  pe, status);
   1539     REGEX_CHECK_STATUS;
   1540     n = pat1->split("1-10,20", fields, 10, status);
   1541     REGEX_CHECK_STATUS;
   1542     REGEX_ASSERT(n==5);
   1543     REGEX_ASSERT(fields[0]=="1");
   1544     REGEX_ASSERT(fields[1]=="-");
   1545     REGEX_ASSERT(fields[2]=="10");
   1546     REGEX_ASSERT(fields[3]==",");
   1547     REGEX_ASSERT(fields[4]=="20");
   1548     delete pat1;
   1549 
   1550 
   1551     //
   1552     // RegexPattern::pattern()
   1553     //
   1554     pat1 = new RegexPattern();
   1555     REGEX_ASSERT(pat1->pattern() == "");
   1556     delete pat1;
   1557 
   1558     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
   1559     REGEX_CHECK_STATUS;
   1560     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
   1561     delete pat1;
   1562 
   1563 
   1564     //
   1565     // classID functions
   1566     //
   1567     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
   1568     REGEX_CHECK_STATUS;
   1569     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
   1570     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
   1571     UnicodeString Hello("Hello, world.");
   1572     RegexMatcher *m = pat1->matcher(Hello, status);
   1573     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
   1574     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
   1575     REGEX_ASSERT(m->getDynamicClassID() != NULL);
   1576     delete m;
   1577     delete pat1;
   1578 
   1579 }
   1580 
   1581 //---------------------------------------------------------------------------
   1582 //
   1583 //      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
   1584 //                       is present and working, but excluding functions
   1585 //                       implementing replace operations.
   1586 //
   1587 //---------------------------------------------------------------------------
   1588 void RegexTest::API_Match_UTF8() {
   1589     UParseError         pe;
   1590     UErrorCode          status=U_ZERO_ERROR;
   1591     int32_t             flags = 0;
   1592 
   1593     //
   1594     // Debug - slide failing test cases early
   1595     //
   1596 #if 0
   1597     {
   1598     }
   1599     return;
   1600 #endif
   1601 
   1602     //
   1603     // Simple pattern compilation
   1604     //
   1605     {
   1606         UText               re = UTEXT_INITIALIZER;
   1607         utext_openUTF8(&re, "abc", -1, &status);
   1608         RegexPattern        *pat2;
   1609         pat2 = RegexPattern::compile(&re, flags, pe, status);
   1610         REGEX_CHECK_STATUS;
   1611 
   1612         UText input1 = UTEXT_INITIALIZER;
   1613         UText input2 = UTEXT_INITIALIZER;
   1614         UText empty  = UTEXT_INITIALIZER;
   1615         utext_openUTF8(&input1, "abcdef this is a test", -1, &status);
   1616         utext_openUTF8(&input2, "not abc", -1, &status);
   1617         utext_openUChars(&empty, NULL, 0, &status);
   1618 
   1619         int32_t input1Len = strlen("abcdef this is a test");
   1620         int32_t input2Len = strlen("not abc");
   1621 
   1622 
   1623         //
   1624         // Matcher creation and reset.
   1625         //
   1626         RegexMatcher *m1 = pat2->matcher(&input1, RegexPattern::PATTERN_IS_UTEXT, status);
   1627         REGEX_CHECK_STATUS;
   1628         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1629         REGEX_ASSERT_UTEXT("abcdef this is a test", m1->inputText());
   1630         m1->reset(&input2);
   1631         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
   1632         REGEX_ASSERT_UTEXT("not abc", m1->inputText());
   1633         m1->reset(&input1);
   1634         REGEX_ASSERT_UTEXT("abcdef this is a test", m1->inputText());
   1635         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1636         m1->reset(&empty);
   1637         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
   1638         REGEX_ASSERT(utext_nativeLength(&empty) == 0);
   1639 
   1640         //
   1641         //  reset(pos, status)
   1642         //
   1643         m1->reset(&input1);
   1644         m1->reset(4, status);
   1645         REGEX_CHECK_STATUS;
   1646         REGEX_ASSERT_UTEXT("abcdef this is a test", m1->inputText());
   1647         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1648 
   1649         m1->reset(-1, status);
   1650         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1651         status = U_ZERO_ERROR;
   1652 
   1653         m1->reset(0, status);
   1654         REGEX_CHECK_STATUS;
   1655         status = U_ZERO_ERROR;
   1656 
   1657         m1->reset(input1Len-1, status);
   1658         REGEX_CHECK_STATUS;
   1659         status = U_ZERO_ERROR;
   1660 
   1661         m1->reset(input1Len, status);
   1662         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1663         status = U_ZERO_ERROR;
   1664 
   1665         //
   1666         // match(pos, status)
   1667         //
   1668         m1->reset(&input2);
   1669         REGEX_ASSERT(m1->matches(4, status) == TRUE);
   1670         m1->reset();
   1671         REGEX_ASSERT(m1->matches(3, status) == FALSE);
   1672         m1->reset();
   1673         REGEX_ASSERT(m1->matches(5, status) == FALSE);
   1674         REGEX_ASSERT(m1->matches(4, status) == TRUE);
   1675         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
   1676         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1677 
   1678         // Match() at end of string should fail, but should not
   1679         //  be an error.
   1680         status = U_ZERO_ERROR;
   1681         REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
   1682         REGEX_CHECK_STATUS;
   1683 
   1684         // Match beyond end of string should fail with an error.
   1685         status = U_ZERO_ERROR;
   1686         REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
   1687         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1688 
   1689         // Successful match at end of string.
   1690         {
   1691             status = U_ZERO_ERROR;
   1692             RegexMatcher m("A?", 0, status);  // will match zero length string.
   1693             REGEX_CHECK_STATUS;
   1694             m.reset(&input1);
   1695             REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
   1696             REGEX_CHECK_STATUS;
   1697             m.reset(&empty);
   1698             REGEX_ASSERT(m.matches(0, status) == TRUE);
   1699             REGEX_CHECK_STATUS;
   1700         }
   1701 
   1702 
   1703         //
   1704         // lookingAt(pos, status)
   1705         //
   1706         status = U_ZERO_ERROR;
   1707         m1->reset(&input2);  // "not abc"
   1708         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
   1709         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
   1710         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
   1711         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
   1712         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
   1713         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1714         status = U_ZERO_ERROR;
   1715         REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
   1716         REGEX_CHECK_STATUS;
   1717         REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
   1718         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1719 
   1720         delete m1;
   1721         delete pat2;
   1722 
   1723         utext_close(&re);
   1724         utext_close(&input1);
   1725         utext_close(&input2);
   1726         utext_close(&empty);
   1727     }
   1728 
   1729 
   1730     //
   1731     // Capture Group.
   1732     //     RegexMatcher::start();
   1733     //     RegexMatcher::end();
   1734     //     RegexMatcher::groupCount();
   1735     //
   1736     {
   1737         int32_t             flags=0;
   1738         UParseError         pe;
   1739         UErrorCode          status=U_ZERO_ERROR;
   1740         UText               re=UTEXT_INITIALIZER;
   1741         utext_openUTF8(&re, "01(23(45)67)(.*)", -1, &status);
   1742 
   1743         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   1744         REGEX_CHECK_STATUS;
   1745 
   1746         UText input = UTEXT_INITIALIZER;
   1747         utext_openUTF8(&input, "0123456789", -1, &status);
   1748 
   1749         RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status);
   1750         REGEX_CHECK_STATUS;
   1751         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
   1752         static const int32_t matchStarts[] = {0,  2, 4, 8};
   1753         static const int32_t matchEnds[]   = {10, 8, 6, 10};
   1754         int32_t i;
   1755         for (i=0; i<4; i++) {
   1756             int32_t actualStart = matcher->start(i, status);
   1757             REGEX_CHECK_STATUS;
   1758             if (actualStart != matchStarts[i]) {
   1759                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
   1760                     __LINE__, i, matchStarts[i], actualStart);
   1761             }
   1762             int32_t actualEnd = matcher->end(i, status);
   1763             REGEX_CHECK_STATUS;
   1764             if (actualEnd != matchEnds[i]) {
   1765                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
   1766                     __LINE__, i, matchEnds[i], actualEnd);
   1767             }
   1768         }
   1769 
   1770         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
   1771         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
   1772 
   1773         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   1774         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
   1775         matcher->reset();
   1776         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
   1777 
   1778         matcher->lookingAt(status);
   1779 
   1780         UnicodeString dest;
   1781         UText destText = UTEXT_INITIALIZER;
   1782         utext_openUnicodeString(&destText, &dest, &status);
   1783         UText *result;
   1784 
   1785         result = matcher->group((UText *)NULL, RegexMatcher::MATCHER_DEST_IS_UTEXT, status);
   1786         REGEX_CHECK_STATUS;
   1787         REGEX_ASSERT_UTEXT("0123456789", result);
   1788         utext_close(result);
   1789         result = matcher->group(&destText, RegexMatcher::MATCHER_DEST_IS_UTEXT, status);
   1790         REGEX_CHECK_STATUS;
   1791         REGEX_ASSERT(result == &destText);
   1792         REGEX_ASSERT_UTEXT("0123456789", result);
   1793 
   1794         result = matcher->group(0, NULL, status);
   1795         REGEX_CHECK_STATUS;
   1796         REGEX_ASSERT_UTEXT("0123456789", result);
   1797         utext_close(result);
   1798         result = matcher->group(0, &destText, status);
   1799         REGEX_CHECK_STATUS;
   1800         REGEX_ASSERT(result == &destText);
   1801         REGEX_ASSERT_UTEXT("0123456789", result);
   1802 
   1803         result = matcher->group(1, NULL, status);
   1804         REGEX_CHECK_STATUS;
   1805         REGEX_ASSERT_UTEXT("234567", result);
   1806         utext_close(result);
   1807         result = matcher->group(1, &destText, status);
   1808         REGEX_CHECK_STATUS;
   1809         REGEX_ASSERT(result == &destText);
   1810         REGEX_ASSERT_UTEXT("234567", result);
   1811 
   1812         result = matcher->group(2, NULL, status);
   1813         REGEX_CHECK_STATUS;
   1814         REGEX_ASSERT_UTEXT("45", result);
   1815         utext_close(result);
   1816         result = matcher->group(2, &destText, status);
   1817         REGEX_CHECK_STATUS;
   1818         REGEX_ASSERT(result == &destText);
   1819         REGEX_ASSERT_UTEXT("45", result);
   1820 
   1821         result = matcher->group(3, NULL, status);
   1822         REGEX_CHECK_STATUS;
   1823         REGEX_ASSERT_UTEXT("89", result);
   1824         utext_close(result);
   1825         result = matcher->group(3, &destText, status);
   1826         REGEX_CHECK_STATUS;
   1827         REGEX_ASSERT(result == &destText);
   1828         REGEX_ASSERT_UTEXT("89", result);
   1829 
   1830         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   1831         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
   1832         matcher->reset();
   1833         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
   1834 
   1835         delete matcher;
   1836         delete pat;
   1837 
   1838         utext_close(&destText);
   1839         utext_close(&input);
   1840         utext_close(&re);
   1841     }
   1842 
   1843     //
   1844     //  find
   1845     //
   1846     {
   1847         int32_t             flags=0;
   1848         UParseError         pe;
   1849         UErrorCode          status=U_ZERO_ERROR;
   1850         UText               re=UTEXT_INITIALIZER;
   1851         utext_openUTF8(&re, "abc", -1, &status);
   1852 
   1853         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   1854         REGEX_CHECK_STATUS;
   1855         UText input = UTEXT_INITIALIZER;
   1856         utext_openUTF8(&input, ".abc..abc...abc..", -1, &status);
   1857         //                      012345678901234567
   1858 
   1859         RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status);
   1860         REGEX_CHECK_STATUS;
   1861         REGEX_ASSERT(matcher->find());
   1862         REGEX_ASSERT(matcher->start(status) == 1);
   1863         REGEX_ASSERT(matcher->find());
   1864         REGEX_ASSERT(matcher->start(status) == 6);
   1865         REGEX_ASSERT(matcher->find());
   1866         REGEX_ASSERT(matcher->start(status) == 12);
   1867         REGEX_ASSERT(matcher->find() == FALSE);
   1868         REGEX_ASSERT(matcher->find() == FALSE);
   1869 
   1870         matcher->reset();
   1871         REGEX_ASSERT(matcher->find());
   1872         REGEX_ASSERT(matcher->start(status) == 1);
   1873 
   1874         REGEX_ASSERT(matcher->find(0, status));
   1875         REGEX_ASSERT(matcher->start(status) == 1);
   1876         REGEX_ASSERT(matcher->find(1, status));
   1877         REGEX_ASSERT(matcher->start(status) == 1);
   1878         REGEX_ASSERT(matcher->find(2, status));
   1879         REGEX_ASSERT(matcher->start(status) == 6);
   1880         REGEX_ASSERT(matcher->find(12, status));
   1881         REGEX_ASSERT(matcher->start(status) == 12);
   1882         REGEX_ASSERT(matcher->find(13, status) == FALSE);
   1883         REGEX_ASSERT(matcher->find(16, status) == FALSE);
   1884         REGEX_ASSERT(matcher->find(17, status) == FALSE);
   1885         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
   1886 
   1887         status = U_ZERO_ERROR;
   1888         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   1889         status = U_ZERO_ERROR;
   1890         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
   1891 
   1892         REGEX_ASSERT(matcher->groupCount() == 0);
   1893 
   1894         delete matcher;
   1895         delete pat;
   1896 
   1897         utext_close(&input);
   1898         utext_close(&re);
   1899     }
   1900 
   1901 
   1902     //
   1903     //  find, with \G in pattern (true if at the end of a previous match).
   1904     //
   1905     {
   1906         int32_t             flags=0;
   1907         UParseError         pe;
   1908         UErrorCode          status=U_ZERO_ERROR;
   1909         UText               re=UTEXT_INITIALIZER;
   1910         utext_openUTF8(&re, ".*?(?:(\\Gabc)|(abc))", -1, &status);
   1911 
   1912         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   1913 
   1914         REGEX_CHECK_STATUS;
   1915         UText input = UTEXT_INITIALIZER;
   1916         utext_openUTF8(&input, ".abcabc.abc..", -1, &status);
   1917         //                      012345678901234567
   1918 
   1919         RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status);
   1920         REGEX_CHECK_STATUS;
   1921         REGEX_ASSERT(matcher->find());
   1922         REGEX_ASSERT(matcher->start(status) == 0);
   1923         REGEX_ASSERT(matcher->start(1, status) == -1);
   1924         REGEX_ASSERT(matcher->start(2, status) == 1);
   1925 
   1926         REGEX_ASSERT(matcher->find());
   1927         REGEX_ASSERT(matcher->start(status) == 4);
   1928         REGEX_ASSERT(matcher->start(1, status) == 4);
   1929         REGEX_ASSERT(matcher->start(2, status) == -1);
   1930         REGEX_CHECK_STATUS;
   1931 
   1932         delete matcher;
   1933         delete pat;
   1934 
   1935         utext_close(&input);
   1936         utext_close(&re);
   1937     }
   1938 
   1939     //
   1940     //   find with zero length matches, match position should bump ahead
   1941     //     to prevent loops.
   1942     //
   1943     {
   1944         int32_t                 i;
   1945         UErrorCode          status=U_ZERO_ERROR;
   1946         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
   1947                                                       //   using an always-true look-ahead.
   1948         REGEX_CHECK_STATUS;
   1949         UText s = UTEXT_INITIALIZER;
   1950         utext_openUTF8(&s, "    ", -1, &status);
   1951         m.reset(&s);
   1952         for (i=0; ; i++) {
   1953             if (m.find() == FALSE) {
   1954                 break;
   1955             }
   1956             REGEX_ASSERT(m.start(status) == i);
   1957             REGEX_ASSERT(m.end(status) == i);
   1958         }
   1959         REGEX_ASSERT(i==5);
   1960 
   1961         // Check that the bump goes over characters outside the BMP OK
   1962         // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
   1963         unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
   1964         utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
   1965         m.reset(&s);
   1966         for (i=0; ; i+=2) {
   1967             if (m.find() == FALSE) {
   1968                 break;
   1969             }
   1970             REGEX_ASSERT(m.start(status) == i);
   1971             REGEX_ASSERT(m.end(status) == i);
   1972         }
   1973         REGEX_ASSERT(i==10);
   1974 
   1975         utext_close(&s);
   1976     }
   1977     {
   1978         // find() loop breaking test.
   1979         //        with pattern of /.?/, should see a series of one char matches, then a single
   1980         //        match of zero length at the end of the input string.
   1981         int32_t                 i;
   1982         UErrorCode          status=U_ZERO_ERROR;
   1983         RegexMatcher        m(".?", 0, status);
   1984         REGEX_CHECK_STATUS;
   1985         UText s = UTEXT_INITIALIZER;
   1986         utext_openUTF8(&s, "    ", -1, &status);
   1987         m.reset(&s);
   1988         for (i=0; ; i++) {
   1989             if (m.find() == FALSE) {
   1990                 break;
   1991             }
   1992             REGEX_ASSERT(m.start(status) == i);
   1993             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
   1994         }
   1995         REGEX_ASSERT(i==5);
   1996 
   1997         utext_close(&s);
   1998     }
   1999 
   2000 
   2001     //
   2002     // Matchers with no input string behave as if they had an empty input string.
   2003     //
   2004 
   2005     {
   2006         UErrorCode status = U_ZERO_ERROR;
   2007         RegexMatcher  m(".?", 0, status);
   2008         REGEX_CHECK_STATUS;
   2009         REGEX_ASSERT(m.find());
   2010         REGEX_ASSERT(m.start(status) == 0);
   2011         REGEX_ASSERT(m.input() == "");
   2012     }
   2013     {
   2014         UErrorCode status = U_ZERO_ERROR;
   2015         RegexPattern  *p = RegexPattern::compile(".", 0, status);
   2016         RegexMatcher  *m = p->matcher(status);
   2017         REGEX_CHECK_STATUS;
   2018 
   2019         REGEX_ASSERT(m->find() == FALSE);
   2020         REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
   2021         delete m;
   2022         delete p;
   2023     }
   2024 
   2025     //
   2026     // Regions
   2027     //
   2028     {
   2029         UErrorCode status = U_ZERO_ERROR;
   2030         UText testPattern = UTEXT_INITIALIZER;
   2031         UText testText    = UTEXT_INITIALIZER;
   2032         utext_openUTF8(&testPattern, ".*", -1, &status);
   2033         utext_openUTF8(&testText, "This is test data", -1, &status);
   2034 
   2035         RegexMatcher m(&testPattern, &testText, 0, status);
   2036         REGEX_CHECK_STATUS;
   2037         REGEX_ASSERT(m.regionStart() == 0);
   2038         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
   2039         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2040         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2041 
   2042         m.region(2,4, status);
   2043         REGEX_CHECK_STATUS;
   2044         REGEX_ASSERT(m.matches(status));
   2045         REGEX_ASSERT(m.start(status)==2);
   2046         REGEX_ASSERT(m.end(status)==4);
   2047         REGEX_CHECK_STATUS;
   2048 
   2049         m.reset();
   2050         REGEX_ASSERT(m.regionStart() == 0);
   2051         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
   2052 
   2053         utext_openUTF8(&testText, "short", -1, &status);
   2054         m.reset(&testText);
   2055         REGEX_ASSERT(m.regionStart() == 0);
   2056         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
   2057 
   2058         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2059         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
   2060         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   2061         REGEX_ASSERT(&m == &m.reset());
   2062         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   2063 
   2064         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
   2065         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2066         REGEX_ASSERT(&m == &m.reset());
   2067         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2068 
   2069         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2070         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
   2071         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   2072         REGEX_ASSERT(&m == &m.reset());
   2073         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   2074 
   2075         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
   2076         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2077         REGEX_ASSERT(&m == &m.reset());
   2078         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2079 
   2080         utext_close(&testText);
   2081         utext_close(&testPattern);
   2082     }
   2083 
   2084     //
   2085     // hitEnd() and requireEnd()
   2086     //
   2087     {
   2088         UErrorCode status = U_ZERO_ERROR;
   2089         UText testPattern = UTEXT_INITIALIZER;
   2090         UText testText    = UTEXT_INITIALIZER;
   2091         utext_openUTF8(&testPattern, ".*", -1, &status);
   2092         utext_openUTF8(&testText, "aabb", -1, &status);
   2093 
   2094         RegexMatcher m1(&testPattern, &testText,  0, status);
   2095         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
   2096         REGEX_ASSERT(m1.hitEnd() == TRUE);
   2097         REGEX_ASSERT(m1.requireEnd() == FALSE);
   2098         REGEX_CHECK_STATUS;
   2099 
   2100         status = U_ZERO_ERROR;
   2101         utext_openUTF8(&testPattern, "a*", -1, &status);
   2102         RegexMatcher m2(&testPattern, &testText, 0, status);
   2103         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
   2104         REGEX_ASSERT(m2.hitEnd() == FALSE);
   2105         REGEX_ASSERT(m2.requireEnd() == FALSE);
   2106         REGEX_CHECK_STATUS;
   2107 
   2108         status = U_ZERO_ERROR;
   2109         utext_openUTF8(&testPattern, ".*$", -1, &status);
   2110         RegexMatcher m3(&testPattern, &testText, 0, status);
   2111         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
   2112         REGEX_ASSERT(m3.hitEnd() == TRUE);
   2113         REGEX_ASSERT(m3.requireEnd() == TRUE);
   2114         REGEX_CHECK_STATUS;
   2115 
   2116         utext_close(&testText);
   2117         utext_close(&testPattern);
   2118     }
   2119 }
   2120 
   2121 
   2122 //---------------------------------------------------------------------------
   2123 //
   2124 //      API_Replace_UTF8   API test for class RegexMatcher, testing the
   2125 //                         Replace family of functions.
   2126 //
   2127 //---------------------------------------------------------------------------
   2128 void RegexTest::API_Replace_UTF8() {
   2129     //
   2130     //  Replace
   2131     //
   2132     int32_t             flags=0;
   2133     UParseError         pe;
   2134     UErrorCode          status=U_ZERO_ERROR;
   2135 
   2136     UText               re=UTEXT_INITIALIZER;
   2137     utext_openUTF8(&re, "abc", -1, &status);
   2138     RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2139     REGEX_CHECK_STATUS;
   2140 
   2141     char data[] = ".abc..abc...abc..";
   2142     //             012345678901234567
   2143     UText dataText = UTEXT_INITIALIZER;
   2144     utext_openUTF8(&dataText, data, -1, &status);
   2145     RegexMatcher *matcher = pat->matcher(&dataText, RegexPattern::PATTERN_IS_UTEXT, status);
   2146 
   2147     //
   2148     //  Plain vanilla matches.
   2149     //
   2150     UnicodeString  dest;
   2151     UText destText = UTEXT_INITIALIZER;
   2152     utext_openUnicodeString(&destText, &dest, &status);
   2153     UText *result;
   2154 
   2155     UText replText = UTEXT_INITIALIZER;
   2156 
   2157     utext_openUTF8(&replText, "yz", -1, &status);
   2158     result = matcher->replaceFirst(&replText, NULL, status);
   2159     REGEX_CHECK_STATUS;
   2160     REGEX_ASSERT_UTEXT(".yz..abc...abc..", result);
   2161     utext_close(result);
   2162     result = matcher->replaceFirst(&replText, &destText, status);
   2163     REGEX_CHECK_STATUS;
   2164     REGEX_ASSERT(result == &destText);
   2165     REGEX_ASSERT_UTEXT(".yz..abc...abc..", result);
   2166 
   2167     result = matcher->replaceAll(&replText, NULL, status);
   2168     REGEX_CHECK_STATUS;
   2169     REGEX_ASSERT_UTEXT(".yz..yz...yz..", result);
   2170     utext_close(result);
   2171 
   2172     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2173     result = matcher->replaceAll(&replText, &destText, status);
   2174     REGEX_CHECK_STATUS;
   2175     REGEX_ASSERT(result == &destText);
   2176     REGEX_ASSERT_UTEXT(".yz..yz...yz..", result);
   2177 
   2178     //
   2179     //  Plain vanilla non-matches.
   2180     //
   2181     utext_openUTF8(&dataText, ".abx..abx...abx..", -1, &status);
   2182     matcher->reset(&dataText);
   2183 
   2184     result = matcher->replaceFirst(&replText, NULL, status);
   2185     REGEX_CHECK_STATUS;
   2186     REGEX_ASSERT_UTEXT(".abx..abx...abx..", result);
   2187     utext_close(result);
   2188     result = matcher->replaceFirst(&replText, &destText, status);
   2189     REGEX_CHECK_STATUS;
   2190     REGEX_ASSERT(result == &destText);
   2191     REGEX_ASSERT_UTEXT(".abx..abx...abx..", result);
   2192 
   2193     result = matcher->replaceAll(&replText, NULL, status);
   2194     REGEX_CHECK_STATUS;
   2195     REGEX_ASSERT_UTEXT(".abx..abx...abx..", result);
   2196     utext_close(result);
   2197     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2198     result = matcher->replaceAll(&replText, &destText, status);
   2199     REGEX_CHECK_STATUS;
   2200     REGEX_ASSERT(result == &destText);
   2201     REGEX_ASSERT_UTEXT(".abx..abx...abx..", result);
   2202 
   2203     //
   2204     // Empty source string
   2205     //
   2206     utext_openUTF8(&dataText, NULL, 0, &status);
   2207     matcher->reset(&dataText);
   2208 
   2209     result = matcher->replaceFirst(&replText, NULL, status);
   2210     REGEX_CHECK_STATUS;
   2211     REGEX_ASSERT_UTEXT("", result);
   2212     utext_close(result);
   2213     result = matcher->replaceFirst(&replText, &destText, status);
   2214     REGEX_CHECK_STATUS;
   2215     REGEX_ASSERT(result == &destText);
   2216     REGEX_ASSERT_UTEXT("", result);
   2217 
   2218     result = matcher->replaceAll(&replText, NULL, status);
   2219     REGEX_CHECK_STATUS;
   2220     REGEX_ASSERT_UTEXT("", result);
   2221     utext_close(result);
   2222     result = matcher->replaceAll(&replText, &destText, status);
   2223     REGEX_CHECK_STATUS;
   2224     REGEX_ASSERT(result == &destText);
   2225     REGEX_ASSERT_UTEXT("", result);
   2226 
   2227     //
   2228     // Empty substitution string
   2229     //
   2230     utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
   2231     matcher->reset(&dataText);
   2232 
   2233     utext_openUTF8(&replText, NULL, 0, &status);
   2234     result = matcher->replaceFirst(&replText, NULL, status);
   2235     REGEX_CHECK_STATUS;
   2236     REGEX_ASSERT_UTEXT("...abc...abc..", result);
   2237     utext_close(result);
   2238     result = matcher->replaceFirst(&replText, &destText, status);
   2239     REGEX_CHECK_STATUS;
   2240     REGEX_ASSERT(result == &destText);
   2241     REGEX_ASSERT_UTEXT("...abc...abc..", result);
   2242 
   2243     result = matcher->replaceAll(&replText, NULL, status);
   2244     REGEX_CHECK_STATUS;
   2245     REGEX_ASSERT_UTEXT("........", result);
   2246     utext_close(result);
   2247     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2248     result = matcher->replaceAll(&replText, &destText, status);
   2249     REGEX_CHECK_STATUS;
   2250     REGEX_ASSERT(result == &destText);
   2251     REGEX_ASSERT_UTEXT("........", result);
   2252 
   2253     //
   2254     // match whole string
   2255     //
   2256     utext_openUTF8(&dataText, "abc", -1, &status);
   2257     matcher->reset(&dataText);
   2258 
   2259     utext_openUTF8(&replText, "xyz", -1, &status);
   2260     result = matcher->replaceFirst(&replText, NULL, status);
   2261     REGEX_CHECK_STATUS;
   2262     REGEX_ASSERT_UTEXT("xyz", result);
   2263     utext_close(result);
   2264     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2265     result = matcher->replaceFirst(&replText, &destText, status);
   2266     REGEX_CHECK_STATUS;
   2267     REGEX_ASSERT(result == &destText);
   2268     REGEX_ASSERT_UTEXT("xyz", result);
   2269 
   2270     result = matcher->replaceAll(&replText, NULL, status);
   2271     REGEX_CHECK_STATUS;
   2272     REGEX_ASSERT_UTEXT("xyz", result);
   2273     utext_close(result);
   2274     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2275     result = matcher->replaceAll(&replText, &destText, status);
   2276     REGEX_CHECK_STATUS;
   2277     REGEX_ASSERT(result == &destText);
   2278     REGEX_ASSERT_UTEXT("xyz", result);
   2279 
   2280     //
   2281     // Capture Group, simple case
   2282     //
   2283     utext_openUTF8(&re, "a(..)", -1, &status);
   2284     RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
   2285     REGEX_CHECK_STATUS;
   2286 
   2287     utext_openUTF8(&dataText, "abcdefg", -1, &status);
   2288     RegexMatcher *matcher2 = pat2->matcher(&dataText, RegexPattern::PATTERN_IS_UTEXT, status);
   2289     REGEX_CHECK_STATUS;
   2290 
   2291     utext_openUTF8(&replText, "$1$1", -1, &status);
   2292     result = matcher2->replaceFirst(&replText, NULL, status);
   2293     REGEX_CHECK_STATUS;
   2294     REGEX_ASSERT_UTEXT("bcbcdefg", result);
   2295     utext_close(result);
   2296     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2297     result = matcher2->replaceFirst(&replText, &destText, status);
   2298     REGEX_CHECK_STATUS;
   2299     REGEX_ASSERT(result == &destText);
   2300     REGEX_ASSERT_UTEXT("bcbcdefg", result);
   2301 
   2302     utext_openUTF8(&replText, "The value of \\$1 is $1.", -1, &status);
   2303     result = matcher2->replaceFirst(&replText, NULL, status);
   2304     REGEX_CHECK_STATUS;
   2305     REGEX_ASSERT_UTEXT("The value of $1 is bc.defg", result);
   2306     utext_close(result);
   2307     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2308     result = matcher2->replaceFirst(&replText, &destText, status);
   2309     REGEX_CHECK_STATUS;
   2310     REGEX_ASSERT(result == &destText);
   2311     REGEX_ASSERT_UTEXT("The value of $1 is bc.defg", result);
   2312 
   2313     utext_openUTF8(&replText, "$ by itself, no group number $$$", -1, &status);
   2314     result = matcher2->replaceFirst(&replText, NULL, status);
   2315     REGEX_CHECK_STATUS;
   2316     REGEX_ASSERT_UTEXT("$ by itself, no group number $$$defg", result);
   2317     utext_close(result);
   2318     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2319     result = matcher2->replaceFirst(&replText, &destText, status);
   2320     REGEX_CHECK_STATUS;
   2321     REGEX_ASSERT(result == &destText);
   2322     REGEX_ASSERT_UTEXT("$ by itself, no group number $$$defg", result);
   2323 
   2324     unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
   2325     //                                 012345678901234567890123456
   2326     supplDigitChars[22] = 0xF0;
   2327     supplDigitChars[23] = 0x9D;
   2328     supplDigitChars[24] = 0x9F;
   2329     supplDigitChars[25] = 0x8F;
   2330     utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
   2331 
   2332     result = matcher2->replaceFirst(&replText, NULL, status);
   2333     REGEX_CHECK_STATUS;
   2334     REGEX_ASSERT_UTEXT("Supplemental Digit 1 bc.defg", result);
   2335     utext_close(result);
   2336     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2337     result = matcher2->replaceFirst(&replText, &destText, status);
   2338     REGEX_CHECK_STATUS;
   2339     REGEX_ASSERT(result == &destText);
   2340     REGEX_ASSERT_UTEXT("Supplemental Digit 1 bc.defg", result);
   2341 
   2342     utext_openUTF8(&replText, "bad capture group number $5...", -1, &status);
   2343     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
   2344 //    REGEX_ASSERT_UTEXT("abcdefg", result);
   2345     utext_close(result);
   2346     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2347     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
   2348     REGEX_ASSERT(result == &destText);
   2349 //    REGEX_ASSERT_UTEXT("abcdefg", result);
   2350 
   2351     //
   2352     // Replacement String with \u hex escapes
   2353     //
   2354     {
   2355         utext_openUTF8(&dataText, "abc 1 abc 2 abc 3", -1, &status);
   2356         utext_openUTF8(&replText, "--\\u0043--", -1, &status);
   2357         matcher->reset(&dataText);
   2358 
   2359         result = matcher->replaceAll(&replText, NULL, status);
   2360         REGEX_CHECK_STATUS;
   2361         REGEX_ASSERT_UTEXT("--C-- 1 --C-- 2 --C-- 3", result);
   2362         utext_close(result);
   2363         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2364         result = matcher->replaceAll(&replText, &destText, status);
   2365         REGEX_CHECK_STATUS;
   2366         REGEX_ASSERT(result == &destText);
   2367         REGEX_ASSERT_UTEXT("--C-- 1 --C-- 2 --C-- 3", result);
   2368     }
   2369     {
   2370         utext_openUTF8(&dataText, "abc !", -1, &status);
   2371         utext_openUTF8(&replText, "--\\U00010000--", -1, &status);
   2372         matcher->reset(&dataText);
   2373 
   2374         unsigned char expected[] = "--xxxx-- !"; // \U00010000, "LINEAR B SYLLABLE B008 A"
   2375         //                          0123456789
   2376         expected[2] = 0xF0;
   2377         expected[3] = 0x90;
   2378         expected[4] = 0x80;
   2379         expected[5] = 0x80;
   2380 
   2381         result = matcher->replaceAll(&replText, NULL, status);
   2382         REGEX_CHECK_STATUS;
   2383         REGEX_ASSERT_UTEXT((char *)expected, result);
   2384         utext_close(result);
   2385         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2386         result = matcher->replaceAll(&replText, &destText, status);
   2387         REGEX_CHECK_STATUS;
   2388         REGEX_ASSERT(result == &destText);
   2389         REGEX_ASSERT_UTEXT((char *)expected, result);
   2390     }
   2391     // TODO:  need more through testing of capture substitutions.
   2392 
   2393     // Bug 4057
   2394     //
   2395     {
   2396         status = U_ZERO_ERROR;
   2397         utext_openUTF8(&re, "ss(.*?)ee", -1, &status);
   2398         utext_openUTF8(&dataText, "The matches start with ss and end with ee ss stuff ee fin", -1, &status);
   2399         utext_openUTF8(&replText, "ooh", -1, &status);
   2400 
   2401         RegexMatcher m(&re, 0, status);
   2402         REGEX_CHECK_STATUS;
   2403 
   2404         UnicodeString result;
   2405         UText resultText = UTEXT_INITIALIZER;
   2406         utext_openUnicodeString(&resultText, &result, &status);
   2407 
   2408         // Multiple finds do NOT bump up the previous appendReplacement postion.
   2409         m.reset(&dataText);
   2410         m.find();
   2411         m.find();
   2412         m.appendReplacement(&resultText, &replText, status);
   2413         REGEX_CHECK_STATUS;
   2414         REGEX_ASSERT_UTEXT("The matches start with ss and end with ee ooh", &resultText);
   2415 
   2416         // After a reset into the interior of a string, appendReplacement still starts at beginning.
   2417         status = U_ZERO_ERROR;
   2418         result.truncate(0);
   2419         utext_openUnicodeString(&resultText, &result, &status);
   2420         m.reset(10, status);
   2421         m.find();
   2422         m.find();
   2423         m.appendReplacement(&resultText, &replText, status);
   2424         REGEX_CHECK_STATUS;
   2425         REGEX_ASSERT_UTEXT("The matches start with ss and end with ee ooh", &resultText);
   2426 
   2427         // find() at interior of string, appendReplacement still starts at beginning.
   2428         status = U_ZERO_ERROR;
   2429         result.truncate(0);
   2430         utext_openUnicodeString(&resultText, &result, &status);
   2431         m.reset();
   2432         m.find(10, status);
   2433         m.find();
   2434         m.appendReplacement(&resultText, &replText, status);
   2435         REGEX_CHECK_STATUS;
   2436         REGEX_ASSERT_UTEXT("The matches start with ss and end with ee ooh", &resultText);
   2437 
   2438         m.appendTail(&resultText);
   2439         REGEX_ASSERT_UTEXT("The matches start with ss and end with ee ooh fin", &resultText);
   2440 
   2441         utext_close(&resultText);
   2442     }
   2443 
   2444     delete matcher2;
   2445     delete pat2;
   2446     delete matcher;
   2447     delete pat;
   2448 
   2449     utext_close(&dataText);
   2450     utext_close(&replText);
   2451     utext_close(&destText);
   2452     utext_close(&re);
   2453 }
   2454 
   2455 
   2456 //---------------------------------------------------------------------------
   2457 //
   2458 //      API_Pattern_UTF8  Test that the API for class RegexPattern is
   2459 //                        present and nominally working.
   2460 //
   2461 //---------------------------------------------------------------------------
   2462 void RegexTest::API_Pattern_UTF8() {
   2463     RegexPattern        pata;    // Test default constructor to not crash.
   2464     RegexPattern        patb;
   2465 
   2466     REGEX_ASSERT(pata == patb);
   2467     REGEX_ASSERT(pata == pata);
   2468 
   2469     UText         re1 = UTEXT_INITIALIZER;
   2470     UText         re2 = UTEXT_INITIALIZER;
   2471     UErrorCode    status = U_ZERO_ERROR;
   2472     UParseError   pe;
   2473 
   2474     utext_openUTF8(&re1, "abc[a-l][m-z]", -1, &status);
   2475     utext_openUTF8(&re2, "def", -1, &status);
   2476 
   2477     RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
   2478     RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
   2479     REGEX_CHECK_STATUS;
   2480     REGEX_ASSERT(*pat1 == *pat1);
   2481     REGEX_ASSERT(*pat1 != pata);
   2482 
   2483     // Assign
   2484     patb = *pat1;
   2485     REGEX_ASSERT(patb == *pat1);
   2486 
   2487     // Copy Construct
   2488     RegexPattern patc(*pat1);
   2489     REGEX_ASSERT(patc == *pat1);
   2490     REGEX_ASSERT(patb == patc);
   2491     REGEX_ASSERT(pat1 != pat2);
   2492     patb = *pat2;
   2493     REGEX_ASSERT(patb != patc);
   2494     REGEX_ASSERT(patb == *pat2);
   2495 
   2496     // Compile with no flags.
   2497     RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
   2498     REGEX_ASSERT(*pat1a == *pat1);
   2499 
   2500     REGEX_ASSERT(pat1a->flags() == 0);
   2501 
   2502     // Compile with different flags should be not equal
   2503     RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
   2504     REGEX_CHECK_STATUS;
   2505 
   2506     REGEX_ASSERT(*pat1b != *pat1a);
   2507     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
   2508     REGEX_ASSERT(pat1a->flags() == 0);
   2509     delete pat1b;
   2510 
   2511     // clone
   2512     RegexPattern *pat1c = pat1->clone();
   2513     REGEX_ASSERT(*pat1c == *pat1);
   2514     REGEX_ASSERT(*pat1c != *pat2);
   2515 
   2516     delete pat1c;
   2517     delete pat1a;
   2518     delete pat1;
   2519     delete pat2;
   2520 
   2521     utext_close(&re1);
   2522     utext_close(&re2);
   2523 
   2524 
   2525     //
   2526     //   Verify that a matcher created from a cloned pattern works.
   2527     //     (Jitterbug 3423)
   2528     //
   2529     {
   2530         UErrorCode     status     = U_ZERO_ERROR;
   2531         UText          pattern    = UTEXT_INITIALIZER;
   2532         utext_openUTF8(&pattern, "\\p{L}+", -1, &status);
   2533 
   2534         RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
   2535         RegexPattern  *pClone     = pSource->clone();
   2536         delete         pSource;
   2537         RegexMatcher  *mFromClone = pClone->matcher(status);
   2538         REGEX_CHECK_STATUS;
   2539 
   2540         UText          input      = UTEXT_INITIALIZER;
   2541         utext_openUTF8(&input, "Hello World", -1, &status);
   2542         mFromClone->reset(&input);
   2543         REGEX_ASSERT(mFromClone->find() == TRUE);
   2544         REGEX_ASSERT(mFromClone->group(status) == "Hello");
   2545         REGEX_ASSERT(mFromClone->find() == TRUE);
   2546         REGEX_ASSERT(mFromClone->group(status) == "World");
   2547         REGEX_ASSERT(mFromClone->find() == FALSE);
   2548         delete mFromClone;
   2549         delete pClone;
   2550 
   2551         utext_close(&input);
   2552         utext_close(&pattern);
   2553     }
   2554 
   2555     //
   2556     //   matches convenience API
   2557     //
   2558     {
   2559         UErrorCode status  = U_ZERO_ERROR;
   2560         UText      pattern = UTEXT_INITIALIZER;
   2561         UText      input   = UTEXT_INITIALIZER;
   2562 
   2563         utext_openUTF8(&input, "random input", -1, &status);
   2564 
   2565         utext_openUTF8(&pattern, ".*", -1, &status);
   2566         REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
   2567         REGEX_CHECK_STATUS;
   2568 
   2569         utext_openUTF8(&pattern, "abc", -1, &status);
   2570         REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
   2571         REGEX_CHECK_STATUS;
   2572 
   2573         utext_openUTF8(&pattern, ".*nput", -1, &status);
   2574         REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
   2575         REGEX_CHECK_STATUS;
   2576 
   2577         utext_openUTF8(&pattern, "random input", -1, &status);
   2578         REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
   2579         REGEX_CHECK_STATUS;
   2580 
   2581         utext_openUTF8(&pattern, ".*u", -1, &status);
   2582         REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
   2583         REGEX_CHECK_STATUS;
   2584 
   2585         utext_openUTF8(&input, "abc", -1, &status);
   2586         utext_openUTF8(&pattern, "abc", -1, &status);
   2587         status = U_INDEX_OUTOFBOUNDS_ERROR;
   2588         REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
   2589         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   2590 
   2591         utext_close(&input);
   2592         utext_close(&pattern);
   2593     }
   2594 
   2595 
   2596     //
   2597     // Split()
   2598     //
   2599     status = U_ZERO_ERROR;
   2600     utext_openUTF8(&re1, " +", -1, &status);
   2601     pat1 = RegexPattern::compile(&re1, pe, status);
   2602     REGEX_CHECK_STATUS;
   2603     UnicodeString  fields[10];
   2604 
   2605     int32_t n;
   2606     n = pat1->split("Now is the time", fields, 10, status);
   2607     REGEX_CHECK_STATUS;
   2608     REGEX_ASSERT(n==4);
   2609     REGEX_ASSERT(fields[0]=="Now");
   2610     REGEX_ASSERT(fields[1]=="is");
   2611     REGEX_ASSERT(fields[2]=="the");
   2612     REGEX_ASSERT(fields[3]=="time");
   2613     REGEX_ASSERT(fields[4]=="");
   2614 
   2615     n = pat1->split("Now is the time", fields, 2, status);
   2616     REGEX_CHECK_STATUS;
   2617     REGEX_ASSERT(n==2);
   2618     REGEX_ASSERT(fields[0]=="Now");
   2619     REGEX_ASSERT(fields[1]=="is the time");
   2620     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
   2621 
   2622     fields[1] = "*";
   2623     status = U_ZERO_ERROR;
   2624     n = pat1->split("Now is the time", fields, 1, status);
   2625     REGEX_CHECK_STATUS;
   2626     REGEX_ASSERT(n==1);
   2627     REGEX_ASSERT(fields[0]=="Now is the time");
   2628     REGEX_ASSERT(fields[1]=="*");
   2629     status = U_ZERO_ERROR;
   2630 
   2631     n = pat1->split("    Now       is the time   ", fields, 10, status);
   2632     REGEX_CHECK_STATUS;
   2633     REGEX_ASSERT(n==5);
   2634     REGEX_ASSERT(fields[0]=="");
   2635     REGEX_ASSERT(fields[1]=="Now");
   2636     REGEX_ASSERT(fields[2]=="is");
   2637     REGEX_ASSERT(fields[3]=="the");
   2638     REGEX_ASSERT(fields[4]=="time");
   2639     REGEX_ASSERT(fields[5]=="");
   2640 
   2641     n = pat1->split("     ", fields, 10, status);
   2642     REGEX_CHECK_STATUS;
   2643     REGEX_ASSERT(n==1);
   2644     REGEX_ASSERT(fields[0]=="");
   2645 
   2646     fields[0] = "foo";
   2647     n = pat1->split("", fields, 10, status);
   2648     REGEX_CHECK_STATUS;
   2649     REGEX_ASSERT(n==0);
   2650     REGEX_ASSERT(fields[0]=="foo");
   2651 
   2652     delete pat1;
   2653 
   2654     //  split, with a pattern with (capture)
   2655     utext_openUTF8(&re1, "<(\\w*)>", -1, &status);
   2656     pat1 = RegexPattern::compile(&re1,  pe, status);
   2657     REGEX_CHECK_STATUS;
   2658 
   2659     status = U_ZERO_ERROR;
   2660     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
   2661     REGEX_CHECK_STATUS;
   2662     REGEX_ASSERT(n==6);
   2663     REGEX_ASSERT(fields[0]=="");
   2664     REGEX_ASSERT(fields[1]=="a");
   2665     REGEX_ASSERT(fields[2]=="Now is ");
   2666     REGEX_ASSERT(fields[3]=="b");
   2667     REGEX_ASSERT(fields[4]=="the time");
   2668     REGEX_ASSERT(fields[5]=="c");
   2669     REGEX_ASSERT(fields[6]=="");
   2670     REGEX_ASSERT(status==U_ZERO_ERROR);
   2671 
   2672     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
   2673     REGEX_CHECK_STATUS;
   2674     REGEX_ASSERT(n==6);
   2675     REGEX_ASSERT(fields[0]=="  ");
   2676     REGEX_ASSERT(fields[1]=="a");
   2677     REGEX_ASSERT(fields[2]=="Now is ");
   2678     REGEX_ASSERT(fields[3]=="b");
   2679     REGEX_ASSERT(fields[4]=="the time");
   2680     REGEX_ASSERT(fields[5]=="c");
   2681     REGEX_ASSERT(fields[6]=="");
   2682 
   2683     status = U_ZERO_ERROR;
   2684     fields[6] = "foo";
   2685     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
   2686     REGEX_CHECK_STATUS;
   2687     REGEX_ASSERT(n==6);
   2688     REGEX_ASSERT(fields[0]=="  ");
   2689     REGEX_ASSERT(fields[1]=="a");
   2690     REGEX_ASSERT(fields[2]=="Now is ");
   2691     REGEX_ASSERT(fields[3]=="b");
   2692     REGEX_ASSERT(fields[4]=="the time");
   2693     REGEX_ASSERT(fields[5]=="c");
   2694     REGEX_ASSERT(fields[6]=="foo");
   2695 
   2696     status = U_ZERO_ERROR;
   2697     fields[5] = "foo";
   2698     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
   2699     REGEX_CHECK_STATUS;
   2700     REGEX_ASSERT(n==5);
   2701     REGEX_ASSERT(fields[0]=="  ");
   2702     REGEX_ASSERT(fields[1]=="a");
   2703     REGEX_ASSERT(fields[2]=="Now is ");
   2704     REGEX_ASSERT(fields[3]=="b");
   2705     REGEX_ASSERT(fields[4]=="the time<c>");
   2706     REGEX_ASSERT(fields[5]=="foo");
   2707 
   2708     status = U_ZERO_ERROR;
   2709     fields[5] = "foo";
   2710     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
   2711     REGEX_CHECK_STATUS;
   2712     REGEX_ASSERT(n==5);
   2713     REGEX_ASSERT(fields[0]=="  ");
   2714     REGEX_ASSERT(fields[1]=="a");
   2715     REGEX_ASSERT(fields[2]=="Now is ");
   2716     REGEX_ASSERT(fields[3]=="b");
   2717     REGEX_ASSERT(fields[4]=="the time");
   2718     REGEX_ASSERT(fields[5]=="foo");
   2719 
   2720     status = U_ZERO_ERROR;
   2721     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
   2722     REGEX_CHECK_STATUS;
   2723     REGEX_ASSERT(n==4);
   2724     REGEX_ASSERT(fields[0]=="  ");
   2725     REGEX_ASSERT(fields[1]=="a");
   2726     REGEX_ASSERT(fields[2]=="Now is ");
   2727     REGEX_ASSERT(fields[3]=="the time<c>");
   2728     status = U_ZERO_ERROR;
   2729     delete pat1;
   2730 
   2731     utext_openUTF8(&re1, "([-,])", -1, &status);
   2732     pat1 = RegexPattern::compile(&re1, pe, status);
   2733     REGEX_CHECK_STATUS;
   2734     n = pat1->split("1-10,20", fields, 10, status);
   2735     REGEX_CHECK_STATUS;
   2736     REGEX_ASSERT(n==5);
   2737     REGEX_ASSERT(fields[0]=="1");
   2738     REGEX_ASSERT(fields[1]=="-");
   2739     REGEX_ASSERT(fields[2]=="10");
   2740     REGEX_ASSERT(fields[3]==",");
   2741     REGEX_ASSERT(fields[4]=="20");
   2742     delete pat1;
   2743 
   2744 
   2745     //
   2746     // RegexPattern::pattern() and patternText()
   2747     //
   2748     pat1 = new RegexPattern();
   2749     REGEX_ASSERT(pat1->pattern() == "");
   2750     REGEX_ASSERT_UTEXT("", pat1->patternText());
   2751     delete pat1;
   2752 
   2753     utext_openUTF8(&re1, "(Hello, world)*", -1, &status);
   2754     pat1 = RegexPattern::compile(&re1, pe, status);
   2755     REGEX_CHECK_STATUS;
   2756     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
   2757     REGEX_ASSERT_UTEXT("(Hello, world)*", pat1->patternText());
   2758     delete pat1;
   2759 
   2760     utext_close(&re1);
   2761 }
   2762 
   2763 
   2764 //---------------------------------------------------------------------------
   2765 //
   2766 //      Extended       A more thorough check for features of regex patterns
   2767 //                     The test cases are in a separate data file,
   2768 //                       source/tests/testdata/regextst.txt
   2769 //                     A description of the test data format is included in that file.
   2770 //
   2771 //---------------------------------------------------------------------------
   2772 
   2773 const char *
   2774 RegexTest::getPath(char buffer[2048], const char *filename) {
   2775     UErrorCode status=U_ZERO_ERROR;
   2776     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   2777     if (U_FAILURE(status)) {
   2778         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
   2779         return NULL;
   2780     }
   2781 
   2782     strcpy(buffer, testDataDirectory);
   2783     strcat(buffer, filename);
   2784     return buffer;
   2785 }
   2786 
   2787 void RegexTest::Extended() {
   2788     char tdd[2048];
   2789     const char *srcPath;
   2790     UErrorCode  status  = U_ZERO_ERROR;
   2791     int32_t     lineNum = 0;
   2792 
   2793     //
   2794     //  Open and read the test data file.
   2795     //
   2796     srcPath=getPath(tdd, "regextst.txt");
   2797     if(srcPath==NULL) {
   2798         return; /* something went wrong, error already output */
   2799     }
   2800 
   2801     int32_t    len;
   2802     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
   2803     if (U_FAILURE(status)) {
   2804         return; /* something went wrong, error already output */
   2805     }
   2806 
   2807     //
   2808     //  Put the test data into a UnicodeString
   2809     //
   2810     UnicodeString testString(FALSE, testData, len);
   2811 
   2812     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
   2813     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
   2814     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
   2815 
   2816     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
   2817     UnicodeString   testPattern;   // The pattern for test from the test file.
   2818     UnicodeString   testFlags;     // the flags   for a test.
   2819     UnicodeString   matchString;   // The marked up string to be used as input
   2820 
   2821     if (U_FAILURE(status)){
   2822         dataerrln("Construct RegexMatcher() error.");
   2823         delete [] testData;
   2824         return;
   2825     }
   2826 
   2827     //
   2828     //  Loop over the test data file, once per line.
   2829     //
   2830     while (lineMat.find()) {
   2831         lineNum++;
   2832         if (U_FAILURE(status)) {
   2833             errln("line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
   2834         }
   2835 
   2836         status = U_ZERO_ERROR;
   2837         UnicodeString testLine = lineMat.group(1, status);
   2838         if (testLine.length() == 0) {
   2839             continue;
   2840         }
   2841 
   2842         //
   2843         // Parse the test line.  Skip blank and comment only lines.
   2844         // Separate out the three main fields - pattern, flags, target.
   2845         //
   2846 
   2847         commentMat.reset(testLine);
   2848         if (commentMat.lookingAt(status)) {
   2849             // This line is a comment, or blank.
   2850             continue;
   2851         }
   2852 
   2853         //
   2854         //  Pull out the pattern field, remove it from the test file line.
   2855         //
   2856         quotedStuffMat.reset(testLine);
   2857         if (quotedStuffMat.lookingAt(status)) {
   2858             testPattern = quotedStuffMat.group(2, status);
   2859             testLine.remove(0, quotedStuffMat.end(0, status));
   2860         } else {
   2861             errln("Bad pattern (missing quotes?) at test file line %d", lineNum);
   2862             continue;
   2863         }
   2864 
   2865 
   2866         //
   2867         //  Pull out the flags from the test file line.
   2868         //
   2869         flagsMat.reset(testLine);
   2870         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
   2871         testFlags = flagsMat.group(1, status);
   2872         if (flagsMat.group(2, status).length() > 0) {
   2873             errln("Bad Match flag at line %d. Scanning %c\n",
   2874                 lineNum, flagsMat.group(2, status).charAt(0));
   2875             continue;
   2876         }
   2877         testLine.remove(0, flagsMat.end(0, status));
   2878 
   2879         //
   2880         //  Pull out the match string, as a whole.
   2881         //    We'll process the <tags> later.
   2882         //
   2883         quotedStuffMat.reset(testLine);
   2884         if (quotedStuffMat.lookingAt(status)) {
   2885             matchString = quotedStuffMat.group(2, status);
   2886             testLine.remove(0, quotedStuffMat.end(0, status));
   2887         } else {
   2888             errln("Bad match string at test file line %d", lineNum);
   2889             continue;
   2890         }
   2891 
   2892         //
   2893         //  The only thing left from the input line should be an optional trailing comment.
   2894         //
   2895         commentMat.reset(testLine);
   2896         if (commentMat.lookingAt(status) == FALSE) {
   2897             errln("Line %d: unexpected characters at end of test line.", lineNum);
   2898             continue;
   2899         }
   2900 
   2901         //
   2902         //  Run the test
   2903         //
   2904         regex_find(testPattern, testFlags, matchString, lineNum);
   2905     }
   2906 
   2907     delete [] testData;
   2908 
   2909 }
   2910 
   2911 
   2912 
   2913 //---------------------------------------------------------------------------
   2914 //
   2915 //    regex_find(pattern, flags, inputString, lineNumber)
   2916 //
   2917 //         Function to run a single test from the Extended (data driven) tests.
   2918 //         See file test/testdata/regextst.txt for a description of the
   2919 //         pattern and inputString fields, and the allowed flags.
   2920 //         lineNumber is the source line in regextst.txt of the test.
   2921 //
   2922 //---------------------------------------------------------------------------
   2923 
   2924 
   2925 //  Set a value into a UVector at position specified by a decimal number in
   2926 //   a UnicodeString.   This is a utility function needed by the actual test function,
   2927 //   which follows.
   2928 static void set(UVector &vec, int32_t val, UnicodeString index) {
   2929     UErrorCode  status=U_ZERO_ERROR;
   2930     int32_t  idx = 0;
   2931     for (int32_t i=0; i<index.length(); i++) {
   2932         int32_t d=u_charDigitValue(index.charAt(i));
   2933         if (d<0) {return;}
   2934         idx = idx*10 + d;
   2935     }
   2936     while (vec.size()<idx+1) {vec.addElement(-1, status);}
   2937     vec.setElementAt(val, idx);
   2938 }
   2939 
   2940 void RegexTest::regex_find(const UnicodeString &pattern,
   2941                            const UnicodeString &flags,
   2942                            const UnicodeString &inputString,
   2943                            int32_t line) {
   2944     UnicodeString       unEscapedInput;
   2945     UnicodeString       deTaggedInput;
   2946 
   2947     int32_t             patternUTF8Length,      inputUTF8Length;
   2948     char                *patternChars  = NULL, *inputChars = NULL;
   2949     UText               patternText    = UTEXT_INITIALIZER;
   2950     UText               inputText      = UTEXT_INITIALIZER;
   2951     UConverter          *UTF8Converter = NULL;
   2952 
   2953     UErrorCode          status         = U_ZERO_ERROR;
   2954     UParseError         pe;
   2955     RegexPattern        *parsePat      = NULL;
   2956     RegexMatcher        *parseMatcher  = NULL;
   2957     RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
   2958     RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
   2959     UVector             groupStarts(status);
   2960     UVector             groupEnds(status);
   2961     UBool               isMatch        = FALSE, isUTF8Match = FALSE;
   2962     UBool               failed         = FALSE;
   2963     int32_t             numFinds;
   2964     int32_t             i;
   2965     UBool               useMatchesFunc   = FALSE;
   2966     UBool               useLookingAtFunc = FALSE;
   2967     int32_t             regionStart      = -1;
   2968     int32_t             regionEnd        = -1;
   2969 
   2970     //
   2971     //  Compile the caller's pattern
   2972     //
   2973     uint32_t bflags = 0;
   2974     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
   2975         bflags |= UREGEX_CASE_INSENSITIVE;
   2976     }
   2977     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
   2978         bflags |= UREGEX_COMMENTS;
   2979     }
   2980     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
   2981         bflags |= UREGEX_DOTALL;
   2982     }
   2983     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
   2984         bflags |= UREGEX_MULTILINE;
   2985     }
   2986 
   2987     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
   2988         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
   2989     }
   2990     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
   2991         bflags |= UREGEX_UNIX_LINES;
   2992     }
   2993 
   2994 
   2995     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
   2996     if (status != U_ZERO_ERROR) {
   2997         #if UCONFIG_NO_BREAK_ITERATION==1
   2998         // 'v' test flag means that the test pattern should not compile if ICU was configured
   2999         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
   3000         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
   3001             goto cleanupAndReturn;
   3002         }
   3003         #endif
   3004         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
   3005             // Expected pattern compilation error.
   3006             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
   3007                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
   3008             }
   3009             goto cleanupAndReturn;
   3010         } else {
   3011             // Unexpected pattern compilation error.
   3012             errln("Line %d: error %s compiling pattern.", line, u_errorName(status));
   3013             goto cleanupAndReturn;
   3014         }
   3015     }
   3016 
   3017     UTF8Converter = ucnv_open("UTF8", &status);
   3018     ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
   3019 
   3020     patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
   3021     status = U_ZERO_ERROR; // buffer overflow
   3022     patternChars = new char[patternUTF8Length+1];
   3023     pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
   3024     utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
   3025 
   3026     if (status == U_ZERO_ERROR) {
   3027         UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
   3028 
   3029         if (status != U_ZERO_ERROR) {
   3030 #if UCONFIG_NO_BREAK_ITERATION==1
   3031             // 'v' test flag means that the test pattern should not compile if ICU was configured
   3032             //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
   3033             if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
   3034                 goto cleanupAndReturn;
   3035             }
   3036 #endif
   3037             if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
   3038                 // Expected pattern compilation error.
   3039                 if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
   3040                     logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
   3041                 }
   3042                 goto cleanupAndReturn;
   3043             } else {
   3044                 // Unexpected pattern compilation error.
   3045                 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
   3046                 goto cleanupAndReturn;
   3047             }
   3048         }
   3049     }
   3050 
   3051     if (UTF8Pattern == NULL) {
   3052         // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
   3053         logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for line %d", line);
   3054         status = U_ZERO_ERROR;
   3055     }
   3056 
   3057     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
   3058         RegexPatternDump(callerPattern);
   3059     }
   3060 
   3061     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
   3062         errln("Expected, but did not get, a pattern compilation error.");
   3063         goto cleanupAndReturn;
   3064     }
   3065 
   3066 
   3067     //
   3068     // Number of times find() should be called on the test string, default to 1
   3069     //
   3070     numFinds = 1;
   3071     for (i=2; i<=9; i++) {
   3072         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
   3073             if (numFinds != 1) {
   3074                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
   3075                 goto cleanupAndReturn;
   3076             }
   3077             numFinds = i;
   3078         }
   3079     }
   3080 
   3081     // 'M' flag.  Use matches() instead of find()
   3082     if (flags.indexOf((UChar)0x4d) >= 0) {
   3083         useMatchesFunc = TRUE;
   3084     }
   3085     if (flags.indexOf((UChar)0x4c) >= 0) {
   3086         useLookingAtFunc = TRUE;
   3087     }
   3088 
   3089     //
   3090     //  Find the tags in the input data, remove them, and record the group boundary
   3091     //    positions.
   3092     //
   3093     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
   3094     REGEX_CHECK_STATUS_L(line);
   3095 
   3096     unEscapedInput = inputString.unescape();
   3097     parseMatcher = parsePat->matcher(unEscapedInput, status);
   3098     REGEX_CHECK_STATUS_L(line);
   3099     while(parseMatcher->find()) {
   3100         parseMatcher->appendReplacement(deTaggedInput, "", status);
   3101         REGEX_CHECK_STATUS;
   3102         UnicodeString groupNum = parseMatcher->group(2, status);
   3103         if (groupNum == "r") {
   3104             // <r> or </r>, a region specification within the string
   3105             if (parseMatcher->group(1, status) == "/") {
   3106                 regionEnd = deTaggedInput.length();
   3107             } else {
   3108                 regionStart = deTaggedInput.length();
   3109             }
   3110         } else {
   3111             // <digits> or </digits>, a group match boundary tag.
   3112             if (parseMatcher->group(1, status) == "/") {
   3113                 set(groupEnds, deTaggedInput.length(), groupNum);
   3114             } else {
   3115                 set(groupStarts, deTaggedInput.length(), groupNum);
   3116             }
   3117         }
   3118     }
   3119     parseMatcher->appendTail(deTaggedInput);
   3120     REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
   3121     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
   3122       errln("mismatched <r> tags");
   3123       failed = TRUE;
   3124       goto cleanupAndReturn;
   3125     }
   3126 
   3127 
   3128     //
   3129     //  Configure the matcher according to the flags specified with this test.
   3130     //
   3131     matcher = callerPattern->matcher(deTaggedInput, status);
   3132     REGEX_CHECK_STATUS_L(line);
   3133     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
   3134         matcher->setTrace(TRUE);
   3135     }
   3136 
   3137     if (UTF8Pattern != NULL) {
   3138         inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
   3139         status = U_ZERO_ERROR; // buffer overflow
   3140         inputChars = new char[inputUTF8Length+1];
   3141         deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
   3142         utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
   3143 
   3144         if (status == U_ZERO_ERROR) {
   3145             UTF8Matcher = UTF8Pattern->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status);
   3146             REGEX_CHECK_STATUS_L(line);
   3147         }
   3148 
   3149         if (UTF8Matcher == NULL) {
   3150             // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
   3151             logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for line %d", line);
   3152             status = U_ZERO_ERROR;
   3153         }
   3154     }
   3155 
   3156     if (regionStart>=0) {
   3157        matcher->region(regionStart, regionEnd, status);
   3158        REGEX_CHECK_STATUS_L(line);
   3159        if (UTF8Matcher != NULL) {
   3160            UTF8Matcher->region(regionStart, regionEnd, status);
   3161            REGEX_CHECK_STATUS_L(line);
   3162        }
   3163     }
   3164     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
   3165         matcher->useAnchoringBounds(FALSE);
   3166         if (UTF8Matcher != NULL) {
   3167             UTF8Matcher->useAnchoringBounds(FALSE);
   3168         }
   3169     }
   3170     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
   3171         matcher->useTransparentBounds(TRUE);
   3172         if (UTF8Matcher != NULL) {
   3173             UTF8Matcher->useTransparentBounds(TRUE);
   3174         }
   3175     }
   3176 
   3177 
   3178 
   3179     //
   3180     // Do a find on the de-tagged input using the caller's pattern
   3181     //     TODO: error on count>1 and not find().
   3182     //           error on both matches() and lookingAt().
   3183     //
   3184     for (i=0; i<numFinds; i++) {
   3185         if (useMatchesFunc) {
   3186             isMatch = matcher->matches(status);
   3187             if (UTF8Matcher != NULL) {
   3188                isUTF8Match = UTF8Matcher->matches(status);
   3189             }
   3190         } else  if (useLookingAtFunc) {
   3191             isMatch = matcher->lookingAt(status);
   3192             if (UTF8Matcher != NULL) {
   3193                 isUTF8Match = UTF8Matcher->lookingAt(status);
   3194             }
   3195         } else {
   3196             isMatch = matcher->find();
   3197             if (UTF8Matcher != NULL) {
   3198                 isUTF8Match = UTF8Matcher->find();
   3199             }
   3200         }
   3201     }
   3202     matcher->setTrace(FALSE);
   3203 
   3204     //
   3205     // Match up the groups from the find() with the groups from the tags
   3206     //
   3207 
   3208     // number of tags should match number of groups from find operation.
   3209     // matcher->groupCount does not include group 0, the entire match, hence the +1.
   3210     //   G option in test means that capture group data is not available in the
   3211     //     expected results, so the check needs to be suppressed.
   3212     if (isMatch == FALSE && groupStarts.size() != 0) {
   3213         errln("Error at line %d:  Match expected, but none found.", line);
   3214         failed = TRUE;
   3215         goto cleanupAndReturn;
   3216     } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
   3217         errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
   3218         failed = TRUE;
   3219         goto cleanupAndReturn;
   3220     }
   3221 
   3222     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
   3223         // Only check for match / no match.  Don't check capture groups.
   3224         if (isMatch && groupStarts.size() == 0) {
   3225             errln("Error at line %d:  No match expected, but one found.", line);
   3226             failed = TRUE;
   3227         } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
   3228             errln("Error at line %d:  No match expected, but one found. (UTF8)", line);
   3229             failed = TRUE;
   3230         }
   3231         goto cleanupAndReturn;
   3232     }
   3233 
   3234     REGEX_CHECK_STATUS_L(line);
   3235     for (i=0; i<=matcher->groupCount(); i++) {
   3236         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
   3237         if (matcher->start(i, status) != expectedStart) {
   3238             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
   3239                 line, i, expectedStart, matcher->start(i, status));
   3240             failed = TRUE;
   3241             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3242         } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStart) {
   3243             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
   3244                   line, i, expectedStart, UTF8Matcher->start(i, status));
   3245             failed = TRUE;
   3246             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3247         }
   3248 
   3249         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
   3250         if (matcher->end(i, status) != expectedEnd) {
   3251             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
   3252                 line, i, expectedEnd, matcher->end(i, status));
   3253             failed = TRUE;
   3254             // Error on end position;  keep going; real error is probably yet to come as group
   3255             //   end positions work from end of the input data towards the front.
   3256         } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEnd) {
   3257             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
   3258                   line, i, expectedEnd, UTF8Matcher->end(i, status));
   3259             failed = TRUE;
   3260             // Error on end position;  keep going; real error is probably yet to come as group
   3261             //   end positions work from end of the input data towards the front.
   3262         }
   3263     }
   3264     if ( matcher->groupCount()+1 < groupStarts.size()) {
   3265         errln("Error at line %d: Expected %d capture groups, found %d.",
   3266             line, groupStarts.size()-1, matcher->groupCount());
   3267         failed = TRUE;
   3268         }
   3269     else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
   3270         errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
   3271               line, groupStarts.size()-1, UTF8Matcher->groupCount());
   3272         failed = TRUE;
   3273     }
   3274 
   3275     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3276         matcher->requireEnd() == TRUE) {
   3277         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
   3278         failed = TRUE;
   3279     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3280         UTF8Matcher->requireEnd() == TRUE) {
   3281         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
   3282         failed = TRUE;
   3283     }
   3284 
   3285     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
   3286         matcher->requireEnd() == FALSE) {
   3287         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
   3288         failed = TRUE;
   3289     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3290         UTF8Matcher->requireEnd() == FALSE) {
   3291         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
   3292         failed = TRUE;
   3293     }
   3294 
   3295     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
   3296         matcher->hitEnd() == TRUE) {
   3297         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
   3298         failed = TRUE;
   3299     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
   3300                UTF8Matcher->hitEnd() == TRUE) {
   3301         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
   3302         failed = TRUE;
   3303     }
   3304 
   3305     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
   3306         matcher->hitEnd() == FALSE) {
   3307         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
   3308         failed = TRUE;
   3309     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
   3310                UTF8Matcher->hitEnd() == FALSE) {
   3311         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
   3312         failed = TRUE;
   3313     }
   3314 
   3315 
   3316 cleanupAndReturn:
   3317     if (failed) {
   3318         infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
   3319             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
   3320         // callerPattern->dump();
   3321     }
   3322     delete parseMatcher;
   3323     delete parsePat;
   3324     delete UTF8Matcher;
   3325     delete UTF8Pattern;
   3326     delete matcher;
   3327     delete callerPattern;
   3328 
   3329     utext_close(&inputText);
   3330     delete[] inputChars;
   3331     utext_close(&patternText);
   3332     delete[] patternChars;
   3333     ucnv_close(UTF8Converter);
   3334 }
   3335 
   3336 
   3337 
   3338 
   3339 //---------------------------------------------------------------------------
   3340 //
   3341 //      Errors     Check for error handling in patterns.
   3342 //
   3343 //---------------------------------------------------------------------------
   3344 void RegexTest::Errors() {
   3345     // \escape sequences that aren't implemented yet.
   3346     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
   3347 
   3348     // Missing close parentheses
   3349     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
   3350     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
   3351     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
   3352 
   3353     // Extra close paren
   3354     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
   3355     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
   3356     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
   3357 
   3358     // Look-ahead, Look-behind
   3359     //  TODO:  add tests for unbounded length look-behinds.
   3360     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
   3361 
   3362     // Attempt to use non-default flags
   3363     {
   3364         UParseError   pe;
   3365         UErrorCode    status = U_ZERO_ERROR;
   3366         int32_t       flags  = UREGEX_CANON_EQ |
   3367                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
   3368                                UREGEX_MULTILINE;
   3369         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
   3370         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
   3371         delete pat1;
   3372     }
   3373 
   3374 
   3375     // Quantifiers are allowed only after something that can be quantified.
   3376     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
   3377     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
   3378     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
   3379 
   3380     // Mal-formed {min,max} quantifiers
   3381     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
   3382     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
   3383     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
   3384     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
   3385     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
   3386     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
   3387     REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
   3388     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
   3389     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
   3390 
   3391     // Ticket 5389
   3392     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
   3393 
   3394     // Invalid Back Reference \0
   3395     //    For ICU 3.8 and earlier
   3396     //    For ICU versions newer than 3.8, \0 introduces an octal escape.
   3397     //
   3398     REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
   3399 
   3400 }
   3401 
   3402 
   3403 //-------------------------------------------------------------------------------
   3404 //
   3405 //  Read a text data file, convert it to UChars, and return the data
   3406 //    in one big UChar * buffer, which the caller must delete.
   3407 //
   3408 //--------------------------------------------------------------------------------
   3409 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
   3410                                      const char *defEncoding, UErrorCode &status) {
   3411     UChar       *retPtr  = NULL;
   3412     char        *fileBuf = NULL;
   3413     UConverter* conv     = NULL;
   3414     FILE        *f       = NULL;
   3415 
   3416     ulen = 0;
   3417     if (U_FAILURE(status)) {
   3418         return retPtr;
   3419     }
   3420 
   3421     //
   3422     //  Open the file.
   3423     //
   3424     f = fopen(fileName, "rb");
   3425     if (f == 0) {
   3426         dataerrln("Error opening test data file %s\n", fileName);
   3427         status = U_FILE_ACCESS_ERROR;
   3428         return NULL;
   3429     }
   3430     //
   3431     //  Read it in
   3432     //
   3433     int32_t            fileSize;
   3434     int32_t            amt_read;
   3435 
   3436     fseek( f, 0, SEEK_END);
   3437     fileSize = ftell(f);
   3438     fileBuf = new char[fileSize];
   3439     fseek(f, 0, SEEK_SET);
   3440     amt_read = fread(fileBuf, 1, fileSize, f);
   3441     if (amt_read != fileSize || fileSize <= 0) {
   3442         errln("Error reading test data file.");
   3443         goto cleanUpAndReturn;
   3444     }
   3445 
   3446     //
   3447     // Look for a Unicode Signature (BOM) on the data just read
   3448     //
   3449     int32_t        signatureLength;
   3450     const char *   fileBufC;
   3451     const char*    encoding;
   3452 
   3453     fileBufC = fileBuf;
   3454     encoding = ucnv_detectUnicodeSignature(
   3455         fileBuf, fileSize, &signatureLength, &status);
   3456     if(encoding!=NULL ){
   3457         fileBufC  += signatureLength;
   3458         fileSize  -= signatureLength;
   3459     } else {
   3460         encoding = defEncoding;
   3461         if (strcmp(encoding, "utf-8") == 0) {
   3462             errln("file %s is missing its BOM", fileName);
   3463         }
   3464     }
   3465 
   3466     //
   3467     // Open a converter to take the rule file to UTF-16
   3468     //
   3469     conv = ucnv_open(encoding, &status);
   3470     if (U_FAILURE(status)) {
   3471         goto cleanUpAndReturn;
   3472     }
   3473 
   3474     //
   3475     // Convert the rules to UChar.
   3476     //  Preflight first to determine required buffer size.
   3477     //
   3478     ulen = ucnv_toUChars(conv,
   3479         NULL,           //  dest,
   3480         0,              //  destCapacity,
   3481         fileBufC,
   3482         fileSize,
   3483         &status);
   3484     if (status == U_BUFFER_OVERFLOW_ERROR) {
   3485         // Buffer Overflow is expected from the preflight operation.
   3486         status = U_ZERO_ERROR;
   3487 
   3488         retPtr = new UChar[ulen+1];
   3489         ucnv_toUChars(conv,
   3490             retPtr,       //  dest,
   3491             ulen+1,
   3492             fileBufC,
   3493             fileSize,
   3494             &status);
   3495     }
   3496 
   3497 cleanUpAndReturn:
   3498     fclose(f);
   3499     delete[] fileBuf;
   3500     ucnv_close(conv);
   3501     if (U_FAILURE(status)) {
   3502         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   3503         delete retPtr;
   3504         retPtr = 0;
   3505         ulen   = 0;
   3506     };
   3507     return retPtr;
   3508 }
   3509 
   3510 
   3511 //-------------------------------------------------------------------------------
   3512 //
   3513 //   PerlTests  - Run Perl's regular expression tests
   3514 //                The input file for this test is re_tests, the standard regular
   3515 //                expression test data distributed with the Perl source code.
   3516 //
   3517 //                Here is Perl's description of the test data file:
   3518 //
   3519 //        # The tests are in a separate file 't/op/re_tests'.
   3520 //        # Each line in that file is a separate test.
   3521 //        # There are five columns, separated by tabs.
   3522 //        #
   3523 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
   3524 //        # Modifiers can be put after the closing C<'>.
   3525 //        #
   3526 //        # Column 2 contains the string to be matched.
   3527 //        #
   3528 //        # Column 3 contains the expected result:
   3529 //        #     y   expect a match
   3530 //        #     n   expect no match
   3531 //        #     c   expect an error
   3532 //        # B   test exposes a known bug in Perl, should be skipped
   3533 //        # b   test exposes a known bug in Perl, should be skipped if noamp
   3534 //        #
   3535 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
   3536 //        #
   3537 //        # Column 4 contains a string, usually C<$&>.
   3538 //        #
   3539 //        # Column 5 contains the expected result of double-quote
   3540 //        # interpolating that string after the match, or start of error message.
   3541 //        #
   3542 //        # Column 6, if present, contains a reason why the test is skipped.
   3543 //        # This is printed with "skipped", for harness to pick up.
   3544 //        #
   3545 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
   3546 //        #
   3547 //        # If you want to add a regular expression test that can't be expressed
   3548 //        # in this format, don't add it here: put it in op/pat.t instead.
   3549 //
   3550 //        For ICU, if field 3 contains an 'i', the test will be skipped.
   3551 //        The test exposes is some known incompatibility between ICU and Perl regexps.
   3552 //        (The i is in addition to whatever was there before.)
   3553 //
   3554 //-------------------------------------------------------------------------------
   3555 void RegexTest::PerlTests() {
   3556     char tdd[2048];
   3557     const char *srcPath;
   3558     UErrorCode  status = U_ZERO_ERROR;
   3559     UParseError pe;
   3560 
   3561     //
   3562     //  Open and read the test data file.
   3563     //
   3564     srcPath=getPath(tdd, "re_tests.txt");
   3565     if(srcPath==NULL) {
   3566         return; /* something went wrong, error already output */
   3567     }
   3568 
   3569     int32_t    len;
   3570     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
   3571     if (U_FAILURE(status)) {
   3572         return; /* something went wrong, error already output */
   3573     }
   3574 
   3575     //
   3576     //  Put the test data into a UnicodeString
   3577     //
   3578     UnicodeString testDataString(FALSE, testData, len);
   3579 
   3580     //
   3581     //  Regex to break the input file into lines, and strip the new lines.
   3582     //     One line per match, capture group one is the desired data.
   3583     //
   3584     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
   3585     if (U_FAILURE(status)) {
   3586         dataerrln("RegexPattern::compile() error");
   3587         return;
   3588     }
   3589     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
   3590 
   3591     //
   3592     //  Regex to split a test file line into fields.
   3593     //    There are six fields, separated by tabs.
   3594     //
   3595     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
   3596 
   3597     //
   3598     //  Regex to identify test patterns with flag settings, and to separate them.
   3599     //    Test patterns with flags look like 'pattern'i
   3600     //    Test patterns without flags are not quoted:   pattern
   3601     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
   3602     //
   3603     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
   3604     RegexMatcher* flagMat = flagPat->matcher(status);
   3605 
   3606     //
   3607     // The Perl tests reference several perl-isms, which are evaluated/substituted
   3608     //   in the test data.  Not being perl, this must be done explicitly.  Here
   3609     //   are string constants and REs for these constructs.
   3610     //
   3611     UnicodeString nulnulSrc("${nulnul}");
   3612     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
   3613     nulnul = nulnul.unescape();
   3614 
   3615     UnicodeString ffffSrc("${ffff}");
   3616     UnicodeString ffff("\\uffff", -1, US_INV);
   3617     ffff = ffff.unescape();
   3618 
   3619     //  regexp for $-[0], $+[2], etc.
   3620     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
   3621     RegexMatcher *groupsMat = groupsPat->matcher(status);
   3622 
   3623     //  regexp for $0, $1, $2, etc.
   3624     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
   3625     RegexMatcher *cgMat = cgPat->matcher(status);
   3626 
   3627 
   3628     //
   3629     // Main Loop for the Perl Tests, runs once per line from the
   3630     //   test data file.
   3631     //
   3632     int32_t  lineNum = 0;
   3633     int32_t  skippedUnimplementedCount = 0;
   3634     while (lineMat->find()) {
   3635         lineNum++;
   3636 
   3637         //
   3638         //  Get a line, break it into its fields, do the Perl
   3639         //    variable substitutions.
   3640         //
   3641         UnicodeString line = lineMat->group(1, status);
   3642         UnicodeString fields[7];
   3643         fieldPat->split(line, fields, 7, status);
   3644 
   3645         flagMat->reset(fields[0]);
   3646         flagMat->matches(status);
   3647         UnicodeString pattern  = flagMat->group(2, status);
   3648         pattern.findAndReplace("${bang}", "!");
   3649         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
   3650         pattern.findAndReplace(ffffSrc, ffff);
   3651 
   3652         //
   3653         //  Identify patterns that include match flag settings,
   3654         //    split off the flags, remove the extra quotes.
   3655         //
   3656         UnicodeString flagStr = flagMat->group(3, status);
   3657         if (U_FAILURE(status)) {
   3658             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   3659             return;
   3660         }
   3661         int32_t flags = 0;
   3662         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
   3663         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
   3664         const UChar UChar_m = 0x6d;
   3665         const UChar UChar_x = 0x78;
   3666         const UChar UChar_y = 0x79;
   3667         if (flagStr.indexOf(UChar_i) != -1) {
   3668             flags |= UREGEX_CASE_INSENSITIVE;
   3669         }
   3670         if (flagStr.indexOf(UChar_m) != -1) {
   3671             flags |= UREGEX_MULTILINE;
   3672         }
   3673         if (flagStr.indexOf(UChar_x) != -1) {
   3674             flags |= UREGEX_COMMENTS;
   3675         }
   3676 
   3677         //
   3678         // Compile the test pattern.
   3679         //
   3680         status = U_ZERO_ERROR;
   3681         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
   3682         if (status == U_REGEX_UNIMPLEMENTED) {
   3683             //
   3684             // Test of a feature that is planned for ICU, but not yet implemented.
   3685             //   skip the test.
   3686             skippedUnimplementedCount++;
   3687             delete testPat;
   3688             status = U_ZERO_ERROR;
   3689             continue;
   3690         }
   3691 
   3692         if (U_FAILURE(status)) {
   3693             // Some tests are supposed to generate errors.
   3694             //   Only report an error for tests that are supposed to succeed.
   3695             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
   3696                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
   3697             {
   3698                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
   3699             }
   3700             status = U_ZERO_ERROR;
   3701             delete testPat;
   3702             continue;
   3703         }
   3704 
   3705         if (fields[2].indexOf(UChar_i) >= 0) {
   3706             // ICU should skip this test.
   3707             delete testPat;
   3708             continue;
   3709         }
   3710 
   3711         if (fields[2].indexOf(UChar_c) >= 0) {
   3712             // This pattern should have caused a compilation error, but didn't/
   3713             errln("line %d: Expected a pattern compile error, got success.", lineNum);
   3714             delete testPat;
   3715             continue;
   3716         }
   3717 
   3718         //
   3719         // replace the Perl variables that appear in some of the
   3720         //   match data strings.
   3721         //
   3722         UnicodeString matchString = fields[1];
   3723         matchString.findAndReplace(nulnulSrc, nulnul);
   3724         matchString.findAndReplace(ffffSrc,   ffff);
   3725 
   3726         // Replace any \n in the match string with an actual new-line char.
   3727         //  Don't do full unescape, as this unescapes more than Perl does, which
   3728         //  causes other spurious failures in the tests.
   3729         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   3730 
   3731 
   3732 
   3733         //
   3734         // Run the test, check for expected match/don't match result.
   3735         //
   3736         RegexMatcher *testMat = testPat->matcher(matchString, status);
   3737         UBool found = testMat->find();
   3738         UBool expected = FALSE;
   3739         if (fields[2].indexOf(UChar_y) >=0) {
   3740             expected = TRUE;
   3741         }
   3742         if (expected != found) {
   3743             errln("line %d: Expected %smatch, got %smatch",
   3744                 lineNum, expected?"":"no ", found?"":"no " );
   3745             continue;
   3746         }
   3747 
   3748         // Don't try to check expected results if there is no match.
   3749         //   (Some have stuff in the expected fields)
   3750         if (!found) {
   3751             delete testMat;
   3752             delete testPat;
   3753             continue;
   3754         }
   3755 
   3756         //
   3757         // Interpret the Perl expression from the fourth field of the data file,
   3758         // building up an ICU string from the results of the ICU match.
   3759         //   The Perl expression will contain references to the results of
   3760         //     a regex match, including the matched string, capture group strings,
   3761         //     group starting and ending indicies, etc.
   3762         //
   3763         UnicodeString resultString;
   3764         UnicodeString perlExpr = fields[3];
   3765 #if SUPPORT_MUTATING_INPUT_STRING
   3766         groupsMat->reset(perlExpr);
   3767         cgMat->reset(perlExpr);
   3768 #endif
   3769 
   3770         while (perlExpr.length() > 0) {
   3771 #if !SUPPORT_MUTATING_INPUT_STRING
   3772             //  Perferred usage.  Reset after any modification to input string.
   3773             groupsMat->reset(perlExpr);
   3774             cgMat->reset(perlExpr);
   3775 #endif
   3776 
   3777             if (perlExpr.startsWith("$&")) {
   3778                 resultString.append(testMat->group(status));
   3779                 perlExpr.remove(0, 2);
   3780             }
   3781 
   3782             else if (groupsMat->lookingAt(status)) {
   3783                 // $-[0]   $+[2]  etc.
   3784                 UnicodeString digitString = groupsMat->group(2, status);
   3785                 int32_t t = 0;
   3786                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   3787                 UnicodeString plusOrMinus = groupsMat->group(1, status);
   3788                 int32_t matchPosition;
   3789                 if (plusOrMinus.compare("+") == 0) {
   3790                     matchPosition = testMat->end(groupNum, status);
   3791                 } else {
   3792                     matchPosition = testMat->start(groupNum, status);
   3793                 }
   3794                 if (matchPosition != -1) {
   3795                     ICU_Utility::appendNumber(resultString, matchPosition);
   3796                 }
   3797                 perlExpr.remove(0, groupsMat->end(status));
   3798             }
   3799 
   3800             else if (cgMat->lookingAt(status)) {
   3801                 // $1, $2, $3, etc.
   3802                 UnicodeString digitString = cgMat->group(1, status);
   3803                 int32_t t = 0;
   3804                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   3805                 if (U_SUCCESS(status)) {
   3806                     resultString.append(testMat->group(groupNum, status));
   3807                     status = U_ZERO_ERROR;
   3808                 }
   3809                 perlExpr.remove(0, cgMat->end(status));
   3810             }
   3811 
   3812             else if (perlExpr.startsWith("@-")) {
   3813                 int32_t i;
   3814                 for (i=0; i<=testMat->groupCount(); i++) {
   3815                     if (i>0) {
   3816                         resultString.append(" ");
   3817                     }
   3818                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
   3819                 }
   3820                 perlExpr.remove(0, 2);
   3821             }
   3822 
   3823             else if (perlExpr.startsWith("@+")) {
   3824                 int32_t i;
   3825                 for (i=0; i<=testMat->groupCount(); i++) {
   3826                     if (i>0) {
   3827                         resultString.append(" ");
   3828                     }
   3829                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
   3830                 }
   3831                 perlExpr.remove(0, 2);
   3832             }
   3833 
   3834             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
   3835                                                      //           or as an escaped sequence (e.g. \n)
   3836                 if (perlExpr.length() > 1) {
   3837                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
   3838                 }
   3839                 UChar c = perlExpr.charAt(0);
   3840                 switch (c) {
   3841                 case 'n':   c = '\n'; break;
   3842                 // add any other escape sequences that show up in the test expected results.
   3843                 }
   3844                 resultString.append(c);
   3845                 perlExpr.remove(0, 1);
   3846             }
   3847 
   3848             else  {
   3849                 // Any characters from the perl expression that we don't explicitly
   3850                 //  recognize before here are assumed to be literals and copied
   3851                 //  as-is to the expected results.
   3852                 resultString.append(perlExpr.charAt(0));
   3853                 perlExpr.remove(0, 1);
   3854             }
   3855 
   3856             if (U_FAILURE(status)) {
   3857                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
   3858                 break;
   3859             }
   3860         }
   3861 
   3862         //
   3863         // Expected Results Compare
   3864         //
   3865         UnicodeString expectedS(fields[4]);
   3866         expectedS.findAndReplace(nulnulSrc, nulnul);
   3867         expectedS.findAndReplace(ffffSrc,   ffff);
   3868         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   3869 
   3870 
   3871         if (expectedS.compare(resultString) != 0) {
   3872             err("Line %d: Incorrect perl expression results.", lineNum);
   3873             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
   3874         }
   3875 
   3876         delete testMat;
   3877         delete testPat;
   3878     }
   3879 
   3880     //
   3881     // All done.  Clean up allocated stuff.
   3882     //
   3883     delete cgMat;
   3884     delete cgPat;
   3885 
   3886     delete groupsMat;
   3887     delete groupsPat;
   3888 
   3889     delete flagMat;
   3890     delete flagPat;
   3891 
   3892     delete lineMat;
   3893     delete linePat;
   3894 
   3895     delete fieldPat;
   3896     delete [] testData;
   3897 
   3898 
   3899     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
   3900 
   3901 }
   3902 
   3903 
   3904 //-------------------------------------------------------------------------------
   3905 //
   3906 //   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
   3907 //                  (instead of using UnicodeStrings) to test the alternate engine.
   3908 //                  The input file for this test is re_tests, the standard regular
   3909 //                  expression test data distributed with the Perl source code.
   3910 //                  See PerlTests() for more information.
   3911 //
   3912 //-------------------------------------------------------------------------------
   3913 void RegexTest::PerlTestsUTF8() {
   3914     char tdd[2048];
   3915     const char *srcPath;
   3916     UErrorCode  status = U_ZERO_ERROR;
   3917     UParseError pe;
   3918     LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
   3919     UText       patternText = UTEXT_INITIALIZER;
   3920     char       *patternChars = NULL;
   3921     int32_t     patternLength;
   3922     int32_t     patternCapacity = 0;
   3923     UText       inputText = UTEXT_INITIALIZER;
   3924     char       *inputChars = NULL;
   3925     int32_t     inputLength;
   3926     int32_t     inputCapacity = 0;
   3927 
   3928     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
   3929 
   3930     //
   3931     //  Open and read the test data file.
   3932     //
   3933     srcPath=getPath(tdd, "re_tests.txt");
   3934     if(srcPath==NULL) {
   3935         return; /* something went wrong, error already output */
   3936     }
   3937 
   3938     int32_t    len;
   3939     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
   3940     if (U_FAILURE(status)) {
   3941         return; /* something went wrong, error already output */
   3942     }
   3943 
   3944     //
   3945     //  Put the test data into a UnicodeString
   3946     //
   3947     UnicodeString testDataString(FALSE, testData, len);
   3948 
   3949     //
   3950     //  Regex to break the input file into lines, and strip the new lines.
   3951     //     One line per match, capture group one is the desired data.
   3952     //
   3953     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
   3954     if (U_FAILURE(status)) {
   3955         dataerrln("RegexPattern::compile() error");
   3956         return;
   3957     }
   3958     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
   3959 
   3960     //
   3961     //  Regex to split a test file line into fields.
   3962     //    There are six fields, separated by tabs.
   3963     //
   3964     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
   3965 
   3966     //
   3967     //  Regex to identify test patterns with flag settings, and to separate them.
   3968     //    Test patterns with flags look like 'pattern'i
   3969     //    Test patterns without flags are not quoted:   pattern
   3970     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
   3971     //
   3972     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
   3973     RegexMatcher* flagMat = flagPat->matcher(status);
   3974 
   3975     //
   3976     // The Perl tests reference several perl-isms, which are evaluated/substituted
   3977     //   in the test data.  Not being perl, this must be done explicitly.  Here
   3978     //   are string constants and REs for these constructs.
   3979     //
   3980     UnicodeString nulnulSrc("${nulnul}");
   3981     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
   3982     nulnul = nulnul.unescape();
   3983 
   3984     UnicodeString ffffSrc("${ffff}");
   3985     UnicodeString ffff("\\uffff", -1, US_INV);
   3986     ffff = ffff.unescape();
   3987 
   3988     //  regexp for $-[0], $+[2], etc.
   3989     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
   3990     RegexMatcher *groupsMat = groupsPat->matcher(status);
   3991 
   3992     //  regexp for $0, $1, $2, etc.
   3993     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
   3994     RegexMatcher *cgMat = cgPat->matcher(status);
   3995 
   3996 
   3997     //
   3998     // Main Loop for the Perl Tests, runs once per line from the
   3999     //   test data file.
   4000     //
   4001     int32_t  lineNum = 0;
   4002     int32_t  skippedUnimplementedCount = 0;
   4003     while (lineMat->find()) {
   4004         lineNum++;
   4005 
   4006         //
   4007         //  Get a line, break it into its fields, do the Perl
   4008         //    variable substitutions.
   4009         //
   4010         UnicodeString line = lineMat->group(1, status);
   4011         UnicodeString fields[7];
   4012         fieldPat->split(line, fields, 7, status);
   4013 
   4014         flagMat->reset(fields[0]);
   4015         flagMat->matches(status);
   4016         UnicodeString pattern  = flagMat->group(2, status);
   4017         pattern.findAndReplace("${bang}", "!");
   4018         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
   4019         pattern.findAndReplace(ffffSrc, ffff);
   4020 
   4021         //
   4022         //  Identify patterns that include match flag settings,
   4023         //    split off the flags, remove the extra quotes.
   4024         //
   4025         UnicodeString flagStr = flagMat->group(3, status);
   4026         if (U_FAILURE(status)) {
   4027             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   4028             return;
   4029         }
   4030         int32_t flags = 0;
   4031         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
   4032         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
   4033         const UChar UChar_m = 0x6d;
   4034         const UChar UChar_x = 0x78;
   4035         const UChar UChar_y = 0x79;
   4036         if (flagStr.indexOf(UChar_i) != -1) {
   4037             flags |= UREGEX_CASE_INSENSITIVE;
   4038         }
   4039         if (flagStr.indexOf(UChar_m) != -1) {
   4040             flags |= UREGEX_MULTILINE;
   4041         }
   4042         if (flagStr.indexOf(UChar_x) != -1) {
   4043             flags |= UREGEX_COMMENTS;
   4044         }
   4045 
   4046         //
   4047         // Put the pattern in a UTF-8 UText
   4048         //
   4049         status = U_ZERO_ERROR;
   4050         patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
   4051         if (status == U_BUFFER_OVERFLOW_ERROR) {
   4052             status = U_ZERO_ERROR;
   4053             delete[] patternChars;
   4054             patternCapacity = patternLength + 1;
   4055             patternChars = new char[patternCapacity];
   4056             pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
   4057         }
   4058         utext_openUTF8(&patternText, patternChars, patternLength, &status);
   4059 
   4060         //
   4061         // Compile the test pattern.
   4062         //
   4063         RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
   4064         if (status == U_REGEX_UNIMPLEMENTED) {
   4065             //
   4066             // Test of a feature that is planned for ICU, but not yet implemented.
   4067             //   skip the test.
   4068             skippedUnimplementedCount++;
   4069             delete testPat;
   4070             status = U_ZERO_ERROR;
   4071             continue;
   4072         }
   4073 
   4074         if (U_FAILURE(status)) {
   4075             // Some tests are supposed to generate errors.
   4076             //   Only report an error for tests that are supposed to succeed.
   4077             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
   4078                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
   4079             {
   4080                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
   4081             }
   4082             status = U_ZERO_ERROR;
   4083             delete testPat;
   4084             continue;
   4085         }
   4086 
   4087         if (fields[2].indexOf(UChar_i) >= 0) {
   4088             // ICU should skip this test.
   4089             delete testPat;
   4090             continue;
   4091         }
   4092 
   4093         if (fields[2].indexOf(UChar_c) >= 0) {
   4094             // This pattern should have caused a compilation error, but didn't/
   4095             errln("line %d: Expected a pattern compile error, got success.", lineNum);
   4096             delete testPat;
   4097             continue;
   4098         }
   4099 
   4100 
   4101         //
   4102         // replace the Perl variables that appear in some of the
   4103         //   match data strings.
   4104         //
   4105         UnicodeString matchString = fields[1];
   4106         matchString.findAndReplace(nulnulSrc, nulnul);
   4107         matchString.findAndReplace(ffffSrc,   ffff);
   4108 
   4109         // Replace any \n in the match string with an actual new-line char.
   4110         //  Don't do full unescape, as this unescapes more than Perl does, which
   4111         //  causes other spurious failures in the tests.
   4112         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4113 
   4114         //
   4115         // Put the input in a UTF-8 UText
   4116         //
   4117         status = U_ZERO_ERROR;
   4118         inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
   4119         if (status == U_BUFFER_OVERFLOW_ERROR) {
   4120             status = U_ZERO_ERROR;
   4121             delete[] inputChars;
   4122             inputCapacity = inputLength + 1;
   4123             inputChars = new char[inputCapacity];
   4124             matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
   4125         }
   4126         utext_openUTF8(&inputText, inputChars, inputLength, &status);
   4127 
   4128         //
   4129         // Run the test, check for expected match/don't match result.
   4130         //
   4131         RegexMatcher *testMat = testPat->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status);
   4132         UBool found = testMat->find();
   4133         UBool expected = FALSE;
   4134         if (fields[2].indexOf(UChar_y) >=0) {
   4135             expected = TRUE;
   4136         }
   4137         if (expected != found) {
   4138             errln("line %d: Expected %smatch, got %smatch",
   4139                 lineNum, expected?"":"no ", found?"":"no " );
   4140             continue;
   4141         }
   4142 
   4143         // Don't try to check expected results if there is no match.
   4144         //   (Some have stuff in the expected fields)
   4145         if (!found) {
   4146             delete testMat;
   4147             delete testPat;
   4148             continue;
   4149         }
   4150 
   4151         //
   4152         // Interpret the Perl expression from the fourth field of the data file,
   4153         // building up an ICU string from the results of the ICU match.
   4154         //   The Perl expression will contain references to the results of
   4155         //     a regex match, including the matched string, capture group strings,
   4156         //     group starting and ending indicies, etc.
   4157         //
   4158         UnicodeString resultString;
   4159         UnicodeString perlExpr = fields[3];
   4160 
   4161         while (perlExpr.length() > 0) {
   4162             groupsMat->reset(perlExpr);
   4163             cgMat->reset(perlExpr);
   4164 
   4165             if (perlExpr.startsWith("$&")) {
   4166                 resultString.append(testMat->group(status));
   4167                 perlExpr.remove(0, 2);
   4168             }
   4169 
   4170             else if (groupsMat->lookingAt(status)) {
   4171                 // $-[0]   $+[2]  etc.
   4172                 UnicodeString digitString = groupsMat->group(2, status);
   4173                 int32_t t = 0;
   4174                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4175                 UnicodeString plusOrMinus = groupsMat->group(1, status);
   4176                 int32_t matchPosition;
   4177                 if (plusOrMinus.compare("+") == 0) {
   4178                     matchPosition = testMat->end(groupNum, status);
   4179                 } else {
   4180                     matchPosition = testMat->start(groupNum, status);
   4181                 }
   4182                 if (matchPosition != -1) {
   4183                     ICU_Utility::appendNumber(resultString, matchPosition);
   4184                 }
   4185                 perlExpr.remove(0, groupsMat->end(status));
   4186             }
   4187 
   4188             else if (cgMat->lookingAt(status)) {
   4189                 // $1, $2, $3, etc.
   4190                 UnicodeString digitString = cgMat->group(1, status);
   4191                 int32_t t = 0;
   4192                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4193                 if (U_SUCCESS(status)) {
   4194                     resultString.append(testMat->group(groupNum, status));
   4195                     status = U_ZERO_ERROR;
   4196                 }
   4197                 perlExpr.remove(0, cgMat->end(status));
   4198             }
   4199 
   4200             else if (perlExpr.startsWith("@-")) {
   4201                 int32_t i;
   4202                 for (i=0; i<=testMat->groupCount(); i++) {
   4203                     if (i>0) {
   4204                         resultString.append(" ");
   4205                     }
   4206                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
   4207                 }
   4208                 perlExpr.remove(0, 2);
   4209             }
   4210 
   4211             else if (perlExpr.startsWith("@+")) {
   4212                 int32_t i;
   4213                 for (i=0; i<=testMat->groupCount(); i++) {
   4214                     if (i>0) {
   4215                         resultString.append(" ");
   4216                     }
   4217                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
   4218                 }
   4219                 perlExpr.remove(0, 2);
   4220             }
   4221 
   4222             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
   4223                                                      //           or as an escaped sequence (e.g. \n)
   4224                 if (perlExpr.length() > 1) {
   4225                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
   4226                 }
   4227                 UChar c = perlExpr.charAt(0);
   4228                 switch (c) {
   4229                 case 'n':   c = '\n'; break;
   4230                 // add any other escape sequences that show up in the test expected results.
   4231                 }
   4232                 resultString.append(c);
   4233                 perlExpr.remove(0, 1);
   4234             }
   4235 
   4236             else  {
   4237                 // Any characters from the perl expression that we don't explicitly
   4238                 //  recognize before here are assumed to be literals and copied
   4239                 //  as-is to the expected results.
   4240                 resultString.append(perlExpr.charAt(0));
   4241                 perlExpr.remove(0, 1);
   4242             }
   4243 
   4244             if (U_FAILURE(status)) {
   4245                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
   4246                 break;
   4247             }
   4248         }
   4249 
   4250         //
   4251         // Expected Results Compare
   4252         //
   4253         UnicodeString expectedS(fields[4]);
   4254         expectedS.findAndReplace(nulnulSrc, nulnul);
   4255         expectedS.findAndReplace(ffffSrc,   ffff);
   4256         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4257 
   4258 
   4259         if (expectedS.compare(resultString) != 0) {
   4260             err("Line %d: Incorrect perl expression results.", lineNum);
   4261             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
   4262         }
   4263 
   4264         delete testMat;
   4265         delete testPat;
   4266     }
   4267 
   4268     //
   4269     // All done.  Clean up allocated stuff.
   4270     //
   4271     delete cgMat;
   4272     delete cgPat;
   4273 
   4274     delete groupsMat;
   4275     delete groupsPat;
   4276 
   4277     delete flagMat;
   4278     delete flagPat;
   4279 
   4280     delete lineMat;
   4281     delete linePat;
   4282 
   4283     delete fieldPat;
   4284     delete [] testData;
   4285 
   4286     utext_close(&patternText);
   4287     utext_close(&inputText);
   4288 
   4289     delete [] patternChars;
   4290     delete [] inputChars;
   4291 
   4292 
   4293     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
   4294 
   4295 }
   4296 
   4297 
   4298 //--------------------------------------------------------------
   4299 //
   4300 //  Bug6149   Verify limits to heap expansion for backtrack stack.
   4301 //             Use this pattern,
   4302 //                 "(a?){1,}"
   4303 //             The zero-length match will repeat forever.
   4304 //                (That this goes into a loop is another bug)
   4305 //
   4306 //---------------------------------------------------------------
   4307 void RegexTest::Bug6149() {
   4308     UnicodeString pattern("(a?){1,}");
   4309     UnicodeString s("xyz");
   4310     uint32_t flags = 0;
   4311     UErrorCode status = U_ZERO_ERROR;
   4312 
   4313     RegexMatcher  matcher(pattern, s, flags, status);
   4314     UBool result = false;
   4315     REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
   4316     REGEX_ASSERT(result == FALSE);
   4317  }
   4318 
   4319 
   4320 //
   4321 //   Callbacks()    Test the callback function.
   4322 //                  When set, callbacks occur periodically during matching operations,
   4323 //                  giving the application code the ability to abort the operation
   4324 //                  before it's normal completion.
   4325 //
   4326 
   4327 struct callBackContext {
   4328     RegexTest        *test;
   4329     int32_t          maxCalls;
   4330     int32_t          numCalls;
   4331     int32_t          lastSteps;
   4332     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
   4333 };
   4334 
   4335 U_CDECL_BEGIN
   4336 static UBool U_CALLCONV
   4337 testCallBackFn(const void *context, int32_t steps) {
   4338     callBackContext  *info = (callBackContext *)context;
   4339     if (info->lastSteps+1 != steps) {
   4340         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
   4341     }
   4342     info->lastSteps = steps;
   4343     info->numCalls++;
   4344     return (info->numCalls < info->maxCalls);
   4345 }
   4346 U_CDECL_END
   4347 
   4348 void RegexTest::Callbacks() {
   4349    {
   4350         // Getter returns NULLs if no callback has been set
   4351 
   4352         //   The variables that the getter will fill in.
   4353         //   Init to non-null values so that the action of the getter can be seen.
   4354         const void          *returnedContext = &returnedContext;
   4355         URegexMatchCallback *returnedFn = &testCallBackFn;
   4356 
   4357         UErrorCode status = U_ZERO_ERROR;
   4358         RegexMatcher matcher("x", 0, status);
   4359         REGEX_CHECK_STATUS;
   4360         matcher.getMatchCallback(returnedFn, returnedContext, status);
   4361         REGEX_CHECK_STATUS;
   4362         REGEX_ASSERT(returnedFn == NULL);
   4363         REGEX_ASSERT(returnedContext == NULL);
   4364     }
   4365 
   4366    {
   4367         // Set and Get work
   4368         callBackContext cbInfo = {this, 0, 0, 0};
   4369         const void          *returnedContext;
   4370         URegexMatchCallback *returnedFn;
   4371         UErrorCode status = U_ZERO_ERROR;
   4372         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
   4373         REGEX_CHECK_STATUS;
   4374         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
   4375         REGEX_CHECK_STATUS;
   4376         matcher.getMatchCallback(returnedFn, returnedContext, status);
   4377         REGEX_CHECK_STATUS;
   4378         REGEX_ASSERT(returnedFn == testCallBackFn);
   4379         REGEX_ASSERT(returnedContext == &cbInfo);
   4380 
   4381         // A short-running match shouldn't invoke the callback
   4382         status = U_ZERO_ERROR;
   4383         cbInfo.reset(1);
   4384         UnicodeString s = "xxx";
   4385         matcher.reset(s);
   4386         REGEX_ASSERT(matcher.matches(status));
   4387         REGEX_CHECK_STATUS;
   4388         REGEX_ASSERT(cbInfo.numCalls == 0);
   4389 
   4390         // A medium-length match that runs long enough to invoke the
   4391         //   callback, but not so long that the callback aborts it.
   4392         status = U_ZERO_ERROR;
   4393         cbInfo.reset(4);
   4394         s = "aaaaaaaaaaaaaaaaaaab";
   4395         matcher.reset(s);
   4396         REGEX_ASSERT(matcher.matches(status)==FALSE);
   4397         REGEX_CHECK_STATUS;
   4398         REGEX_ASSERT(cbInfo.numCalls > 0);
   4399 
   4400         // A longer running match that the callback function will abort.
   4401         status = U_ZERO_ERROR;
   4402         cbInfo.reset(4);
   4403         s = "aaaaaaaaaaaaaaaaaaaaaaab";
   4404         matcher.reset(s);
   4405         REGEX_ASSERT(matcher.matches(status)==FALSE);
   4406         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
   4407         REGEX_ASSERT(cbInfo.numCalls == 4);
   4408     }
   4409 
   4410 
   4411 }
   4412 
   4413 
   4414 //---------------------------------------------------------------------------
   4415 //
   4416 //    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
   4417 //                             UTexts. The pure-C implementation of UText
   4418 //                             has no mutable backing stores, but we can
   4419 //                             use UnicodeString here to test the functionality.
   4420 //
   4421 //---------------------------------------------------------------------------
   4422 void RegexTest::PreAllocatedUTextCAPI () {
   4423     UErrorCode           status = U_ZERO_ERROR;
   4424     URegularExpression  *re;
   4425     UText                patternText = UTEXT_INITIALIZER;
   4426     UnicodeString        buffer;
   4427     UText                bufferText = UTEXT_INITIALIZER;
   4428 
   4429     utext_openUnicodeString(&bufferText, &buffer, &status);
   4430 
   4431     /*
   4432      *  getText() and getUText()
   4433      */
   4434     {
   4435         UText  text1 = UTEXT_INITIALIZER;
   4436         UText  text2 = UTEXT_INITIALIZER;
   4437         UChar  text2Chars[20];
   4438         UText  *resultText;
   4439 
   4440         status = U_ZERO_ERROR;
   4441         utext_openUTF8(&text1, "abcccd", -1, &status);
   4442         utext_openUTF8(&text2, "abcccxd", -1, &status);
   4443         u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
   4444         utext_openUChars(&text2, text2Chars, -1, &status);
   4445 
   4446         utext_openUTF8(&patternText, "abc*d", -1, &status);
   4447         re = uregex_openUText(&patternText, 0, NULL, &status);
   4448 
   4449         /* First set a UText */
   4450         uregex_setUText(re, &text1, &status);
   4451         resultText = uregex_getUText(re, &bufferText, &status);
   4452         REGEX_CHECK_STATUS;
   4453         REGEX_ASSERT(resultText == &bufferText);
   4454         utext_setNativeIndex(resultText, 0);
   4455         utext_setNativeIndex(&text1, 0);
   4456         REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
   4457 
   4458         resultText = uregex_getUText(re, &bufferText, &status);
   4459         REGEX_CHECK_STATUS;
   4460         REGEX_ASSERT(resultText == &bufferText);
   4461         utext_setNativeIndex(resultText, 0);
   4462         utext_setNativeIndex(&text1, 0);
   4463         REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
   4464 
   4465         /* Then set a UChar * */
   4466         uregex_setText(re, text2Chars, 7, &status);
   4467         resultText = uregex_getUText(re, &bufferText, &status);
   4468         REGEX_CHECK_STATUS;
   4469         REGEX_ASSERT(resultText == &bufferText);
   4470         utext_setNativeIndex(resultText, 0);
   4471         utext_setNativeIndex(&text2, 0);
   4472         REGEX_ASSERT(utext_compare(resultText, -1, &text2, -1) == 0);
   4473 
   4474         uregex_close(re);
   4475         utext_close(&text1);
   4476         utext_close(&text2);
   4477     }
   4478 
   4479     /*
   4480      *  group()
   4481      */
   4482     {
   4483         UChar    text1[80];
   4484         UText   *actual;
   4485         UBool    result;
   4486         u_uastrncpy(text1, "noise abc interior def, and this is off the end",  sizeof(text1)/2);
   4487 
   4488         status = U_ZERO_ERROR;
   4489         re = uregex_openC("abc(.*?)def", 0, NULL, &status);
   4490         REGEX_CHECK_STATUS;
   4491 
   4492         uregex_setText(re, text1, -1, &status);
   4493         result = uregex_find(re, 0, &status);
   4494         REGEX_ASSERT(result==TRUE);
   4495 
   4496         /*  Capture Group 0, the full match.  Should succeed.  */
   4497         status = U_ZERO_ERROR;
   4498         actual = uregex_groupUText(re, 0, &bufferText, &status);
   4499         REGEX_CHECK_STATUS;
   4500         REGEX_ASSERT(actual == &bufferText);
   4501         REGEX_ASSERT_UTEXT("abc interior def", actual);
   4502 
   4503         /*  Capture group #1.  Should succeed. */
   4504         status = U_ZERO_ERROR;
   4505         actual = uregex_groupUText(re, 1, &bufferText, &status);
   4506         REGEX_CHECK_STATUS;
   4507         REGEX_ASSERT(actual == &bufferText);
   4508         REGEX_ASSERT_UTEXT(" interior ", actual);
   4509 
   4510         /*  Capture group out of range.  Error. */
   4511         status = U_ZERO_ERROR;
   4512         actual = uregex_groupUText(re, 2, &bufferText, &status);
   4513         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   4514         REGEX_ASSERT(actual == &bufferText);
   4515 
   4516         uregex_close(re);
   4517 
   4518     }
   4519 
   4520     /*
   4521      *  replaceFirst()
   4522      */
   4523     {
   4524         UChar    text1[80];
   4525         UChar    text2[80];
   4526         UText    replText = UTEXT_INITIALIZER;
   4527         UText   *result;
   4528 
   4529         status = U_ZERO_ERROR;
   4530         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
   4531         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
   4532         utext_openUTF8(&replText, "<$1>", -1, &status);
   4533 
   4534         re = uregex_openC("x(.*?)x", 0, NULL, &status);
   4535         REGEX_CHECK_STATUS;
   4536 
   4537         /*  Normal case, with match */
   4538         uregex_setText(re, text1, -1, &status);
   4539         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   4540         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   4541         REGEX_CHECK_STATUS;
   4542         REGEX_ASSERT(result == &bufferText);
   4543         REGEX_ASSERT_UTEXT("Replace <aa> x1x x...x.", result);
   4544 
   4545         /* No match.  Text should copy to output with no changes.  */
   4546         uregex_setText(re, text2, -1, &status);
   4547         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   4548         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   4549         REGEX_CHECK_STATUS;
   4550         REGEX_ASSERT(result == &bufferText);
   4551         REGEX_ASSERT_UTEXT("No match here.", result);
   4552 
   4553         /* Unicode escapes */
   4554         uregex_setText(re, text1, -1, &status);
   4555         utext_openUTF8(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
   4556         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   4557         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   4558         REGEX_CHECK_STATUS;
   4559         REGEX_ASSERT(result == &bufferText);
   4560         REGEX_ASSERT_UTEXT("Replace \\AaaB$a x1x x...x.", result);
   4561 
   4562         uregex_close(re);
   4563         utext_close(&replText);
   4564     }
   4565 
   4566 
   4567     /*
   4568      *  replaceAll()
   4569      */
   4570     {
   4571         UChar    text1[80];
   4572         UChar    text2[80];
   4573         UText    replText = UTEXT_INITIALIZER;
   4574         UText   *result;
   4575 
   4576         status = U_ZERO_ERROR;
   4577         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
   4578         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
   4579         utext_openUTF8(&replText, "<$1>", -1, &status);
   4580 
   4581         re = uregex_openC("x(.*?)x", 0, NULL, &status);
   4582         REGEX_CHECK_STATUS;
   4583 
   4584         /*  Normal case, with match */
   4585         uregex_setText(re, text1, -1, &status);
   4586         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   4587         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
   4588         REGEX_CHECK_STATUS;
   4589         REGEX_ASSERT(result == &bufferText);
   4590         REGEX_ASSERT_UTEXT("Replace <aa> <1> <...>.", result);
   4591 
   4592         /* No match.  Text should copy to output with no changes.  */
   4593         uregex_setText(re, text2, -1, &status);
   4594         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   4595         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
   4596         REGEX_CHECK_STATUS;
   4597         REGEX_ASSERT(result == &bufferText);
   4598         REGEX_ASSERT_UTEXT("No match here.", result);
   4599 
   4600         uregex_close(re);
   4601         utext_close(&replText);
   4602     }
   4603 
   4604 
   4605     /*
   4606      *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
   4607      *   so we don't need to test it here.
   4608      */
   4609 
   4610     utext_close(&bufferText);
   4611     utext_close(&patternText);
   4612 }
   4613 
   4614 //--------------------------------------------------------------
   4615 //
   4616 //  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
   4617 //
   4618 //---------------------------------------------------------------
   4619 void RegexTest::Bug7651() {
   4620     UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
   4621     //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
   4622     UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
   4623     UnicodeString s("#ff @abcd This is test");
   4624     RegexPattern  *REPattern = NULL;
   4625     RegexMatcher  *REMatcher = NULL;
   4626     UErrorCode status = U_ZERO_ERROR;
   4627     UParseError pe;
   4628 
   4629     REPattern = RegexPattern::compile(pattern1, 0, pe, status);
   4630     REGEX_CHECK_STATUS;
   4631     REMatcher = REPattern->matcher(s, status);
   4632     REGEX_CHECK_STATUS;
   4633     REGEX_ASSERT(REMatcher->find());
   4634     REGEX_ASSERT(REMatcher->start(status) == 0);
   4635     delete REPattern;
   4636     delete REMatcher;
   4637     status = U_ZERO_ERROR;
   4638 
   4639     REPattern = RegexPattern::compile(pattern2, 0, pe, status);
   4640     REGEX_CHECK_STATUS;
   4641     REMatcher = REPattern->matcher(s, status);
   4642     REGEX_CHECK_STATUS;
   4643     REGEX_ASSERT(REMatcher->find());
   4644     REGEX_ASSERT(REMatcher->start(status) == 0);
   4645     delete REPattern;
   4646     delete REMatcher;
   4647     status = U_ZERO_ERROR;
   4648  }
   4649 
   4650 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
   4651 
   4652