Home | History | Annotate | Download | only in intltest
      1 /********************************************************************
      2  * COPYRIGHT:
      3  * Copyright (c) 2002-2009, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  ********************************************************************/
      6 
      7 //
      8 //   regextst.cpp
      9 //
     10 //      ICU Regular Expressions test, part of intltest.
     11 //
     12 
     13 #include "intltest.h"
     14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     15 
     16 #include "unicode/regex.h"
     17 #include "unicode/uchar.h"
     18 #include "unicode/ucnv.h"
     19 #include "regextst.h"
     20 #include "uvector.h"
     21 #include "util.h"
     22 #include <stdlib.h>
     23 #include <string.h>
     24 #include <stdio.h>
     25 
     26 
     27 //---------------------------------------------------------------------------
     28 //
     29 //  Test class boilerplate
     30 //
     31 //---------------------------------------------------------------------------
     32 RegexTest::RegexTest()
     33 {
     34 }
     35 
     36 
     37 RegexTest::~RegexTest()
     38 {
     39 }
     40 
     41 
     42 
     43 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
     44 {
     45     if (exec) logln("TestSuite RegexTest: ");
     46     switch (index) {
     47 
     48         case 0: name = "Basic";
     49             if (exec) Basic();
     50             break;
     51         case 1: name = "API_Match";
     52             if (exec) API_Match();
     53             break;
     54         case 2: name = "API_Replace";
     55             if (exec) API_Replace();
     56             break;
     57         case 3: name = "API_Pattern";
     58             if (exec) API_Pattern();
     59             break;
     60         case 4: name = "Extended";
     61             if (exec) Extended();
     62             break;
     63         case 5: name = "Errors";
     64             if (exec) Errors();
     65             break;
     66         case 6: name = "PerlTests";
     67             if (exec) PerlTests();
     68             break;
     69         case 7: name = "Callbacks";
     70             if (exec) Callbacks();
     71             break;
     72         case 8: name = "Bug 6149";
     73              if (exec) Bug6149();
     74              break;
     75 
     76         default: name = "";
     77             break; //needed to end loop
     78     }
     79 }
     80 
     81 
     82 //---------------------------------------------------------------------------
     83 //
     84 //   Error Checking / Reporting macros used in all of the tests.
     85 //
     86 //---------------------------------------------------------------------------
     87 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("RegexTest failure at line %d.  status=%s", \
     88 __LINE__, u_errorName(status)); return;}}
     89 
     90 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("RegexTest failure at line %d.\n", __LINE__);};}
     91 
     92 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
     93 if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
     94     __LINE__, u_errorName(errcode), u_errorName(status));};}
     95 
     96 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
     97     "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
     98 
     99 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
    100     errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
    101 
    102 
    103 
    104 //---------------------------------------------------------------------------
    105 //
    106 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
    107 //                       for the LookingAt() and  Match() functions.
    108 //
    109 //       usage:
    110 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
    111 //
    112 //          The expected results are UBool - TRUE or FALSE.
    113 //          The input text is unescaped.  The pattern is not.
    114 //
    115 //
    116 //---------------------------------------------------------------------------
    117 
    118 #define REGEX_TESTLM(pat, text, looking, match) doRegexLMTest(pat, text, looking, match, __LINE__);
    119 
    120 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
    121     const UnicodeString pattern(pat, -1, US_INV);
    122     const UnicodeString inputText(text, -1, US_INV);
    123     UErrorCode          status  = U_ZERO_ERROR;
    124     UParseError         pe;
    125     RegexPattern        *REPattern = NULL;
    126     RegexMatcher        *REMatcher = NULL;
    127     UBool               retVal     = TRUE;
    128 
    129     UnicodeString patString(pat, -1, US_INV);
    130     REPattern = RegexPattern::compile(patString, 0, pe, status);
    131     if (U_FAILURE(status)) {
    132         dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
    133             line, u_errorName(status));
    134         return FALSE;
    135     }
    136     if (line==376) { RegexPatternDump(REPattern);}
    137 
    138     UnicodeString inputString(inputText);
    139     UnicodeString unEscapedInput = inputString.unescape();
    140     REMatcher = REPattern->matcher(unEscapedInput, status);
    141     if (U_FAILURE(status)) {
    142         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
    143             line, u_errorName(status));
    144         return FALSE;
    145     }
    146 
    147     UBool actualmatch;
    148     actualmatch = REMatcher->lookingAt(status);
    149     if (U_FAILURE(status)) {
    150         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
    151             line, u_errorName(status));
    152         retVal =  FALSE;
    153     }
    154     if (actualmatch != looking) {
    155         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
    156         retVal = FALSE;
    157     }
    158 
    159     status = U_ZERO_ERROR;
    160     actualmatch = REMatcher->matches(status);
    161     if (U_FAILURE(status)) {
    162         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
    163             line, u_errorName(status));
    164         retVal = FALSE;
    165     }
    166     if (actualmatch != match) {
    167         errln("RegexTest: wrong return from matches() at line %d.\n", line);
    168         retVal = FALSE;
    169     }
    170 
    171     if (retVal == FALSE) {
    172         RegexPatternDump(REPattern);
    173     }
    174 
    175     delete REPattern;
    176     delete REMatcher;
    177     return retVal;
    178 }
    179 
    180 
    181 
    182 
    183 
    184 //---------------------------------------------------------------------------
    185 //
    186 //    REGEX_ERR       Macro + invocation function to simplify writing tests
    187 //                       regex tests for incorrect patterns
    188 //
    189 //       usage:
    190 //          REGEX_ERR("pattern",   expected error line, column, expected status);
    191 //
    192 //---------------------------------------------------------------------------
    193 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
    194 
    195 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
    196                           UErrorCode expectedStatus, int32_t line) {
    197     UnicodeString       pattern(pat);
    198 
    199     UErrorCode          status         = U_ZERO_ERROR;
    200     UParseError         pe;
    201     RegexPattern        *callerPattern = NULL;
    202 
    203     //
    204     //  Compile the caller's pattern
    205     //
    206     UnicodeString patString(pat);
    207     callerPattern = RegexPattern::compile(patString, 0, pe, status);
    208     if (status != expectedStatus) {
    209         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
    210     } else {
    211         if (status != U_ZERO_ERROR) {
    212             if (pe.line != errLine || pe.offset != errCol) {
    213                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
    214                     line, errLine, errCol, pe.line, pe.offset);
    215             }
    216         }
    217     }
    218 
    219     delete callerPattern;
    220 }
    221 
    222 
    223 
    224 //---------------------------------------------------------------------------
    225 //
    226 //      Basic      Check for basic functionality of regex pattern matching.
    227 //                 Avoid the use of REGEX_FIND test macro, which has
    228 //                 substantial dependencies on basic Regex functionality.
    229 //
    230 //---------------------------------------------------------------------------
    231 void RegexTest::Basic() {
    232 
    233 
    234 //
    235 // Debug - slide failing test cases early
    236 //
    237 #if 0
    238     {
    239         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
    240         UParseError pe;
    241         UErrorCode  status = U_ZERO_ERROR;
    242         RegexPattern::compile("^(?:a?b?)*$", 0, pe, status);
    243         // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
    244         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
    245     }
    246     exit(1);
    247 #endif
    248 
    249 
    250     //
    251     // Pattern with parentheses
    252     //
    253     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
    254     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
    255     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
    256 
    257     //
    258     // Patterns with *
    259     //
    260     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
    261     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
    262     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
    263     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
    264     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
    265 
    266     REGEX_TESTLM("a*", "",  TRUE, TRUE);
    267     REGEX_TESTLM("a*", "b", TRUE, FALSE);
    268 
    269 
    270     //
    271     //  Patterns with "."
    272     //
    273     REGEX_TESTLM(".", "abc", TRUE, FALSE);
    274     REGEX_TESTLM("...", "abc", TRUE, TRUE);
    275     REGEX_TESTLM("....", "abc", FALSE, FALSE);
    276     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
    277     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
    278     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
    279     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
    280     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
    281 
    282     //
    283     //  Patterns with * applied to chars at end of literal string
    284     //
    285     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
    286     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
    287 
    288     //
    289     //  Supplemental chars match as single chars, not a pair of surrogates.
    290     //
    291     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
    292     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
    293     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
    294 
    295 
    296     //
    297     //  UnicodeSets in the pattern
    298     //
    299     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
    300     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
    301     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
    302     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
    303     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
    304     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
    305 
    306     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
    307     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
    308     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
    309     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
    310     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
    311 
    312     //
    313     //   OR operator in patterns
    314     //
    315     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
    316     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
    317     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
    318     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
    319 
    320     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
    321     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
    322     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
    323     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
    324     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
    325     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
    326 
    327     //
    328     //  +
    329     //
    330     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
    331     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
    332     REGEX_TESTLM("b+", "", FALSE, FALSE);
    333     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
    334     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
    335     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
    336 
    337     //
    338     //   ?
    339     //
    340     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
    341     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
    342     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
    343     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
    344     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
    345     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
    346     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
    347     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
    348     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
    349 
    350     //
    351     //  Escape sequences that become single literal chars, handled internally
    352     //   by ICU's Unescape.
    353     //
    354 
    355     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
    356     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
    357     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
    358     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
    359     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
    360     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
    361     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
    362     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
    363     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
    364     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
    365 
    366     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
    367     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
    368 
    369     // Escape of special chars in patterns
    370     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
    371 
    372 
    373 }
    374 
    375 
    376 //---------------------------------------------------------------------------
    377 //
    378 //      API_Match   Test that the API for class RegexMatcher
    379 //                  is present and nominally working, but excluding functions
    380 //                  implementing replace operations.
    381 //
    382 //---------------------------------------------------------------------------
    383 void RegexTest::API_Match() {
    384     UParseError         pe;
    385     UErrorCode          status=U_ZERO_ERROR;
    386     int32_t             flags = 0;
    387 
    388     //
    389     // Debug - slide failing test cases early
    390     //
    391 #if 0
    392     {
    393     }
    394     return;
    395 #endif
    396 
    397     //
    398     // Simple pattern compilation
    399     //
    400     {
    401         UnicodeString       re("abc");
    402         RegexPattern        *pat2;
    403         pat2 = RegexPattern::compile(re, flags, pe, status);
    404         REGEX_CHECK_STATUS;
    405 
    406         UnicodeString inStr1 = "abcdef this is a test";
    407         UnicodeString instr2 = "not abc";
    408         UnicodeString empty  = "";
    409 
    410 
    411         //
    412         // Matcher creation and reset.
    413         //
    414         RegexMatcher *m1 = pat2->matcher(inStr1, status);
    415         REGEX_CHECK_STATUS;
    416         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    417         REGEX_ASSERT(m1->input() == inStr1);
    418         m1->reset(instr2);
    419         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
    420         REGEX_ASSERT(m1->input() == instr2);
    421         m1->reset(inStr1);
    422         REGEX_ASSERT(m1->input() == inStr1);
    423         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    424         m1->reset(empty);
    425         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
    426         REGEX_ASSERT(m1->input() == empty);
    427         REGEX_ASSERT(&m1->pattern() == pat2);
    428 
    429         //
    430         //  reset(pos, status)
    431         //
    432         m1->reset(inStr1);
    433         m1->reset(4, status);
    434         REGEX_CHECK_STATUS;
    435         REGEX_ASSERT(m1->input() == inStr1);
    436         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    437 
    438         m1->reset(-1, status);
    439         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    440         status = U_ZERO_ERROR;
    441 
    442         m1->reset(0, status);
    443         REGEX_CHECK_STATUS;
    444         status = U_ZERO_ERROR;
    445 
    446         int32_t len = m1->input().length();
    447         m1->reset(len-1, status);
    448         REGEX_CHECK_STATUS;
    449         status = U_ZERO_ERROR;
    450 
    451         m1->reset(len, status);
    452         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    453         status = U_ZERO_ERROR;
    454 
    455         //
    456         // match(pos, status)
    457         //
    458         m1->reset(instr2);
    459         REGEX_ASSERT(m1->matches(4, status) == TRUE);
    460         m1->reset();
    461         REGEX_ASSERT(m1->matches(3, status) == FALSE);
    462         m1->reset();
    463         REGEX_ASSERT(m1->matches(5, status) == FALSE);
    464         REGEX_ASSERT(m1->matches(4, status) == TRUE);
    465         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
    466         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    467 
    468         // Match() at end of string should fail, but should not
    469         //  be an error.
    470         status = U_ZERO_ERROR;
    471         len = m1->input().length();
    472         REGEX_ASSERT(m1->matches(len, status) == FALSE);
    473         REGEX_CHECK_STATUS;
    474 
    475         // Match beyond end of string should fail with an error.
    476         status = U_ZERO_ERROR;
    477         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
    478         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    479 
    480         // Successful match at end of string.
    481         {
    482             status = U_ZERO_ERROR;
    483             RegexMatcher m("A?", 0, status);  // will match zero length string.
    484             REGEX_CHECK_STATUS;
    485             m.reset(inStr1);
    486             len = inStr1.length();
    487             REGEX_ASSERT(m.matches(len, status) == TRUE);
    488             REGEX_CHECK_STATUS;
    489             m.reset(empty);
    490             REGEX_ASSERT(m.matches(0, status) == TRUE);
    491             REGEX_CHECK_STATUS;
    492         }
    493 
    494 
    495         //
    496         // lookingAt(pos, status)
    497         //
    498         status = U_ZERO_ERROR;
    499         m1->reset(instr2);  // "not abc"
    500         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
    501         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
    502         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
    503         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
    504         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
    505         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    506         status = U_ZERO_ERROR;
    507         len = m1->input().length();
    508         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
    509         REGEX_CHECK_STATUS;
    510         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
    511         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    512 
    513         delete m1;
    514         delete pat2;
    515     }
    516 
    517 
    518     //
    519     // Capture Group.
    520     //     RegexMatcher::start();
    521     //     RegexMatcher::end();
    522     //     RegexMatcher::groupCount();
    523     //
    524     {
    525         int32_t             flags=0;
    526         UParseError         pe;
    527         UErrorCode          status=U_ZERO_ERROR;
    528 
    529         UnicodeString       re("01(23(45)67)(.*)");
    530         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
    531         REGEX_CHECK_STATUS;
    532         UnicodeString data = "0123456789";
    533 
    534         RegexMatcher *matcher = pat->matcher(data, status);
    535         REGEX_CHECK_STATUS;
    536         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
    537         static const int32_t matchStarts[] = {0,  2, 4, 8};
    538         static const int32_t matchEnds[]   = {10, 8, 6, 10};
    539         int32_t i;
    540         for (i=0; i<4; i++) {
    541             int32_t actualStart = matcher->start(i, status);
    542             REGEX_CHECK_STATUS;
    543             if (actualStart != matchStarts[i]) {
    544                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
    545                     __LINE__, i, matchStarts[i], actualStart);
    546             }
    547             int32_t actualEnd = matcher->end(i, status);
    548             REGEX_CHECK_STATUS;
    549             if (actualEnd != matchEnds[i]) {
    550                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
    551                     __LINE__, i, matchEnds[i], actualEnd);
    552             }
    553         }
    554 
    555         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
    556         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
    557 
    558         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
    559         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
    560         matcher->reset();
    561         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
    562 
    563         matcher->lookingAt(status);
    564         REGEX_ASSERT(matcher->group(status)    == "0123456789");
    565         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
    566         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
    567         REGEX_ASSERT(matcher->group(2, status) == "45"        );
    568         REGEX_ASSERT(matcher->group(3, status) == "89"        );
    569         REGEX_CHECK_STATUS;
    570         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
    571         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
    572         matcher->reset();
    573         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
    574 
    575         delete matcher;
    576         delete pat;
    577 
    578     }
    579 
    580     //
    581     //  find
    582     //
    583     {
    584         int32_t             flags=0;
    585         UParseError         pe;
    586         UErrorCode          status=U_ZERO_ERROR;
    587 
    588         UnicodeString       re("abc");
    589         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
    590         REGEX_CHECK_STATUS;
    591         UnicodeString data = ".abc..abc...abc..";
    592         //                    012345678901234567
    593 
    594         RegexMatcher *matcher = pat->matcher(data, status);
    595         REGEX_CHECK_STATUS;
    596         REGEX_ASSERT(matcher->find());
    597         REGEX_ASSERT(matcher->start(status) == 1);
    598         REGEX_ASSERT(matcher->find());
    599         REGEX_ASSERT(matcher->start(status) == 6);
    600         REGEX_ASSERT(matcher->find());
    601         REGEX_ASSERT(matcher->start(status) == 12);
    602         REGEX_ASSERT(matcher->find() == FALSE);
    603         REGEX_ASSERT(matcher->find() == FALSE);
    604 
    605         matcher->reset();
    606         REGEX_ASSERT(matcher->find());
    607         REGEX_ASSERT(matcher->start(status) == 1);
    608 
    609         REGEX_ASSERT(matcher->find(0, status));
    610         REGEX_ASSERT(matcher->start(status) == 1);
    611         REGEX_ASSERT(matcher->find(1, status));
    612         REGEX_ASSERT(matcher->start(status) == 1);
    613         REGEX_ASSERT(matcher->find(2, status));
    614         REGEX_ASSERT(matcher->start(status) == 6);
    615         REGEX_ASSERT(matcher->find(12, status));
    616         REGEX_ASSERT(matcher->start(status) == 12);
    617         REGEX_ASSERT(matcher->find(13, status) == FALSE);
    618         REGEX_ASSERT(matcher->find(16, status) == FALSE);
    619         REGEX_ASSERT(matcher->find(17, status) == FALSE);
    620         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
    621 
    622         status = U_ZERO_ERROR;
    623         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
    624         status = U_ZERO_ERROR;
    625         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
    626 
    627         REGEX_ASSERT(matcher->groupCount() == 0);
    628 
    629         delete matcher;
    630         delete pat;
    631     }
    632 
    633 
    634     //
    635     //  find, with \G in pattern (true if at the end of a previous match).
    636     //
    637     {
    638         int32_t             flags=0;
    639         UParseError         pe;
    640         UErrorCode          status=U_ZERO_ERROR;
    641 
    642         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
    643         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
    644         REGEX_CHECK_STATUS;
    645         UnicodeString data = ".abcabc.abc..";
    646         //                    012345678901234567
    647 
    648         RegexMatcher *matcher = pat->matcher(data, status);
    649         REGEX_CHECK_STATUS;
    650         REGEX_ASSERT(matcher->find());
    651         REGEX_ASSERT(matcher->start(status) == 0);
    652         REGEX_ASSERT(matcher->start(1, status) == -1);
    653         REGEX_ASSERT(matcher->start(2, status) == 1);
    654 
    655         REGEX_ASSERT(matcher->find());
    656         REGEX_ASSERT(matcher->start(status) == 4);
    657         REGEX_ASSERT(matcher->start(1, status) == 4);
    658         REGEX_ASSERT(matcher->start(2, status) == -1);
    659         REGEX_CHECK_STATUS;
    660 
    661         delete matcher;
    662         delete pat;
    663     }
    664 
    665     //
    666     //   find with zero length matches, match position should bump ahead
    667     //     to prevent loops.
    668     //
    669     {
    670         int32_t                 i;
    671         UErrorCode          status=U_ZERO_ERROR;
    672         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
    673                                                       //   using an always-true look-ahead.
    674         REGEX_CHECK_STATUS;
    675         UnicodeString s("    ");
    676         m.reset(s);
    677         for (i=0; ; i++) {
    678             if (m.find() == FALSE) {
    679                 break;
    680             }
    681             REGEX_ASSERT(m.start(status) == i);
    682             REGEX_ASSERT(m.end(status) == i);
    683         }
    684         REGEX_ASSERT(i==5);
    685 
    686         // Check that the bump goes over surrogate pairs OK
    687         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
    688         s = s.unescape();
    689         m.reset(s);
    690         for (i=0; ; i+=2) {
    691             if (m.find() == FALSE) {
    692                 break;
    693             }
    694             REGEX_ASSERT(m.start(status) == i);
    695             REGEX_ASSERT(m.end(status) == i);
    696         }
    697         REGEX_ASSERT(i==10);
    698     }
    699     {
    700         // find() loop breaking test.
    701         //        with pattern of /.?/, should see a series of one char matches, then a single
    702         //        match of zero length at the end of the input string.
    703         int32_t                 i;
    704         UErrorCode          status=U_ZERO_ERROR;
    705         RegexMatcher        m(".?", 0, status);
    706         REGEX_CHECK_STATUS;
    707         UnicodeString s("    ");
    708         m.reset(s);
    709         for (i=0; ; i++) {
    710             if (m.find() == FALSE) {
    711                 break;
    712             }
    713             REGEX_ASSERT(m.start(status) == i);
    714             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
    715         }
    716         REGEX_ASSERT(i==5);
    717     }
    718 
    719 
    720     //
    721     // Matchers with no input string behave as if they had an empty input string.
    722     //
    723 
    724     {
    725         UErrorCode status = U_ZERO_ERROR;
    726         RegexMatcher  m(".?", 0, status);
    727         REGEX_CHECK_STATUS;
    728         REGEX_ASSERT(m.find());
    729         REGEX_ASSERT(m.start(status) == 0);
    730         REGEX_ASSERT(m.input() == "");
    731     }
    732     {
    733         UErrorCode status = U_ZERO_ERROR;
    734         RegexPattern  *p = RegexPattern::compile(".", 0, status);
    735         RegexMatcher  *m = p->matcher(status);
    736         REGEX_CHECK_STATUS;
    737 
    738         REGEX_ASSERT(m->find() == FALSE);
    739         REGEX_ASSERT(m->input() == "");
    740         delete m;
    741         delete p;
    742     }
    743 
    744     //
    745     // Regions
    746     //
    747     {
    748         UErrorCode status = U_ZERO_ERROR;
    749         UnicodeString testString("This is test data");
    750         RegexMatcher m(".*", testString,  0, status);
    751         REGEX_CHECK_STATUS;
    752         REGEX_ASSERT(m.regionStart() == 0);
    753         REGEX_ASSERT(m.regionEnd() == testString.length());
    754         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
    755         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
    756 
    757         m.region(2,4, status);
    758         REGEX_CHECK_STATUS;
    759         REGEX_ASSERT(m.matches(status));
    760         REGEX_ASSERT(m.start(status)==2);
    761         REGEX_ASSERT(m.end(status)==4);
    762         REGEX_CHECK_STATUS;
    763 
    764         m.reset();
    765         REGEX_ASSERT(m.regionStart() == 0);
    766         REGEX_ASSERT(m.regionEnd() == testString.length());
    767 
    768         UnicodeString shorterString("short");
    769         m.reset(shorterString);
    770         REGEX_ASSERT(m.regionStart() == 0);
    771         REGEX_ASSERT(m.regionEnd() == shorterString.length());
    772 
    773         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
    774         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
    775         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
    776         REGEX_ASSERT(&m == &m.reset());
    777         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
    778 
    779         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
    780         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
    781         REGEX_ASSERT(&m == &m.reset());
    782         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
    783 
    784         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
    785         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
    786         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
    787         REGEX_ASSERT(&m == &m.reset());
    788         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
    789 
    790         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
    791         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
    792         REGEX_ASSERT(&m == &m.reset());
    793         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
    794 
    795     }
    796 
    797     //
    798     // hitEnd() and requireEnd()
    799     //
    800     {
    801         UErrorCode status = U_ZERO_ERROR;
    802         UnicodeString testString("aabb");
    803         RegexMatcher m1(".*", testString,  0, status);
    804         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
    805         REGEX_ASSERT(m1.hitEnd() == TRUE);
    806         REGEX_ASSERT(m1.requireEnd() == FALSE);
    807         REGEX_CHECK_STATUS;
    808 
    809         status = U_ZERO_ERROR;
    810         RegexMatcher m2("a*", testString, 0, status);
    811         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
    812         REGEX_ASSERT(m2.hitEnd() == FALSE);
    813         REGEX_ASSERT(m2.requireEnd() == FALSE);
    814         REGEX_CHECK_STATUS;
    815 
    816         status = U_ZERO_ERROR;
    817         RegexMatcher m3(".*$", testString, 0, status);
    818         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
    819         REGEX_ASSERT(m3.hitEnd() == TRUE);
    820         REGEX_ASSERT(m3.requireEnd() == TRUE);
    821         REGEX_CHECK_STATUS;
    822     }
    823 
    824 
    825     //
    826     // Compilation error on reset with UChar *
    827     //   These were a hazard that people were stumbling over with runtime errors.
    828     //   Changed them to compiler errors by adding private methods that more closely
    829     //   matched the incorrect use of the functions.
    830     //
    831 #if 0
    832     {
    833         UErrorCode status = U_ZERO_ERROR;
    834         UChar ucharString[20];
    835         RegexMatcher m(".", 0, status);
    836         m.reset(ucharString);  // should not compile.
    837 
    838         RegexPattern *p = RegexPattern::compile(".", 0, status);
    839         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
    840 
    841         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
    842     }
    843 #endif
    844 
    845     //
    846     //  Time Outs.
    847     //       Note:  These tests will need to be changed when the regexp engine is
    848     //              able to detect and cut short the exponential time behavior on
    849     //              this type of match.
    850     //
    851     {
    852         UErrorCode status = U_ZERO_ERROR;
    853         //    Enough 'a's in the string to cause the match to time out.
    854         //       (Each on additonal 'a' doubles the time)
    855         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
    856         RegexMatcher matcher("(a+)+b", testString, 0, status);
    857         REGEX_CHECK_STATUS;
    858         REGEX_ASSERT(matcher.getTimeLimit() == 0);
    859         matcher.setTimeLimit(100, status);
    860         REGEX_ASSERT(matcher.getTimeLimit() == 100);
    861         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
    862         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
    863     }
    864     {
    865         UErrorCode status = U_ZERO_ERROR;
    866         //   Few enough 'a's to slip in under the time limit.
    867         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
    868         RegexMatcher matcher("(a+)+b", testString, 0, status);
    869         REGEX_CHECK_STATUS;
    870         matcher.setTimeLimit(100, status);
    871         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
    872         REGEX_CHECK_STATUS;
    873     }
    874 
    875     //
    876     //  Stack Limits
    877     //
    878     {
    879         UErrorCode status = U_ZERO_ERROR;
    880         UnicodeString testString(600000, 0x41, 600000);  // Length 600,000, filled with 'A'
    881 
    882         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
    883         //   of the '+', and makes the stack frames larger.
    884         RegexMatcher matcher("(A)+A$", testString, 0, status);
    885 
    886         // With the default stack, this match should fail to run
    887         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
    888         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
    889 
    890         // With unlimited stack, it should run
    891         status = U_ZERO_ERROR;
    892         matcher.setStackLimit(0, status);
    893         REGEX_CHECK_STATUS;
    894         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
    895         REGEX_CHECK_STATUS;
    896         REGEX_ASSERT(matcher.getStackLimit() == 0);
    897 
    898         // With a limited stack, it the match should fail
    899         status = U_ZERO_ERROR;
    900         matcher.setStackLimit(10000, status);
    901         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
    902         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
    903         REGEX_ASSERT(matcher.getStackLimit() == 10000);
    904     }
    905 
    906         // A pattern that doesn't save state should work with
    907         //   a minimal sized stack
    908     {
    909         UErrorCode status = U_ZERO_ERROR;
    910         UnicodeString testString = "abc";
    911         RegexMatcher matcher("abc", testString, 0, status);
    912         REGEX_CHECK_STATUS;
    913         matcher.setStackLimit(30, status);
    914         REGEX_CHECK_STATUS;
    915         REGEX_ASSERT(matcher.matches(status) == TRUE);
    916         REGEX_CHECK_STATUS;
    917         REGEX_ASSERT(matcher.getStackLimit() == 30);
    918 
    919         // Negative stack sizes should fail
    920         status = U_ZERO_ERROR;
    921         matcher.setStackLimit(1000, status);
    922         REGEX_CHECK_STATUS;
    923         matcher.setStackLimit(-1, status);
    924         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
    925         REGEX_ASSERT(matcher.getStackLimit() == 1000);
    926     }
    927 
    928 
    929 }
    930 
    931 
    932 
    933 
    934 
    935 
    936 //---------------------------------------------------------------------------
    937 //
    938 //      API_Replace        API test for class RegexMatcher, testing the
    939 //                         Replace family of functions.
    940 //
    941 //---------------------------------------------------------------------------
    942 void RegexTest::API_Replace() {
    943     //
    944     //  Replace
    945     //
    946     int32_t             flags=0;
    947     UParseError         pe;
    948     UErrorCode          status=U_ZERO_ERROR;
    949 
    950     UnicodeString       re("abc");
    951     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
    952     REGEX_CHECK_STATUS;
    953     UnicodeString data = ".abc..abc...abc..";
    954     //                    012345678901234567
    955     RegexMatcher *matcher = pat->matcher(data, status);
    956 
    957     //
    958     //  Plain vanilla matches.
    959     //
    960     UnicodeString  dest;
    961     dest = matcher->replaceFirst("yz", status);
    962     REGEX_CHECK_STATUS;
    963     REGEX_ASSERT(dest == ".yz..abc...abc..");
    964 
    965     dest = matcher->replaceAll("yz", status);
    966     REGEX_CHECK_STATUS;
    967     REGEX_ASSERT(dest == ".yz..yz...yz..");
    968 
    969     //
    970     //  Plain vanilla non-matches.
    971     //
    972     UnicodeString d2 = ".abx..abx...abx..";
    973     matcher->reset(d2);
    974     dest = matcher->replaceFirst("yz", status);
    975     REGEX_CHECK_STATUS;
    976     REGEX_ASSERT(dest == ".abx..abx...abx..");
    977 
    978     dest = matcher->replaceAll("yz", status);
    979     REGEX_CHECK_STATUS;
    980     REGEX_ASSERT(dest == ".abx..abx...abx..");
    981 
    982     //
    983     // Empty source string
    984     //
    985     UnicodeString d3 = "";
    986     matcher->reset(d3);
    987     dest = matcher->replaceFirst("yz", status);
    988     REGEX_CHECK_STATUS;
    989     REGEX_ASSERT(dest == "");
    990 
    991     dest = matcher->replaceAll("yz", status);
    992     REGEX_CHECK_STATUS;
    993     REGEX_ASSERT(dest == "");
    994 
    995     //
    996     // Empty substitution string
    997     //
    998     matcher->reset(data);              // ".abc..abc...abc.."
    999     dest = matcher->replaceFirst("", status);
   1000     REGEX_CHECK_STATUS;
   1001     REGEX_ASSERT(dest == "...abc...abc..");
   1002 
   1003     dest = matcher->replaceAll("", status);
   1004     REGEX_CHECK_STATUS;
   1005     REGEX_ASSERT(dest == "........");
   1006 
   1007     //
   1008     // match whole string
   1009     //
   1010     UnicodeString d4 = "abc";
   1011     matcher->reset(d4);
   1012     dest = matcher->replaceFirst("xyz", status);
   1013     REGEX_CHECK_STATUS;
   1014     REGEX_ASSERT(dest == "xyz");
   1015 
   1016     dest = matcher->replaceAll("xyz", status);
   1017     REGEX_CHECK_STATUS;
   1018     REGEX_ASSERT(dest == "xyz");
   1019 
   1020     //
   1021     // Capture Group, simple case
   1022     //
   1023     UnicodeString       re2("a(..)");
   1024     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
   1025     REGEX_CHECK_STATUS;
   1026     UnicodeString d5 = "abcdefg";
   1027     RegexMatcher *matcher2 = pat2->matcher(d5, status);
   1028     REGEX_CHECK_STATUS;
   1029     dest = matcher2->replaceFirst("$1$1", status);
   1030     REGEX_CHECK_STATUS;
   1031     REGEX_ASSERT(dest == "bcbcdefg");
   1032 
   1033     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
   1034     REGEX_CHECK_STATUS;
   1035     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
   1036 
   1037     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
   1038     REGEX_CHECK_STATUS;
   1039     REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
   1040 
   1041     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
   1042     replacement = replacement.unescape();
   1043     dest = matcher2->replaceFirst(replacement, status);
   1044     REGEX_CHECK_STATUS;
   1045     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
   1046 
   1047     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
   1048 
   1049 
   1050     //
   1051     // Replacement String with \u hex escapes
   1052     //
   1053     {
   1054         UnicodeString  src = "abc 1 abc 2 abc 3";
   1055         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
   1056         matcher->reset(src);
   1057         UnicodeString  result = matcher->replaceAll(substitute, status);
   1058         REGEX_CHECK_STATUS;
   1059         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
   1060     }
   1061     {
   1062         UnicodeString  src = "abc !";
   1063         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
   1064         matcher->reset(src);
   1065         UnicodeString  result = matcher->replaceAll(substitute, status);
   1066         REGEX_CHECK_STATUS;
   1067         UnicodeString expected = UnicodeString("--");
   1068         expected.append((UChar32)0x10000);
   1069         expected.append("-- !");
   1070         REGEX_ASSERT(result == expected);
   1071     }
   1072     // TODO:  need more through testing of capture substitutions.
   1073 
   1074     // Bug 4057
   1075     //
   1076     {
   1077         status = U_ZERO_ERROR;
   1078         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
   1079         RegexMatcher m("ss(.*?)ee", 0, status);
   1080         REGEX_CHECK_STATUS;
   1081         UnicodeString result;
   1082 
   1083         // Multiple finds do NOT bump up the previous appendReplacement postion.
   1084         m.reset(s);
   1085         m.find();
   1086         m.find();
   1087         m.appendReplacement(result, "ooh", status);
   1088         REGEX_CHECK_STATUS;
   1089         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1090 
   1091         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
   1092         status = U_ZERO_ERROR;
   1093         result.truncate(0);
   1094         m.reset(10, status);
   1095         m.find();
   1096         m.find();
   1097         m.appendReplacement(result, "ooh", status);
   1098         REGEX_CHECK_STATUS;
   1099         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1100 
   1101         // find() at interior of string, appendReplacemnt still starts at beginning.
   1102         status = U_ZERO_ERROR;
   1103         result.truncate(0);
   1104         m.reset();
   1105         m.find(10, status);
   1106         m.find();
   1107         m.appendReplacement(result, "ooh", status);
   1108         REGEX_CHECK_STATUS;
   1109         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1110 
   1111         m.appendTail(result);
   1112         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
   1113 
   1114     }
   1115 
   1116     delete matcher2;
   1117     delete pat2;
   1118     delete matcher;
   1119     delete pat;
   1120 }
   1121 
   1122 
   1123 //---------------------------------------------------------------------------
   1124 //
   1125 //      API_Pattern       Test that the API for class RegexPattern is
   1126 //                        present and nominally working.
   1127 //
   1128 //---------------------------------------------------------------------------
   1129 void RegexTest::API_Pattern() {
   1130     RegexPattern        pata;    // Test default constructor to not crash.
   1131     RegexPattern        patb;
   1132 
   1133     REGEX_ASSERT(pata == patb);
   1134     REGEX_ASSERT(pata == pata);
   1135 
   1136     UnicodeString re1("abc[a-l][m-z]");
   1137     UnicodeString re2("def");
   1138     UErrorCode    status = U_ZERO_ERROR;
   1139     UParseError   pe;
   1140 
   1141     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
   1142     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
   1143     REGEX_CHECK_STATUS;
   1144     REGEX_ASSERT(*pat1 == *pat1);
   1145     REGEX_ASSERT(*pat1 != pata);
   1146 
   1147     // Assign
   1148     patb = *pat1;
   1149     REGEX_ASSERT(patb == *pat1);
   1150 
   1151     // Copy Construct
   1152     RegexPattern patc(*pat1);
   1153     REGEX_ASSERT(patc == *pat1);
   1154     REGEX_ASSERT(patb == patc);
   1155     REGEX_ASSERT(pat1 != pat2);
   1156     patb = *pat2;
   1157     REGEX_ASSERT(patb != patc);
   1158     REGEX_ASSERT(patb == *pat2);
   1159 
   1160     // Compile with no flags.
   1161     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
   1162     REGEX_ASSERT(*pat1a == *pat1);
   1163 
   1164     REGEX_ASSERT(pat1a->flags() == 0);
   1165 
   1166     // Compile with different flags should be not equal
   1167     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
   1168     REGEX_CHECK_STATUS;
   1169 
   1170     REGEX_ASSERT(*pat1b != *pat1a);
   1171     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
   1172     REGEX_ASSERT(pat1a->flags() == 0);
   1173     delete pat1b;
   1174 
   1175     // clone
   1176     RegexPattern *pat1c = pat1->clone();
   1177     REGEX_ASSERT(*pat1c == *pat1);
   1178     REGEX_ASSERT(*pat1c != *pat2);
   1179 
   1180     delete pat1c;
   1181     delete pat1a;
   1182     delete pat1;
   1183     delete pat2;
   1184 
   1185 
   1186     //
   1187     //   Verify that a matcher created from a cloned pattern works.
   1188     //     (Jitterbug 3423)
   1189     //
   1190     {
   1191         UErrorCode     status     = U_ZERO_ERROR;
   1192         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
   1193         RegexPattern  *pClone     = pSource->clone();
   1194         delete         pSource;
   1195         RegexMatcher  *mFromClone = pClone->matcher(status);
   1196         REGEX_CHECK_STATUS;
   1197         UnicodeString s = "Hello World";
   1198         mFromClone->reset(s);
   1199         REGEX_ASSERT(mFromClone->find() == TRUE);
   1200         REGEX_ASSERT(mFromClone->group(status) == "Hello");
   1201         REGEX_ASSERT(mFromClone->find() == TRUE);
   1202         REGEX_ASSERT(mFromClone->group(status) == "World");
   1203         REGEX_ASSERT(mFromClone->find() == FALSE);
   1204         delete mFromClone;
   1205         delete pClone;
   1206     }
   1207 
   1208     //
   1209     //   matches convenience API
   1210     //
   1211     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
   1212     REGEX_CHECK_STATUS;
   1213     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
   1214     REGEX_CHECK_STATUS;
   1215     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
   1216     REGEX_CHECK_STATUS;
   1217     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
   1218     REGEX_CHECK_STATUS;
   1219     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
   1220     REGEX_CHECK_STATUS;
   1221     status = U_INDEX_OUTOFBOUNDS_ERROR;
   1222     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
   1223     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1224 
   1225 
   1226     //
   1227     // Split()
   1228     //
   1229     status = U_ZERO_ERROR;
   1230     pat1 = RegexPattern::compile(" +",  pe, status);
   1231     REGEX_CHECK_STATUS;
   1232     UnicodeString  fields[10];
   1233 
   1234     int32_t n;
   1235     n = pat1->split("Now is the time", fields, 10, status);
   1236     REGEX_CHECK_STATUS;
   1237     REGEX_ASSERT(n==4);
   1238     REGEX_ASSERT(fields[0]=="Now");
   1239     REGEX_ASSERT(fields[1]=="is");
   1240     REGEX_ASSERT(fields[2]=="the");
   1241     REGEX_ASSERT(fields[3]=="time");
   1242     REGEX_ASSERT(fields[4]=="");
   1243 
   1244     n = pat1->split("Now is the time", fields, 2, status);
   1245     REGEX_CHECK_STATUS;
   1246     REGEX_ASSERT(n==2);
   1247     REGEX_ASSERT(fields[0]=="Now");
   1248     REGEX_ASSERT(fields[1]=="is the time");
   1249     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
   1250 
   1251     fields[1] = "*";
   1252     status = U_ZERO_ERROR;
   1253     n = pat1->split("Now is the time", fields, 1, status);
   1254     REGEX_CHECK_STATUS;
   1255     REGEX_ASSERT(n==1);
   1256     REGEX_ASSERT(fields[0]=="Now is the time");
   1257     REGEX_ASSERT(fields[1]=="*");
   1258     status = U_ZERO_ERROR;
   1259 
   1260     n = pat1->split("    Now       is the time   ", fields, 10, status);
   1261     REGEX_CHECK_STATUS;
   1262     REGEX_ASSERT(n==5);
   1263     REGEX_ASSERT(fields[0]=="");
   1264     REGEX_ASSERT(fields[1]=="Now");
   1265     REGEX_ASSERT(fields[2]=="is");
   1266     REGEX_ASSERT(fields[3]=="the");
   1267     REGEX_ASSERT(fields[4]=="time");
   1268     REGEX_ASSERT(fields[5]=="");
   1269 
   1270     n = pat1->split("     ", fields, 10, status);
   1271     REGEX_CHECK_STATUS;
   1272     REGEX_ASSERT(n==1);
   1273     REGEX_ASSERT(fields[0]=="");
   1274 
   1275     fields[0] = "foo";
   1276     n = pat1->split("", fields, 10, status);
   1277     REGEX_CHECK_STATUS;
   1278     REGEX_ASSERT(n==0);
   1279     REGEX_ASSERT(fields[0]=="foo");
   1280 
   1281     delete pat1;
   1282 
   1283     //  split, with a pattern with (capture)
   1284     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
   1285     REGEX_CHECK_STATUS;
   1286 
   1287     status = U_ZERO_ERROR;
   1288     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
   1289     REGEX_CHECK_STATUS;
   1290     REGEX_ASSERT(n==6);
   1291     REGEX_ASSERT(fields[0]=="");
   1292     REGEX_ASSERT(fields[1]=="a");
   1293     REGEX_ASSERT(fields[2]=="Now is ");
   1294     REGEX_ASSERT(fields[3]=="b");
   1295     REGEX_ASSERT(fields[4]=="the time");
   1296     REGEX_ASSERT(fields[5]=="c");
   1297     REGEX_ASSERT(fields[6]=="");
   1298     REGEX_ASSERT(status==U_ZERO_ERROR);
   1299 
   1300     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
   1301     REGEX_CHECK_STATUS;
   1302     REGEX_ASSERT(n==6);
   1303     REGEX_ASSERT(fields[0]=="  ");
   1304     REGEX_ASSERT(fields[1]=="a");
   1305     REGEX_ASSERT(fields[2]=="Now is ");
   1306     REGEX_ASSERT(fields[3]=="b");
   1307     REGEX_ASSERT(fields[4]=="the time");
   1308     REGEX_ASSERT(fields[5]=="c");
   1309     REGEX_ASSERT(fields[6]=="");
   1310 
   1311     status = U_ZERO_ERROR;
   1312     fields[6] = "foo";
   1313     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
   1314     REGEX_CHECK_STATUS;
   1315     REGEX_ASSERT(n==6);
   1316     REGEX_ASSERT(fields[0]=="  ");
   1317     REGEX_ASSERT(fields[1]=="a");
   1318     REGEX_ASSERT(fields[2]=="Now is ");
   1319     REGEX_ASSERT(fields[3]=="b");
   1320     REGEX_ASSERT(fields[4]=="the time");
   1321     REGEX_ASSERT(fields[5]=="c");
   1322     REGEX_ASSERT(fields[6]=="foo");
   1323 
   1324     status = U_ZERO_ERROR;
   1325     fields[5] = "foo";
   1326     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
   1327     REGEX_CHECK_STATUS;
   1328     REGEX_ASSERT(n==5);
   1329     REGEX_ASSERT(fields[0]=="  ");
   1330     REGEX_ASSERT(fields[1]=="a");
   1331     REGEX_ASSERT(fields[2]=="Now is ");
   1332     REGEX_ASSERT(fields[3]=="b");
   1333     REGEX_ASSERT(fields[4]=="the time<c>");
   1334     REGEX_ASSERT(fields[5]=="foo");
   1335 
   1336     status = U_ZERO_ERROR;
   1337     fields[5] = "foo";
   1338     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
   1339     REGEX_CHECK_STATUS;
   1340     REGEX_ASSERT(n==5);
   1341     REGEX_ASSERT(fields[0]=="  ");
   1342     REGEX_ASSERT(fields[1]=="a");
   1343     REGEX_ASSERT(fields[2]=="Now is ");
   1344     REGEX_ASSERT(fields[3]=="b");
   1345     REGEX_ASSERT(fields[4]=="the time");
   1346     REGEX_ASSERT(fields[5]=="foo");
   1347 
   1348     status = U_ZERO_ERROR;
   1349     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
   1350     REGEX_CHECK_STATUS;
   1351     REGEX_ASSERT(n==4);
   1352     REGEX_ASSERT(fields[0]=="  ");
   1353     REGEX_ASSERT(fields[1]=="a");
   1354     REGEX_ASSERT(fields[2]=="Now is ");
   1355     REGEX_ASSERT(fields[3]=="the time<c>");
   1356     status = U_ZERO_ERROR;
   1357     delete pat1;
   1358 
   1359     pat1 = RegexPattern::compile("([-,])",  pe, status);
   1360     REGEX_CHECK_STATUS;
   1361     n = pat1->split("1-10,20", fields, 10, status);
   1362     REGEX_CHECK_STATUS;
   1363     REGEX_ASSERT(n==5);
   1364     REGEX_ASSERT(fields[0]=="1");
   1365     REGEX_ASSERT(fields[1]=="-");
   1366     REGEX_ASSERT(fields[2]=="10");
   1367     REGEX_ASSERT(fields[3]==",");
   1368     REGEX_ASSERT(fields[4]=="20");
   1369     delete pat1;
   1370 
   1371 
   1372     //
   1373     // RegexPattern::pattern()
   1374     //
   1375     pat1 = new RegexPattern();
   1376     REGEX_ASSERT(pat1->pattern() == "");
   1377     delete pat1;
   1378 
   1379     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
   1380     REGEX_CHECK_STATUS;
   1381     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
   1382     delete pat1;
   1383 
   1384 
   1385     //
   1386     // classID functions
   1387     //
   1388     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
   1389     REGEX_CHECK_STATUS;
   1390     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
   1391     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
   1392     UnicodeString Hello("Hello, world.");
   1393     RegexMatcher *m = pat1->matcher(Hello, status);
   1394     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
   1395     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
   1396     REGEX_ASSERT(m->getDynamicClassID() != NULL);
   1397     delete m;
   1398     delete pat1;
   1399 
   1400 }
   1401 
   1402 //---------------------------------------------------------------------------
   1403 //
   1404 //      Extended       A more thorough check for features of regex patterns
   1405 //                     The test cases are in a separate data file,
   1406 //                       source/tests/testdata/regextst.txt
   1407 //                     A description of the test data format is included in that file.
   1408 //
   1409 //---------------------------------------------------------------------------
   1410 
   1411 const char *
   1412 RegexTest::getPath(char buffer[2048], const char *filename) {
   1413     UErrorCode status=U_ZERO_ERROR;
   1414     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   1415     if (U_FAILURE(status)) {
   1416         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
   1417         return NULL;
   1418     }
   1419 
   1420     strcpy(buffer, testDataDirectory);
   1421     strcat(buffer, filename);
   1422     return buffer;
   1423 }
   1424 
   1425 void RegexTest::Extended() {
   1426     char tdd[2048];
   1427     const char *srcPath;
   1428     UErrorCode  status  = U_ZERO_ERROR;
   1429     int32_t     lineNum = 0;
   1430 
   1431     //
   1432     //  Open and read the test data file.
   1433     //
   1434     srcPath=getPath(tdd, "regextst.txt");
   1435     if(srcPath==NULL) {
   1436         return; /* something went wrong, error already output */
   1437     }
   1438 
   1439     int32_t    len;
   1440     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
   1441     if (U_FAILURE(status)) {
   1442         return; /* something went wrong, error already output */
   1443     }
   1444 
   1445     //
   1446     //  Put the test data into a UnicodeString
   1447     //
   1448     UnicodeString testString(FALSE, testData, len);
   1449 
   1450     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
   1451     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
   1452     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
   1453 
   1454     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
   1455     UnicodeString   testPattern;   // The pattern for test from the test file.
   1456     UnicodeString   testFlags;     // the flags   for a test.
   1457     UnicodeString   matchString;   // The marked up string to be used as input
   1458 
   1459     if (U_FAILURE(status)){
   1460         dataerrln("Construct RegexMatcher() error.");
   1461         delete [] testData;
   1462         return;
   1463     }
   1464 
   1465     //
   1466     //  Loop over the test data file, once per line.
   1467     //
   1468     while (lineMat.find()) {
   1469         lineNum++;
   1470         if (U_FAILURE(status)) {
   1471             errln("line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
   1472         }
   1473 
   1474         status = U_ZERO_ERROR;
   1475         UnicodeString testLine = lineMat.group(1, status);
   1476         if (testLine.length() == 0) {
   1477             continue;
   1478         }
   1479 
   1480         //
   1481         // Parse the test line.  Skip blank and comment only lines.
   1482         // Separate out the three main fields - pattern, flags, target.
   1483         //
   1484 
   1485         commentMat.reset(testLine);
   1486         if (commentMat.lookingAt(status)) {
   1487             // This line is a comment, or blank.
   1488             continue;
   1489         }
   1490 
   1491         //
   1492         //  Pull out the pattern field, remove it from the test file line.
   1493         //
   1494         quotedStuffMat.reset(testLine);
   1495         if (quotedStuffMat.lookingAt(status)) {
   1496             testPattern = quotedStuffMat.group(2, status);
   1497             testLine.remove(0, quotedStuffMat.end(0, status));
   1498         } else {
   1499             errln("Bad pattern (missing quotes?) at test file line %d", lineNum);
   1500             continue;
   1501         }
   1502 
   1503 
   1504         //
   1505         //  Pull out the flags from the test file line.
   1506         //
   1507         flagsMat.reset(testLine);
   1508         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
   1509         testFlags = flagsMat.group(1, status);
   1510         if (flagsMat.group(2, status).length() > 0) {
   1511             errln("Bad Match flag at line %d. Scanning %c\n",
   1512                 lineNum, flagsMat.group(2, status).charAt(0));
   1513             continue;
   1514         }
   1515         testLine.remove(0, flagsMat.end(0, status));
   1516 
   1517         //
   1518         //  Pull out the match string, as a whole.
   1519         //    We'll process the <tags> later.
   1520         //
   1521         quotedStuffMat.reset(testLine);
   1522         if (quotedStuffMat.lookingAt(status)) {
   1523             matchString = quotedStuffMat.group(2, status);
   1524             testLine.remove(0, quotedStuffMat.end(0, status));
   1525         } else {
   1526             errln("Bad match string at test file line %d", lineNum);
   1527             continue;
   1528         }
   1529 
   1530         //
   1531         //  The only thing left from the input line should be an optional trailing comment.
   1532         //
   1533         commentMat.reset(testLine);
   1534         if (commentMat.lookingAt(status) == FALSE) {
   1535             errln("Line %d: unexpected characters at end of test line.", lineNum);
   1536             continue;
   1537         }
   1538 
   1539         //
   1540         //  Run the test
   1541         //
   1542         regex_find(testPattern, testFlags, matchString, lineNum);
   1543     }
   1544 
   1545     delete [] testData;
   1546 
   1547 }
   1548 
   1549 
   1550 
   1551 //---------------------------------------------------------------------------
   1552 //
   1553 //    regex_find(pattern, flags, inputString, lineNumber)
   1554 //
   1555 //         Function to run a single test from the Extended (data driven) tests.
   1556 //         See file test/testdata/regextst.txt for a description of the
   1557 //         pattern and inputString fields, and the allowed flags.
   1558 //         lineNumber is the source line in regextst.txt of the test.
   1559 //
   1560 //---------------------------------------------------------------------------
   1561 
   1562 
   1563 //  Set a value into a UVector at position specified by a decimal number in
   1564 //   a UnicodeString.   This is a utility function needed by the actual test function,
   1565 //   which follows.
   1566 static void set(UVector &vec, int32_t val, UnicodeString index) {
   1567     UErrorCode  status=U_ZERO_ERROR;
   1568     int32_t  idx = 0;
   1569     for (int32_t i=0; i<index.length(); i++) {
   1570         int32_t d=u_charDigitValue(index.charAt(i));
   1571         if (d<0) {return;}
   1572         idx = idx*10 + d;
   1573     }
   1574     while (vec.size()<idx+1) {vec.addElement(-1, status);}
   1575     vec.setElementAt(val, idx);
   1576 }
   1577 
   1578 void RegexTest::regex_find(const UnicodeString &pattern,
   1579                            const UnicodeString &flags,
   1580                            const UnicodeString &inputString,
   1581                            int32_t line) {
   1582     UnicodeString       unEscapedInput;
   1583     UnicodeString       deTaggedInput;
   1584 
   1585     UErrorCode          status         = U_ZERO_ERROR;
   1586     UParseError         pe;
   1587     RegexPattern        *parsePat      = NULL;
   1588     RegexMatcher        *parseMatcher  = NULL;
   1589     RegexPattern        *callerPattern = NULL;
   1590     RegexMatcher        *matcher       = NULL;
   1591     UVector             groupStarts(status);
   1592     UVector             groupEnds(status);
   1593     UBool               isMatch        = FALSE;
   1594     UBool               failed         = FALSE;
   1595     int32_t             numFinds;
   1596     int32_t             i;
   1597     UBool               useMatchesFunc   = FALSE;
   1598     UBool               useLookingAtFunc = FALSE;
   1599     int32_t             regionStart      = -1;
   1600     int32_t             regionEnd        = -1;
   1601 
   1602     //
   1603     //  Compile the caller's pattern
   1604     //
   1605     uint32_t bflags = 0;
   1606     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
   1607         bflags |= UREGEX_CASE_INSENSITIVE;
   1608     }
   1609     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
   1610         bflags |= UREGEX_COMMENTS;
   1611     }
   1612     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
   1613         bflags |= UREGEX_DOTALL;
   1614     }
   1615     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
   1616         bflags |= UREGEX_MULTILINE;
   1617     }
   1618 
   1619     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
   1620         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
   1621     }
   1622     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
   1623         bflags |= UREGEX_UNIX_LINES;
   1624     }
   1625 
   1626 
   1627     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
   1628     if (status != U_ZERO_ERROR) {
   1629         #if UCONFIG_NO_BREAK_ITERATION==1
   1630         // 'v' test flag means that the test pattern should not compile if ICU was configured
   1631         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
   1632         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
   1633             goto cleanupAndReturn;
   1634         }
   1635         #endif
   1636         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
   1637             // Expected pattern compilation error.
   1638             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
   1639                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
   1640             }
   1641             goto cleanupAndReturn;
   1642         } else {
   1643             // Unexpected pattern compilation error.
   1644             errln("Line %d: error %s compiling pattern.", line, u_errorName(status));
   1645             goto cleanupAndReturn;
   1646         }
   1647     }
   1648 
   1649     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
   1650         RegexPatternDump(callerPattern);
   1651     }
   1652 
   1653     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
   1654         errln("Expected, but did not get, a pattern compilation error.");
   1655         goto cleanupAndReturn;
   1656     }
   1657 
   1658 
   1659     //
   1660     // Number of times find() should be called on the test string, default to 1
   1661     //
   1662     numFinds = 1;
   1663     for (i=2; i<=9; i++) {
   1664         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
   1665             if (numFinds != 1) {
   1666                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
   1667                 goto cleanupAndReturn;
   1668             }
   1669             numFinds = i;
   1670         }
   1671     }
   1672 
   1673     // 'M' flag.  Use matches() instead of find()
   1674     if (flags.indexOf((UChar)0x4d) >= 0) {
   1675         useMatchesFunc = TRUE;
   1676     }
   1677     if (flags.indexOf((UChar)0x4c) >= 0) {
   1678         useLookingAtFunc = TRUE;
   1679     }
   1680 
   1681     //
   1682     //  Find the tags in the input data, remove them, and record the group boundary
   1683     //    positions.
   1684     //
   1685     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
   1686     REGEX_CHECK_STATUS_L(line);
   1687 
   1688     unEscapedInput = inputString.unescape();
   1689     parseMatcher = parsePat->matcher(unEscapedInput, status);
   1690     REGEX_CHECK_STATUS_L(line);
   1691     while(parseMatcher->find()) {
   1692         parseMatcher->appendReplacement(deTaggedInput, "", status);
   1693         REGEX_CHECK_STATUS;
   1694         UnicodeString groupNum = parseMatcher->group(2, status);
   1695         if (groupNum == "r") {
   1696             // <r> or </r>, a region specification within the string
   1697             if (parseMatcher->group(1, status) == "/") {
   1698                 regionEnd = deTaggedInput.length();
   1699             } else {
   1700                 regionStart = deTaggedInput.length();
   1701             }
   1702         } else {
   1703             // <digits> or </digits>, a group match boundary tag.
   1704             if (parseMatcher->group(1, status) == "/") {
   1705                 set(groupEnds, deTaggedInput.length(), groupNum);
   1706             } else {
   1707                 set(groupStarts, deTaggedInput.length(), groupNum);
   1708             }
   1709         }
   1710     }
   1711     parseMatcher->appendTail(deTaggedInput);
   1712     REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
   1713     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
   1714       errln("mismatched <r> tags");
   1715       failed = TRUE;
   1716       goto cleanupAndReturn;
   1717     }
   1718 
   1719 
   1720     //
   1721     //  Configure the matcher according to the flags specified with this test.
   1722     //
   1723     matcher = callerPattern->matcher(deTaggedInput, status);
   1724     REGEX_CHECK_STATUS_L(line);
   1725     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
   1726         matcher->setTrace(TRUE);
   1727     }
   1728     if (regionStart>=0) {
   1729        matcher->region(regionStart, regionEnd, status);
   1730        REGEX_CHECK_STATUS_L(line);
   1731     }
   1732     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
   1733         matcher->useAnchoringBounds(FALSE);
   1734     }
   1735     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
   1736         matcher->useTransparentBounds(TRUE);
   1737     }
   1738 
   1739 
   1740 
   1741     //
   1742     // Do a find on the de-tagged input using the caller's pattern
   1743     //     TODO: error on count>1 and not find().
   1744     //           error on both matches() and lookingAt().
   1745     //
   1746     for (i=0; i<numFinds; i++) {
   1747         if (useMatchesFunc) {
   1748             isMatch = matcher->matches(status);
   1749         } else  if (useLookingAtFunc) {
   1750             isMatch = matcher->lookingAt(status);
   1751         } else {
   1752             isMatch = matcher->find();
   1753         }
   1754     }
   1755     matcher->setTrace(FALSE);
   1756 
   1757     //
   1758     // Match up the groups from the find() with the groups from the tags
   1759     //
   1760 
   1761     // number of tags should match number of groups from find operation.
   1762     // matcher->groupCount does not include group 0, the entire match, hence the +1.
   1763     //   G option in test means that capture group data is not available in the
   1764     //     expected results, so the check needs to be suppressed.
   1765     if (isMatch == FALSE && groupStarts.size() != 0) {
   1766         errln("Error at line %d:  Match expected, but none found.\n", line);
   1767         failed = TRUE;
   1768         goto cleanupAndReturn;
   1769     }
   1770 
   1771     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
   1772         // Only check for match / no match.  Don't check capture groups.
   1773         if (isMatch && groupStarts.size() == 0) {
   1774             errln("Error at line %d:  No match expected, but one found.\n", line);
   1775             failed = TRUE;
   1776         }
   1777         goto cleanupAndReturn;
   1778     }
   1779 
   1780     for (i=0; i<=matcher->groupCount(); i++) {
   1781         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
   1782         if (matcher->start(i, status) != expectedStart) {
   1783             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
   1784                 line, i, expectedStart, matcher->start(i, status));
   1785             failed = TRUE;
   1786             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   1787         }
   1788         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
   1789         if (matcher->end(i, status) != expectedEnd) {
   1790             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
   1791                 line, i, expectedEnd, matcher->end(i, status));
   1792             failed = TRUE;
   1793             // Error on end position;  keep going; real error is probably yet to come as group
   1794             //   end positions work from end of the input data towards the front.
   1795         }
   1796     }
   1797     if ( matcher->groupCount()+1 < groupStarts.size()) {
   1798         errln("Error at line %d: Expected %d capture groups, found %d.",
   1799             line, groupStarts.size()-1, matcher->groupCount());
   1800         failed = TRUE;
   1801         }
   1802 
   1803     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   1804         matcher->requireEnd() == TRUE) {
   1805         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
   1806         failed = TRUE;
   1807     }
   1808     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
   1809         matcher->requireEnd() == FALSE) {
   1810         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
   1811         failed = TRUE;
   1812     }
   1813     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
   1814         matcher->hitEnd() == TRUE) {
   1815         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
   1816         failed = TRUE;
   1817     }
   1818     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
   1819         matcher->hitEnd() == FALSE) {
   1820         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
   1821         failed = TRUE;
   1822     }
   1823 
   1824 
   1825 cleanupAndReturn:
   1826     if (failed) {
   1827         errln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
   1828             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
   1829         // callerPattern->dump();
   1830     }
   1831     delete parseMatcher;
   1832     delete parsePat;
   1833     delete matcher;
   1834     delete callerPattern;
   1835 }
   1836 
   1837 
   1838 
   1839 
   1840 //---------------------------------------------------------------------------
   1841 //
   1842 //      Errors     Check for error handling in patterns.
   1843 //
   1844 //---------------------------------------------------------------------------
   1845 void RegexTest::Errors() {
   1846     // \escape sequences that aren't implemented yet.
   1847     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
   1848 
   1849     // Missing close parentheses
   1850     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
   1851     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
   1852     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
   1853 
   1854     // Extra close paren
   1855     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
   1856     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
   1857     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
   1858 
   1859     // Look-ahead, Look-behind
   1860     //  TODO:  add tests for unbounded length look-behinds.
   1861     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
   1862 
   1863     // Attempt to use non-default flags
   1864     {
   1865         UParseError   pe;
   1866         UErrorCode    status = U_ZERO_ERROR;
   1867         int32_t       flags  = UREGEX_CANON_EQ |
   1868                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
   1869                                UREGEX_MULTILINE;
   1870         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
   1871         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
   1872         delete pat1;
   1873     }
   1874 
   1875 
   1876     // Quantifiers are allowed only after something that can be quantified.
   1877     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
   1878     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
   1879     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
   1880 
   1881     // Mal-formed {min,max} quantifiers
   1882     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
   1883     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
   1884     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
   1885     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
   1886     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
   1887     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
   1888     REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
   1889     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
   1890     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
   1891 
   1892     // Ticket 5389
   1893     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
   1894 
   1895     // Invalid Back Reference \0
   1896     //    For ICU 3.8 and earlier
   1897     //    For ICU versions newer than 3.8, \0 introduces an octal escape.
   1898     //
   1899     REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
   1900 
   1901 }
   1902 
   1903 
   1904 //-------------------------------------------------------------------------------
   1905 //
   1906 //  Read a text data file, convert it to UChars, and return the data
   1907 //    in one big UChar * buffer, which the caller must delete.
   1908 //
   1909 //--------------------------------------------------------------------------------
   1910 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
   1911                                      const char *defEncoding, UErrorCode &status) {
   1912     UChar       *retPtr  = NULL;
   1913     char        *fileBuf = NULL;
   1914     UConverter* conv     = NULL;
   1915     FILE        *f       = NULL;
   1916 
   1917     ulen = 0;
   1918     if (U_FAILURE(status)) {
   1919         return retPtr;
   1920     }
   1921 
   1922     //
   1923     //  Open the file.
   1924     //
   1925     f = fopen(fileName, "rb");
   1926     if (f == 0) {
   1927         dataerrln("Error opening test data file %s\n", fileName);
   1928         status = U_FILE_ACCESS_ERROR;
   1929         return NULL;
   1930     }
   1931     //
   1932     //  Read it in
   1933     //
   1934     int32_t            fileSize;
   1935     int32_t            amt_read;
   1936 
   1937     fseek( f, 0, SEEK_END);
   1938     fileSize = ftell(f);
   1939     fileBuf = new char[fileSize];
   1940     fseek(f, 0, SEEK_SET);
   1941     amt_read = fread(fileBuf, 1, fileSize, f);
   1942     if (amt_read != fileSize || fileSize <= 0) {
   1943         errln("Error reading test data file.");
   1944         goto cleanUpAndReturn;
   1945     }
   1946 
   1947     //
   1948     // Look for a Unicode Signature (BOM) on the data just read
   1949     //
   1950     int32_t        signatureLength;
   1951     const char *   fileBufC;
   1952     const char*    encoding;
   1953 
   1954     fileBufC = fileBuf;
   1955     encoding = ucnv_detectUnicodeSignature(
   1956         fileBuf, fileSize, &signatureLength, &status);
   1957     if(encoding!=NULL ){
   1958         fileBufC  += signatureLength;
   1959         fileSize  -= signatureLength;
   1960     } else {
   1961         encoding = defEncoding;
   1962         if (strcmp(encoding, "utf-8") == 0) {
   1963             errln("file %s is missing its BOM", fileName);
   1964         }
   1965     }
   1966 
   1967     //
   1968     // Open a converter to take the rule file to UTF-16
   1969     //
   1970     conv = ucnv_open(encoding, &status);
   1971     if (U_FAILURE(status)) {
   1972         goto cleanUpAndReturn;
   1973     }
   1974 
   1975     //
   1976     // Convert the rules to UChar.
   1977     //  Preflight first to determine required buffer size.
   1978     //
   1979     ulen = ucnv_toUChars(conv,
   1980         NULL,           //  dest,
   1981         0,              //  destCapacity,
   1982         fileBufC,
   1983         fileSize,
   1984         &status);
   1985     if (status == U_BUFFER_OVERFLOW_ERROR) {
   1986         // Buffer Overflow is expected from the preflight operation.
   1987         status = U_ZERO_ERROR;
   1988 
   1989         retPtr = new UChar[ulen+1];
   1990         ucnv_toUChars(conv,
   1991             retPtr,       //  dest,
   1992             ulen+1,
   1993             fileBufC,
   1994             fileSize,
   1995             &status);
   1996     }
   1997 
   1998 cleanUpAndReturn:
   1999     fclose(f);
   2000     delete[] fileBuf;
   2001     ucnv_close(conv);
   2002     if (U_FAILURE(status)) {
   2003         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   2004         delete retPtr;
   2005         retPtr = 0;
   2006         ulen   = 0;
   2007     };
   2008     return retPtr;
   2009 }
   2010 
   2011 
   2012 //-------------------------------------------------------------------------------
   2013 //
   2014 //   PerlTests  - Run Perl's regular expression tests
   2015 //                The input file for this test is re_tests, the standard regular
   2016 //                expression test data distributed with the Perl source code.
   2017 //
   2018 //                Here is Perl's description of the test data file:
   2019 //
   2020 //        # The tests are in a separate file 't/op/re_tests'.
   2021 //        # Each line in that file is a separate test.
   2022 //        # There are five columns, separated by tabs.
   2023 //        #
   2024 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
   2025 //        # Modifiers can be put after the closing C<'>.
   2026 //        #
   2027 //        # Column 2 contains the string to be matched.
   2028 //        #
   2029 //        # Column 3 contains the expected result:
   2030 //        #     y   expect a match
   2031 //        #     n   expect no match
   2032 //        #     c   expect an error
   2033 //        # B   test exposes a known bug in Perl, should be skipped
   2034 //        # b   test exposes a known bug in Perl, should be skipped if noamp
   2035 //        #
   2036 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
   2037 //        #
   2038 //        # Column 4 contains a string, usually C<$&>.
   2039 //        #
   2040 //        # Column 5 contains the expected result of double-quote
   2041 //        # interpolating that string after the match, or start of error message.
   2042 //        #
   2043 //        # Column 6, if present, contains a reason why the test is skipped.
   2044 //        # This is printed with "skipped", for harness to pick up.
   2045 //        #
   2046 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
   2047 //        #
   2048 //        # If you want to add a regular expression test that can't be expressed
   2049 //        # in this format, don't add it here: put it in op/pat.t instead.
   2050 //
   2051 //        For ICU, if field 3 contains an 'i', the test will be skipped.
   2052 //        The test exposes is some known incompatibility between ICU and Perl regexps.
   2053 //        (The i is in addition to whatever was there before.)
   2054 //
   2055 //-------------------------------------------------------------------------------
   2056 void RegexTest::PerlTests() {
   2057     char tdd[2048];
   2058     const char *srcPath;
   2059     UErrorCode  status = U_ZERO_ERROR;
   2060     UParseError pe;
   2061 
   2062     //
   2063     //  Open and read the test data file.
   2064     //
   2065     srcPath=getPath(tdd, "re_tests.txt");
   2066     if(srcPath==NULL) {
   2067         return; /* something went wrong, error already output */
   2068     }
   2069 
   2070     int32_t    len;
   2071     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
   2072     if (U_FAILURE(status)) {
   2073         return; /* something went wrong, error already output */
   2074     }
   2075 
   2076     //
   2077     //  Put the test data into a UnicodeString
   2078     //
   2079     UnicodeString testDataString(FALSE, testData, len);
   2080 
   2081     //
   2082     //  Regex to break the input file into lines, and strip the new lines.
   2083     //     One line per match, capture group one is the desired data.
   2084     //
   2085     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
   2086     if (U_FAILURE(status)) {
   2087         dataerrln("RegexPattern::compile() error");
   2088         return;
   2089     }
   2090     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
   2091 
   2092     //
   2093     //  Regex to split a test file line into fields.
   2094     //    There are six fields, separated by tabs.
   2095     //
   2096     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
   2097 
   2098     //
   2099     //  Regex to identify test patterns with flag settings, and to separate them.
   2100     //    Test patterns with flags look like 'pattern'i
   2101     //    Test patterns without flags are not quoted:   pattern
   2102     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
   2103     //
   2104     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
   2105     RegexMatcher* flagMat = flagPat->matcher(status);
   2106 
   2107     //
   2108     // The Perl tests reference several perl-isms, which are evaluated/substituted
   2109     //   in the test data.  Not being perl, this must be done explicitly.  Here
   2110     //   are string constants and REs for these constructs.
   2111     //
   2112     UnicodeString nulnulSrc("${nulnul}");
   2113     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
   2114     nulnul = nulnul.unescape();
   2115 
   2116     UnicodeString ffffSrc("${ffff}");
   2117     UnicodeString ffff("\\uffff", -1, US_INV);
   2118     ffff = ffff.unescape();
   2119 
   2120     //  regexp for $-[0], $+[2], etc.
   2121     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
   2122     RegexMatcher *groupsMat = groupsPat->matcher(status);
   2123 
   2124     //  regexp for $0, $1, $2, etc.
   2125     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
   2126     RegexMatcher *cgMat = cgPat->matcher(status);
   2127 
   2128 
   2129     //
   2130     // Main Loop for the Perl Tests, runs once per line from the
   2131     //   test data file.
   2132     //
   2133     int32_t  lineNum = 0;
   2134     int32_t  skippedUnimplementedCount = 0;
   2135     while (lineMat->find()) {
   2136         lineNum++;
   2137 
   2138         //
   2139         //  Get a line, break it into its fields, do the Perl
   2140         //    variable substitutions.
   2141         //
   2142         UnicodeString line = lineMat->group(1, status);
   2143         UnicodeString fields[7];
   2144         fieldPat->split(line, fields, 7, status);
   2145 
   2146         flagMat->reset(fields[0]);
   2147         flagMat->matches(status);
   2148         UnicodeString pattern  = flagMat->group(2, status);
   2149         pattern.findAndReplace("${bang}", "!");
   2150         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
   2151         pattern.findAndReplace(ffffSrc, ffff);
   2152 
   2153         //
   2154         //  Identify patterns that include match flag settings,
   2155         //    split off the flags, remove the extra quotes.
   2156         //
   2157         UnicodeString flagStr = flagMat->group(3, status);
   2158         if (U_FAILURE(status)) {
   2159             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   2160             return;
   2161         }
   2162         int32_t flags = 0;
   2163         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
   2164         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
   2165         const UChar UChar_m = 0x6d;
   2166         const UChar UChar_x = 0x78;
   2167         const UChar UChar_y = 0x79;
   2168         if (flagStr.indexOf(UChar_i) != -1) {
   2169             flags |= UREGEX_CASE_INSENSITIVE;
   2170         }
   2171         if (flagStr.indexOf(UChar_m) != -1) {
   2172             flags |= UREGEX_MULTILINE;
   2173         }
   2174         if (flagStr.indexOf(UChar_x) != -1) {
   2175             flags |= UREGEX_COMMENTS;
   2176         }
   2177 
   2178         //
   2179         // Compile the test pattern.
   2180         //
   2181         status = U_ZERO_ERROR;
   2182         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
   2183         if (status == U_REGEX_UNIMPLEMENTED) {
   2184             //
   2185             // Test of a feature that is planned for ICU, but not yet implemented.
   2186             //   skip the test.
   2187             skippedUnimplementedCount++;
   2188             delete testPat;
   2189             status = U_ZERO_ERROR;
   2190             continue;
   2191         }
   2192 
   2193         if (U_FAILURE(status)) {
   2194             // Some tests are supposed to generate errors.
   2195             //   Only report an error for tests that are supposed to succeed.
   2196             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
   2197                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
   2198             {
   2199                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
   2200             }
   2201             status = U_ZERO_ERROR;
   2202             delete testPat;
   2203             continue;
   2204         }
   2205 
   2206         if (fields[2].indexOf(UChar_i) >= 0) {
   2207             // ICU should skip this test.
   2208             delete testPat;
   2209             continue;
   2210         }
   2211 
   2212         if (fields[2].indexOf(UChar_c) >= 0) {
   2213             // This pattern should have caused a compilation error, but didn't/
   2214             errln("line %d: Expected a pattern compile error, got success.", lineNum);
   2215             delete testPat;
   2216             continue;
   2217         }
   2218 
   2219         //
   2220         // replace the Perl variables that appear in some of the
   2221         //   match data strings.
   2222         //
   2223         UnicodeString matchString = fields[1];
   2224         matchString.findAndReplace(nulnulSrc, nulnul);
   2225         matchString.findAndReplace(ffffSrc,   ffff);
   2226 
   2227         // Replace any \n in the match string with an actual new-line char.
   2228         //  Don't do full unescape, as this unescapes more than Perl does, which
   2229         //  causes other spurious failures in the tests.
   2230         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   2231 
   2232 
   2233 
   2234         //
   2235         // Run the test, check for expected match/don't match result.
   2236         //
   2237         RegexMatcher *testMat = testPat->matcher(matchString, status);
   2238         UBool found = testMat->find();
   2239         UBool expected = FALSE;
   2240         if (fields[2].indexOf(UChar_y) >=0) {
   2241             expected = TRUE;
   2242         }
   2243         if (expected != found) {
   2244             errln("line %d: Expected %smatch, got %smatch",
   2245                 lineNum, expected?"":"no ", found?"":"no " );
   2246             continue;
   2247         }
   2248 
   2249         // Don't try to check expected results if there is no match.
   2250         //   (Some have stuff in the expected fields)
   2251         if (!found) {
   2252             delete testMat;
   2253             delete testPat;
   2254             continue;
   2255         }
   2256 
   2257         //
   2258         // Interpret the Perl expression from the fourth field of the data file,
   2259         // building up an ICU string from the results of the ICU match.
   2260         //   The Perl expression will contain references to the results of
   2261         //     a regex match, including the matched string, capture group strings,
   2262         //     group starting and ending indicies, etc.
   2263         //
   2264         UnicodeString resultString;
   2265         UnicodeString perlExpr = fields[3];
   2266         groupsMat->reset(perlExpr);
   2267         cgMat->reset(perlExpr);
   2268 
   2269         while (perlExpr.length() > 0) {
   2270             if (perlExpr.startsWith("$&")) {
   2271                 resultString.append(testMat->group(status));
   2272                 perlExpr.remove(0, 2);
   2273             }
   2274 
   2275             else if (groupsMat->lookingAt(status)) {
   2276                 // $-[0]   $+[2]  etc.
   2277                 UnicodeString digitString = groupsMat->group(2, status);
   2278                 int32_t t = 0;
   2279                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   2280                 UnicodeString plusOrMinus = groupsMat->group(1, status);
   2281                 int32_t matchPosition;
   2282                 if (plusOrMinus.compare("+") == 0) {
   2283                     matchPosition = testMat->end(groupNum, status);
   2284                 } else {
   2285                     matchPosition = testMat->start(groupNum, status);
   2286                 }
   2287                 if (matchPosition != -1) {
   2288                     ICU_Utility::appendNumber(resultString, matchPosition);
   2289                 }
   2290                 perlExpr.remove(0, groupsMat->end(status));
   2291             }
   2292 
   2293             else if (cgMat->lookingAt(status)) {
   2294                 // $1, $2, $3, etc.
   2295                 UnicodeString digitString = cgMat->group(1, status);
   2296                 int32_t t = 0;
   2297                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   2298                 if (U_SUCCESS(status)) {
   2299                     resultString.append(testMat->group(groupNum, status));
   2300                     status = U_ZERO_ERROR;
   2301                 }
   2302                 perlExpr.remove(0, cgMat->end(status));
   2303             }
   2304 
   2305             else if (perlExpr.startsWith("@-")) {
   2306                 int32_t i;
   2307                 for (i=0; i<=testMat->groupCount(); i++) {
   2308                     if (i>0) {
   2309                         resultString.append(" ");
   2310                     }
   2311                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
   2312                 }
   2313                 perlExpr.remove(0, 2);
   2314             }
   2315 
   2316             else if (perlExpr.startsWith("@+")) {
   2317                 int32_t i;
   2318                 for (i=0; i<=testMat->groupCount(); i++) {
   2319                     if (i>0) {
   2320                         resultString.append(" ");
   2321                     }
   2322                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
   2323                 }
   2324                 perlExpr.remove(0, 2);
   2325             }
   2326 
   2327             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
   2328                                                      //           or as an escaped sequence (e.g. \n)
   2329                 if (perlExpr.length() > 1) {
   2330                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
   2331                 }
   2332                 UChar c = perlExpr.charAt(0);
   2333                 switch (c) {
   2334                 case 'n':   c = '\n'; break;
   2335                 // add any other escape sequences that show up in the test expected results.
   2336                 }
   2337                 resultString.append(c);
   2338                 perlExpr.remove(0, 1);
   2339             }
   2340 
   2341             else  {
   2342                 // Any characters from the perl expression that we don't explicitly
   2343                 //  recognize before here are assumed to be literals and copied
   2344                 //  as-is to the expected results.
   2345                 resultString.append(perlExpr.charAt(0));
   2346                 perlExpr.remove(0, 1);
   2347             }
   2348 
   2349             if (U_FAILURE(status)) {
   2350                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
   2351                 break;
   2352             }
   2353         }
   2354 
   2355         //
   2356         // Expected Results Compare
   2357         //
   2358         UnicodeString expectedS(fields[4]);
   2359         expectedS.findAndReplace(nulnulSrc, nulnul);
   2360         expectedS.findAndReplace(ffffSrc,   ffff);
   2361         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   2362 
   2363 
   2364         if (expectedS.compare(resultString) != 0) {
   2365             err("Line %d: Incorrect perl expression results.", lineNum);
   2366             errln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
   2367         }
   2368 
   2369         delete testMat;
   2370         delete testPat;
   2371     }
   2372 
   2373     //
   2374     // All done.  Clean up allocated stuff.
   2375     //
   2376     delete cgMat;
   2377     delete cgPat;
   2378 
   2379     delete groupsMat;
   2380     delete groupsPat;
   2381 
   2382     delete flagMat;
   2383     delete flagPat;
   2384 
   2385     delete lineMat;
   2386     delete linePat;
   2387 
   2388     delete fieldPat;
   2389     delete [] testData;
   2390 
   2391 
   2392     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
   2393 
   2394 }
   2395 
   2396 
   2397 //--------------------------------------------------------------
   2398 //
   2399 //  Bug6149   Verify limits to heap expansion for backtrack stack.
   2400 //             Use this pattern,
   2401 //                 "(a?){1,}"
   2402 //             The zero-length match will repeat forever.
   2403 //                (That this goes into a loop is another bug)
   2404 //
   2405 //---------------------------------------------------------------
   2406 void RegexTest::Bug6149() {
   2407     UnicodeString pattern("(a?){1,}");
   2408     UnicodeString s("xyz");
   2409     uint32_t flags = 0;
   2410     UErrorCode status = U_ZERO_ERROR;
   2411 
   2412     RegexMatcher  matcher(pattern, s, flags, status);
   2413     UBool result = false;
   2414     REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
   2415     REGEX_ASSERT(result == FALSE);
   2416  }
   2417 
   2418 
   2419 //
   2420 //   Callbacks()    Test the callback function.
   2421 //                  When set, callbacks occur periodically during matching operations,
   2422 //                  giving the application code the ability to abort the operation
   2423 //                  before it's normal completion.
   2424 //
   2425 
   2426 struct callBackContext {
   2427     RegexTest        *test;
   2428     int32_t          maxCalls;
   2429     int32_t          numCalls;
   2430     int32_t          lastSteps;
   2431     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
   2432 };
   2433 
   2434 U_CDECL_BEGIN
   2435 static UBool U_CALLCONV
   2436 testCallBackFn(const void *context, int32_t steps) {
   2437     callBackContext  *info = (callBackContext *)context;
   2438     if (info->lastSteps+1 != steps) {
   2439         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
   2440     }
   2441     info->lastSteps = steps;
   2442     info->numCalls++;
   2443     return (info->numCalls < info->maxCalls);
   2444 }
   2445 U_CDECL_END
   2446 
   2447 void RegexTest::Callbacks() {
   2448    {
   2449         // Getter returns NULLs if no callback has been set
   2450 
   2451         //   The variables that the getter will fill in.
   2452         //   Init to non-null values so that the action of the getter can be seen.
   2453         const void          *returnedContext = &returnedContext;
   2454         URegexMatchCallback *returnedFn = &testCallBackFn;
   2455 
   2456         UErrorCode status = U_ZERO_ERROR;
   2457         RegexMatcher matcher("x", 0, status);
   2458         REGEX_CHECK_STATUS;
   2459         matcher.getMatchCallback(returnedFn, returnedContext, status);
   2460         REGEX_CHECK_STATUS;
   2461         REGEX_ASSERT(returnedFn == NULL);
   2462         REGEX_ASSERT(returnedContext == NULL);
   2463     }
   2464 
   2465    {
   2466         // Set and Get work
   2467         callBackContext cbInfo = {this, 0, 0, 0};
   2468         const void          *returnedContext;
   2469         URegexMatchCallback *returnedFn;
   2470         UErrorCode status = U_ZERO_ERROR;
   2471         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
   2472         REGEX_CHECK_STATUS;
   2473         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
   2474         REGEX_CHECK_STATUS;
   2475         matcher.getMatchCallback(returnedFn, returnedContext, status);
   2476         REGEX_CHECK_STATUS;
   2477         REGEX_ASSERT(returnedFn == testCallBackFn);
   2478         REGEX_ASSERT(returnedContext == &cbInfo);
   2479 
   2480         // A short-running match shouldn't invoke the callback
   2481         status = U_ZERO_ERROR;
   2482         cbInfo.reset(1);
   2483         UnicodeString s = "xxx";
   2484         matcher.reset(s);
   2485         REGEX_ASSERT(matcher.matches(status));
   2486         REGEX_CHECK_STATUS;
   2487         REGEX_ASSERT(cbInfo.numCalls == 0);
   2488 
   2489         // A medium-length match that runs long enough to invoke the
   2490         //   callback, but not so long that the callback aborts it.
   2491         status = U_ZERO_ERROR;
   2492         cbInfo.reset(4);
   2493         s = "aaaaaaaaaaaaaaaaaaab";
   2494         matcher.reset(s);
   2495         REGEX_ASSERT(matcher.matches(status)==FALSE);
   2496         REGEX_CHECK_STATUS;
   2497         REGEX_ASSERT(cbInfo.numCalls > 0);
   2498 
   2499         // A longer running match that the callback function will abort.
   2500         status = U_ZERO_ERROR;
   2501         cbInfo.reset(4);
   2502         s = "aaaaaaaaaaaaaaaaaaaaaaab";
   2503         matcher.reset(s);
   2504         REGEX_ASSERT(matcher.matches(status)==FALSE);
   2505         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
   2506         REGEX_ASSERT(cbInfo.numCalls == 4);
   2507     }
   2508 
   2509 
   2510 }
   2511 
   2512 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
   2513 
   2514