1 /******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 2002-2009, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6 7 // 8 // regextst.cpp 9 // 10 // ICU Regular Expressions test, part of intltest. 11 // 12 13 #include "intltest.h" 14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 15 16 #include "unicode/regex.h" 17 #include "unicode/uchar.h" 18 #include "unicode/ucnv.h" 19 #include "regextst.h" 20 #include "uvector.h" 21 #include "util.h" 22 #include <stdlib.h> 23 #include <string.h> 24 #include <stdio.h> 25 26 27 //--------------------------------------------------------------------------- 28 // 29 // Test class boilerplate 30 // 31 //--------------------------------------------------------------------------- 32 RegexTest::RegexTest() 33 { 34 } 35 36 37 RegexTest::~RegexTest() 38 { 39 } 40 41 42 43 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) 44 { 45 if (exec) logln("TestSuite RegexTest: "); 46 switch (index) { 47 48 case 0: name = "Basic"; 49 if (exec) Basic(); 50 break; 51 case 1: name = "API_Match"; 52 if (exec) API_Match(); 53 break; 54 case 2: name = "API_Replace"; 55 if (exec) API_Replace(); 56 break; 57 case 3: name = "API_Pattern"; 58 if (exec) API_Pattern(); 59 break; 60 case 4: name = "Extended"; 61 if (exec) Extended(); 62 break; 63 case 5: name = "Errors"; 64 if (exec) Errors(); 65 break; 66 case 6: name = "PerlTests"; 67 if (exec) PerlTests(); 68 break; 69 case 7: name = "Callbacks"; 70 if (exec) Callbacks(); 71 break; 72 case 8: name = "Bug 6149"; 73 if (exec) Bug6149(); 74 break; 75 76 default: name = ""; 77 break; //needed to end loop 78 } 79 } 80 81 82 //--------------------------------------------------------------------------- 83 // 84 // Error Checking / Reporting macros used in all of the tests. 85 // 86 //--------------------------------------------------------------------------- 87 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("RegexTest failure at line %d. status=%s", \ 88 __LINE__, u_errorName(status)); return;}} 89 90 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("RegexTest failure at line %d.\n", __LINE__);};} 91 92 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\ 93 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \ 94 __LINE__, u_errorName(errcode), u_errorName(status));};} 95 96 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \ 97 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }} 98 99 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \ 100 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}} 101 102 103 104 //--------------------------------------------------------------------------- 105 // 106 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests 107 // for the LookingAt() and Match() functions. 108 // 109 // usage: 110 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected); 111 // 112 // The expected results are UBool - TRUE or FALSE. 113 // The input text is unescaped. The pattern is not. 114 // 115 // 116 //--------------------------------------------------------------------------- 117 118 #define REGEX_TESTLM(pat, text, looking, match) doRegexLMTest(pat, text, looking, match, __LINE__); 119 120 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) { 121 const UnicodeString pattern(pat, -1, US_INV); 122 const UnicodeString inputText(text, -1, US_INV); 123 UErrorCode status = U_ZERO_ERROR; 124 UParseError pe; 125 RegexPattern *REPattern = NULL; 126 RegexMatcher *REMatcher = NULL; 127 UBool retVal = TRUE; 128 129 UnicodeString patString(pat, -1, US_INV); 130 REPattern = RegexPattern::compile(patString, 0, pe, status); 131 if (U_FAILURE(status)) { 132 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s", 133 line, u_errorName(status)); 134 return FALSE; 135 } 136 if (line==376) { RegexPatternDump(REPattern);} 137 138 UnicodeString inputString(inputText); 139 UnicodeString unEscapedInput = inputString.unescape(); 140 REMatcher = REPattern->matcher(unEscapedInput, status); 141 if (U_FAILURE(status)) { 142 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n", 143 line, u_errorName(status)); 144 return FALSE; 145 } 146 147 UBool actualmatch; 148 actualmatch = REMatcher->lookingAt(status); 149 if (U_FAILURE(status)) { 150 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n", 151 line, u_errorName(status)); 152 retVal = FALSE; 153 } 154 if (actualmatch != looking) { 155 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line); 156 retVal = FALSE; 157 } 158 159 status = U_ZERO_ERROR; 160 actualmatch = REMatcher->matches(status); 161 if (U_FAILURE(status)) { 162 errln("RegexTest failure in matches() at line %d. Status = %s\n", 163 line, u_errorName(status)); 164 retVal = FALSE; 165 } 166 if (actualmatch != match) { 167 errln("RegexTest: wrong return from matches() at line %d.\n", line); 168 retVal = FALSE; 169 } 170 171 if (retVal == FALSE) { 172 RegexPatternDump(REPattern); 173 } 174 175 delete REPattern; 176 delete REMatcher; 177 return retVal; 178 } 179 180 181 182 183 184 //--------------------------------------------------------------------------- 185 // 186 // REGEX_ERR Macro + invocation function to simplify writing tests 187 // regex tests for incorrect patterns 188 // 189 // usage: 190 // REGEX_ERR("pattern", expected error line, column, expected status); 191 // 192 //--------------------------------------------------------------------------- 193 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__); 194 195 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol, 196 UErrorCode expectedStatus, int32_t line) { 197 UnicodeString pattern(pat); 198 199 UErrorCode status = U_ZERO_ERROR; 200 UParseError pe; 201 RegexPattern *callerPattern = NULL; 202 203 // 204 // Compile the caller's pattern 205 // 206 UnicodeString patString(pat); 207 callerPattern = RegexPattern::compile(patString, 0, pe, status); 208 if (status != expectedStatus) { 209 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status)); 210 } else { 211 if (status != U_ZERO_ERROR) { 212 if (pe.line != errLine || pe.offset != errCol) { 213 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n", 214 line, errLine, errCol, pe.line, pe.offset); 215 } 216 } 217 } 218 219 delete callerPattern; 220 } 221 222 223 224 //--------------------------------------------------------------------------- 225 // 226 // Basic Check for basic functionality of regex pattern matching. 227 // Avoid the use of REGEX_FIND test macro, which has 228 // substantial dependencies on basic Regex functionality. 229 // 230 //--------------------------------------------------------------------------- 231 void RegexTest::Basic() { 232 233 234 // 235 // Debug - slide failing test cases early 236 // 237 #if 0 238 { 239 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE); 240 UParseError pe; 241 UErrorCode status = U_ZERO_ERROR; 242 RegexPattern::compile("^(?:a?b?)*$", 0, pe, status); 243 // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd"); 244 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX===================="); 245 } 246 exit(1); 247 #endif 248 249 250 // 251 // Pattern with parentheses 252 // 253 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE); 254 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE); 255 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE); 256 257 // 258 // Patterns with * 259 // 260 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE); 261 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE); 262 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE); 263 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE); 264 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE); 265 266 REGEX_TESTLM("a*", "", TRUE, TRUE); 267 REGEX_TESTLM("a*", "b", TRUE, FALSE); 268 269 270 // 271 // Patterns with "." 272 // 273 REGEX_TESTLM(".", "abc", TRUE, FALSE); 274 REGEX_TESTLM("...", "abc", TRUE, TRUE); 275 REGEX_TESTLM("....", "abc", FALSE, FALSE); 276 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE); 277 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE); 278 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE); 279 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE); 280 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE); 281 282 // 283 // Patterns with * applied to chars at end of literal string 284 // 285 REGEX_TESTLM("abc*", "ab", TRUE, TRUE); 286 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE); 287 288 // 289 // Supplemental chars match as single chars, not a pair of surrogates. 290 // 291 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE); 292 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE); 293 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE); 294 295 296 // 297 // UnicodeSets in the pattern 298 // 299 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE); 300 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE); 301 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE); 302 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE); 303 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE); 304 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE); 305 306 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE); 307 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE); 308 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE); 309 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences. 310 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE); 311 312 // 313 // OR operator in patterns 314 // 315 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE); 316 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE); 317 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE); 318 REGEX_TESTLM("a|b", "b", TRUE, TRUE); 319 320 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE); 321 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE); 322 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE); 323 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE); 324 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE); 325 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE); 326 327 // 328 // + 329 // 330 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE); 331 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE); 332 REGEX_TESTLM("b+", "", FALSE, FALSE); 333 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE); 334 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE); 335 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE); 336 337 // 338 // ? 339 // 340 REGEX_TESTLM("ab?", "ab", TRUE, TRUE); 341 REGEX_TESTLM("ab?", "a", TRUE, TRUE); 342 REGEX_TESTLM("ab?", "ac", TRUE, FALSE); 343 REGEX_TESTLM("ab?", "abb", TRUE, FALSE); 344 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE); 345 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE); 346 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE); 347 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE); 348 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE); 349 350 // 351 // Escape sequences that become single literal chars, handled internally 352 // by ICU's Unescape. 353 // 354 355 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet. 356 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL 357 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L 358 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape 359 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed 360 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line 361 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR 362 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab 363 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE); 364 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE); 365 366 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input 367 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input 368 369 // Escape of special chars in patterns 370 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE); 371 372 373 } 374 375 376 //--------------------------------------------------------------------------- 377 // 378 // API_Match Test that the API for class RegexMatcher 379 // is present and nominally working, but excluding functions 380 // implementing replace operations. 381 // 382 //--------------------------------------------------------------------------- 383 void RegexTest::API_Match() { 384 UParseError pe; 385 UErrorCode status=U_ZERO_ERROR; 386 int32_t flags = 0; 387 388 // 389 // Debug - slide failing test cases early 390 // 391 #if 0 392 { 393 } 394 return; 395 #endif 396 397 // 398 // Simple pattern compilation 399 // 400 { 401 UnicodeString re("abc"); 402 RegexPattern *pat2; 403 pat2 = RegexPattern::compile(re, flags, pe, status); 404 REGEX_CHECK_STATUS; 405 406 UnicodeString inStr1 = "abcdef this is a test"; 407 UnicodeString instr2 = "not abc"; 408 UnicodeString empty = ""; 409 410 411 // 412 // Matcher creation and reset. 413 // 414 RegexMatcher *m1 = pat2->matcher(inStr1, status); 415 REGEX_CHECK_STATUS; 416 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 417 REGEX_ASSERT(m1->input() == inStr1); 418 m1->reset(instr2); 419 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 420 REGEX_ASSERT(m1->input() == instr2); 421 m1->reset(inStr1); 422 REGEX_ASSERT(m1->input() == inStr1); 423 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 424 m1->reset(empty); 425 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 426 REGEX_ASSERT(m1->input() == empty); 427 REGEX_ASSERT(&m1->pattern() == pat2); 428 429 // 430 // reset(pos, status) 431 // 432 m1->reset(inStr1); 433 m1->reset(4, status); 434 REGEX_CHECK_STATUS; 435 REGEX_ASSERT(m1->input() == inStr1); 436 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 437 438 m1->reset(-1, status); 439 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 440 status = U_ZERO_ERROR; 441 442 m1->reset(0, status); 443 REGEX_CHECK_STATUS; 444 status = U_ZERO_ERROR; 445 446 int32_t len = m1->input().length(); 447 m1->reset(len-1, status); 448 REGEX_CHECK_STATUS; 449 status = U_ZERO_ERROR; 450 451 m1->reset(len, status); 452 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 453 status = U_ZERO_ERROR; 454 455 // 456 // match(pos, status) 457 // 458 m1->reset(instr2); 459 REGEX_ASSERT(m1->matches(4, status) == TRUE); 460 m1->reset(); 461 REGEX_ASSERT(m1->matches(3, status) == FALSE); 462 m1->reset(); 463 REGEX_ASSERT(m1->matches(5, status) == FALSE); 464 REGEX_ASSERT(m1->matches(4, status) == TRUE); 465 REGEX_ASSERT(m1->matches(-1, status) == FALSE); 466 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 467 468 // Match() at end of string should fail, but should not 469 // be an error. 470 status = U_ZERO_ERROR; 471 len = m1->input().length(); 472 REGEX_ASSERT(m1->matches(len, status) == FALSE); 473 REGEX_CHECK_STATUS; 474 475 // Match beyond end of string should fail with an error. 476 status = U_ZERO_ERROR; 477 REGEX_ASSERT(m1->matches(len+1, status) == FALSE); 478 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 479 480 // Successful match at end of string. 481 { 482 status = U_ZERO_ERROR; 483 RegexMatcher m("A?", 0, status); // will match zero length string. 484 REGEX_CHECK_STATUS; 485 m.reset(inStr1); 486 len = inStr1.length(); 487 REGEX_ASSERT(m.matches(len, status) == TRUE); 488 REGEX_CHECK_STATUS; 489 m.reset(empty); 490 REGEX_ASSERT(m.matches(0, status) == TRUE); 491 REGEX_CHECK_STATUS; 492 } 493 494 495 // 496 // lookingAt(pos, status) 497 // 498 status = U_ZERO_ERROR; 499 m1->reset(instr2); // "not abc" 500 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 501 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE); 502 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE); 503 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 504 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE); 505 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 506 status = U_ZERO_ERROR; 507 len = m1->input().length(); 508 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE); 509 REGEX_CHECK_STATUS; 510 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE); 511 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 512 513 delete m1; 514 delete pat2; 515 } 516 517 518 // 519 // Capture Group. 520 // RegexMatcher::start(); 521 // RegexMatcher::end(); 522 // RegexMatcher::groupCount(); 523 // 524 { 525 int32_t flags=0; 526 UParseError pe; 527 UErrorCode status=U_ZERO_ERROR; 528 529 UnicodeString re("01(23(45)67)(.*)"); 530 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 531 REGEX_CHECK_STATUS; 532 UnicodeString data = "0123456789"; 533 534 RegexMatcher *matcher = pat->matcher(data, status); 535 REGEX_CHECK_STATUS; 536 REGEX_ASSERT(matcher->lookingAt(status) == TRUE); 537 static const int32_t matchStarts[] = {0, 2, 4, 8}; 538 static const int32_t matchEnds[] = {10, 8, 6, 10}; 539 int32_t i; 540 for (i=0; i<4; i++) { 541 int32_t actualStart = matcher->start(i, status); 542 REGEX_CHECK_STATUS; 543 if (actualStart != matchStarts[i]) { 544 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n", 545 __LINE__, i, matchStarts[i], actualStart); 546 } 547 int32_t actualEnd = matcher->end(i, status); 548 REGEX_CHECK_STATUS; 549 if (actualEnd != matchEnds[i]) { 550 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n", 551 __LINE__, i, matchEnds[i], actualEnd); 552 } 553 } 554 555 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status)); 556 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status)); 557 558 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 559 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 560 matcher->reset(); 561 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE); 562 563 matcher->lookingAt(status); 564 REGEX_ASSERT(matcher->group(status) == "0123456789"); 565 REGEX_ASSERT(matcher->group(0, status) == "0123456789"); 566 REGEX_ASSERT(matcher->group(1, status) == "234567" ); 567 REGEX_ASSERT(matcher->group(2, status) == "45" ); 568 REGEX_ASSERT(matcher->group(3, status) == "89" ); 569 REGEX_CHECK_STATUS; 570 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 571 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 572 matcher->reset(); 573 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); 574 575 delete matcher; 576 delete pat; 577 578 } 579 580 // 581 // find 582 // 583 { 584 int32_t flags=0; 585 UParseError pe; 586 UErrorCode status=U_ZERO_ERROR; 587 588 UnicodeString re("abc"); 589 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 590 REGEX_CHECK_STATUS; 591 UnicodeString data = ".abc..abc...abc.."; 592 // 012345678901234567 593 594 RegexMatcher *matcher = pat->matcher(data, status); 595 REGEX_CHECK_STATUS; 596 REGEX_ASSERT(matcher->find()); 597 REGEX_ASSERT(matcher->start(status) == 1); 598 REGEX_ASSERT(matcher->find()); 599 REGEX_ASSERT(matcher->start(status) == 6); 600 REGEX_ASSERT(matcher->find()); 601 REGEX_ASSERT(matcher->start(status) == 12); 602 REGEX_ASSERT(matcher->find() == FALSE); 603 REGEX_ASSERT(matcher->find() == FALSE); 604 605 matcher->reset(); 606 REGEX_ASSERT(matcher->find()); 607 REGEX_ASSERT(matcher->start(status) == 1); 608 609 REGEX_ASSERT(matcher->find(0, status)); 610 REGEX_ASSERT(matcher->start(status) == 1); 611 REGEX_ASSERT(matcher->find(1, status)); 612 REGEX_ASSERT(matcher->start(status) == 1); 613 REGEX_ASSERT(matcher->find(2, status)); 614 REGEX_ASSERT(matcher->start(status) == 6); 615 REGEX_ASSERT(matcher->find(12, status)); 616 REGEX_ASSERT(matcher->start(status) == 12); 617 REGEX_ASSERT(matcher->find(13, status) == FALSE); 618 REGEX_ASSERT(matcher->find(16, status) == FALSE); 619 REGEX_ASSERT(matcher->find(17, status) == FALSE); 620 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE); 621 622 status = U_ZERO_ERROR; 623 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 624 status = U_ZERO_ERROR; 625 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR); 626 627 REGEX_ASSERT(matcher->groupCount() == 0); 628 629 delete matcher; 630 delete pat; 631 } 632 633 634 // 635 // find, with \G in pattern (true if at the end of a previous match). 636 // 637 { 638 int32_t flags=0; 639 UParseError pe; 640 UErrorCode status=U_ZERO_ERROR; 641 642 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV); 643 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 644 REGEX_CHECK_STATUS; 645 UnicodeString data = ".abcabc.abc.."; 646 // 012345678901234567 647 648 RegexMatcher *matcher = pat->matcher(data, status); 649 REGEX_CHECK_STATUS; 650 REGEX_ASSERT(matcher->find()); 651 REGEX_ASSERT(matcher->start(status) == 0); 652 REGEX_ASSERT(matcher->start(1, status) == -1); 653 REGEX_ASSERT(matcher->start(2, status) == 1); 654 655 REGEX_ASSERT(matcher->find()); 656 REGEX_ASSERT(matcher->start(status) == 4); 657 REGEX_ASSERT(matcher->start(1, status) == 4); 658 REGEX_ASSERT(matcher->start(2, status) == -1); 659 REGEX_CHECK_STATUS; 660 661 delete matcher; 662 delete pat; 663 } 664 665 // 666 // find with zero length matches, match position should bump ahead 667 // to prevent loops. 668 // 669 { 670 int32_t i; 671 UErrorCode status=U_ZERO_ERROR; 672 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere, 673 // using an always-true look-ahead. 674 REGEX_CHECK_STATUS; 675 UnicodeString s(" "); 676 m.reset(s); 677 for (i=0; ; i++) { 678 if (m.find() == FALSE) { 679 break; 680 } 681 REGEX_ASSERT(m.start(status) == i); 682 REGEX_ASSERT(m.end(status) == i); 683 } 684 REGEX_ASSERT(i==5); 685 686 // Check that the bump goes over surrogate pairs OK 687 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004"); 688 s = s.unescape(); 689 m.reset(s); 690 for (i=0; ; i+=2) { 691 if (m.find() == FALSE) { 692 break; 693 } 694 REGEX_ASSERT(m.start(status) == i); 695 REGEX_ASSERT(m.end(status) == i); 696 } 697 REGEX_ASSERT(i==10); 698 } 699 { 700 // find() loop breaking test. 701 // with pattern of /.?/, should see a series of one char matches, then a single 702 // match of zero length at the end of the input string. 703 int32_t i; 704 UErrorCode status=U_ZERO_ERROR; 705 RegexMatcher m(".?", 0, status); 706 REGEX_CHECK_STATUS; 707 UnicodeString s(" "); 708 m.reset(s); 709 for (i=0; ; i++) { 710 if (m.find() == FALSE) { 711 break; 712 } 713 REGEX_ASSERT(m.start(status) == i); 714 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i)); 715 } 716 REGEX_ASSERT(i==5); 717 } 718 719 720 // 721 // Matchers with no input string behave as if they had an empty input string. 722 // 723 724 { 725 UErrorCode status = U_ZERO_ERROR; 726 RegexMatcher m(".?", 0, status); 727 REGEX_CHECK_STATUS; 728 REGEX_ASSERT(m.find()); 729 REGEX_ASSERT(m.start(status) == 0); 730 REGEX_ASSERT(m.input() == ""); 731 } 732 { 733 UErrorCode status = U_ZERO_ERROR; 734 RegexPattern *p = RegexPattern::compile(".", 0, status); 735 RegexMatcher *m = p->matcher(status); 736 REGEX_CHECK_STATUS; 737 738 REGEX_ASSERT(m->find() == FALSE); 739 REGEX_ASSERT(m->input() == ""); 740 delete m; 741 delete p; 742 } 743 744 // 745 // Regions 746 // 747 { 748 UErrorCode status = U_ZERO_ERROR; 749 UnicodeString testString("This is test data"); 750 RegexMatcher m(".*", testString, 0, status); 751 REGEX_CHECK_STATUS; 752 REGEX_ASSERT(m.regionStart() == 0); 753 REGEX_ASSERT(m.regionEnd() == testString.length()); 754 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 755 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 756 757 m.region(2,4, status); 758 REGEX_CHECK_STATUS; 759 REGEX_ASSERT(m.matches(status)); 760 REGEX_ASSERT(m.start(status)==2); 761 REGEX_ASSERT(m.end(status)==4); 762 REGEX_CHECK_STATUS; 763 764 m.reset(); 765 REGEX_ASSERT(m.regionStart() == 0); 766 REGEX_ASSERT(m.regionEnd() == testString.length()); 767 768 UnicodeString shorterString("short"); 769 m.reset(shorterString); 770 REGEX_ASSERT(m.regionStart() == 0); 771 REGEX_ASSERT(m.regionEnd() == shorterString.length()); 772 773 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 774 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE)); 775 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 776 REGEX_ASSERT(&m == &m.reset()); 777 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 778 779 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE)); 780 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 781 REGEX_ASSERT(&m == &m.reset()); 782 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 783 784 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 785 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE)); 786 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 787 REGEX_ASSERT(&m == &m.reset()); 788 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 789 790 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE)); 791 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 792 REGEX_ASSERT(&m == &m.reset()); 793 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 794 795 } 796 797 // 798 // hitEnd() and requireEnd() 799 // 800 { 801 UErrorCode status = U_ZERO_ERROR; 802 UnicodeString testString("aabb"); 803 RegexMatcher m1(".*", testString, 0, status); 804 REGEX_ASSERT(m1.lookingAt(status) == TRUE); 805 REGEX_ASSERT(m1.hitEnd() == TRUE); 806 REGEX_ASSERT(m1.requireEnd() == FALSE); 807 REGEX_CHECK_STATUS; 808 809 status = U_ZERO_ERROR; 810 RegexMatcher m2("a*", testString, 0, status); 811 REGEX_ASSERT(m2.lookingAt(status) == TRUE); 812 REGEX_ASSERT(m2.hitEnd() == FALSE); 813 REGEX_ASSERT(m2.requireEnd() == FALSE); 814 REGEX_CHECK_STATUS; 815 816 status = U_ZERO_ERROR; 817 RegexMatcher m3(".*$", testString, 0, status); 818 REGEX_ASSERT(m3.lookingAt(status) == TRUE); 819 REGEX_ASSERT(m3.hitEnd() == TRUE); 820 REGEX_ASSERT(m3.requireEnd() == TRUE); 821 REGEX_CHECK_STATUS; 822 } 823 824 825 // 826 // Compilation error on reset with UChar * 827 // These were a hazard that people were stumbling over with runtime errors. 828 // Changed them to compiler errors by adding private methods that more closely 829 // matched the incorrect use of the functions. 830 // 831 #if 0 832 { 833 UErrorCode status = U_ZERO_ERROR; 834 UChar ucharString[20]; 835 RegexMatcher m(".", 0, status); 836 m.reset(ucharString); // should not compile. 837 838 RegexPattern *p = RegexPattern::compile(".", 0, status); 839 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile. 840 841 RegexMatcher m3(".", ucharString, 0, status); // Should not compile 842 } 843 #endif 844 845 // 846 // Time Outs. 847 // Note: These tests will need to be changed when the regexp engine is 848 // able to detect and cut short the exponential time behavior on 849 // this type of match. 850 // 851 { 852 UErrorCode status = U_ZERO_ERROR; 853 // Enough 'a's in the string to cause the match to time out. 854 // (Each on additonal 'a' doubles the time) 855 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa"); 856 RegexMatcher matcher("(a+)+b", testString, 0, status); 857 REGEX_CHECK_STATUS; 858 REGEX_ASSERT(matcher.getTimeLimit() == 0); 859 matcher.setTimeLimit(100, status); 860 REGEX_ASSERT(matcher.getTimeLimit() == 100); 861 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 862 REGEX_ASSERT(status == U_REGEX_TIME_OUT); 863 } 864 { 865 UErrorCode status = U_ZERO_ERROR; 866 // Few enough 'a's to slip in under the time limit. 867 UnicodeString testString("aaaaaaaaaaaaaaaaaa"); 868 RegexMatcher matcher("(a+)+b", testString, 0, status); 869 REGEX_CHECK_STATUS; 870 matcher.setTimeLimit(100, status); 871 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 872 REGEX_CHECK_STATUS; 873 } 874 875 // 876 // Stack Limits 877 // 878 { 879 UErrorCode status = U_ZERO_ERROR; 880 UnicodeString testString(600000, 0x41, 600000); // Length 600,000, filled with 'A' 881 882 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations 883 // of the '+', and makes the stack frames larger. 884 RegexMatcher matcher("(A)+A$", testString, 0, status); 885 886 // With the default stack, this match should fail to run 887 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 888 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW); 889 890 // With unlimited stack, it should run 891 status = U_ZERO_ERROR; 892 matcher.setStackLimit(0, status); 893 REGEX_CHECK_STATUS; 894 REGEX_ASSERT(matcher.lookingAt(status) == TRUE); 895 REGEX_CHECK_STATUS; 896 REGEX_ASSERT(matcher.getStackLimit() == 0); 897 898 // With a limited stack, it the match should fail 899 status = U_ZERO_ERROR; 900 matcher.setStackLimit(10000, status); 901 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 902 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW); 903 REGEX_ASSERT(matcher.getStackLimit() == 10000); 904 } 905 906 // A pattern that doesn't save state should work with 907 // a minimal sized stack 908 { 909 UErrorCode status = U_ZERO_ERROR; 910 UnicodeString testString = "abc"; 911 RegexMatcher matcher("abc", testString, 0, status); 912 REGEX_CHECK_STATUS; 913 matcher.setStackLimit(30, status); 914 REGEX_CHECK_STATUS; 915 REGEX_ASSERT(matcher.matches(status) == TRUE); 916 REGEX_CHECK_STATUS; 917 REGEX_ASSERT(matcher.getStackLimit() == 30); 918 919 // Negative stack sizes should fail 920 status = U_ZERO_ERROR; 921 matcher.setStackLimit(1000, status); 922 REGEX_CHECK_STATUS; 923 matcher.setStackLimit(-1, status); 924 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); 925 REGEX_ASSERT(matcher.getStackLimit() == 1000); 926 } 927 928 929 } 930 931 932 933 934 935 936 //--------------------------------------------------------------------------- 937 // 938 // API_Replace API test for class RegexMatcher, testing the 939 // Replace family of functions. 940 // 941 //--------------------------------------------------------------------------- 942 void RegexTest::API_Replace() { 943 // 944 // Replace 945 // 946 int32_t flags=0; 947 UParseError pe; 948 UErrorCode status=U_ZERO_ERROR; 949 950 UnicodeString re("abc"); 951 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 952 REGEX_CHECK_STATUS; 953 UnicodeString data = ".abc..abc...abc.."; 954 // 012345678901234567 955 RegexMatcher *matcher = pat->matcher(data, status); 956 957 // 958 // Plain vanilla matches. 959 // 960 UnicodeString dest; 961 dest = matcher->replaceFirst("yz", status); 962 REGEX_CHECK_STATUS; 963 REGEX_ASSERT(dest == ".yz..abc...abc.."); 964 965 dest = matcher->replaceAll("yz", status); 966 REGEX_CHECK_STATUS; 967 REGEX_ASSERT(dest == ".yz..yz...yz.."); 968 969 // 970 // Plain vanilla non-matches. 971 // 972 UnicodeString d2 = ".abx..abx...abx.."; 973 matcher->reset(d2); 974 dest = matcher->replaceFirst("yz", status); 975 REGEX_CHECK_STATUS; 976 REGEX_ASSERT(dest == ".abx..abx...abx.."); 977 978 dest = matcher->replaceAll("yz", status); 979 REGEX_CHECK_STATUS; 980 REGEX_ASSERT(dest == ".abx..abx...abx.."); 981 982 // 983 // Empty source string 984 // 985 UnicodeString d3 = ""; 986 matcher->reset(d3); 987 dest = matcher->replaceFirst("yz", status); 988 REGEX_CHECK_STATUS; 989 REGEX_ASSERT(dest == ""); 990 991 dest = matcher->replaceAll("yz", status); 992 REGEX_CHECK_STATUS; 993 REGEX_ASSERT(dest == ""); 994 995 // 996 // Empty substitution string 997 // 998 matcher->reset(data); // ".abc..abc...abc.." 999 dest = matcher->replaceFirst("", status); 1000 REGEX_CHECK_STATUS; 1001 REGEX_ASSERT(dest == "...abc...abc.."); 1002 1003 dest = matcher->replaceAll("", status); 1004 REGEX_CHECK_STATUS; 1005 REGEX_ASSERT(dest == "........"); 1006 1007 // 1008 // match whole string 1009 // 1010 UnicodeString d4 = "abc"; 1011 matcher->reset(d4); 1012 dest = matcher->replaceFirst("xyz", status); 1013 REGEX_CHECK_STATUS; 1014 REGEX_ASSERT(dest == "xyz"); 1015 1016 dest = matcher->replaceAll("xyz", status); 1017 REGEX_CHECK_STATUS; 1018 REGEX_ASSERT(dest == "xyz"); 1019 1020 // 1021 // Capture Group, simple case 1022 // 1023 UnicodeString re2("a(..)"); 1024 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status); 1025 REGEX_CHECK_STATUS; 1026 UnicodeString d5 = "abcdefg"; 1027 RegexMatcher *matcher2 = pat2->matcher(d5, status); 1028 REGEX_CHECK_STATUS; 1029 dest = matcher2->replaceFirst("$1$1", status); 1030 REGEX_CHECK_STATUS; 1031 REGEX_ASSERT(dest == "bcbcdefg"); 1032 1033 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status); 1034 REGEX_CHECK_STATUS; 1035 REGEX_ASSERT(dest == "The value of $1 is bc.defg"); 1036 1037 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status); 1038 REGEX_CHECK_STATUS; 1039 REGEX_ASSERT(dest == "$ by itself, no group number $$$defg"); 1040 1041 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF."); 1042 replacement = replacement.unescape(); 1043 dest = matcher2->replaceFirst(replacement, status); 1044 REGEX_CHECK_STATUS; 1045 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg"); 1046 1047 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR); 1048 1049 1050 // 1051 // Replacement String with \u hex escapes 1052 // 1053 { 1054 UnicodeString src = "abc 1 abc 2 abc 3"; 1055 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--"); 1056 matcher->reset(src); 1057 UnicodeString result = matcher->replaceAll(substitute, status); 1058 REGEX_CHECK_STATUS; 1059 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3"); 1060 } 1061 { 1062 UnicodeString src = "abc !"; 1063 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--"); 1064 matcher->reset(src); 1065 UnicodeString result = matcher->replaceAll(substitute, status); 1066 REGEX_CHECK_STATUS; 1067 UnicodeString expected = UnicodeString("--"); 1068 expected.append((UChar32)0x10000); 1069 expected.append("-- !"); 1070 REGEX_ASSERT(result == expected); 1071 } 1072 // TODO: need more through testing of capture substitutions. 1073 1074 // Bug 4057 1075 // 1076 { 1077 status = U_ZERO_ERROR; 1078 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin"; 1079 RegexMatcher m("ss(.*?)ee", 0, status); 1080 REGEX_CHECK_STATUS; 1081 UnicodeString result; 1082 1083 // Multiple finds do NOT bump up the previous appendReplacement postion. 1084 m.reset(s); 1085 m.find(); 1086 m.find(); 1087 m.appendReplacement(result, "ooh", status); 1088 REGEX_CHECK_STATUS; 1089 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1090 1091 // After a reset into the interior of a string, appendReplacemnt still starts at beginning. 1092 status = U_ZERO_ERROR; 1093 result.truncate(0); 1094 m.reset(10, status); 1095 m.find(); 1096 m.find(); 1097 m.appendReplacement(result, "ooh", status); 1098 REGEX_CHECK_STATUS; 1099 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1100 1101 // find() at interior of string, appendReplacemnt still starts at beginning. 1102 status = U_ZERO_ERROR; 1103 result.truncate(0); 1104 m.reset(); 1105 m.find(10, status); 1106 m.find(); 1107 m.appendReplacement(result, "ooh", status); 1108 REGEX_CHECK_STATUS; 1109 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1110 1111 m.appendTail(result); 1112 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin"); 1113 1114 } 1115 1116 delete matcher2; 1117 delete pat2; 1118 delete matcher; 1119 delete pat; 1120 } 1121 1122 1123 //--------------------------------------------------------------------------- 1124 // 1125 // API_Pattern Test that the API for class RegexPattern is 1126 // present and nominally working. 1127 // 1128 //--------------------------------------------------------------------------- 1129 void RegexTest::API_Pattern() { 1130 RegexPattern pata; // Test default constructor to not crash. 1131 RegexPattern patb; 1132 1133 REGEX_ASSERT(pata == patb); 1134 REGEX_ASSERT(pata == pata); 1135 1136 UnicodeString re1("abc[a-l][m-z]"); 1137 UnicodeString re2("def"); 1138 UErrorCode status = U_ZERO_ERROR; 1139 UParseError pe; 1140 1141 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status); 1142 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status); 1143 REGEX_CHECK_STATUS; 1144 REGEX_ASSERT(*pat1 == *pat1); 1145 REGEX_ASSERT(*pat1 != pata); 1146 1147 // Assign 1148 patb = *pat1; 1149 REGEX_ASSERT(patb == *pat1); 1150 1151 // Copy Construct 1152 RegexPattern patc(*pat1); 1153 REGEX_ASSERT(patc == *pat1); 1154 REGEX_ASSERT(patb == patc); 1155 REGEX_ASSERT(pat1 != pat2); 1156 patb = *pat2; 1157 REGEX_ASSERT(patb != patc); 1158 REGEX_ASSERT(patb == *pat2); 1159 1160 // Compile with no flags. 1161 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status); 1162 REGEX_ASSERT(*pat1a == *pat1); 1163 1164 REGEX_ASSERT(pat1a->flags() == 0); 1165 1166 // Compile with different flags should be not equal 1167 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status); 1168 REGEX_CHECK_STATUS; 1169 1170 REGEX_ASSERT(*pat1b != *pat1a); 1171 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE); 1172 REGEX_ASSERT(pat1a->flags() == 0); 1173 delete pat1b; 1174 1175 // clone 1176 RegexPattern *pat1c = pat1->clone(); 1177 REGEX_ASSERT(*pat1c == *pat1); 1178 REGEX_ASSERT(*pat1c != *pat2); 1179 1180 delete pat1c; 1181 delete pat1a; 1182 delete pat1; 1183 delete pat2; 1184 1185 1186 // 1187 // Verify that a matcher created from a cloned pattern works. 1188 // (Jitterbug 3423) 1189 // 1190 { 1191 UErrorCode status = U_ZERO_ERROR; 1192 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status); 1193 RegexPattern *pClone = pSource->clone(); 1194 delete pSource; 1195 RegexMatcher *mFromClone = pClone->matcher(status); 1196 REGEX_CHECK_STATUS; 1197 UnicodeString s = "Hello World"; 1198 mFromClone->reset(s); 1199 REGEX_ASSERT(mFromClone->find() == TRUE); 1200 REGEX_ASSERT(mFromClone->group(status) == "Hello"); 1201 REGEX_ASSERT(mFromClone->find() == TRUE); 1202 REGEX_ASSERT(mFromClone->group(status) == "World"); 1203 REGEX_ASSERT(mFromClone->find() == FALSE); 1204 delete mFromClone; 1205 delete pClone; 1206 } 1207 1208 // 1209 // matches convenience API 1210 // 1211 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE); 1212 REGEX_CHECK_STATUS; 1213 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE); 1214 REGEX_CHECK_STATUS; 1215 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE); 1216 REGEX_CHECK_STATUS; 1217 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE); 1218 REGEX_CHECK_STATUS; 1219 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE); 1220 REGEX_CHECK_STATUS; 1221 status = U_INDEX_OUTOFBOUNDS_ERROR; 1222 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE); 1223 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1224 1225 1226 // 1227 // Split() 1228 // 1229 status = U_ZERO_ERROR; 1230 pat1 = RegexPattern::compile(" +", pe, status); 1231 REGEX_CHECK_STATUS; 1232 UnicodeString fields[10]; 1233 1234 int32_t n; 1235 n = pat1->split("Now is the time", fields, 10, status); 1236 REGEX_CHECK_STATUS; 1237 REGEX_ASSERT(n==4); 1238 REGEX_ASSERT(fields[0]=="Now"); 1239 REGEX_ASSERT(fields[1]=="is"); 1240 REGEX_ASSERT(fields[2]=="the"); 1241 REGEX_ASSERT(fields[3]=="time"); 1242 REGEX_ASSERT(fields[4]==""); 1243 1244 n = pat1->split("Now is the time", fields, 2, status); 1245 REGEX_CHECK_STATUS; 1246 REGEX_ASSERT(n==2); 1247 REGEX_ASSERT(fields[0]=="Now"); 1248 REGEX_ASSERT(fields[1]=="is the time"); 1249 REGEX_ASSERT(fields[2]=="the"); // left over from previous test 1250 1251 fields[1] = "*"; 1252 status = U_ZERO_ERROR; 1253 n = pat1->split("Now is the time", fields, 1, status); 1254 REGEX_CHECK_STATUS; 1255 REGEX_ASSERT(n==1); 1256 REGEX_ASSERT(fields[0]=="Now is the time"); 1257 REGEX_ASSERT(fields[1]=="*"); 1258 status = U_ZERO_ERROR; 1259 1260 n = pat1->split(" Now is the time ", fields, 10, status); 1261 REGEX_CHECK_STATUS; 1262 REGEX_ASSERT(n==5); 1263 REGEX_ASSERT(fields[0]==""); 1264 REGEX_ASSERT(fields[1]=="Now"); 1265 REGEX_ASSERT(fields[2]=="is"); 1266 REGEX_ASSERT(fields[3]=="the"); 1267 REGEX_ASSERT(fields[4]=="time"); 1268 REGEX_ASSERT(fields[5]==""); 1269 1270 n = pat1->split(" ", fields, 10, status); 1271 REGEX_CHECK_STATUS; 1272 REGEX_ASSERT(n==1); 1273 REGEX_ASSERT(fields[0]==""); 1274 1275 fields[0] = "foo"; 1276 n = pat1->split("", fields, 10, status); 1277 REGEX_CHECK_STATUS; 1278 REGEX_ASSERT(n==0); 1279 REGEX_ASSERT(fields[0]=="foo"); 1280 1281 delete pat1; 1282 1283 // split, with a pattern with (capture) 1284 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status); 1285 REGEX_CHECK_STATUS; 1286 1287 status = U_ZERO_ERROR; 1288 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status); 1289 REGEX_CHECK_STATUS; 1290 REGEX_ASSERT(n==6); 1291 REGEX_ASSERT(fields[0]==""); 1292 REGEX_ASSERT(fields[1]=="a"); 1293 REGEX_ASSERT(fields[2]=="Now is "); 1294 REGEX_ASSERT(fields[3]=="b"); 1295 REGEX_ASSERT(fields[4]=="the time"); 1296 REGEX_ASSERT(fields[5]=="c"); 1297 REGEX_ASSERT(fields[6]==""); 1298 REGEX_ASSERT(status==U_ZERO_ERROR); 1299 1300 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status); 1301 REGEX_CHECK_STATUS; 1302 REGEX_ASSERT(n==6); 1303 REGEX_ASSERT(fields[0]==" "); 1304 REGEX_ASSERT(fields[1]=="a"); 1305 REGEX_ASSERT(fields[2]=="Now is "); 1306 REGEX_ASSERT(fields[3]=="b"); 1307 REGEX_ASSERT(fields[4]=="the time"); 1308 REGEX_ASSERT(fields[5]=="c"); 1309 REGEX_ASSERT(fields[6]==""); 1310 1311 status = U_ZERO_ERROR; 1312 fields[6] = "foo"; 1313 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status); 1314 REGEX_CHECK_STATUS; 1315 REGEX_ASSERT(n==6); 1316 REGEX_ASSERT(fields[0]==" "); 1317 REGEX_ASSERT(fields[1]=="a"); 1318 REGEX_ASSERT(fields[2]=="Now is "); 1319 REGEX_ASSERT(fields[3]=="b"); 1320 REGEX_ASSERT(fields[4]=="the time"); 1321 REGEX_ASSERT(fields[5]=="c"); 1322 REGEX_ASSERT(fields[6]=="foo"); 1323 1324 status = U_ZERO_ERROR; 1325 fields[5] = "foo"; 1326 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status); 1327 REGEX_CHECK_STATUS; 1328 REGEX_ASSERT(n==5); 1329 REGEX_ASSERT(fields[0]==" "); 1330 REGEX_ASSERT(fields[1]=="a"); 1331 REGEX_ASSERT(fields[2]=="Now is "); 1332 REGEX_ASSERT(fields[3]=="b"); 1333 REGEX_ASSERT(fields[4]=="the time<c>"); 1334 REGEX_ASSERT(fields[5]=="foo"); 1335 1336 status = U_ZERO_ERROR; 1337 fields[5] = "foo"; 1338 n = pat1->split(" <a>Now is <b>the time", fields, 5, status); 1339 REGEX_CHECK_STATUS; 1340 REGEX_ASSERT(n==5); 1341 REGEX_ASSERT(fields[0]==" "); 1342 REGEX_ASSERT(fields[1]=="a"); 1343 REGEX_ASSERT(fields[2]=="Now is "); 1344 REGEX_ASSERT(fields[3]=="b"); 1345 REGEX_ASSERT(fields[4]=="the time"); 1346 REGEX_ASSERT(fields[5]=="foo"); 1347 1348 status = U_ZERO_ERROR; 1349 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status); 1350 REGEX_CHECK_STATUS; 1351 REGEX_ASSERT(n==4); 1352 REGEX_ASSERT(fields[0]==" "); 1353 REGEX_ASSERT(fields[1]=="a"); 1354 REGEX_ASSERT(fields[2]=="Now is "); 1355 REGEX_ASSERT(fields[3]=="the time<c>"); 1356 status = U_ZERO_ERROR; 1357 delete pat1; 1358 1359 pat1 = RegexPattern::compile("([-,])", pe, status); 1360 REGEX_CHECK_STATUS; 1361 n = pat1->split("1-10,20", fields, 10, status); 1362 REGEX_CHECK_STATUS; 1363 REGEX_ASSERT(n==5); 1364 REGEX_ASSERT(fields[0]=="1"); 1365 REGEX_ASSERT(fields[1]=="-"); 1366 REGEX_ASSERT(fields[2]=="10"); 1367 REGEX_ASSERT(fields[3]==","); 1368 REGEX_ASSERT(fields[4]=="20"); 1369 delete pat1; 1370 1371 1372 // 1373 // RegexPattern::pattern() 1374 // 1375 pat1 = new RegexPattern(); 1376 REGEX_ASSERT(pat1->pattern() == ""); 1377 delete pat1; 1378 1379 pat1 = RegexPattern::compile("(Hello, world)*", pe, status); 1380 REGEX_CHECK_STATUS; 1381 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*"); 1382 delete pat1; 1383 1384 1385 // 1386 // classID functions 1387 // 1388 pat1 = RegexPattern::compile("(Hello, world)*", pe, status); 1389 REGEX_CHECK_STATUS; 1390 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID()); 1391 REGEX_ASSERT(pat1->getDynamicClassID() != NULL); 1392 UnicodeString Hello("Hello, world."); 1393 RegexMatcher *m = pat1->matcher(Hello, status); 1394 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID()); 1395 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID()); 1396 REGEX_ASSERT(m->getDynamicClassID() != NULL); 1397 delete m; 1398 delete pat1; 1399 1400 } 1401 1402 //--------------------------------------------------------------------------- 1403 // 1404 // Extended A more thorough check for features of regex patterns 1405 // The test cases are in a separate data file, 1406 // source/tests/testdata/regextst.txt 1407 // A description of the test data format is included in that file. 1408 // 1409 //--------------------------------------------------------------------------- 1410 1411 const char * 1412 RegexTest::getPath(char buffer[2048], const char *filename) { 1413 UErrorCode status=U_ZERO_ERROR; 1414 const char *testDataDirectory = IntlTest::getSourceTestData(status); 1415 if (U_FAILURE(status)) { 1416 errln("ERROR: loadTestData() failed - %s", u_errorName(status)); 1417 return NULL; 1418 } 1419 1420 strcpy(buffer, testDataDirectory); 1421 strcat(buffer, filename); 1422 return buffer; 1423 } 1424 1425 void RegexTest::Extended() { 1426 char tdd[2048]; 1427 const char *srcPath; 1428 UErrorCode status = U_ZERO_ERROR; 1429 int32_t lineNum = 0; 1430 1431 // 1432 // Open and read the test data file. 1433 // 1434 srcPath=getPath(tdd, "regextst.txt"); 1435 if(srcPath==NULL) { 1436 return; /* something went wrong, error already output */ 1437 } 1438 1439 int32_t len; 1440 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status); 1441 if (U_FAILURE(status)) { 1442 return; /* something went wrong, error already output */ 1443 } 1444 1445 // 1446 // Put the test data into a UnicodeString 1447 // 1448 UnicodeString testString(FALSE, testData, len); 1449 1450 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status); 1451 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status); 1452 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)"), 0, status); 1453 1454 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status); 1455 UnicodeString testPattern; // The pattern for test from the test file. 1456 UnicodeString testFlags; // the flags for a test. 1457 UnicodeString matchString; // The marked up string to be used as input 1458 1459 if (U_FAILURE(status)){ 1460 dataerrln("Construct RegexMatcher() error."); 1461 delete [] testData; 1462 return; 1463 } 1464 1465 // 1466 // Loop over the test data file, once per line. 1467 // 1468 while (lineMat.find()) { 1469 lineNum++; 1470 if (U_FAILURE(status)) { 1471 errln("line %d: ICU Error \"%s\"", lineNum, u_errorName(status)); 1472 } 1473 1474 status = U_ZERO_ERROR; 1475 UnicodeString testLine = lineMat.group(1, status); 1476 if (testLine.length() == 0) { 1477 continue; 1478 } 1479 1480 // 1481 // Parse the test line. Skip blank and comment only lines. 1482 // Separate out the three main fields - pattern, flags, target. 1483 // 1484 1485 commentMat.reset(testLine); 1486 if (commentMat.lookingAt(status)) { 1487 // This line is a comment, or blank. 1488 continue; 1489 } 1490 1491 // 1492 // Pull out the pattern field, remove it from the test file line. 1493 // 1494 quotedStuffMat.reset(testLine); 1495 if (quotedStuffMat.lookingAt(status)) { 1496 testPattern = quotedStuffMat.group(2, status); 1497 testLine.remove(0, quotedStuffMat.end(0, status)); 1498 } else { 1499 errln("Bad pattern (missing quotes?) at test file line %d", lineNum); 1500 continue; 1501 } 1502 1503 1504 // 1505 // Pull out the flags from the test file line. 1506 // 1507 flagsMat.reset(testLine); 1508 flagsMat.lookingAt(status); // Will always match, possibly an empty string. 1509 testFlags = flagsMat.group(1, status); 1510 if (flagsMat.group(2, status).length() > 0) { 1511 errln("Bad Match flag at line %d. Scanning %c\n", 1512 lineNum, flagsMat.group(2, status).charAt(0)); 1513 continue; 1514 } 1515 testLine.remove(0, flagsMat.end(0, status)); 1516 1517 // 1518 // Pull out the match string, as a whole. 1519 // We'll process the <tags> later. 1520 // 1521 quotedStuffMat.reset(testLine); 1522 if (quotedStuffMat.lookingAt(status)) { 1523 matchString = quotedStuffMat.group(2, status); 1524 testLine.remove(0, quotedStuffMat.end(0, status)); 1525 } else { 1526 errln("Bad match string at test file line %d", lineNum); 1527 continue; 1528 } 1529 1530 // 1531 // The only thing left from the input line should be an optional trailing comment. 1532 // 1533 commentMat.reset(testLine); 1534 if (commentMat.lookingAt(status) == FALSE) { 1535 errln("Line %d: unexpected characters at end of test line.", lineNum); 1536 continue; 1537 } 1538 1539 // 1540 // Run the test 1541 // 1542 regex_find(testPattern, testFlags, matchString, lineNum); 1543 } 1544 1545 delete [] testData; 1546 1547 } 1548 1549 1550 1551 //--------------------------------------------------------------------------- 1552 // 1553 // regex_find(pattern, flags, inputString, lineNumber) 1554 // 1555 // Function to run a single test from the Extended (data driven) tests. 1556 // See file test/testdata/regextst.txt for a description of the 1557 // pattern and inputString fields, and the allowed flags. 1558 // lineNumber is the source line in regextst.txt of the test. 1559 // 1560 //--------------------------------------------------------------------------- 1561 1562 1563 // Set a value into a UVector at position specified by a decimal number in 1564 // a UnicodeString. This is a utility function needed by the actual test function, 1565 // which follows. 1566 static void set(UVector &vec, int32_t val, UnicodeString index) { 1567 UErrorCode status=U_ZERO_ERROR; 1568 int32_t idx = 0; 1569 for (int32_t i=0; i<index.length(); i++) { 1570 int32_t d=u_charDigitValue(index.charAt(i)); 1571 if (d<0) {return;} 1572 idx = idx*10 + d; 1573 } 1574 while (vec.size()<idx+1) {vec.addElement(-1, status);} 1575 vec.setElementAt(val, idx); 1576 } 1577 1578 void RegexTest::regex_find(const UnicodeString &pattern, 1579 const UnicodeString &flags, 1580 const UnicodeString &inputString, 1581 int32_t line) { 1582 UnicodeString unEscapedInput; 1583 UnicodeString deTaggedInput; 1584 1585 UErrorCode status = U_ZERO_ERROR; 1586 UParseError pe; 1587 RegexPattern *parsePat = NULL; 1588 RegexMatcher *parseMatcher = NULL; 1589 RegexPattern *callerPattern = NULL; 1590 RegexMatcher *matcher = NULL; 1591 UVector groupStarts(status); 1592 UVector groupEnds(status); 1593 UBool isMatch = FALSE; 1594 UBool failed = FALSE; 1595 int32_t numFinds; 1596 int32_t i; 1597 UBool useMatchesFunc = FALSE; 1598 UBool useLookingAtFunc = FALSE; 1599 int32_t regionStart = -1; 1600 int32_t regionEnd = -1; 1601 1602 // 1603 // Compile the caller's pattern 1604 // 1605 uint32_t bflags = 0; 1606 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag 1607 bflags |= UREGEX_CASE_INSENSITIVE; 1608 } 1609 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag 1610 bflags |= UREGEX_COMMENTS; 1611 } 1612 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag 1613 bflags |= UREGEX_DOTALL; 1614 } 1615 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag 1616 bflags |= UREGEX_MULTILINE; 1617 } 1618 1619 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag 1620 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES; 1621 } 1622 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag 1623 bflags |= UREGEX_UNIX_LINES; 1624 } 1625 1626 1627 callerPattern = RegexPattern::compile(pattern, bflags, pe, status); 1628 if (status != U_ZERO_ERROR) { 1629 #if UCONFIG_NO_BREAK_ITERATION==1 1630 // 'v' test flag means that the test pattern should not compile if ICU was configured 1631 // to not include break iteration. RBBI is needed for Unicode word boundaries. 1632 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) { 1633 goto cleanupAndReturn; 1634 } 1635 #endif 1636 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E' 1637 // Expected pattern compilation error. 1638 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd' 1639 logln("Pattern Compile returns \"%s\"", u_errorName(status)); 1640 } 1641 goto cleanupAndReturn; 1642 } else { 1643 // Unexpected pattern compilation error. 1644 errln("Line %d: error %s compiling pattern.", line, u_errorName(status)); 1645 goto cleanupAndReturn; 1646 } 1647 } 1648 1649 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag 1650 RegexPatternDump(callerPattern); 1651 } 1652 1653 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag 1654 errln("Expected, but did not get, a pattern compilation error."); 1655 goto cleanupAndReturn; 1656 } 1657 1658 1659 // 1660 // Number of times find() should be called on the test string, default to 1 1661 // 1662 numFinds = 1; 1663 for (i=2; i<=9; i++) { 1664 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag 1665 if (numFinds != 1) { 1666 errln("Line %d: more than one digit flag. Scanning %d.", line, i); 1667 goto cleanupAndReturn; 1668 } 1669 numFinds = i; 1670 } 1671 } 1672 1673 // 'M' flag. Use matches() instead of find() 1674 if (flags.indexOf((UChar)0x4d) >= 0) { 1675 useMatchesFunc = TRUE; 1676 } 1677 if (flags.indexOf((UChar)0x4c) >= 0) { 1678 useLookingAtFunc = TRUE; 1679 } 1680 1681 // 1682 // Find the tags in the input data, remove them, and record the group boundary 1683 // positions. 1684 // 1685 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status); 1686 REGEX_CHECK_STATUS_L(line); 1687 1688 unEscapedInput = inputString.unescape(); 1689 parseMatcher = parsePat->matcher(unEscapedInput, status); 1690 REGEX_CHECK_STATUS_L(line); 1691 while(parseMatcher->find()) { 1692 parseMatcher->appendReplacement(deTaggedInput, "", status); 1693 REGEX_CHECK_STATUS; 1694 UnicodeString groupNum = parseMatcher->group(2, status); 1695 if (groupNum == "r") { 1696 // <r> or </r>, a region specification within the string 1697 if (parseMatcher->group(1, status) == "/") { 1698 regionEnd = deTaggedInput.length(); 1699 } else { 1700 regionStart = deTaggedInput.length(); 1701 } 1702 } else { 1703 // <digits> or </digits>, a group match boundary tag. 1704 if (parseMatcher->group(1, status) == "/") { 1705 set(groupEnds, deTaggedInput.length(), groupNum); 1706 } else { 1707 set(groupStarts, deTaggedInput.length(), groupNum); 1708 } 1709 } 1710 } 1711 parseMatcher->appendTail(deTaggedInput); 1712 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line); 1713 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) { 1714 errln("mismatched <r> tags"); 1715 failed = TRUE; 1716 goto cleanupAndReturn; 1717 } 1718 1719 1720 // 1721 // Configure the matcher according to the flags specified with this test. 1722 // 1723 matcher = callerPattern->matcher(deTaggedInput, status); 1724 REGEX_CHECK_STATUS_L(line); 1725 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag 1726 matcher->setTrace(TRUE); 1727 } 1728 if (regionStart>=0) { 1729 matcher->region(regionStart, regionEnd, status); 1730 REGEX_CHECK_STATUS_L(line); 1731 } 1732 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag 1733 matcher->useAnchoringBounds(FALSE); 1734 } 1735 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag 1736 matcher->useTransparentBounds(TRUE); 1737 } 1738 1739 1740 1741 // 1742 // Do a find on the de-tagged input using the caller's pattern 1743 // TODO: error on count>1 and not find(). 1744 // error on both matches() and lookingAt(). 1745 // 1746 for (i=0; i<numFinds; i++) { 1747 if (useMatchesFunc) { 1748 isMatch = matcher->matches(status); 1749 } else if (useLookingAtFunc) { 1750 isMatch = matcher->lookingAt(status); 1751 } else { 1752 isMatch = matcher->find(); 1753 } 1754 } 1755 matcher->setTrace(FALSE); 1756 1757 // 1758 // Match up the groups from the find() with the groups from the tags 1759 // 1760 1761 // number of tags should match number of groups from find operation. 1762 // matcher->groupCount does not include group 0, the entire match, hence the +1. 1763 // G option in test means that capture group data is not available in the 1764 // expected results, so the check needs to be suppressed. 1765 if (isMatch == FALSE && groupStarts.size() != 0) { 1766 errln("Error at line %d: Match expected, but none found.\n", line); 1767 failed = TRUE; 1768 goto cleanupAndReturn; 1769 } 1770 1771 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) { 1772 // Only check for match / no match. Don't check capture groups. 1773 if (isMatch && groupStarts.size() == 0) { 1774 errln("Error at line %d: No match expected, but one found.\n", line); 1775 failed = TRUE; 1776 } 1777 goto cleanupAndReturn; 1778 } 1779 1780 for (i=0; i<=matcher->groupCount(); i++) { 1781 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i)); 1782 if (matcher->start(i, status) != expectedStart) { 1783 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d", 1784 line, i, expectedStart, matcher->start(i, status)); 1785 failed = TRUE; 1786 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 1787 } 1788 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i)); 1789 if (matcher->end(i, status) != expectedEnd) { 1790 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d", 1791 line, i, expectedEnd, matcher->end(i, status)); 1792 failed = TRUE; 1793 // Error on end position; keep going; real error is probably yet to come as group 1794 // end positions work from end of the input data towards the front. 1795 } 1796 } 1797 if ( matcher->groupCount()+1 < groupStarts.size()) { 1798 errln("Error at line %d: Expected %d capture groups, found %d.", 1799 line, groupStarts.size()-1, matcher->groupCount()); 1800 failed = TRUE; 1801 } 1802 1803 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false 1804 matcher->requireEnd() == TRUE) { 1805 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line); 1806 failed = TRUE; 1807 } 1808 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true 1809 matcher->requireEnd() == FALSE) { 1810 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line); 1811 failed = TRUE; 1812 } 1813 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false 1814 matcher->hitEnd() == TRUE) { 1815 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line); 1816 failed = TRUE; 1817 } 1818 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true 1819 matcher->hitEnd() == FALSE) { 1820 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line); 1821 failed = TRUE; 1822 } 1823 1824 1825 cleanupAndReturn: 1826 if (failed) { 1827 errln((UnicodeString)"\""+pattern+(UnicodeString)"\" " 1828 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\""); 1829 // callerPattern->dump(); 1830 } 1831 delete parseMatcher; 1832 delete parsePat; 1833 delete matcher; 1834 delete callerPattern; 1835 } 1836 1837 1838 1839 1840 //--------------------------------------------------------------------------- 1841 // 1842 // Errors Check for error handling in patterns. 1843 // 1844 //--------------------------------------------------------------------------- 1845 void RegexTest::Errors() { 1846 // \escape sequences that aren't implemented yet. 1847 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED); 1848 1849 // Missing close parentheses 1850 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN); 1851 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN); 1852 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN); 1853 1854 // Extra close paren 1855 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN); 1856 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN); 1857 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN); 1858 1859 // Look-ahead, Look-behind 1860 // TODO: add tests for unbounded length look-behinds. 1861 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct 1862 1863 // Attempt to use non-default flags 1864 { 1865 UParseError pe; 1866 UErrorCode status = U_ZERO_ERROR; 1867 int32_t flags = UREGEX_CANON_EQ | 1868 UREGEX_COMMENTS | UREGEX_DOTALL | 1869 UREGEX_MULTILINE; 1870 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status); 1871 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED); 1872 delete pat1; 1873 } 1874 1875 1876 // Quantifiers are allowed only after something that can be quantified. 1877 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX); 1878 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX); 1879 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX); 1880 1881 // Mal-formed {min,max} quantifiers 1882 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL); 1883 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN); 1884 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL); 1885 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL); 1886 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL); 1887 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG); 1888 REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan 1889 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format 1890 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG); 1891 1892 // Ticket 5389 1893 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX); 1894 1895 // Invalid Back Reference \0 1896 // For ICU 3.8 and earlier 1897 // For ICU versions newer than 3.8, \0 introduces an octal escape. 1898 // 1899 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE); 1900 1901 } 1902 1903 1904 //------------------------------------------------------------------------------- 1905 // 1906 // Read a text data file, convert it to UChars, and return the data 1907 // in one big UChar * buffer, which the caller must delete. 1908 // 1909 //-------------------------------------------------------------------------------- 1910 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen, 1911 const char *defEncoding, UErrorCode &status) { 1912 UChar *retPtr = NULL; 1913 char *fileBuf = NULL; 1914 UConverter* conv = NULL; 1915 FILE *f = NULL; 1916 1917 ulen = 0; 1918 if (U_FAILURE(status)) { 1919 return retPtr; 1920 } 1921 1922 // 1923 // Open the file. 1924 // 1925 f = fopen(fileName, "rb"); 1926 if (f == 0) { 1927 dataerrln("Error opening test data file %s\n", fileName); 1928 status = U_FILE_ACCESS_ERROR; 1929 return NULL; 1930 } 1931 // 1932 // Read it in 1933 // 1934 int32_t fileSize; 1935 int32_t amt_read; 1936 1937 fseek( f, 0, SEEK_END); 1938 fileSize = ftell(f); 1939 fileBuf = new char[fileSize]; 1940 fseek(f, 0, SEEK_SET); 1941 amt_read = fread(fileBuf, 1, fileSize, f); 1942 if (amt_read != fileSize || fileSize <= 0) { 1943 errln("Error reading test data file."); 1944 goto cleanUpAndReturn; 1945 } 1946 1947 // 1948 // Look for a Unicode Signature (BOM) on the data just read 1949 // 1950 int32_t signatureLength; 1951 const char * fileBufC; 1952 const char* encoding; 1953 1954 fileBufC = fileBuf; 1955 encoding = ucnv_detectUnicodeSignature( 1956 fileBuf, fileSize, &signatureLength, &status); 1957 if(encoding!=NULL ){ 1958 fileBufC += signatureLength; 1959 fileSize -= signatureLength; 1960 } else { 1961 encoding = defEncoding; 1962 if (strcmp(encoding, "utf-8") == 0) { 1963 errln("file %s is missing its BOM", fileName); 1964 } 1965 } 1966 1967 // 1968 // Open a converter to take the rule file to UTF-16 1969 // 1970 conv = ucnv_open(encoding, &status); 1971 if (U_FAILURE(status)) { 1972 goto cleanUpAndReturn; 1973 } 1974 1975 // 1976 // Convert the rules to UChar. 1977 // Preflight first to determine required buffer size. 1978 // 1979 ulen = ucnv_toUChars(conv, 1980 NULL, // dest, 1981 0, // destCapacity, 1982 fileBufC, 1983 fileSize, 1984 &status); 1985 if (status == U_BUFFER_OVERFLOW_ERROR) { 1986 // Buffer Overflow is expected from the preflight operation. 1987 status = U_ZERO_ERROR; 1988 1989 retPtr = new UChar[ulen+1]; 1990 ucnv_toUChars(conv, 1991 retPtr, // dest, 1992 ulen+1, 1993 fileBufC, 1994 fileSize, 1995 &status); 1996 } 1997 1998 cleanUpAndReturn: 1999 fclose(f); 2000 delete[] fileBuf; 2001 ucnv_close(conv); 2002 if (U_FAILURE(status)) { 2003 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 2004 delete retPtr; 2005 retPtr = 0; 2006 ulen = 0; 2007 }; 2008 return retPtr; 2009 } 2010 2011 2012 //------------------------------------------------------------------------------- 2013 // 2014 // PerlTests - Run Perl's regular expression tests 2015 // The input file for this test is re_tests, the standard regular 2016 // expression test data distributed with the Perl source code. 2017 // 2018 // Here is Perl's description of the test data file: 2019 // 2020 // # The tests are in a separate file 't/op/re_tests'. 2021 // # Each line in that file is a separate test. 2022 // # There are five columns, separated by tabs. 2023 // # 2024 // # Column 1 contains the pattern, optionally enclosed in C<''>. 2025 // # Modifiers can be put after the closing C<'>. 2026 // # 2027 // # Column 2 contains the string to be matched. 2028 // # 2029 // # Column 3 contains the expected result: 2030 // # y expect a match 2031 // # n expect no match 2032 // # c expect an error 2033 // # B test exposes a known bug in Perl, should be skipped 2034 // # b test exposes a known bug in Perl, should be skipped if noamp 2035 // # 2036 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>. 2037 // # 2038 // # Column 4 contains a string, usually C<$&>. 2039 // # 2040 // # Column 5 contains the expected result of double-quote 2041 // # interpolating that string after the match, or start of error message. 2042 // # 2043 // # Column 6, if present, contains a reason why the test is skipped. 2044 // # This is printed with "skipped", for harness to pick up. 2045 // # 2046 // # \n in the tests are interpolated, as are variables of the form ${\w+}. 2047 // # 2048 // # If you want to add a regular expression test that can't be expressed 2049 // # in this format, don't add it here: put it in op/pat.t instead. 2050 // 2051 // For ICU, if field 3 contains an 'i', the test will be skipped. 2052 // The test exposes is some known incompatibility between ICU and Perl regexps. 2053 // (The i is in addition to whatever was there before.) 2054 // 2055 //------------------------------------------------------------------------------- 2056 void RegexTest::PerlTests() { 2057 char tdd[2048]; 2058 const char *srcPath; 2059 UErrorCode status = U_ZERO_ERROR; 2060 UParseError pe; 2061 2062 // 2063 // Open and read the test data file. 2064 // 2065 srcPath=getPath(tdd, "re_tests.txt"); 2066 if(srcPath==NULL) { 2067 return; /* something went wrong, error already output */ 2068 } 2069 2070 int32_t len; 2071 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status); 2072 if (U_FAILURE(status)) { 2073 return; /* something went wrong, error already output */ 2074 } 2075 2076 // 2077 // Put the test data into a UnicodeString 2078 // 2079 UnicodeString testDataString(FALSE, testData, len); 2080 2081 // 2082 // Regex to break the input file into lines, and strip the new lines. 2083 // One line per match, capture group one is the desired data. 2084 // 2085 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status); 2086 if (U_FAILURE(status)) { 2087 dataerrln("RegexPattern::compile() error"); 2088 return; 2089 } 2090 RegexMatcher* lineMat = linePat->matcher(testDataString, status); 2091 2092 // 2093 // Regex to split a test file line into fields. 2094 // There are six fields, separated by tabs. 2095 // 2096 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status); 2097 2098 // 2099 // Regex to identify test patterns with flag settings, and to separate them. 2100 // Test patterns with flags look like 'pattern'i 2101 // Test patterns without flags are not quoted: pattern 2102 // Coming out, capture group 2 is the pattern, capture group 3 is the flags. 2103 // 2104 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status); 2105 RegexMatcher* flagMat = flagPat->matcher(status); 2106 2107 // 2108 // The Perl tests reference several perl-isms, which are evaluated/substituted 2109 // in the test data. Not being perl, this must be done explicitly. Here 2110 // are string constants and REs for these constructs. 2111 // 2112 UnicodeString nulnulSrc("${nulnul}"); 2113 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV); 2114 nulnul = nulnul.unescape(); 2115 2116 UnicodeString ffffSrc("${ffff}"); 2117 UnicodeString ffff("\\uffff", -1, US_INV); 2118 ffff = ffff.unescape(); 2119 2120 // regexp for $-[0], $+[2], etc. 2121 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status); 2122 RegexMatcher *groupsMat = groupsPat->matcher(status); 2123 2124 // regexp for $0, $1, $2, etc. 2125 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status); 2126 RegexMatcher *cgMat = cgPat->matcher(status); 2127 2128 2129 // 2130 // Main Loop for the Perl Tests, runs once per line from the 2131 // test data file. 2132 // 2133 int32_t lineNum = 0; 2134 int32_t skippedUnimplementedCount = 0; 2135 while (lineMat->find()) { 2136 lineNum++; 2137 2138 // 2139 // Get a line, break it into its fields, do the Perl 2140 // variable substitutions. 2141 // 2142 UnicodeString line = lineMat->group(1, status); 2143 UnicodeString fields[7]; 2144 fieldPat->split(line, fields, 7, status); 2145 2146 flagMat->reset(fields[0]); 2147 flagMat->matches(status); 2148 UnicodeString pattern = flagMat->group(2, status); 2149 pattern.findAndReplace("${bang}", "!"); 2150 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000")); 2151 pattern.findAndReplace(ffffSrc, ffff); 2152 2153 // 2154 // Identify patterns that include match flag settings, 2155 // split off the flags, remove the extra quotes. 2156 // 2157 UnicodeString flagStr = flagMat->group(3, status); 2158 if (U_FAILURE(status)) { 2159 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 2160 return; 2161 } 2162 int32_t flags = 0; 2163 const UChar UChar_c = 0x63; // Char constants for the flag letters. 2164 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C) 2165 const UChar UChar_m = 0x6d; 2166 const UChar UChar_x = 0x78; 2167 const UChar UChar_y = 0x79; 2168 if (flagStr.indexOf(UChar_i) != -1) { 2169 flags |= UREGEX_CASE_INSENSITIVE; 2170 } 2171 if (flagStr.indexOf(UChar_m) != -1) { 2172 flags |= UREGEX_MULTILINE; 2173 } 2174 if (flagStr.indexOf(UChar_x) != -1) { 2175 flags |= UREGEX_COMMENTS; 2176 } 2177 2178 // 2179 // Compile the test pattern. 2180 // 2181 status = U_ZERO_ERROR; 2182 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status); 2183 if (status == U_REGEX_UNIMPLEMENTED) { 2184 // 2185 // Test of a feature that is planned for ICU, but not yet implemented. 2186 // skip the test. 2187 skippedUnimplementedCount++; 2188 delete testPat; 2189 status = U_ZERO_ERROR; 2190 continue; 2191 } 2192 2193 if (U_FAILURE(status)) { 2194 // Some tests are supposed to generate errors. 2195 // Only report an error for tests that are supposed to succeed. 2196 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND 2197 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility 2198 { 2199 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status)); 2200 } 2201 status = U_ZERO_ERROR; 2202 delete testPat; 2203 continue; 2204 } 2205 2206 if (fields[2].indexOf(UChar_i) >= 0) { 2207 // ICU should skip this test. 2208 delete testPat; 2209 continue; 2210 } 2211 2212 if (fields[2].indexOf(UChar_c) >= 0) { 2213 // This pattern should have caused a compilation error, but didn't/ 2214 errln("line %d: Expected a pattern compile error, got success.", lineNum); 2215 delete testPat; 2216 continue; 2217 } 2218 2219 // 2220 // replace the Perl variables that appear in some of the 2221 // match data strings. 2222 // 2223 UnicodeString matchString = fields[1]; 2224 matchString.findAndReplace(nulnulSrc, nulnul); 2225 matchString.findAndReplace(ffffSrc, ffff); 2226 2227 // Replace any \n in the match string with an actual new-line char. 2228 // Don't do full unescape, as this unescapes more than Perl does, which 2229 // causes other spurious failures in the tests. 2230 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 2231 2232 2233 2234 // 2235 // Run the test, check for expected match/don't match result. 2236 // 2237 RegexMatcher *testMat = testPat->matcher(matchString, status); 2238 UBool found = testMat->find(); 2239 UBool expected = FALSE; 2240 if (fields[2].indexOf(UChar_y) >=0) { 2241 expected = TRUE; 2242 } 2243 if (expected != found) { 2244 errln("line %d: Expected %smatch, got %smatch", 2245 lineNum, expected?"":"no ", found?"":"no " ); 2246 continue; 2247 } 2248 2249 // Don't try to check expected results if there is no match. 2250 // (Some have stuff in the expected fields) 2251 if (!found) { 2252 delete testMat; 2253 delete testPat; 2254 continue; 2255 } 2256 2257 // 2258 // Interpret the Perl expression from the fourth field of the data file, 2259 // building up an ICU string from the results of the ICU match. 2260 // The Perl expression will contain references to the results of 2261 // a regex match, including the matched string, capture group strings, 2262 // group starting and ending indicies, etc. 2263 // 2264 UnicodeString resultString; 2265 UnicodeString perlExpr = fields[3]; 2266 groupsMat->reset(perlExpr); 2267 cgMat->reset(perlExpr); 2268 2269 while (perlExpr.length() > 0) { 2270 if (perlExpr.startsWith("$&")) { 2271 resultString.append(testMat->group(status)); 2272 perlExpr.remove(0, 2); 2273 } 2274 2275 else if (groupsMat->lookingAt(status)) { 2276 // $-[0] $+[2] etc. 2277 UnicodeString digitString = groupsMat->group(2, status); 2278 int32_t t = 0; 2279 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 2280 UnicodeString plusOrMinus = groupsMat->group(1, status); 2281 int32_t matchPosition; 2282 if (plusOrMinus.compare("+") == 0) { 2283 matchPosition = testMat->end(groupNum, status); 2284 } else { 2285 matchPosition = testMat->start(groupNum, status); 2286 } 2287 if (matchPosition != -1) { 2288 ICU_Utility::appendNumber(resultString, matchPosition); 2289 } 2290 perlExpr.remove(0, groupsMat->end(status)); 2291 } 2292 2293 else if (cgMat->lookingAt(status)) { 2294 // $1, $2, $3, etc. 2295 UnicodeString digitString = cgMat->group(1, status); 2296 int32_t t = 0; 2297 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 2298 if (U_SUCCESS(status)) { 2299 resultString.append(testMat->group(groupNum, status)); 2300 status = U_ZERO_ERROR; 2301 } 2302 perlExpr.remove(0, cgMat->end(status)); 2303 } 2304 2305 else if (perlExpr.startsWith("@-")) { 2306 int32_t i; 2307 for (i=0; i<=testMat->groupCount(); i++) { 2308 if (i>0) { 2309 resultString.append(" "); 2310 } 2311 ICU_Utility::appendNumber(resultString, testMat->start(i, status)); 2312 } 2313 perlExpr.remove(0, 2); 2314 } 2315 2316 else if (perlExpr.startsWith("@+")) { 2317 int32_t i; 2318 for (i=0; i<=testMat->groupCount(); i++) { 2319 if (i>0) { 2320 resultString.append(" "); 2321 } 2322 ICU_Utility::appendNumber(resultString, testMat->end(i, status)); 2323 } 2324 perlExpr.remove(0, 2); 2325 } 2326 2327 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal. 2328 // or as an escaped sequence (e.g. \n) 2329 if (perlExpr.length() > 1) { 2330 perlExpr.remove(0, 1); // Remove the '\', but only if not last char. 2331 } 2332 UChar c = perlExpr.charAt(0); 2333 switch (c) { 2334 case 'n': c = '\n'; break; 2335 // add any other escape sequences that show up in the test expected results. 2336 } 2337 resultString.append(c); 2338 perlExpr.remove(0, 1); 2339 } 2340 2341 else { 2342 // Any characters from the perl expression that we don't explicitly 2343 // recognize before here are assumed to be literals and copied 2344 // as-is to the expected results. 2345 resultString.append(perlExpr.charAt(0)); 2346 perlExpr.remove(0, 1); 2347 } 2348 2349 if (U_FAILURE(status)) { 2350 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status)); 2351 break; 2352 } 2353 } 2354 2355 // 2356 // Expected Results Compare 2357 // 2358 UnicodeString expectedS(fields[4]); 2359 expectedS.findAndReplace(nulnulSrc, nulnul); 2360 expectedS.findAndReplace(ffffSrc, ffff); 2361 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 2362 2363 2364 if (expectedS.compare(resultString) != 0) { 2365 err("Line %d: Incorrect perl expression results.", lineNum); 2366 errln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\""); 2367 } 2368 2369 delete testMat; 2370 delete testPat; 2371 } 2372 2373 // 2374 // All done. Clean up allocated stuff. 2375 // 2376 delete cgMat; 2377 delete cgPat; 2378 2379 delete groupsMat; 2380 delete groupsPat; 2381 2382 delete flagMat; 2383 delete flagPat; 2384 2385 delete lineMat; 2386 delete linePat; 2387 2388 delete fieldPat; 2389 delete [] testData; 2390 2391 2392 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount); 2393 2394 } 2395 2396 2397 //-------------------------------------------------------------- 2398 // 2399 // Bug6149 Verify limits to heap expansion for backtrack stack. 2400 // Use this pattern, 2401 // "(a?){1,}" 2402 // The zero-length match will repeat forever. 2403 // (That this goes into a loop is another bug) 2404 // 2405 //--------------------------------------------------------------- 2406 void RegexTest::Bug6149() { 2407 UnicodeString pattern("(a?){1,}"); 2408 UnicodeString s("xyz"); 2409 uint32_t flags = 0; 2410 UErrorCode status = U_ZERO_ERROR; 2411 2412 RegexMatcher matcher(pattern, s, flags, status); 2413 UBool result = false; 2414 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW); 2415 REGEX_ASSERT(result == FALSE); 2416 } 2417 2418 2419 // 2420 // Callbacks() Test the callback function. 2421 // When set, callbacks occur periodically during matching operations, 2422 // giving the application code the ability to abort the operation 2423 // before it's normal completion. 2424 // 2425 2426 struct callBackContext { 2427 RegexTest *test; 2428 int32_t maxCalls; 2429 int32_t numCalls; 2430 int32_t lastSteps; 2431 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;}; 2432 }; 2433 2434 U_CDECL_BEGIN 2435 static UBool U_CALLCONV 2436 testCallBackFn(const void *context, int32_t steps) { 2437 callBackContext *info = (callBackContext *)context; 2438 if (info->lastSteps+1 != steps) { 2439 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps); 2440 } 2441 info->lastSteps = steps; 2442 info->numCalls++; 2443 return (info->numCalls < info->maxCalls); 2444 } 2445 U_CDECL_END 2446 2447 void RegexTest::Callbacks() { 2448 { 2449 // Getter returns NULLs if no callback has been set 2450 2451 // The variables that the getter will fill in. 2452 // Init to non-null values so that the action of the getter can be seen. 2453 const void *returnedContext = &returnedContext; 2454 URegexMatchCallback *returnedFn = &testCallBackFn; 2455 2456 UErrorCode status = U_ZERO_ERROR; 2457 RegexMatcher matcher("x", 0, status); 2458 REGEX_CHECK_STATUS; 2459 matcher.getMatchCallback(returnedFn, returnedContext, status); 2460 REGEX_CHECK_STATUS; 2461 REGEX_ASSERT(returnedFn == NULL); 2462 REGEX_ASSERT(returnedContext == NULL); 2463 } 2464 2465 { 2466 // Set and Get work 2467 callBackContext cbInfo = {this, 0, 0, 0}; 2468 const void *returnedContext; 2469 URegexMatchCallback *returnedFn; 2470 UErrorCode status = U_ZERO_ERROR; 2471 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long. 2472 REGEX_CHECK_STATUS; 2473 matcher.setMatchCallback(testCallBackFn, &cbInfo, status); 2474 REGEX_CHECK_STATUS; 2475 matcher.getMatchCallback(returnedFn, returnedContext, status); 2476 REGEX_CHECK_STATUS; 2477 REGEX_ASSERT(returnedFn == testCallBackFn); 2478 REGEX_ASSERT(returnedContext == &cbInfo); 2479 2480 // A short-running match shouldn't invoke the callback 2481 status = U_ZERO_ERROR; 2482 cbInfo.reset(1); 2483 UnicodeString s = "xxx"; 2484 matcher.reset(s); 2485 REGEX_ASSERT(matcher.matches(status)); 2486 REGEX_CHECK_STATUS; 2487 REGEX_ASSERT(cbInfo.numCalls == 0); 2488 2489 // A medium-length match that runs long enough to invoke the 2490 // callback, but not so long that the callback aborts it. 2491 status = U_ZERO_ERROR; 2492 cbInfo.reset(4); 2493 s = "aaaaaaaaaaaaaaaaaaab"; 2494 matcher.reset(s); 2495 REGEX_ASSERT(matcher.matches(status)==FALSE); 2496 REGEX_CHECK_STATUS; 2497 REGEX_ASSERT(cbInfo.numCalls > 0); 2498 2499 // A longer running match that the callback function will abort. 2500 status = U_ZERO_ERROR; 2501 cbInfo.reset(4); 2502 s = "aaaaaaaaaaaaaaaaaaaaaaab"; 2503 matcher.reset(s); 2504 REGEX_ASSERT(matcher.matches(status)==FALSE); 2505 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); 2506 REGEX_ASSERT(cbInfo.numCalls == 4); 2507 } 2508 2509 2510 } 2511 2512 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ 2513 2514