1 /******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 2002-2010, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6 7 // 8 // regextst.cpp 9 // 10 // ICU Regular Expressions test, part of intltest. 11 // 12 13 #include "intltest.h" 14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 15 16 #include "unicode/regex.h" 17 #include "unicode/uchar.h" 18 #include "unicode/ucnv.h" 19 #include "unicode/ustring.h" 20 #include "regextst.h" 21 #include "uvector.h" 22 #include "util.h" 23 #include <stdlib.h> 24 #include <string.h> 25 #include <stdio.h> 26 27 #define SUPPORT_MUTATING_INPUT_STRING 0 28 29 30 //--------------------------------------------------------------------------- 31 // 32 // Test class boilerplate 33 // 34 //--------------------------------------------------------------------------- 35 RegexTest::RegexTest() 36 { 37 } 38 39 40 RegexTest::~RegexTest() 41 { 42 } 43 44 45 46 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) 47 { 48 if (exec) logln("TestSuite RegexTest: "); 49 switch (index) { 50 51 case 0: name = "Basic"; 52 if (exec) Basic(); 53 break; 54 case 1: name = "API_Match"; 55 if (exec) API_Match(); 56 break; 57 case 2: name = "API_Replace"; 58 if (exec) API_Replace(); 59 break; 60 case 3: name = "API_Pattern"; 61 if (exec) API_Pattern(); 62 break; 63 case 4: 64 #if !UCONFIG_NO_FILE_IO 65 name = "Extended"; 66 if (exec) Extended(); 67 #else 68 name = "skip"; 69 #endif 70 break; 71 case 5: name = "Errors"; 72 if (exec) Errors(); 73 break; 74 case 6: name = "PerlTests"; 75 if (exec) PerlTests(); 76 break; 77 case 7: name = "Callbacks"; 78 if (exec) Callbacks(); 79 break; 80 case 8: name = "Bug 6149"; 81 if (exec) Bug6149(); 82 break; 83 case 9: name = "UTextBasic"; 84 if (exec) UTextBasic(); 85 break; 86 case 10: name = "API_Match_UTF8"; 87 if (exec) API_Match_UTF8(); 88 break; 89 case 11: name = "API_Replace_UTF8"; 90 if (exec) API_Replace_UTF8(); 91 break; 92 case 12: name = "API_Pattern_UTF8"; 93 if (exec) API_Pattern_UTF8(); 94 break; 95 case 13: name = "PerlTestsUTF8"; 96 if (exec) PerlTestsUTF8(); 97 break; 98 case 14: name = "PreAllocatedUTextCAPI"; 99 if (exec) PreAllocatedUTextCAPI(); 100 break; 101 case 15: name = "Bug 7651"; 102 if (exec) Bug7651(); 103 break; 104 105 default: name = ""; 106 break; //needed to end loop 107 } 108 } 109 110 111 //--------------------------------------------------------------------------- 112 // 113 // Error Checking / Reporting macros used in all of the tests. 114 // 115 //--------------------------------------------------------------------------- 116 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("RegexTest failure at line %d. status=%s", \ 117 __LINE__, u_errorName(status)); return;}} 118 119 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("RegexTest failure at line %d.\n", __LINE__);};} 120 121 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\ 122 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \ 123 __LINE__, u_errorName(errcode), u_errorName(status));};} 124 125 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \ 126 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }} 127 128 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \ 129 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}} 130 131 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) { 132 UErrorCode status = U_ZERO_ERROR; 133 UText expectedText = UTEXT_INITIALIZER; 134 utext_openUTF8(&expectedText, expected, -1, &status); 135 utext_setNativeIndex(actual, 0); 136 if (utext_compare(&expectedText, -1, actual, -1) != 0) { 137 char buf[201 /*21*/]; 138 char *bufPtr = buf; 139 UChar32 c = utext_next32From(actual, 0); 140 while (c != U_SENTINEL && bufPtr < buf+200/*20*/) { 141 if (0x20<c && c<0x7e) { 142 *bufPtr = c; 143 } else { 144 *bufPtr = '.'; 145 } 146 bufPtr++; 147 c = UTEXT_NEXT32(actual); 148 } 149 *bufPtr = 0; 150 151 errln("Failure at file %s, line %d, expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expected, utext_nativeLength(&expectedText), buf, utext_nativeLength(actual)); 152 } 153 utext_close(&expectedText); 154 } 155 156 #define REGEX_ASSERT_UTEXT(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__) 157 158 159 //--------------------------------------------------------------------------- 160 // 161 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests 162 // for the LookingAt() and Match() functions. 163 // 164 // usage: 165 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected); 166 // 167 // The expected results are UBool - TRUE or FALSE. 168 // The input text is unescaped. The pattern is not. 169 // 170 // 171 //--------------------------------------------------------------------------- 172 173 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);} 174 175 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) { 176 const UnicodeString pattern(pat, -1, US_INV); 177 const UnicodeString inputText(text, -1, US_INV); 178 UErrorCode status = U_ZERO_ERROR; 179 UParseError pe; 180 RegexPattern *REPattern = NULL; 181 RegexMatcher *REMatcher = NULL; 182 UBool retVal = TRUE; 183 184 UnicodeString patString(pat, -1, US_INV); 185 REPattern = RegexPattern::compile(patString, 0, pe, status); 186 if (U_FAILURE(status)) { 187 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s", 188 line, u_errorName(status)); 189 return FALSE; 190 } 191 if (line==376) { RegexPatternDump(REPattern);} 192 193 UnicodeString inputString(inputText); 194 UnicodeString unEscapedInput = inputString.unescape(); 195 REMatcher = REPattern->matcher(unEscapedInput, status); 196 if (U_FAILURE(status)) { 197 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n", 198 line, u_errorName(status)); 199 return FALSE; 200 } 201 202 UBool actualmatch; 203 actualmatch = REMatcher->lookingAt(status); 204 if (U_FAILURE(status)) { 205 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n", 206 line, u_errorName(status)); 207 retVal = FALSE; 208 } 209 if (actualmatch != looking) { 210 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line); 211 retVal = FALSE; 212 } 213 214 status = U_ZERO_ERROR; 215 actualmatch = REMatcher->matches(status); 216 if (U_FAILURE(status)) { 217 errln("RegexTest failure in matches() at line %d. Status = %s\n", 218 line, u_errorName(status)); 219 retVal = FALSE; 220 } 221 if (actualmatch != match) { 222 errln("RegexTest: wrong return from matches() at line %d.\n", line); 223 retVal = FALSE; 224 } 225 226 if (retVal == FALSE) { 227 RegexPatternDump(REPattern); 228 } 229 230 delete REPattern; 231 delete REMatcher; 232 return retVal; 233 } 234 235 236 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) { 237 UText pattern = UTEXT_INITIALIZER; 238 int32_t inputUTF8Length; 239 char *textChars = NULL; 240 UText inputText = UTEXT_INITIALIZER; 241 UErrorCode status = U_ZERO_ERROR; 242 UParseError pe; 243 RegexPattern *REPattern = NULL; 244 RegexMatcher *REMatcher = NULL; 245 UBool retVal = TRUE; 246 247 utext_openUTF8(&pattern, pat, -1, &status); 248 REPattern = RegexPattern::compile(&pattern, 0, pe, status); 249 if (U_FAILURE(status)) { 250 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n", 251 line, u_errorName(status)); 252 return FALSE; 253 } 254 255 UnicodeString inputString(text, -1, US_INV); 256 UnicodeString unEscapedInput = inputString.unescape(); 257 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status)); 258 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); 259 260 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status); 261 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) { 262 // UTF-8 does not allow unpaired surrogates, so this could actually happen 263 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status)); 264 return TRUE; // not a failure of the Regex engine 265 } 266 status = U_ZERO_ERROR; // buffer overflow 267 textChars = new char[inputUTF8Length+1]; 268 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status); 269 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status); 270 271 REMatcher = REPattern->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status); 272 if (U_FAILURE(status)) { 273 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n", 274 line, u_errorName(status)); 275 return FALSE; 276 } 277 278 UBool actualmatch; 279 actualmatch = REMatcher->lookingAt(status); 280 if (U_FAILURE(status)) { 281 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n", 282 line, u_errorName(status)); 283 retVal = FALSE; 284 } 285 if (actualmatch != looking) { 286 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line); 287 retVal = FALSE; 288 } 289 290 status = U_ZERO_ERROR; 291 actualmatch = REMatcher->matches(status); 292 if (U_FAILURE(status)) { 293 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n", 294 line, u_errorName(status)); 295 retVal = FALSE; 296 } 297 if (actualmatch != match) { 298 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line); 299 retVal = FALSE; 300 } 301 302 if (retVal == FALSE) { 303 RegexPatternDump(REPattern); 304 } 305 306 delete REPattern; 307 delete REMatcher; 308 utext_close(&inputText); 309 utext_close(&pattern); 310 delete[] textChars; 311 return retVal; 312 } 313 314 315 316 //--------------------------------------------------------------------------- 317 // 318 // REGEX_ERR Macro + invocation function to simplify writing tests 319 // regex tests for incorrect patterns 320 // 321 // usage: 322 // REGEX_ERR("pattern", expected error line, column, expected status); 323 // 324 //--------------------------------------------------------------------------- 325 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__); 326 327 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol, 328 UErrorCode expectedStatus, int32_t line) { 329 UnicodeString pattern(pat); 330 331 UErrorCode status = U_ZERO_ERROR; 332 UParseError pe; 333 RegexPattern *callerPattern = NULL; 334 335 // 336 // Compile the caller's pattern 337 // 338 UnicodeString patString(pat); 339 callerPattern = RegexPattern::compile(patString, 0, pe, status); 340 if (status != expectedStatus) { 341 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status)); 342 } else { 343 if (status != U_ZERO_ERROR) { 344 if (pe.line != errLine || pe.offset != errCol) { 345 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n", 346 line, errLine, errCol, pe.line, pe.offset); 347 } 348 } 349 } 350 351 delete callerPattern; 352 353 // 354 // Compile again, using a UTF-8-based UText 355 // 356 UText patternText = UTEXT_INITIALIZER; 357 utext_openUTF8(&patternText, pat, -1, &status); 358 callerPattern = RegexPattern::compile(&patternText, 0, pe, status); 359 if (status != expectedStatus) { 360 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status)); 361 } else { 362 if (status != U_ZERO_ERROR) { 363 if (pe.line != errLine || pe.offset != errCol) { 364 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n", 365 line, errLine, errCol, pe.line, pe.offset); 366 } 367 } 368 } 369 370 delete callerPattern; 371 utext_close(&patternText); 372 } 373 374 375 376 //--------------------------------------------------------------------------- 377 // 378 // Basic Check for basic functionality of regex pattern matching. 379 // Avoid the use of REGEX_FIND test macro, which has 380 // substantial dependencies on basic Regex functionality. 381 // 382 //--------------------------------------------------------------------------- 383 void RegexTest::Basic() { 384 385 386 // 387 // Debug - slide failing test cases early 388 // 389 #if 0 390 { 391 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE); 392 UParseError pe; 393 UErrorCode status = U_ZERO_ERROR; 394 RegexPattern::compile("^(?:a?b?)*$", 0, pe, status); 395 // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd"); 396 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX===================="); 397 } 398 exit(1); 399 #endif 400 401 402 // 403 // Pattern with parentheses 404 // 405 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE); 406 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE); 407 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE); 408 409 // 410 // Patterns with * 411 // 412 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE); 413 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE); 414 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE); 415 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE); 416 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE); 417 418 REGEX_TESTLM("a*", "", TRUE, TRUE); 419 REGEX_TESTLM("a*", "b", TRUE, FALSE); 420 421 422 // 423 // Patterns with "." 424 // 425 REGEX_TESTLM(".", "abc", TRUE, FALSE); 426 REGEX_TESTLM("...", "abc", TRUE, TRUE); 427 REGEX_TESTLM("....", "abc", FALSE, FALSE); 428 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE); 429 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE); 430 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE); 431 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE); 432 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE); 433 434 // 435 // Patterns with * applied to chars at end of literal string 436 // 437 REGEX_TESTLM("abc*", "ab", TRUE, TRUE); 438 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE); 439 440 // 441 // Supplemental chars match as single chars, not a pair of surrogates. 442 // 443 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE); 444 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE); 445 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE); 446 447 448 // 449 // UnicodeSets in the pattern 450 // 451 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE); 452 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE); 453 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE); 454 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE); 455 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE); 456 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE); 457 458 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE); 459 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE); 460 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE); 461 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences. 462 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE); 463 464 // 465 // OR operator in patterns 466 // 467 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE); 468 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE); 469 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE); 470 REGEX_TESTLM("a|b", "b", TRUE, TRUE); 471 472 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE); 473 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE); 474 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE); 475 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE); 476 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE); 477 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE); 478 479 // 480 // + 481 // 482 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE); 483 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE); 484 REGEX_TESTLM("b+", "", FALSE, FALSE); 485 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE); 486 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE); 487 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE); 488 489 // 490 // ? 491 // 492 REGEX_TESTLM("ab?", "ab", TRUE, TRUE); 493 REGEX_TESTLM("ab?", "a", TRUE, TRUE); 494 REGEX_TESTLM("ab?", "ac", TRUE, FALSE); 495 REGEX_TESTLM("ab?", "abb", TRUE, FALSE); 496 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE); 497 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE); 498 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE); 499 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE); 500 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE); 501 502 // 503 // Escape sequences that become single literal chars, handled internally 504 // by ICU's Unescape. 505 // 506 507 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet. 508 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL 509 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L 510 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape 511 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed 512 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line 513 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR 514 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab 515 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE); 516 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE); 517 518 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input 519 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input 520 521 // Escape of special chars in patterns 522 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE); 523 } 524 525 526 //--------------------------------------------------------------------------- 527 // 528 // UTextBasic Check for quirks that are specific to the UText 529 // implementation. 530 // 531 //--------------------------------------------------------------------------- 532 void RegexTest::UTextBasic() { 533 UErrorCode status = U_ZERO_ERROR; 534 UText pattern = UTEXT_INITIALIZER; 535 utext_openUTF8(&pattern, "abc", -1, &status); 536 RegexMatcher matcher(&pattern, 0, status); 537 REGEX_CHECK_STATUS; 538 539 UText input = UTEXT_INITIALIZER; 540 utext_openUTF8(&input, "abc", -1, &status); 541 REGEX_CHECK_STATUS; 542 matcher.reset(&input); 543 REGEX_CHECK_STATUS; 544 REGEX_ASSERT_UTEXT("abc", matcher.inputText()); 545 546 matcher.reset(matcher.inputText()); 547 REGEX_CHECK_STATUS; 548 REGEX_ASSERT_UTEXT("abc", matcher.inputText()); 549 550 utext_close(&pattern); 551 utext_close(&input); 552 } 553 554 555 //--------------------------------------------------------------------------- 556 // 557 // API_Match Test that the API for class RegexMatcher 558 // is present and nominally working, but excluding functions 559 // implementing replace operations. 560 // 561 //--------------------------------------------------------------------------- 562 void RegexTest::API_Match() { 563 UParseError pe; 564 UErrorCode status=U_ZERO_ERROR; 565 int32_t flags = 0; 566 567 // 568 // Debug - slide failing test cases early 569 // 570 #if 0 571 { 572 } 573 return; 574 #endif 575 576 // 577 // Simple pattern compilation 578 // 579 { 580 UnicodeString re("abc"); 581 RegexPattern *pat2; 582 pat2 = RegexPattern::compile(re, flags, pe, status); 583 REGEX_CHECK_STATUS; 584 585 UnicodeString inStr1 = "abcdef this is a test"; 586 UnicodeString instr2 = "not abc"; 587 UnicodeString empty = ""; 588 589 590 // 591 // Matcher creation and reset. 592 // 593 RegexMatcher *m1 = pat2->matcher(inStr1, status); 594 REGEX_CHECK_STATUS; 595 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 596 REGEX_ASSERT(m1->input() == inStr1); 597 m1->reset(instr2); 598 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 599 REGEX_ASSERT(m1->input() == instr2); 600 m1->reset(inStr1); 601 REGEX_ASSERT(m1->input() == inStr1); 602 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 603 m1->reset(empty); 604 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 605 REGEX_ASSERT(m1->input() == empty); 606 REGEX_ASSERT(&m1->pattern() == pat2); 607 608 // 609 // reset(pos, status) 610 // 611 m1->reset(inStr1); 612 m1->reset(4, status); 613 REGEX_CHECK_STATUS; 614 REGEX_ASSERT(m1->input() == inStr1); 615 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 616 617 m1->reset(-1, status); 618 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 619 status = U_ZERO_ERROR; 620 621 m1->reset(0, status); 622 REGEX_CHECK_STATUS; 623 status = U_ZERO_ERROR; 624 625 int32_t len = m1->input().length(); 626 m1->reset(len-1, status); 627 REGEX_CHECK_STATUS; 628 status = U_ZERO_ERROR; 629 630 m1->reset(len, status); 631 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 632 status = U_ZERO_ERROR; 633 634 // 635 // match(pos, status) 636 // 637 m1->reset(instr2); 638 REGEX_ASSERT(m1->matches(4, status) == TRUE); 639 m1->reset(); 640 REGEX_ASSERT(m1->matches(3, status) == FALSE); 641 m1->reset(); 642 REGEX_ASSERT(m1->matches(5, status) == FALSE); 643 REGEX_ASSERT(m1->matches(4, status) == TRUE); 644 REGEX_ASSERT(m1->matches(-1, status) == FALSE); 645 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 646 647 // Match() at end of string should fail, but should not 648 // be an error. 649 status = U_ZERO_ERROR; 650 len = m1->input().length(); 651 REGEX_ASSERT(m1->matches(len, status) == FALSE); 652 REGEX_CHECK_STATUS; 653 654 // Match beyond end of string should fail with an error. 655 status = U_ZERO_ERROR; 656 REGEX_ASSERT(m1->matches(len+1, status) == FALSE); 657 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 658 659 // Successful match at end of string. 660 { 661 status = U_ZERO_ERROR; 662 RegexMatcher m("A?", 0, status); // will match zero length string. 663 REGEX_CHECK_STATUS; 664 m.reset(inStr1); 665 len = inStr1.length(); 666 REGEX_ASSERT(m.matches(len, status) == TRUE); 667 REGEX_CHECK_STATUS; 668 m.reset(empty); 669 REGEX_ASSERT(m.matches(0, status) == TRUE); 670 REGEX_CHECK_STATUS; 671 } 672 673 674 // 675 // lookingAt(pos, status) 676 // 677 status = U_ZERO_ERROR; 678 m1->reset(instr2); // "not abc" 679 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 680 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE); 681 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE); 682 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 683 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE); 684 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 685 status = U_ZERO_ERROR; 686 len = m1->input().length(); 687 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE); 688 REGEX_CHECK_STATUS; 689 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE); 690 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 691 692 delete m1; 693 delete pat2; 694 } 695 696 697 // 698 // Capture Group. 699 // RegexMatcher::start(); 700 // RegexMatcher::end(); 701 // RegexMatcher::groupCount(); 702 // 703 { 704 int32_t flags=0; 705 UParseError pe; 706 UErrorCode status=U_ZERO_ERROR; 707 708 UnicodeString re("01(23(45)67)(.*)"); 709 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 710 REGEX_CHECK_STATUS; 711 UnicodeString data = "0123456789"; 712 713 RegexMatcher *matcher = pat->matcher(data, status); 714 REGEX_CHECK_STATUS; 715 REGEX_ASSERT(matcher->lookingAt(status) == TRUE); 716 static const int32_t matchStarts[] = {0, 2, 4, 8}; 717 static const int32_t matchEnds[] = {10, 8, 6, 10}; 718 int32_t i; 719 for (i=0; i<4; i++) { 720 int32_t actualStart = matcher->start(i, status); 721 REGEX_CHECK_STATUS; 722 if (actualStart != matchStarts[i]) { 723 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n", 724 __LINE__, i, matchStarts[i], actualStart); 725 } 726 int32_t actualEnd = matcher->end(i, status); 727 REGEX_CHECK_STATUS; 728 if (actualEnd != matchEnds[i]) { 729 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n", 730 __LINE__, i, matchEnds[i], actualEnd); 731 } 732 } 733 734 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status)); 735 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status)); 736 737 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 738 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 739 matcher->reset(); 740 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE); 741 742 matcher->lookingAt(status); 743 REGEX_ASSERT(matcher->group(status) == "0123456789"); 744 REGEX_ASSERT(matcher->group(0, status) == "0123456789"); 745 REGEX_ASSERT(matcher->group(1, status) == "234567" ); 746 REGEX_ASSERT(matcher->group(2, status) == "45" ); 747 REGEX_ASSERT(matcher->group(3, status) == "89" ); 748 REGEX_CHECK_STATUS; 749 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 750 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 751 matcher->reset(); 752 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); 753 754 delete matcher; 755 delete pat; 756 757 } 758 759 // 760 // find 761 // 762 { 763 int32_t flags=0; 764 UParseError pe; 765 UErrorCode status=U_ZERO_ERROR; 766 767 UnicodeString re("abc"); 768 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 769 REGEX_CHECK_STATUS; 770 UnicodeString data = ".abc..abc...abc.."; 771 // 012345678901234567 772 773 RegexMatcher *matcher = pat->matcher(data, status); 774 REGEX_CHECK_STATUS; 775 REGEX_ASSERT(matcher->find()); 776 REGEX_ASSERT(matcher->start(status) == 1); 777 REGEX_ASSERT(matcher->find()); 778 REGEX_ASSERT(matcher->start(status) == 6); 779 REGEX_ASSERT(matcher->find()); 780 REGEX_ASSERT(matcher->start(status) == 12); 781 REGEX_ASSERT(matcher->find() == FALSE); 782 REGEX_ASSERT(matcher->find() == FALSE); 783 784 matcher->reset(); 785 REGEX_ASSERT(matcher->find()); 786 REGEX_ASSERT(matcher->start(status) == 1); 787 788 REGEX_ASSERT(matcher->find(0, status)); 789 REGEX_ASSERT(matcher->start(status) == 1); 790 REGEX_ASSERT(matcher->find(1, status)); 791 REGEX_ASSERT(matcher->start(status) == 1); 792 REGEX_ASSERT(matcher->find(2, status)); 793 REGEX_ASSERT(matcher->start(status) == 6); 794 REGEX_ASSERT(matcher->find(12, status)); 795 REGEX_ASSERT(matcher->start(status) == 12); 796 REGEX_ASSERT(matcher->find(13, status) == FALSE); 797 REGEX_ASSERT(matcher->find(16, status) == FALSE); 798 REGEX_ASSERT(matcher->find(17, status) == FALSE); 799 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE); 800 801 status = U_ZERO_ERROR; 802 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 803 status = U_ZERO_ERROR; 804 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR); 805 806 REGEX_ASSERT(matcher->groupCount() == 0); 807 808 delete matcher; 809 delete pat; 810 } 811 812 813 // 814 // find, with \G in pattern (true if at the end of a previous match). 815 // 816 { 817 int32_t flags=0; 818 UParseError pe; 819 UErrorCode status=U_ZERO_ERROR; 820 821 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV); 822 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 823 REGEX_CHECK_STATUS; 824 UnicodeString data = ".abcabc.abc.."; 825 // 012345678901234567 826 827 RegexMatcher *matcher = pat->matcher(data, status); 828 REGEX_CHECK_STATUS; 829 REGEX_ASSERT(matcher->find()); 830 REGEX_ASSERT(matcher->start(status) == 0); 831 REGEX_ASSERT(matcher->start(1, status) == -1); 832 REGEX_ASSERT(matcher->start(2, status) == 1); 833 834 REGEX_ASSERT(matcher->find()); 835 REGEX_ASSERT(matcher->start(status) == 4); 836 REGEX_ASSERT(matcher->start(1, status) == 4); 837 REGEX_ASSERT(matcher->start(2, status) == -1); 838 REGEX_CHECK_STATUS; 839 840 delete matcher; 841 delete pat; 842 } 843 844 // 845 // find with zero length matches, match position should bump ahead 846 // to prevent loops. 847 // 848 { 849 int32_t i; 850 UErrorCode status=U_ZERO_ERROR; 851 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere, 852 // using an always-true look-ahead. 853 REGEX_CHECK_STATUS; 854 UnicodeString s(" "); 855 m.reset(s); 856 for (i=0; ; i++) { 857 if (m.find() == FALSE) { 858 break; 859 } 860 REGEX_ASSERT(m.start(status) == i); 861 REGEX_ASSERT(m.end(status) == i); 862 } 863 REGEX_ASSERT(i==5); 864 865 // Check that the bump goes over surrogate pairs OK 866 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004"); 867 s = s.unescape(); 868 m.reset(s); 869 for (i=0; ; i+=2) { 870 if (m.find() == FALSE) { 871 break; 872 } 873 REGEX_ASSERT(m.start(status) == i); 874 REGEX_ASSERT(m.end(status) == i); 875 } 876 REGEX_ASSERT(i==10); 877 } 878 { 879 // find() loop breaking test. 880 // with pattern of /.?/, should see a series of one char matches, then a single 881 // match of zero length at the end of the input string. 882 int32_t i; 883 UErrorCode status=U_ZERO_ERROR; 884 RegexMatcher m(".?", 0, status); 885 REGEX_CHECK_STATUS; 886 UnicodeString s(" "); 887 m.reset(s); 888 for (i=0; ; i++) { 889 if (m.find() == FALSE) { 890 break; 891 } 892 REGEX_ASSERT(m.start(status) == i); 893 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i)); 894 } 895 REGEX_ASSERT(i==5); 896 } 897 898 899 // 900 // Matchers with no input string behave as if they had an empty input string. 901 // 902 903 { 904 UErrorCode status = U_ZERO_ERROR; 905 RegexMatcher m(".?", 0, status); 906 REGEX_CHECK_STATUS; 907 REGEX_ASSERT(m.find()); 908 REGEX_ASSERT(m.start(status) == 0); 909 REGEX_ASSERT(m.input() == ""); 910 } 911 { 912 UErrorCode status = U_ZERO_ERROR; 913 RegexPattern *p = RegexPattern::compile(".", 0, status); 914 RegexMatcher *m = p->matcher(status); 915 REGEX_CHECK_STATUS; 916 917 REGEX_ASSERT(m->find() == FALSE); 918 REGEX_ASSERT(m->input() == ""); 919 delete m; 920 delete p; 921 } 922 923 // 924 // Regions 925 // 926 { 927 UErrorCode status = U_ZERO_ERROR; 928 UnicodeString testString("This is test data"); 929 RegexMatcher m(".*", testString, 0, status); 930 REGEX_CHECK_STATUS; 931 REGEX_ASSERT(m.regionStart() == 0); 932 REGEX_ASSERT(m.regionEnd() == testString.length()); 933 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 934 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 935 936 m.region(2,4, status); 937 REGEX_CHECK_STATUS; 938 REGEX_ASSERT(m.matches(status)); 939 REGEX_ASSERT(m.start(status)==2); 940 REGEX_ASSERT(m.end(status)==4); 941 REGEX_CHECK_STATUS; 942 943 m.reset(); 944 REGEX_ASSERT(m.regionStart() == 0); 945 REGEX_ASSERT(m.regionEnd() == testString.length()); 946 947 UnicodeString shorterString("short"); 948 m.reset(shorterString); 949 REGEX_ASSERT(m.regionStart() == 0); 950 REGEX_ASSERT(m.regionEnd() == shorterString.length()); 951 952 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 953 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE)); 954 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 955 REGEX_ASSERT(&m == &m.reset()); 956 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 957 958 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE)); 959 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 960 REGEX_ASSERT(&m == &m.reset()); 961 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 962 963 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 964 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE)); 965 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 966 REGEX_ASSERT(&m == &m.reset()); 967 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 968 969 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE)); 970 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 971 REGEX_ASSERT(&m == &m.reset()); 972 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 973 974 } 975 976 // 977 // hitEnd() and requireEnd() 978 // 979 { 980 UErrorCode status = U_ZERO_ERROR; 981 UnicodeString testString("aabb"); 982 RegexMatcher m1(".*", testString, 0, status); 983 REGEX_ASSERT(m1.lookingAt(status) == TRUE); 984 REGEX_ASSERT(m1.hitEnd() == TRUE); 985 REGEX_ASSERT(m1.requireEnd() == FALSE); 986 REGEX_CHECK_STATUS; 987 988 status = U_ZERO_ERROR; 989 RegexMatcher m2("a*", testString, 0, status); 990 REGEX_ASSERT(m2.lookingAt(status) == TRUE); 991 REGEX_ASSERT(m2.hitEnd() == FALSE); 992 REGEX_ASSERT(m2.requireEnd() == FALSE); 993 REGEX_CHECK_STATUS; 994 995 status = U_ZERO_ERROR; 996 RegexMatcher m3(".*$", testString, 0, status); 997 REGEX_ASSERT(m3.lookingAt(status) == TRUE); 998 REGEX_ASSERT(m3.hitEnd() == TRUE); 999 REGEX_ASSERT(m3.requireEnd() == TRUE); 1000 REGEX_CHECK_STATUS; 1001 } 1002 1003 1004 // 1005 // Compilation error on reset with UChar * 1006 // These were a hazard that people were stumbling over with runtime errors. 1007 // Changed them to compiler errors by adding private methods that more closely 1008 // matched the incorrect use of the functions. 1009 // 1010 #if 0 1011 { 1012 UErrorCode status = U_ZERO_ERROR; 1013 UChar ucharString[20]; 1014 RegexMatcher m(".", 0, status); 1015 m.reset(ucharString); // should not compile. 1016 1017 RegexPattern *p = RegexPattern::compile(".", 0, status); 1018 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile. 1019 1020 RegexMatcher m3(".", ucharString, 0, status); // Should not compile 1021 } 1022 #endif 1023 1024 // 1025 // Time Outs. 1026 // Note: These tests will need to be changed when the regexp engine is 1027 // able to detect and cut short the exponential time behavior on 1028 // this type of match. 1029 // 1030 { 1031 UErrorCode status = U_ZERO_ERROR; 1032 // Enough 'a's in the string to cause the match to time out. 1033 // (Each on additonal 'a' doubles the time) 1034 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa"); 1035 RegexMatcher matcher("(a+)+b", testString, 0, status); 1036 REGEX_CHECK_STATUS; 1037 REGEX_ASSERT(matcher.getTimeLimit() == 0); 1038 matcher.setTimeLimit(100, status); 1039 REGEX_ASSERT(matcher.getTimeLimit() == 100); 1040 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1041 REGEX_ASSERT(status == U_REGEX_TIME_OUT); 1042 } 1043 { 1044 UErrorCode status = U_ZERO_ERROR; 1045 // Few enough 'a's to slip in under the time limit. 1046 UnicodeString testString("aaaaaaaaaaaaaaaaaa"); 1047 RegexMatcher matcher("(a+)+b", testString, 0, status); 1048 REGEX_CHECK_STATUS; 1049 matcher.setTimeLimit(100, status); 1050 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1051 REGEX_CHECK_STATUS; 1052 } 1053 1054 // 1055 // Stack Limits 1056 // 1057 { 1058 UErrorCode status = U_ZERO_ERROR; 1059 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A' 1060 1061 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations 1062 // of the '+', and makes the stack frames larger. 1063 RegexMatcher matcher("(A)+A$", testString, 0, status); 1064 1065 // With the default stack, this match should fail to run 1066 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1067 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW); 1068 1069 // With unlimited stack, it should run 1070 status = U_ZERO_ERROR; 1071 matcher.setStackLimit(0, status); 1072 REGEX_CHECK_STATUS; 1073 REGEX_ASSERT(matcher.lookingAt(status) == TRUE); 1074 REGEX_CHECK_STATUS; 1075 REGEX_ASSERT(matcher.getStackLimit() == 0); 1076 1077 // With a limited stack, it the match should fail 1078 status = U_ZERO_ERROR; 1079 matcher.setStackLimit(10000, status); 1080 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1081 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW); 1082 REGEX_ASSERT(matcher.getStackLimit() == 10000); 1083 } 1084 1085 // A pattern that doesn't save state should work with 1086 // a minimal sized stack 1087 { 1088 UErrorCode status = U_ZERO_ERROR; 1089 UnicodeString testString = "abc"; 1090 RegexMatcher matcher("abc", testString, 0, status); 1091 REGEX_CHECK_STATUS; 1092 matcher.setStackLimit(30, status); 1093 REGEX_CHECK_STATUS; 1094 REGEX_ASSERT(matcher.matches(status) == TRUE); 1095 REGEX_CHECK_STATUS; 1096 REGEX_ASSERT(matcher.getStackLimit() == 30); 1097 1098 // Negative stack sizes should fail 1099 status = U_ZERO_ERROR; 1100 matcher.setStackLimit(1000, status); 1101 REGEX_CHECK_STATUS; 1102 matcher.setStackLimit(-1, status); 1103 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); 1104 REGEX_ASSERT(matcher.getStackLimit() == 1000); 1105 } 1106 1107 1108 } 1109 1110 1111 1112 1113 1114 1115 //--------------------------------------------------------------------------- 1116 // 1117 // API_Replace API test for class RegexMatcher, testing the 1118 // Replace family of functions. 1119 // 1120 //--------------------------------------------------------------------------- 1121 void RegexTest::API_Replace() { 1122 // 1123 // Replace 1124 // 1125 int32_t flags=0; 1126 UParseError pe; 1127 UErrorCode status=U_ZERO_ERROR; 1128 1129 UnicodeString re("abc"); 1130 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 1131 REGEX_CHECK_STATUS; 1132 UnicodeString data = ".abc..abc...abc.."; 1133 // 012345678901234567 1134 RegexMatcher *matcher = pat->matcher(data, status); 1135 1136 // 1137 // Plain vanilla matches. 1138 // 1139 UnicodeString dest; 1140 dest = matcher->replaceFirst("yz", status); 1141 REGEX_CHECK_STATUS; 1142 REGEX_ASSERT(dest == ".yz..abc...abc.."); 1143 1144 dest = matcher->replaceAll("yz", status); 1145 REGEX_CHECK_STATUS; 1146 REGEX_ASSERT(dest == ".yz..yz...yz.."); 1147 1148 // 1149 // Plain vanilla non-matches. 1150 // 1151 UnicodeString d2 = ".abx..abx...abx.."; 1152 matcher->reset(d2); 1153 dest = matcher->replaceFirst("yz", status); 1154 REGEX_CHECK_STATUS; 1155 REGEX_ASSERT(dest == ".abx..abx...abx.."); 1156 1157 dest = matcher->replaceAll("yz", status); 1158 REGEX_CHECK_STATUS; 1159 REGEX_ASSERT(dest == ".abx..abx...abx.."); 1160 1161 // 1162 // Empty source string 1163 // 1164 UnicodeString d3 = ""; 1165 matcher->reset(d3); 1166 dest = matcher->replaceFirst("yz", status); 1167 REGEX_CHECK_STATUS; 1168 REGEX_ASSERT(dest == ""); 1169 1170 dest = matcher->replaceAll("yz", status); 1171 REGEX_CHECK_STATUS; 1172 REGEX_ASSERT(dest == ""); 1173 1174 // 1175 // Empty substitution string 1176 // 1177 matcher->reset(data); // ".abc..abc...abc.." 1178 dest = matcher->replaceFirst("", status); 1179 REGEX_CHECK_STATUS; 1180 REGEX_ASSERT(dest == "...abc...abc.."); 1181 1182 dest = matcher->replaceAll("", status); 1183 REGEX_CHECK_STATUS; 1184 REGEX_ASSERT(dest == "........"); 1185 1186 // 1187 // match whole string 1188 // 1189 UnicodeString d4 = "abc"; 1190 matcher->reset(d4); 1191 dest = matcher->replaceFirst("xyz", status); 1192 REGEX_CHECK_STATUS; 1193 REGEX_ASSERT(dest == "xyz"); 1194 1195 dest = matcher->replaceAll("xyz", status); 1196 REGEX_CHECK_STATUS; 1197 REGEX_ASSERT(dest == "xyz"); 1198 1199 // 1200 // Capture Group, simple case 1201 // 1202 UnicodeString re2("a(..)"); 1203 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status); 1204 REGEX_CHECK_STATUS; 1205 UnicodeString d5 = "abcdefg"; 1206 RegexMatcher *matcher2 = pat2->matcher(d5, status); 1207 REGEX_CHECK_STATUS; 1208 dest = matcher2->replaceFirst("$1$1", status); 1209 REGEX_CHECK_STATUS; 1210 REGEX_ASSERT(dest == "bcbcdefg"); 1211 1212 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status); 1213 REGEX_CHECK_STATUS; 1214 REGEX_ASSERT(dest == "The value of $1 is bc.defg"); 1215 1216 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status); 1217 REGEX_CHECK_STATUS; 1218 REGEX_ASSERT(dest == "$ by itself, no group number $$$defg"); 1219 1220 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF."); 1221 replacement = replacement.unescape(); 1222 dest = matcher2->replaceFirst(replacement, status); 1223 REGEX_CHECK_STATUS; 1224 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg"); 1225 1226 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR); 1227 1228 1229 // 1230 // Replacement String with \u hex escapes 1231 // 1232 { 1233 UnicodeString src = "abc 1 abc 2 abc 3"; 1234 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--"); 1235 matcher->reset(src); 1236 UnicodeString result = matcher->replaceAll(substitute, status); 1237 REGEX_CHECK_STATUS; 1238 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3"); 1239 } 1240 { 1241 UnicodeString src = "abc !"; 1242 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--"); 1243 matcher->reset(src); 1244 UnicodeString result = matcher->replaceAll(substitute, status); 1245 REGEX_CHECK_STATUS; 1246 UnicodeString expected = UnicodeString("--"); 1247 expected.append((UChar32)0x10000); 1248 expected.append("-- !"); 1249 REGEX_ASSERT(result == expected); 1250 } 1251 // TODO: need more through testing of capture substitutions. 1252 1253 // Bug 4057 1254 // 1255 { 1256 status = U_ZERO_ERROR; 1257 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin"; 1258 RegexMatcher m("ss(.*?)ee", 0, status); 1259 REGEX_CHECK_STATUS; 1260 UnicodeString result; 1261 1262 // Multiple finds do NOT bump up the previous appendReplacement postion. 1263 m.reset(s); 1264 m.find(); 1265 m.find(); 1266 m.appendReplacement(result, "ooh", status); 1267 REGEX_CHECK_STATUS; 1268 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1269 1270 // After a reset into the interior of a string, appendReplacemnt still starts at beginning. 1271 status = U_ZERO_ERROR; 1272 result.truncate(0); 1273 m.reset(10, status); 1274 m.find(); 1275 m.find(); 1276 m.appendReplacement(result, "ooh", status); 1277 REGEX_CHECK_STATUS; 1278 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1279 1280 // find() at interior of string, appendReplacemnt still starts at beginning. 1281 status = U_ZERO_ERROR; 1282 result.truncate(0); 1283 m.reset(); 1284 m.find(10, status); 1285 m.find(); 1286 m.appendReplacement(result, "ooh", status); 1287 REGEX_CHECK_STATUS; 1288 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1289 1290 m.appendTail(result); 1291 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin"); 1292 1293 } 1294 1295 delete matcher2; 1296 delete pat2; 1297 delete matcher; 1298 delete pat; 1299 } 1300 1301 1302 //--------------------------------------------------------------------------- 1303 // 1304 // API_Pattern Test that the API for class RegexPattern is 1305 // present and nominally working. 1306 // 1307 //--------------------------------------------------------------------------- 1308 void RegexTest::API_Pattern() { 1309 RegexPattern pata; // Test default constructor to not crash. 1310 RegexPattern patb; 1311 1312 REGEX_ASSERT(pata == patb); 1313 REGEX_ASSERT(pata == pata); 1314 1315 UnicodeString re1("abc[a-l][m-z]"); 1316 UnicodeString re2("def"); 1317 UErrorCode status = U_ZERO_ERROR; 1318 UParseError pe; 1319 1320 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status); 1321 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status); 1322 REGEX_CHECK_STATUS; 1323 REGEX_ASSERT(*pat1 == *pat1); 1324 REGEX_ASSERT(*pat1 != pata); 1325 1326 // Assign 1327 patb = *pat1; 1328 REGEX_ASSERT(patb == *pat1); 1329 1330 // Copy Construct 1331 RegexPattern patc(*pat1); 1332 REGEX_ASSERT(patc == *pat1); 1333 REGEX_ASSERT(patb == patc); 1334 REGEX_ASSERT(pat1 != pat2); 1335 patb = *pat2; 1336 REGEX_ASSERT(patb != patc); 1337 REGEX_ASSERT(patb == *pat2); 1338 1339 // Compile with no flags. 1340 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status); 1341 REGEX_ASSERT(*pat1a == *pat1); 1342 1343 REGEX_ASSERT(pat1a->flags() == 0); 1344 1345 // Compile with different flags should be not equal 1346 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status); 1347 REGEX_CHECK_STATUS; 1348 1349 REGEX_ASSERT(*pat1b != *pat1a); 1350 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE); 1351 REGEX_ASSERT(pat1a->flags() == 0); 1352 delete pat1b; 1353 1354 // clone 1355 RegexPattern *pat1c = pat1->clone(); 1356 REGEX_ASSERT(*pat1c == *pat1); 1357 REGEX_ASSERT(*pat1c != *pat2); 1358 1359 delete pat1c; 1360 delete pat1a; 1361 delete pat1; 1362 delete pat2; 1363 1364 1365 // 1366 // Verify that a matcher created from a cloned pattern works. 1367 // (Jitterbug 3423) 1368 // 1369 { 1370 UErrorCode status = U_ZERO_ERROR; 1371 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status); 1372 RegexPattern *pClone = pSource->clone(); 1373 delete pSource; 1374 RegexMatcher *mFromClone = pClone->matcher(status); 1375 REGEX_CHECK_STATUS; 1376 UnicodeString s = "Hello World"; 1377 mFromClone->reset(s); 1378 REGEX_ASSERT(mFromClone->find() == TRUE); 1379 REGEX_ASSERT(mFromClone->group(status) == "Hello"); 1380 REGEX_ASSERT(mFromClone->find() == TRUE); 1381 REGEX_ASSERT(mFromClone->group(status) == "World"); 1382 REGEX_ASSERT(mFromClone->find() == FALSE); 1383 delete mFromClone; 1384 delete pClone; 1385 } 1386 1387 // 1388 // matches convenience API 1389 // 1390 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE); 1391 REGEX_CHECK_STATUS; 1392 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE); 1393 REGEX_CHECK_STATUS; 1394 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE); 1395 REGEX_CHECK_STATUS; 1396 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE); 1397 REGEX_CHECK_STATUS; 1398 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE); 1399 REGEX_CHECK_STATUS; 1400 status = U_INDEX_OUTOFBOUNDS_ERROR; 1401 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE); 1402 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1403 1404 1405 // 1406 // Split() 1407 // 1408 status = U_ZERO_ERROR; 1409 pat1 = RegexPattern::compile(" +", pe, status); 1410 REGEX_CHECK_STATUS; 1411 UnicodeString fields[10]; 1412 1413 int32_t n; 1414 n = pat1->split("Now is the time", fields, 10, status); 1415 REGEX_CHECK_STATUS; 1416 REGEX_ASSERT(n==4); 1417 REGEX_ASSERT(fields[0]=="Now"); 1418 REGEX_ASSERT(fields[1]=="is"); 1419 REGEX_ASSERT(fields[2]=="the"); 1420 REGEX_ASSERT(fields[3]=="time"); 1421 REGEX_ASSERT(fields[4]==""); 1422 1423 n = pat1->split("Now is the time", fields, 2, status); 1424 REGEX_CHECK_STATUS; 1425 REGEX_ASSERT(n==2); 1426 REGEX_ASSERT(fields[0]=="Now"); 1427 REGEX_ASSERT(fields[1]=="is the time"); 1428 REGEX_ASSERT(fields[2]=="the"); // left over from previous test 1429 1430 fields[1] = "*"; 1431 status = U_ZERO_ERROR; 1432 n = pat1->split("Now is the time", fields, 1, status); 1433 REGEX_CHECK_STATUS; 1434 REGEX_ASSERT(n==1); 1435 REGEX_ASSERT(fields[0]=="Now is the time"); 1436 REGEX_ASSERT(fields[1]=="*"); 1437 status = U_ZERO_ERROR; 1438 1439 n = pat1->split(" Now is the time ", fields, 10, status); 1440 REGEX_CHECK_STATUS; 1441 REGEX_ASSERT(n==5); 1442 REGEX_ASSERT(fields[0]==""); 1443 REGEX_ASSERT(fields[1]=="Now"); 1444 REGEX_ASSERT(fields[2]=="is"); 1445 REGEX_ASSERT(fields[3]=="the"); 1446 REGEX_ASSERT(fields[4]=="time"); 1447 REGEX_ASSERT(fields[5]==""); 1448 1449 n = pat1->split(" ", fields, 10, status); 1450 REGEX_CHECK_STATUS; 1451 REGEX_ASSERT(n==1); 1452 REGEX_ASSERT(fields[0]==""); 1453 1454 fields[0] = "foo"; 1455 n = pat1->split("", fields, 10, status); 1456 REGEX_CHECK_STATUS; 1457 REGEX_ASSERT(n==0); 1458 REGEX_ASSERT(fields[0]=="foo"); 1459 1460 delete pat1; 1461 1462 // split, with a pattern with (capture) 1463 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status); 1464 REGEX_CHECK_STATUS; 1465 1466 status = U_ZERO_ERROR; 1467 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status); 1468 REGEX_CHECK_STATUS; 1469 REGEX_ASSERT(n==6); 1470 REGEX_ASSERT(fields[0]==""); 1471 REGEX_ASSERT(fields[1]=="a"); 1472 REGEX_ASSERT(fields[2]=="Now is "); 1473 REGEX_ASSERT(fields[3]=="b"); 1474 REGEX_ASSERT(fields[4]=="the time"); 1475 REGEX_ASSERT(fields[5]=="c"); 1476 REGEX_ASSERT(fields[6]==""); 1477 REGEX_ASSERT(status==U_ZERO_ERROR); 1478 1479 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status); 1480 REGEX_CHECK_STATUS; 1481 REGEX_ASSERT(n==6); 1482 REGEX_ASSERT(fields[0]==" "); 1483 REGEX_ASSERT(fields[1]=="a"); 1484 REGEX_ASSERT(fields[2]=="Now is "); 1485 REGEX_ASSERT(fields[3]=="b"); 1486 REGEX_ASSERT(fields[4]=="the time"); 1487 REGEX_ASSERT(fields[5]=="c"); 1488 REGEX_ASSERT(fields[6]==""); 1489 1490 status = U_ZERO_ERROR; 1491 fields[6] = "foo"; 1492 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status); 1493 REGEX_CHECK_STATUS; 1494 REGEX_ASSERT(n==6); 1495 REGEX_ASSERT(fields[0]==" "); 1496 REGEX_ASSERT(fields[1]=="a"); 1497 REGEX_ASSERT(fields[2]=="Now is "); 1498 REGEX_ASSERT(fields[3]=="b"); 1499 REGEX_ASSERT(fields[4]=="the time"); 1500 REGEX_ASSERT(fields[5]=="c"); 1501 REGEX_ASSERT(fields[6]=="foo"); 1502 1503 status = U_ZERO_ERROR; 1504 fields[5] = "foo"; 1505 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status); 1506 REGEX_CHECK_STATUS; 1507 REGEX_ASSERT(n==5); 1508 REGEX_ASSERT(fields[0]==" "); 1509 REGEX_ASSERT(fields[1]=="a"); 1510 REGEX_ASSERT(fields[2]=="Now is "); 1511 REGEX_ASSERT(fields[3]=="b"); 1512 REGEX_ASSERT(fields[4]=="the time<c>"); 1513 REGEX_ASSERT(fields[5]=="foo"); 1514 1515 status = U_ZERO_ERROR; 1516 fields[5] = "foo"; 1517 n = pat1->split(" <a>Now is <b>the time", fields, 5, status); 1518 REGEX_CHECK_STATUS; 1519 REGEX_ASSERT(n==5); 1520 REGEX_ASSERT(fields[0]==" "); 1521 REGEX_ASSERT(fields[1]=="a"); 1522 REGEX_ASSERT(fields[2]=="Now is "); 1523 REGEX_ASSERT(fields[3]=="b"); 1524 REGEX_ASSERT(fields[4]=="the time"); 1525 REGEX_ASSERT(fields[5]=="foo"); 1526 1527 status = U_ZERO_ERROR; 1528 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status); 1529 REGEX_CHECK_STATUS; 1530 REGEX_ASSERT(n==4); 1531 REGEX_ASSERT(fields[0]==" "); 1532 REGEX_ASSERT(fields[1]=="a"); 1533 REGEX_ASSERT(fields[2]=="Now is "); 1534 REGEX_ASSERT(fields[3]=="the time<c>"); 1535 status = U_ZERO_ERROR; 1536 delete pat1; 1537 1538 pat1 = RegexPattern::compile("([-,])", pe, status); 1539 REGEX_CHECK_STATUS; 1540 n = pat1->split("1-10,20", fields, 10, status); 1541 REGEX_CHECK_STATUS; 1542 REGEX_ASSERT(n==5); 1543 REGEX_ASSERT(fields[0]=="1"); 1544 REGEX_ASSERT(fields[1]=="-"); 1545 REGEX_ASSERT(fields[2]=="10"); 1546 REGEX_ASSERT(fields[3]==","); 1547 REGEX_ASSERT(fields[4]=="20"); 1548 delete pat1; 1549 1550 1551 // 1552 // RegexPattern::pattern() 1553 // 1554 pat1 = new RegexPattern(); 1555 REGEX_ASSERT(pat1->pattern() == ""); 1556 delete pat1; 1557 1558 pat1 = RegexPattern::compile("(Hello, world)*", pe, status); 1559 REGEX_CHECK_STATUS; 1560 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*"); 1561 delete pat1; 1562 1563 1564 // 1565 // classID functions 1566 // 1567 pat1 = RegexPattern::compile("(Hello, world)*", pe, status); 1568 REGEX_CHECK_STATUS; 1569 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID()); 1570 REGEX_ASSERT(pat1->getDynamicClassID() != NULL); 1571 UnicodeString Hello("Hello, world."); 1572 RegexMatcher *m = pat1->matcher(Hello, status); 1573 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID()); 1574 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID()); 1575 REGEX_ASSERT(m->getDynamicClassID() != NULL); 1576 delete m; 1577 delete pat1; 1578 1579 } 1580 1581 //--------------------------------------------------------------------------- 1582 // 1583 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher 1584 // is present and working, but excluding functions 1585 // implementing replace operations. 1586 // 1587 //--------------------------------------------------------------------------- 1588 void RegexTest::API_Match_UTF8() { 1589 UParseError pe; 1590 UErrorCode status=U_ZERO_ERROR; 1591 int32_t flags = 0; 1592 1593 // 1594 // Debug - slide failing test cases early 1595 // 1596 #if 0 1597 { 1598 } 1599 return; 1600 #endif 1601 1602 // 1603 // Simple pattern compilation 1604 // 1605 { 1606 UText re = UTEXT_INITIALIZER; 1607 utext_openUTF8(&re, "abc", -1, &status); 1608 RegexPattern *pat2; 1609 pat2 = RegexPattern::compile(&re, flags, pe, status); 1610 REGEX_CHECK_STATUS; 1611 1612 UText input1 = UTEXT_INITIALIZER; 1613 UText input2 = UTEXT_INITIALIZER; 1614 UText empty = UTEXT_INITIALIZER; 1615 utext_openUTF8(&input1, "abcdef this is a test", -1, &status); 1616 utext_openUTF8(&input2, "not abc", -1, &status); 1617 utext_openUChars(&empty, NULL, 0, &status); 1618 1619 int32_t input1Len = strlen("abcdef this is a test"); 1620 int32_t input2Len = strlen("not abc"); 1621 1622 1623 // 1624 // Matcher creation and reset. 1625 // 1626 RegexMatcher *m1 = pat2->matcher(&input1, RegexPattern::PATTERN_IS_UTEXT, status); 1627 REGEX_CHECK_STATUS; 1628 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 1629 REGEX_ASSERT_UTEXT("abcdef this is a test", m1->inputText()); 1630 m1->reset(&input2); 1631 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 1632 REGEX_ASSERT_UTEXT("not abc", m1->inputText()); 1633 m1->reset(&input1); 1634 REGEX_ASSERT_UTEXT("abcdef this is a test", m1->inputText()); 1635 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 1636 m1->reset(&empty); 1637 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 1638 REGEX_ASSERT(utext_nativeLength(&empty) == 0); 1639 1640 // 1641 // reset(pos, status) 1642 // 1643 m1->reset(&input1); 1644 m1->reset(4, status); 1645 REGEX_CHECK_STATUS; 1646 REGEX_ASSERT_UTEXT("abcdef this is a test", m1->inputText()); 1647 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 1648 1649 m1->reset(-1, status); 1650 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1651 status = U_ZERO_ERROR; 1652 1653 m1->reset(0, status); 1654 REGEX_CHECK_STATUS; 1655 status = U_ZERO_ERROR; 1656 1657 m1->reset(input1Len-1, status); 1658 REGEX_CHECK_STATUS; 1659 status = U_ZERO_ERROR; 1660 1661 m1->reset(input1Len, status); 1662 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1663 status = U_ZERO_ERROR; 1664 1665 // 1666 // match(pos, status) 1667 // 1668 m1->reset(&input2); 1669 REGEX_ASSERT(m1->matches(4, status) == TRUE); 1670 m1->reset(); 1671 REGEX_ASSERT(m1->matches(3, status) == FALSE); 1672 m1->reset(); 1673 REGEX_ASSERT(m1->matches(5, status) == FALSE); 1674 REGEX_ASSERT(m1->matches(4, status) == TRUE); 1675 REGEX_ASSERT(m1->matches(-1, status) == FALSE); 1676 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1677 1678 // Match() at end of string should fail, but should not 1679 // be an error. 1680 status = U_ZERO_ERROR; 1681 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE); 1682 REGEX_CHECK_STATUS; 1683 1684 // Match beyond end of string should fail with an error. 1685 status = U_ZERO_ERROR; 1686 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE); 1687 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1688 1689 // Successful match at end of string. 1690 { 1691 status = U_ZERO_ERROR; 1692 RegexMatcher m("A?", 0, status); // will match zero length string. 1693 REGEX_CHECK_STATUS; 1694 m.reset(&input1); 1695 REGEX_ASSERT(m.matches(input1Len, status) == TRUE); 1696 REGEX_CHECK_STATUS; 1697 m.reset(&empty); 1698 REGEX_ASSERT(m.matches(0, status) == TRUE); 1699 REGEX_CHECK_STATUS; 1700 } 1701 1702 1703 // 1704 // lookingAt(pos, status) 1705 // 1706 status = U_ZERO_ERROR; 1707 m1->reset(&input2); // "not abc" 1708 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 1709 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE); 1710 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE); 1711 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 1712 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE); 1713 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1714 status = U_ZERO_ERROR; 1715 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE); 1716 REGEX_CHECK_STATUS; 1717 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE); 1718 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1719 1720 delete m1; 1721 delete pat2; 1722 1723 utext_close(&re); 1724 utext_close(&input1); 1725 utext_close(&input2); 1726 utext_close(&empty); 1727 } 1728 1729 1730 // 1731 // Capture Group. 1732 // RegexMatcher::start(); 1733 // RegexMatcher::end(); 1734 // RegexMatcher::groupCount(); 1735 // 1736 { 1737 int32_t flags=0; 1738 UParseError pe; 1739 UErrorCode status=U_ZERO_ERROR; 1740 UText re=UTEXT_INITIALIZER; 1741 utext_openUTF8(&re, "01(23(45)67)(.*)", -1, &status); 1742 1743 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 1744 REGEX_CHECK_STATUS; 1745 1746 UText input = UTEXT_INITIALIZER; 1747 utext_openUTF8(&input, "0123456789", -1, &status); 1748 1749 RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status); 1750 REGEX_CHECK_STATUS; 1751 REGEX_ASSERT(matcher->lookingAt(status) == TRUE); 1752 static const int32_t matchStarts[] = {0, 2, 4, 8}; 1753 static const int32_t matchEnds[] = {10, 8, 6, 10}; 1754 int32_t i; 1755 for (i=0; i<4; i++) { 1756 int32_t actualStart = matcher->start(i, status); 1757 REGEX_CHECK_STATUS; 1758 if (actualStart != matchStarts[i]) { 1759 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n", 1760 __LINE__, i, matchStarts[i], actualStart); 1761 } 1762 int32_t actualEnd = matcher->end(i, status); 1763 REGEX_CHECK_STATUS; 1764 if (actualEnd != matchEnds[i]) { 1765 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n", 1766 __LINE__, i, matchEnds[i], actualEnd); 1767 } 1768 } 1769 1770 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status)); 1771 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status)); 1772 1773 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 1774 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 1775 matcher->reset(); 1776 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE); 1777 1778 matcher->lookingAt(status); 1779 1780 UnicodeString dest; 1781 UText destText = UTEXT_INITIALIZER; 1782 utext_openUnicodeString(&destText, &dest, &status); 1783 UText *result; 1784 1785 result = matcher->group((UText *)NULL, RegexMatcher::MATCHER_DEST_IS_UTEXT, status); 1786 REGEX_CHECK_STATUS; 1787 REGEX_ASSERT_UTEXT("0123456789", result); 1788 utext_close(result); 1789 result = matcher->group(&destText, RegexMatcher::MATCHER_DEST_IS_UTEXT, status); 1790 REGEX_CHECK_STATUS; 1791 REGEX_ASSERT(result == &destText); 1792 REGEX_ASSERT_UTEXT("0123456789", result); 1793 1794 result = matcher->group(0, NULL, status); 1795 REGEX_CHECK_STATUS; 1796 REGEX_ASSERT_UTEXT("0123456789", result); 1797 utext_close(result); 1798 result = matcher->group(0, &destText, status); 1799 REGEX_CHECK_STATUS; 1800 REGEX_ASSERT(result == &destText); 1801 REGEX_ASSERT_UTEXT("0123456789", result); 1802 1803 result = matcher->group(1, NULL, status); 1804 REGEX_CHECK_STATUS; 1805 REGEX_ASSERT_UTEXT("234567", result); 1806 utext_close(result); 1807 result = matcher->group(1, &destText, status); 1808 REGEX_CHECK_STATUS; 1809 REGEX_ASSERT(result == &destText); 1810 REGEX_ASSERT_UTEXT("234567", result); 1811 1812 result = matcher->group(2, NULL, status); 1813 REGEX_CHECK_STATUS; 1814 REGEX_ASSERT_UTEXT("45", result); 1815 utext_close(result); 1816 result = matcher->group(2, &destText, status); 1817 REGEX_CHECK_STATUS; 1818 REGEX_ASSERT(result == &destText); 1819 REGEX_ASSERT_UTEXT("45", result); 1820 1821 result = matcher->group(3, NULL, status); 1822 REGEX_CHECK_STATUS; 1823 REGEX_ASSERT_UTEXT("89", result); 1824 utext_close(result); 1825 result = matcher->group(3, &destText, status); 1826 REGEX_CHECK_STATUS; 1827 REGEX_ASSERT(result == &destText); 1828 REGEX_ASSERT_UTEXT("89", result); 1829 1830 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 1831 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 1832 matcher->reset(); 1833 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); 1834 1835 delete matcher; 1836 delete pat; 1837 1838 utext_close(&destText); 1839 utext_close(&input); 1840 utext_close(&re); 1841 } 1842 1843 // 1844 // find 1845 // 1846 { 1847 int32_t flags=0; 1848 UParseError pe; 1849 UErrorCode status=U_ZERO_ERROR; 1850 UText re=UTEXT_INITIALIZER; 1851 utext_openUTF8(&re, "abc", -1, &status); 1852 1853 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 1854 REGEX_CHECK_STATUS; 1855 UText input = UTEXT_INITIALIZER; 1856 utext_openUTF8(&input, ".abc..abc...abc..", -1, &status); 1857 // 012345678901234567 1858 1859 RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status); 1860 REGEX_CHECK_STATUS; 1861 REGEX_ASSERT(matcher->find()); 1862 REGEX_ASSERT(matcher->start(status) == 1); 1863 REGEX_ASSERT(matcher->find()); 1864 REGEX_ASSERT(matcher->start(status) == 6); 1865 REGEX_ASSERT(matcher->find()); 1866 REGEX_ASSERT(matcher->start(status) == 12); 1867 REGEX_ASSERT(matcher->find() == FALSE); 1868 REGEX_ASSERT(matcher->find() == FALSE); 1869 1870 matcher->reset(); 1871 REGEX_ASSERT(matcher->find()); 1872 REGEX_ASSERT(matcher->start(status) == 1); 1873 1874 REGEX_ASSERT(matcher->find(0, status)); 1875 REGEX_ASSERT(matcher->start(status) == 1); 1876 REGEX_ASSERT(matcher->find(1, status)); 1877 REGEX_ASSERT(matcher->start(status) == 1); 1878 REGEX_ASSERT(matcher->find(2, status)); 1879 REGEX_ASSERT(matcher->start(status) == 6); 1880 REGEX_ASSERT(matcher->find(12, status)); 1881 REGEX_ASSERT(matcher->start(status) == 12); 1882 REGEX_ASSERT(matcher->find(13, status) == FALSE); 1883 REGEX_ASSERT(matcher->find(16, status) == FALSE); 1884 REGEX_ASSERT(matcher->find(17, status) == FALSE); 1885 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE); 1886 1887 status = U_ZERO_ERROR; 1888 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 1889 status = U_ZERO_ERROR; 1890 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR); 1891 1892 REGEX_ASSERT(matcher->groupCount() == 0); 1893 1894 delete matcher; 1895 delete pat; 1896 1897 utext_close(&input); 1898 utext_close(&re); 1899 } 1900 1901 1902 // 1903 // find, with \G in pattern (true if at the end of a previous match). 1904 // 1905 { 1906 int32_t flags=0; 1907 UParseError pe; 1908 UErrorCode status=U_ZERO_ERROR; 1909 UText re=UTEXT_INITIALIZER; 1910 utext_openUTF8(&re, ".*?(?:(\\Gabc)|(abc))", -1, &status); 1911 1912 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 1913 1914 REGEX_CHECK_STATUS; 1915 UText input = UTEXT_INITIALIZER; 1916 utext_openUTF8(&input, ".abcabc.abc..", -1, &status); 1917 // 012345678901234567 1918 1919 RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status); 1920 REGEX_CHECK_STATUS; 1921 REGEX_ASSERT(matcher->find()); 1922 REGEX_ASSERT(matcher->start(status) == 0); 1923 REGEX_ASSERT(matcher->start(1, status) == -1); 1924 REGEX_ASSERT(matcher->start(2, status) == 1); 1925 1926 REGEX_ASSERT(matcher->find()); 1927 REGEX_ASSERT(matcher->start(status) == 4); 1928 REGEX_ASSERT(matcher->start(1, status) == 4); 1929 REGEX_ASSERT(matcher->start(2, status) == -1); 1930 REGEX_CHECK_STATUS; 1931 1932 delete matcher; 1933 delete pat; 1934 1935 utext_close(&input); 1936 utext_close(&re); 1937 } 1938 1939 // 1940 // find with zero length matches, match position should bump ahead 1941 // to prevent loops. 1942 // 1943 { 1944 int32_t i; 1945 UErrorCode status=U_ZERO_ERROR; 1946 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere, 1947 // using an always-true look-ahead. 1948 REGEX_CHECK_STATUS; 1949 UText s = UTEXT_INITIALIZER; 1950 utext_openUTF8(&s, " ", -1, &status); 1951 m.reset(&s); 1952 for (i=0; ; i++) { 1953 if (m.find() == FALSE) { 1954 break; 1955 } 1956 REGEX_ASSERT(m.start(status) == i); 1957 REGEX_ASSERT(m.end(status) == i); 1958 } 1959 REGEX_ASSERT(i==5); 1960 1961 // Check that the bump goes over characters outside the BMP OK 1962 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8 1963 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00}; 1964 utext_openUTF8(&s, (char *)aboveBMP, -1, &status); 1965 m.reset(&s); 1966 for (i=0; ; i+=2) { 1967 if (m.find() == FALSE) { 1968 break; 1969 } 1970 REGEX_ASSERT(m.start(status) == i); 1971 REGEX_ASSERT(m.end(status) == i); 1972 } 1973 REGEX_ASSERT(i==10); 1974 1975 utext_close(&s); 1976 } 1977 { 1978 // find() loop breaking test. 1979 // with pattern of /.?/, should see a series of one char matches, then a single 1980 // match of zero length at the end of the input string. 1981 int32_t i; 1982 UErrorCode status=U_ZERO_ERROR; 1983 RegexMatcher m(".?", 0, status); 1984 REGEX_CHECK_STATUS; 1985 UText s = UTEXT_INITIALIZER; 1986 utext_openUTF8(&s, " ", -1, &status); 1987 m.reset(&s); 1988 for (i=0; ; i++) { 1989 if (m.find() == FALSE) { 1990 break; 1991 } 1992 REGEX_ASSERT(m.start(status) == i); 1993 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i)); 1994 } 1995 REGEX_ASSERT(i==5); 1996 1997 utext_close(&s); 1998 } 1999 2000 2001 // 2002 // Matchers with no input string behave as if they had an empty input string. 2003 // 2004 2005 { 2006 UErrorCode status = U_ZERO_ERROR; 2007 RegexMatcher m(".?", 0, status); 2008 REGEX_CHECK_STATUS; 2009 REGEX_ASSERT(m.find()); 2010 REGEX_ASSERT(m.start(status) == 0); 2011 REGEX_ASSERT(m.input() == ""); 2012 } 2013 { 2014 UErrorCode status = U_ZERO_ERROR; 2015 RegexPattern *p = RegexPattern::compile(".", 0, status); 2016 RegexMatcher *m = p->matcher(status); 2017 REGEX_CHECK_STATUS; 2018 2019 REGEX_ASSERT(m->find() == FALSE); 2020 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0); 2021 delete m; 2022 delete p; 2023 } 2024 2025 // 2026 // Regions 2027 // 2028 { 2029 UErrorCode status = U_ZERO_ERROR; 2030 UText testPattern = UTEXT_INITIALIZER; 2031 UText testText = UTEXT_INITIALIZER; 2032 utext_openUTF8(&testPattern, ".*", -1, &status); 2033 utext_openUTF8(&testText, "This is test data", -1, &status); 2034 2035 RegexMatcher m(&testPattern, &testText, 0, status); 2036 REGEX_CHECK_STATUS; 2037 REGEX_ASSERT(m.regionStart() == 0); 2038 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data")); 2039 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2040 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2041 2042 m.region(2,4, status); 2043 REGEX_CHECK_STATUS; 2044 REGEX_ASSERT(m.matches(status)); 2045 REGEX_ASSERT(m.start(status)==2); 2046 REGEX_ASSERT(m.end(status)==4); 2047 REGEX_CHECK_STATUS; 2048 2049 m.reset(); 2050 REGEX_ASSERT(m.regionStart() == 0); 2051 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data")); 2052 2053 utext_openUTF8(&testText, "short", -1, &status); 2054 m.reset(&testText); 2055 REGEX_ASSERT(m.regionStart() == 0); 2056 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short")); 2057 2058 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2059 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE)); 2060 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 2061 REGEX_ASSERT(&m == &m.reset()); 2062 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 2063 2064 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE)); 2065 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2066 REGEX_ASSERT(&m == &m.reset()); 2067 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2068 2069 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2070 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE)); 2071 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 2072 REGEX_ASSERT(&m == &m.reset()); 2073 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 2074 2075 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE)); 2076 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2077 REGEX_ASSERT(&m == &m.reset()); 2078 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2079 2080 utext_close(&testText); 2081 utext_close(&testPattern); 2082 } 2083 2084 // 2085 // hitEnd() and requireEnd() 2086 // 2087 { 2088 UErrorCode status = U_ZERO_ERROR; 2089 UText testPattern = UTEXT_INITIALIZER; 2090 UText testText = UTEXT_INITIALIZER; 2091 utext_openUTF8(&testPattern, ".*", -1, &status); 2092 utext_openUTF8(&testText, "aabb", -1, &status); 2093 2094 RegexMatcher m1(&testPattern, &testText, 0, status); 2095 REGEX_ASSERT(m1.lookingAt(status) == TRUE); 2096 REGEX_ASSERT(m1.hitEnd() == TRUE); 2097 REGEX_ASSERT(m1.requireEnd() == FALSE); 2098 REGEX_CHECK_STATUS; 2099 2100 status = U_ZERO_ERROR; 2101 utext_openUTF8(&testPattern, "a*", -1, &status); 2102 RegexMatcher m2(&testPattern, &testText, 0, status); 2103 REGEX_ASSERT(m2.lookingAt(status) == TRUE); 2104 REGEX_ASSERT(m2.hitEnd() == FALSE); 2105 REGEX_ASSERT(m2.requireEnd() == FALSE); 2106 REGEX_CHECK_STATUS; 2107 2108 status = U_ZERO_ERROR; 2109 utext_openUTF8(&testPattern, ".*$", -1, &status); 2110 RegexMatcher m3(&testPattern, &testText, 0, status); 2111 REGEX_ASSERT(m3.lookingAt(status) == TRUE); 2112 REGEX_ASSERT(m3.hitEnd() == TRUE); 2113 REGEX_ASSERT(m3.requireEnd() == TRUE); 2114 REGEX_CHECK_STATUS; 2115 2116 utext_close(&testText); 2117 utext_close(&testPattern); 2118 } 2119 } 2120 2121 2122 //--------------------------------------------------------------------------- 2123 // 2124 // API_Replace_UTF8 API test for class RegexMatcher, testing the 2125 // Replace family of functions. 2126 // 2127 //--------------------------------------------------------------------------- 2128 void RegexTest::API_Replace_UTF8() { 2129 // 2130 // Replace 2131 // 2132 int32_t flags=0; 2133 UParseError pe; 2134 UErrorCode status=U_ZERO_ERROR; 2135 2136 UText re=UTEXT_INITIALIZER; 2137 utext_openUTF8(&re, "abc", -1, &status); 2138 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 2139 REGEX_CHECK_STATUS; 2140 2141 char data[] = ".abc..abc...abc.."; 2142 // 012345678901234567 2143 UText dataText = UTEXT_INITIALIZER; 2144 utext_openUTF8(&dataText, data, -1, &status); 2145 RegexMatcher *matcher = pat->matcher(&dataText, RegexPattern::PATTERN_IS_UTEXT, status); 2146 2147 // 2148 // Plain vanilla matches. 2149 // 2150 UnicodeString dest; 2151 UText destText = UTEXT_INITIALIZER; 2152 utext_openUnicodeString(&destText, &dest, &status); 2153 UText *result; 2154 2155 UText replText = UTEXT_INITIALIZER; 2156 2157 utext_openUTF8(&replText, "yz", -1, &status); 2158 result = matcher->replaceFirst(&replText, NULL, status); 2159 REGEX_CHECK_STATUS; 2160 REGEX_ASSERT_UTEXT(".yz..abc...abc..", result); 2161 utext_close(result); 2162 result = matcher->replaceFirst(&replText, &destText, status); 2163 REGEX_CHECK_STATUS; 2164 REGEX_ASSERT(result == &destText); 2165 REGEX_ASSERT_UTEXT(".yz..abc...abc..", result); 2166 2167 result = matcher->replaceAll(&replText, NULL, status); 2168 REGEX_CHECK_STATUS; 2169 REGEX_ASSERT_UTEXT(".yz..yz...yz..", result); 2170 utext_close(result); 2171 2172 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2173 result = matcher->replaceAll(&replText, &destText, status); 2174 REGEX_CHECK_STATUS; 2175 REGEX_ASSERT(result == &destText); 2176 REGEX_ASSERT_UTEXT(".yz..yz...yz..", result); 2177 2178 // 2179 // Plain vanilla non-matches. 2180 // 2181 utext_openUTF8(&dataText, ".abx..abx...abx..", -1, &status); 2182 matcher->reset(&dataText); 2183 2184 result = matcher->replaceFirst(&replText, NULL, status); 2185 REGEX_CHECK_STATUS; 2186 REGEX_ASSERT_UTEXT(".abx..abx...abx..", result); 2187 utext_close(result); 2188 result = matcher->replaceFirst(&replText, &destText, status); 2189 REGEX_CHECK_STATUS; 2190 REGEX_ASSERT(result == &destText); 2191 REGEX_ASSERT_UTEXT(".abx..abx...abx..", result); 2192 2193 result = matcher->replaceAll(&replText, NULL, status); 2194 REGEX_CHECK_STATUS; 2195 REGEX_ASSERT_UTEXT(".abx..abx...abx..", result); 2196 utext_close(result); 2197 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2198 result = matcher->replaceAll(&replText, &destText, status); 2199 REGEX_CHECK_STATUS; 2200 REGEX_ASSERT(result == &destText); 2201 REGEX_ASSERT_UTEXT(".abx..abx...abx..", result); 2202 2203 // 2204 // Empty source string 2205 // 2206 utext_openUTF8(&dataText, NULL, 0, &status); 2207 matcher->reset(&dataText); 2208 2209 result = matcher->replaceFirst(&replText, NULL, status); 2210 REGEX_CHECK_STATUS; 2211 REGEX_ASSERT_UTEXT("", result); 2212 utext_close(result); 2213 result = matcher->replaceFirst(&replText, &destText, status); 2214 REGEX_CHECK_STATUS; 2215 REGEX_ASSERT(result == &destText); 2216 REGEX_ASSERT_UTEXT("", result); 2217 2218 result = matcher->replaceAll(&replText, NULL, status); 2219 REGEX_CHECK_STATUS; 2220 REGEX_ASSERT_UTEXT("", result); 2221 utext_close(result); 2222 result = matcher->replaceAll(&replText, &destText, status); 2223 REGEX_CHECK_STATUS; 2224 REGEX_ASSERT(result == &destText); 2225 REGEX_ASSERT_UTEXT("", result); 2226 2227 // 2228 // Empty substitution string 2229 // 2230 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.." 2231 matcher->reset(&dataText); 2232 2233 utext_openUTF8(&replText, NULL, 0, &status); 2234 result = matcher->replaceFirst(&replText, NULL, status); 2235 REGEX_CHECK_STATUS; 2236 REGEX_ASSERT_UTEXT("...abc...abc..", result); 2237 utext_close(result); 2238 result = matcher->replaceFirst(&replText, &destText, status); 2239 REGEX_CHECK_STATUS; 2240 REGEX_ASSERT(result == &destText); 2241 REGEX_ASSERT_UTEXT("...abc...abc..", result); 2242 2243 result = matcher->replaceAll(&replText, NULL, status); 2244 REGEX_CHECK_STATUS; 2245 REGEX_ASSERT_UTEXT("........", result); 2246 utext_close(result); 2247 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2248 result = matcher->replaceAll(&replText, &destText, status); 2249 REGEX_CHECK_STATUS; 2250 REGEX_ASSERT(result == &destText); 2251 REGEX_ASSERT_UTEXT("........", result); 2252 2253 // 2254 // match whole string 2255 // 2256 utext_openUTF8(&dataText, "abc", -1, &status); 2257 matcher->reset(&dataText); 2258 2259 utext_openUTF8(&replText, "xyz", -1, &status); 2260 result = matcher->replaceFirst(&replText, NULL, status); 2261 REGEX_CHECK_STATUS; 2262 REGEX_ASSERT_UTEXT("xyz", result); 2263 utext_close(result); 2264 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2265 result = matcher->replaceFirst(&replText, &destText, status); 2266 REGEX_CHECK_STATUS; 2267 REGEX_ASSERT(result == &destText); 2268 REGEX_ASSERT_UTEXT("xyz", result); 2269 2270 result = matcher->replaceAll(&replText, NULL, status); 2271 REGEX_CHECK_STATUS; 2272 REGEX_ASSERT_UTEXT("xyz", result); 2273 utext_close(result); 2274 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2275 result = matcher->replaceAll(&replText, &destText, status); 2276 REGEX_CHECK_STATUS; 2277 REGEX_ASSERT(result == &destText); 2278 REGEX_ASSERT_UTEXT("xyz", result); 2279 2280 // 2281 // Capture Group, simple case 2282 // 2283 utext_openUTF8(&re, "a(..)", -1, &status); 2284 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status); 2285 REGEX_CHECK_STATUS; 2286 2287 utext_openUTF8(&dataText, "abcdefg", -1, &status); 2288 RegexMatcher *matcher2 = pat2->matcher(&dataText, RegexPattern::PATTERN_IS_UTEXT, status); 2289 REGEX_CHECK_STATUS; 2290 2291 utext_openUTF8(&replText, "$1$1", -1, &status); 2292 result = matcher2->replaceFirst(&replText, NULL, status); 2293 REGEX_CHECK_STATUS; 2294 REGEX_ASSERT_UTEXT("bcbcdefg", result); 2295 utext_close(result); 2296 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2297 result = matcher2->replaceFirst(&replText, &destText, status); 2298 REGEX_CHECK_STATUS; 2299 REGEX_ASSERT(result == &destText); 2300 REGEX_ASSERT_UTEXT("bcbcdefg", result); 2301 2302 utext_openUTF8(&replText, "The value of \\$1 is $1.", -1, &status); 2303 result = matcher2->replaceFirst(&replText, NULL, status); 2304 REGEX_CHECK_STATUS; 2305 REGEX_ASSERT_UTEXT("The value of $1 is bc.defg", result); 2306 utext_close(result); 2307 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2308 result = matcher2->replaceFirst(&replText, &destText, status); 2309 REGEX_CHECK_STATUS; 2310 REGEX_ASSERT(result == &destText); 2311 REGEX_ASSERT_UTEXT("The value of $1 is bc.defg", result); 2312 2313 utext_openUTF8(&replText, "$ by itself, no group number $$$", -1, &status); 2314 result = matcher2->replaceFirst(&replText, NULL, status); 2315 REGEX_CHECK_STATUS; 2316 REGEX_ASSERT_UTEXT("$ by itself, no group number $$$defg", result); 2317 utext_close(result); 2318 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2319 result = matcher2->replaceFirst(&replText, &destText, status); 2320 REGEX_CHECK_STATUS; 2321 REGEX_ASSERT(result == &destText); 2322 REGEX_ASSERT_UTEXT("$ by itself, no group number $$$defg", result); 2323 2324 unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE 2325 // 012345678901234567890123456 2326 supplDigitChars[22] = 0xF0; 2327 supplDigitChars[23] = 0x9D; 2328 supplDigitChars[24] = 0x9F; 2329 supplDigitChars[25] = 0x8F; 2330 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status); 2331 2332 result = matcher2->replaceFirst(&replText, NULL, status); 2333 REGEX_CHECK_STATUS; 2334 REGEX_ASSERT_UTEXT("Supplemental Digit 1 bc.defg", result); 2335 utext_close(result); 2336 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2337 result = matcher2->replaceFirst(&replText, &destText, status); 2338 REGEX_CHECK_STATUS; 2339 REGEX_ASSERT(result == &destText); 2340 REGEX_ASSERT_UTEXT("Supplemental Digit 1 bc.defg", result); 2341 2342 utext_openUTF8(&replText, "bad capture group number $5...", -1, &status); 2343 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR); 2344 // REGEX_ASSERT_UTEXT("abcdefg", result); 2345 utext_close(result); 2346 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2347 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR); 2348 REGEX_ASSERT(result == &destText); 2349 // REGEX_ASSERT_UTEXT("abcdefg", result); 2350 2351 // 2352 // Replacement String with \u hex escapes 2353 // 2354 { 2355 utext_openUTF8(&dataText, "abc 1 abc 2 abc 3", -1, &status); 2356 utext_openUTF8(&replText, "--\\u0043--", -1, &status); 2357 matcher->reset(&dataText); 2358 2359 result = matcher->replaceAll(&replText, NULL, status); 2360 REGEX_CHECK_STATUS; 2361 REGEX_ASSERT_UTEXT("--C-- 1 --C-- 2 --C-- 3", result); 2362 utext_close(result); 2363 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2364 result = matcher->replaceAll(&replText, &destText, status); 2365 REGEX_CHECK_STATUS; 2366 REGEX_ASSERT(result == &destText); 2367 REGEX_ASSERT_UTEXT("--C-- 1 --C-- 2 --C-- 3", result); 2368 } 2369 { 2370 utext_openUTF8(&dataText, "abc !", -1, &status); 2371 utext_openUTF8(&replText, "--\\U00010000--", -1, &status); 2372 matcher->reset(&dataText); 2373 2374 unsigned char expected[] = "--xxxx-- !"; // \U00010000, "LINEAR B SYLLABLE B008 A" 2375 // 0123456789 2376 expected[2] = 0xF0; 2377 expected[3] = 0x90; 2378 expected[4] = 0x80; 2379 expected[5] = 0x80; 2380 2381 result = matcher->replaceAll(&replText, NULL, status); 2382 REGEX_CHECK_STATUS; 2383 REGEX_ASSERT_UTEXT((char *)expected, result); 2384 utext_close(result); 2385 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2386 result = matcher->replaceAll(&replText, &destText, status); 2387 REGEX_CHECK_STATUS; 2388 REGEX_ASSERT(result == &destText); 2389 REGEX_ASSERT_UTEXT((char *)expected, result); 2390 } 2391 // TODO: need more through testing of capture substitutions. 2392 2393 // Bug 4057 2394 // 2395 { 2396 status = U_ZERO_ERROR; 2397 utext_openUTF8(&re, "ss(.*?)ee", -1, &status); 2398 utext_openUTF8(&dataText, "The matches start with ss and end with ee ss stuff ee fin", -1, &status); 2399 utext_openUTF8(&replText, "ooh", -1, &status); 2400 2401 RegexMatcher m(&re, 0, status); 2402 REGEX_CHECK_STATUS; 2403 2404 UnicodeString result; 2405 UText resultText = UTEXT_INITIALIZER; 2406 utext_openUnicodeString(&resultText, &result, &status); 2407 2408 // Multiple finds do NOT bump up the previous appendReplacement postion. 2409 m.reset(&dataText); 2410 m.find(); 2411 m.find(); 2412 m.appendReplacement(&resultText, &replText, status); 2413 REGEX_CHECK_STATUS; 2414 REGEX_ASSERT_UTEXT("The matches start with ss and end with ee ooh", &resultText); 2415 2416 // After a reset into the interior of a string, appendReplacement still starts at beginning. 2417 status = U_ZERO_ERROR; 2418 result.truncate(0); 2419 utext_openUnicodeString(&resultText, &result, &status); 2420 m.reset(10, status); 2421 m.find(); 2422 m.find(); 2423 m.appendReplacement(&resultText, &replText, status); 2424 REGEX_CHECK_STATUS; 2425 REGEX_ASSERT_UTEXT("The matches start with ss and end with ee ooh", &resultText); 2426 2427 // find() at interior of string, appendReplacement still starts at beginning. 2428 status = U_ZERO_ERROR; 2429 result.truncate(0); 2430 utext_openUnicodeString(&resultText, &result, &status); 2431 m.reset(); 2432 m.find(10, status); 2433 m.find(); 2434 m.appendReplacement(&resultText, &replText, status); 2435 REGEX_CHECK_STATUS; 2436 REGEX_ASSERT_UTEXT("The matches start with ss and end with ee ooh", &resultText); 2437 2438 m.appendTail(&resultText); 2439 REGEX_ASSERT_UTEXT("The matches start with ss and end with ee ooh fin", &resultText); 2440 2441 utext_close(&resultText); 2442 } 2443 2444 delete matcher2; 2445 delete pat2; 2446 delete matcher; 2447 delete pat; 2448 2449 utext_close(&dataText); 2450 utext_close(&replText); 2451 utext_close(&destText); 2452 utext_close(&re); 2453 } 2454 2455 2456 //--------------------------------------------------------------------------- 2457 // 2458 // API_Pattern_UTF8 Test that the API for class RegexPattern is 2459 // present and nominally working. 2460 // 2461 //--------------------------------------------------------------------------- 2462 void RegexTest::API_Pattern_UTF8() { 2463 RegexPattern pata; // Test default constructor to not crash. 2464 RegexPattern patb; 2465 2466 REGEX_ASSERT(pata == patb); 2467 REGEX_ASSERT(pata == pata); 2468 2469 UText re1 = UTEXT_INITIALIZER; 2470 UText re2 = UTEXT_INITIALIZER; 2471 UErrorCode status = U_ZERO_ERROR; 2472 UParseError pe; 2473 2474 utext_openUTF8(&re1, "abc[a-l][m-z]", -1, &status); 2475 utext_openUTF8(&re2, "def", -1, &status); 2476 2477 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status); 2478 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status); 2479 REGEX_CHECK_STATUS; 2480 REGEX_ASSERT(*pat1 == *pat1); 2481 REGEX_ASSERT(*pat1 != pata); 2482 2483 // Assign 2484 patb = *pat1; 2485 REGEX_ASSERT(patb == *pat1); 2486 2487 // Copy Construct 2488 RegexPattern patc(*pat1); 2489 REGEX_ASSERT(patc == *pat1); 2490 REGEX_ASSERT(patb == patc); 2491 REGEX_ASSERT(pat1 != pat2); 2492 patb = *pat2; 2493 REGEX_ASSERT(patb != patc); 2494 REGEX_ASSERT(patb == *pat2); 2495 2496 // Compile with no flags. 2497 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status); 2498 REGEX_ASSERT(*pat1a == *pat1); 2499 2500 REGEX_ASSERT(pat1a->flags() == 0); 2501 2502 // Compile with different flags should be not equal 2503 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status); 2504 REGEX_CHECK_STATUS; 2505 2506 REGEX_ASSERT(*pat1b != *pat1a); 2507 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE); 2508 REGEX_ASSERT(pat1a->flags() == 0); 2509 delete pat1b; 2510 2511 // clone 2512 RegexPattern *pat1c = pat1->clone(); 2513 REGEX_ASSERT(*pat1c == *pat1); 2514 REGEX_ASSERT(*pat1c != *pat2); 2515 2516 delete pat1c; 2517 delete pat1a; 2518 delete pat1; 2519 delete pat2; 2520 2521 utext_close(&re1); 2522 utext_close(&re2); 2523 2524 2525 // 2526 // Verify that a matcher created from a cloned pattern works. 2527 // (Jitterbug 3423) 2528 // 2529 { 2530 UErrorCode status = U_ZERO_ERROR; 2531 UText pattern = UTEXT_INITIALIZER; 2532 utext_openUTF8(&pattern, "\\p{L}+", -1, &status); 2533 2534 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status); 2535 RegexPattern *pClone = pSource->clone(); 2536 delete pSource; 2537 RegexMatcher *mFromClone = pClone->matcher(status); 2538 REGEX_CHECK_STATUS; 2539 2540 UText input = UTEXT_INITIALIZER; 2541 utext_openUTF8(&input, "Hello World", -1, &status); 2542 mFromClone->reset(&input); 2543 REGEX_ASSERT(mFromClone->find() == TRUE); 2544 REGEX_ASSERT(mFromClone->group(status) == "Hello"); 2545 REGEX_ASSERT(mFromClone->find() == TRUE); 2546 REGEX_ASSERT(mFromClone->group(status) == "World"); 2547 REGEX_ASSERT(mFromClone->find() == FALSE); 2548 delete mFromClone; 2549 delete pClone; 2550 2551 utext_close(&input); 2552 utext_close(&pattern); 2553 } 2554 2555 // 2556 // matches convenience API 2557 // 2558 { 2559 UErrorCode status = U_ZERO_ERROR; 2560 UText pattern = UTEXT_INITIALIZER; 2561 UText input = UTEXT_INITIALIZER; 2562 2563 utext_openUTF8(&input, "random input", -1, &status); 2564 2565 utext_openUTF8(&pattern, ".*", -1, &status); 2566 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE); 2567 REGEX_CHECK_STATUS; 2568 2569 utext_openUTF8(&pattern, "abc", -1, &status); 2570 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE); 2571 REGEX_CHECK_STATUS; 2572 2573 utext_openUTF8(&pattern, ".*nput", -1, &status); 2574 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE); 2575 REGEX_CHECK_STATUS; 2576 2577 utext_openUTF8(&pattern, "random input", -1, &status); 2578 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE); 2579 REGEX_CHECK_STATUS; 2580 2581 utext_openUTF8(&pattern, ".*u", -1, &status); 2582 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE); 2583 REGEX_CHECK_STATUS; 2584 2585 utext_openUTF8(&input, "abc", -1, &status); 2586 utext_openUTF8(&pattern, "abc", -1, &status); 2587 status = U_INDEX_OUTOFBOUNDS_ERROR; 2588 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE); 2589 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 2590 2591 utext_close(&input); 2592 utext_close(&pattern); 2593 } 2594 2595 2596 // 2597 // Split() 2598 // 2599 status = U_ZERO_ERROR; 2600 utext_openUTF8(&re1, " +", -1, &status); 2601 pat1 = RegexPattern::compile(&re1, pe, status); 2602 REGEX_CHECK_STATUS; 2603 UnicodeString fields[10]; 2604 2605 int32_t n; 2606 n = pat1->split("Now is the time", fields, 10, status); 2607 REGEX_CHECK_STATUS; 2608 REGEX_ASSERT(n==4); 2609 REGEX_ASSERT(fields[0]=="Now"); 2610 REGEX_ASSERT(fields[1]=="is"); 2611 REGEX_ASSERT(fields[2]=="the"); 2612 REGEX_ASSERT(fields[3]=="time"); 2613 REGEX_ASSERT(fields[4]==""); 2614 2615 n = pat1->split("Now is the time", fields, 2, status); 2616 REGEX_CHECK_STATUS; 2617 REGEX_ASSERT(n==2); 2618 REGEX_ASSERT(fields[0]=="Now"); 2619 REGEX_ASSERT(fields[1]=="is the time"); 2620 REGEX_ASSERT(fields[2]=="the"); // left over from previous test 2621 2622 fields[1] = "*"; 2623 status = U_ZERO_ERROR; 2624 n = pat1->split("Now is the time", fields, 1, status); 2625 REGEX_CHECK_STATUS; 2626 REGEX_ASSERT(n==1); 2627 REGEX_ASSERT(fields[0]=="Now is the time"); 2628 REGEX_ASSERT(fields[1]=="*"); 2629 status = U_ZERO_ERROR; 2630 2631 n = pat1->split(" Now is the time ", fields, 10, status); 2632 REGEX_CHECK_STATUS; 2633 REGEX_ASSERT(n==5); 2634 REGEX_ASSERT(fields[0]==""); 2635 REGEX_ASSERT(fields[1]=="Now"); 2636 REGEX_ASSERT(fields[2]=="is"); 2637 REGEX_ASSERT(fields[3]=="the"); 2638 REGEX_ASSERT(fields[4]=="time"); 2639 REGEX_ASSERT(fields[5]==""); 2640 2641 n = pat1->split(" ", fields, 10, status); 2642 REGEX_CHECK_STATUS; 2643 REGEX_ASSERT(n==1); 2644 REGEX_ASSERT(fields[0]==""); 2645 2646 fields[0] = "foo"; 2647 n = pat1->split("", fields, 10, status); 2648 REGEX_CHECK_STATUS; 2649 REGEX_ASSERT(n==0); 2650 REGEX_ASSERT(fields[0]=="foo"); 2651 2652 delete pat1; 2653 2654 // split, with a pattern with (capture) 2655 utext_openUTF8(&re1, "<(\\w*)>", -1, &status); 2656 pat1 = RegexPattern::compile(&re1, pe, status); 2657 REGEX_CHECK_STATUS; 2658 2659 status = U_ZERO_ERROR; 2660 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status); 2661 REGEX_CHECK_STATUS; 2662 REGEX_ASSERT(n==6); 2663 REGEX_ASSERT(fields[0]==""); 2664 REGEX_ASSERT(fields[1]=="a"); 2665 REGEX_ASSERT(fields[2]=="Now is "); 2666 REGEX_ASSERT(fields[3]=="b"); 2667 REGEX_ASSERT(fields[4]=="the time"); 2668 REGEX_ASSERT(fields[5]=="c"); 2669 REGEX_ASSERT(fields[6]==""); 2670 REGEX_ASSERT(status==U_ZERO_ERROR); 2671 2672 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status); 2673 REGEX_CHECK_STATUS; 2674 REGEX_ASSERT(n==6); 2675 REGEX_ASSERT(fields[0]==" "); 2676 REGEX_ASSERT(fields[1]=="a"); 2677 REGEX_ASSERT(fields[2]=="Now is "); 2678 REGEX_ASSERT(fields[3]=="b"); 2679 REGEX_ASSERT(fields[4]=="the time"); 2680 REGEX_ASSERT(fields[5]=="c"); 2681 REGEX_ASSERT(fields[6]==""); 2682 2683 status = U_ZERO_ERROR; 2684 fields[6] = "foo"; 2685 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status); 2686 REGEX_CHECK_STATUS; 2687 REGEX_ASSERT(n==6); 2688 REGEX_ASSERT(fields[0]==" "); 2689 REGEX_ASSERT(fields[1]=="a"); 2690 REGEX_ASSERT(fields[2]=="Now is "); 2691 REGEX_ASSERT(fields[3]=="b"); 2692 REGEX_ASSERT(fields[4]=="the time"); 2693 REGEX_ASSERT(fields[5]=="c"); 2694 REGEX_ASSERT(fields[6]=="foo"); 2695 2696 status = U_ZERO_ERROR; 2697 fields[5] = "foo"; 2698 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status); 2699 REGEX_CHECK_STATUS; 2700 REGEX_ASSERT(n==5); 2701 REGEX_ASSERT(fields[0]==" "); 2702 REGEX_ASSERT(fields[1]=="a"); 2703 REGEX_ASSERT(fields[2]=="Now is "); 2704 REGEX_ASSERT(fields[3]=="b"); 2705 REGEX_ASSERT(fields[4]=="the time<c>"); 2706 REGEX_ASSERT(fields[5]=="foo"); 2707 2708 status = U_ZERO_ERROR; 2709 fields[5] = "foo"; 2710 n = pat1->split(" <a>Now is <b>the time", fields, 5, status); 2711 REGEX_CHECK_STATUS; 2712 REGEX_ASSERT(n==5); 2713 REGEX_ASSERT(fields[0]==" "); 2714 REGEX_ASSERT(fields[1]=="a"); 2715 REGEX_ASSERT(fields[2]=="Now is "); 2716 REGEX_ASSERT(fields[3]=="b"); 2717 REGEX_ASSERT(fields[4]=="the time"); 2718 REGEX_ASSERT(fields[5]=="foo"); 2719 2720 status = U_ZERO_ERROR; 2721 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status); 2722 REGEX_CHECK_STATUS; 2723 REGEX_ASSERT(n==4); 2724 REGEX_ASSERT(fields[0]==" "); 2725 REGEX_ASSERT(fields[1]=="a"); 2726 REGEX_ASSERT(fields[2]=="Now is "); 2727 REGEX_ASSERT(fields[3]=="the time<c>"); 2728 status = U_ZERO_ERROR; 2729 delete pat1; 2730 2731 utext_openUTF8(&re1, "([-,])", -1, &status); 2732 pat1 = RegexPattern::compile(&re1, pe, status); 2733 REGEX_CHECK_STATUS; 2734 n = pat1->split("1-10,20", fields, 10, status); 2735 REGEX_CHECK_STATUS; 2736 REGEX_ASSERT(n==5); 2737 REGEX_ASSERT(fields[0]=="1"); 2738 REGEX_ASSERT(fields[1]=="-"); 2739 REGEX_ASSERT(fields[2]=="10"); 2740 REGEX_ASSERT(fields[3]==","); 2741 REGEX_ASSERT(fields[4]=="20"); 2742 delete pat1; 2743 2744 2745 // 2746 // RegexPattern::pattern() and patternText() 2747 // 2748 pat1 = new RegexPattern(); 2749 REGEX_ASSERT(pat1->pattern() == ""); 2750 REGEX_ASSERT_UTEXT("", pat1->patternText()); 2751 delete pat1; 2752 2753 utext_openUTF8(&re1, "(Hello, world)*", -1, &status); 2754 pat1 = RegexPattern::compile(&re1, pe, status); 2755 REGEX_CHECK_STATUS; 2756 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*"); 2757 REGEX_ASSERT_UTEXT("(Hello, world)*", pat1->patternText()); 2758 delete pat1; 2759 2760 utext_close(&re1); 2761 } 2762 2763 2764 //--------------------------------------------------------------------------- 2765 // 2766 // Extended A more thorough check for features of regex patterns 2767 // The test cases are in a separate data file, 2768 // source/tests/testdata/regextst.txt 2769 // A description of the test data format is included in that file. 2770 // 2771 //--------------------------------------------------------------------------- 2772 2773 const char * 2774 RegexTest::getPath(char buffer[2048], const char *filename) { 2775 UErrorCode status=U_ZERO_ERROR; 2776 const char *testDataDirectory = IntlTest::getSourceTestData(status); 2777 if (U_FAILURE(status)) { 2778 errln("ERROR: loadTestData() failed - %s", u_errorName(status)); 2779 return NULL; 2780 } 2781 2782 strcpy(buffer, testDataDirectory); 2783 strcat(buffer, filename); 2784 return buffer; 2785 } 2786 2787 void RegexTest::Extended() { 2788 char tdd[2048]; 2789 const char *srcPath; 2790 UErrorCode status = U_ZERO_ERROR; 2791 int32_t lineNum = 0; 2792 2793 // 2794 // Open and read the test data file. 2795 // 2796 srcPath=getPath(tdd, "regextst.txt"); 2797 if(srcPath==NULL) { 2798 return; /* something went wrong, error already output */ 2799 } 2800 2801 int32_t len; 2802 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status); 2803 if (U_FAILURE(status)) { 2804 return; /* something went wrong, error already output */ 2805 } 2806 2807 // 2808 // Put the test data into a UnicodeString 2809 // 2810 UnicodeString testString(FALSE, testData, len); 2811 2812 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status); 2813 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status); 2814 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)"), 0, status); 2815 2816 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status); 2817 UnicodeString testPattern; // The pattern for test from the test file. 2818 UnicodeString testFlags; // the flags for a test. 2819 UnicodeString matchString; // The marked up string to be used as input 2820 2821 if (U_FAILURE(status)){ 2822 dataerrln("Construct RegexMatcher() error."); 2823 delete [] testData; 2824 return; 2825 } 2826 2827 // 2828 // Loop over the test data file, once per line. 2829 // 2830 while (lineMat.find()) { 2831 lineNum++; 2832 if (U_FAILURE(status)) { 2833 errln("line %d: ICU Error \"%s\"", lineNum, u_errorName(status)); 2834 } 2835 2836 status = U_ZERO_ERROR; 2837 UnicodeString testLine = lineMat.group(1, status); 2838 if (testLine.length() == 0) { 2839 continue; 2840 } 2841 2842 // 2843 // Parse the test line. Skip blank and comment only lines. 2844 // Separate out the three main fields - pattern, flags, target. 2845 // 2846 2847 commentMat.reset(testLine); 2848 if (commentMat.lookingAt(status)) { 2849 // This line is a comment, or blank. 2850 continue; 2851 } 2852 2853 // 2854 // Pull out the pattern field, remove it from the test file line. 2855 // 2856 quotedStuffMat.reset(testLine); 2857 if (quotedStuffMat.lookingAt(status)) { 2858 testPattern = quotedStuffMat.group(2, status); 2859 testLine.remove(0, quotedStuffMat.end(0, status)); 2860 } else { 2861 errln("Bad pattern (missing quotes?) at test file line %d", lineNum); 2862 continue; 2863 } 2864 2865 2866 // 2867 // Pull out the flags from the test file line. 2868 // 2869 flagsMat.reset(testLine); 2870 flagsMat.lookingAt(status); // Will always match, possibly an empty string. 2871 testFlags = flagsMat.group(1, status); 2872 if (flagsMat.group(2, status).length() > 0) { 2873 errln("Bad Match flag at line %d. Scanning %c\n", 2874 lineNum, flagsMat.group(2, status).charAt(0)); 2875 continue; 2876 } 2877 testLine.remove(0, flagsMat.end(0, status)); 2878 2879 // 2880 // Pull out the match string, as a whole. 2881 // We'll process the <tags> later. 2882 // 2883 quotedStuffMat.reset(testLine); 2884 if (quotedStuffMat.lookingAt(status)) { 2885 matchString = quotedStuffMat.group(2, status); 2886 testLine.remove(0, quotedStuffMat.end(0, status)); 2887 } else { 2888 errln("Bad match string at test file line %d", lineNum); 2889 continue; 2890 } 2891 2892 // 2893 // The only thing left from the input line should be an optional trailing comment. 2894 // 2895 commentMat.reset(testLine); 2896 if (commentMat.lookingAt(status) == FALSE) { 2897 errln("Line %d: unexpected characters at end of test line.", lineNum); 2898 continue; 2899 } 2900 2901 // 2902 // Run the test 2903 // 2904 regex_find(testPattern, testFlags, matchString, lineNum); 2905 } 2906 2907 delete [] testData; 2908 2909 } 2910 2911 2912 2913 //--------------------------------------------------------------------------- 2914 // 2915 // regex_find(pattern, flags, inputString, lineNumber) 2916 // 2917 // Function to run a single test from the Extended (data driven) tests. 2918 // See file test/testdata/regextst.txt for a description of the 2919 // pattern and inputString fields, and the allowed flags. 2920 // lineNumber is the source line in regextst.txt of the test. 2921 // 2922 //--------------------------------------------------------------------------- 2923 2924 2925 // Set a value into a UVector at position specified by a decimal number in 2926 // a UnicodeString. This is a utility function needed by the actual test function, 2927 // which follows. 2928 static void set(UVector &vec, int32_t val, UnicodeString index) { 2929 UErrorCode status=U_ZERO_ERROR; 2930 int32_t idx = 0; 2931 for (int32_t i=0; i<index.length(); i++) { 2932 int32_t d=u_charDigitValue(index.charAt(i)); 2933 if (d<0) {return;} 2934 idx = idx*10 + d; 2935 } 2936 while (vec.size()<idx+1) {vec.addElement(-1, status);} 2937 vec.setElementAt(val, idx); 2938 } 2939 2940 void RegexTest::regex_find(const UnicodeString &pattern, 2941 const UnicodeString &flags, 2942 const UnicodeString &inputString, 2943 int32_t line) { 2944 UnicodeString unEscapedInput; 2945 UnicodeString deTaggedInput; 2946 2947 int32_t patternUTF8Length, inputUTF8Length; 2948 char *patternChars = NULL, *inputChars = NULL; 2949 UText patternText = UTEXT_INITIALIZER; 2950 UText inputText = UTEXT_INITIALIZER; 2951 UConverter *UTF8Converter = NULL; 2952 2953 UErrorCode status = U_ZERO_ERROR; 2954 UParseError pe; 2955 RegexPattern *parsePat = NULL; 2956 RegexMatcher *parseMatcher = NULL; 2957 RegexPattern *callerPattern = NULL, *UTF8Pattern = NULL; 2958 RegexMatcher *matcher = NULL, *UTF8Matcher = NULL; 2959 UVector groupStarts(status); 2960 UVector groupEnds(status); 2961 UBool isMatch = FALSE, isUTF8Match = FALSE; 2962 UBool failed = FALSE; 2963 int32_t numFinds; 2964 int32_t i; 2965 UBool useMatchesFunc = FALSE; 2966 UBool useLookingAtFunc = FALSE; 2967 int32_t regionStart = -1; 2968 int32_t regionEnd = -1; 2969 2970 // 2971 // Compile the caller's pattern 2972 // 2973 uint32_t bflags = 0; 2974 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag 2975 bflags |= UREGEX_CASE_INSENSITIVE; 2976 } 2977 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag 2978 bflags |= UREGEX_COMMENTS; 2979 } 2980 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag 2981 bflags |= UREGEX_DOTALL; 2982 } 2983 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag 2984 bflags |= UREGEX_MULTILINE; 2985 } 2986 2987 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag 2988 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES; 2989 } 2990 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag 2991 bflags |= UREGEX_UNIX_LINES; 2992 } 2993 2994 2995 callerPattern = RegexPattern::compile(pattern, bflags, pe, status); 2996 if (status != U_ZERO_ERROR) { 2997 #if UCONFIG_NO_BREAK_ITERATION==1 2998 // 'v' test flag means that the test pattern should not compile if ICU was configured 2999 // to not include break iteration. RBBI is needed for Unicode word boundaries. 3000 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) { 3001 goto cleanupAndReturn; 3002 } 3003 #endif 3004 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E' 3005 // Expected pattern compilation error. 3006 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd' 3007 logln("Pattern Compile returns \"%s\"", u_errorName(status)); 3008 } 3009 goto cleanupAndReturn; 3010 } else { 3011 // Unexpected pattern compilation error. 3012 errln("Line %d: error %s compiling pattern.", line, u_errorName(status)); 3013 goto cleanupAndReturn; 3014 } 3015 } 3016 3017 UTF8Converter = ucnv_open("UTF8", &status); 3018 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); 3019 3020 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status); 3021 status = U_ZERO_ERROR; // buffer overflow 3022 patternChars = new char[patternUTF8Length+1]; 3023 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status); 3024 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status); 3025 3026 if (status == U_ZERO_ERROR) { 3027 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status); 3028 3029 if (status != U_ZERO_ERROR) { 3030 #if UCONFIG_NO_BREAK_ITERATION==1 3031 // 'v' test flag means that the test pattern should not compile if ICU was configured 3032 // to not include break iteration. RBBI is needed for Unicode word boundaries. 3033 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) { 3034 goto cleanupAndReturn; 3035 } 3036 #endif 3037 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E' 3038 // Expected pattern compilation error. 3039 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd' 3040 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status)); 3041 } 3042 goto cleanupAndReturn; 3043 } else { 3044 // Unexpected pattern compilation error. 3045 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status)); 3046 goto cleanupAndReturn; 3047 } 3048 } 3049 } 3050 3051 if (UTF8Pattern == NULL) { 3052 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine 3053 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for line %d", line); 3054 status = U_ZERO_ERROR; 3055 } 3056 3057 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag 3058 RegexPatternDump(callerPattern); 3059 } 3060 3061 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag 3062 errln("Expected, but did not get, a pattern compilation error."); 3063 goto cleanupAndReturn; 3064 } 3065 3066 3067 // 3068 // Number of times find() should be called on the test string, default to 1 3069 // 3070 numFinds = 1; 3071 for (i=2; i<=9; i++) { 3072 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag 3073 if (numFinds != 1) { 3074 errln("Line %d: more than one digit flag. Scanning %d.", line, i); 3075 goto cleanupAndReturn; 3076 } 3077 numFinds = i; 3078 } 3079 } 3080 3081 // 'M' flag. Use matches() instead of find() 3082 if (flags.indexOf((UChar)0x4d) >= 0) { 3083 useMatchesFunc = TRUE; 3084 } 3085 if (flags.indexOf((UChar)0x4c) >= 0) { 3086 useLookingAtFunc = TRUE; 3087 } 3088 3089 // 3090 // Find the tags in the input data, remove them, and record the group boundary 3091 // positions. 3092 // 3093 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status); 3094 REGEX_CHECK_STATUS_L(line); 3095 3096 unEscapedInput = inputString.unescape(); 3097 parseMatcher = parsePat->matcher(unEscapedInput, status); 3098 REGEX_CHECK_STATUS_L(line); 3099 while(parseMatcher->find()) { 3100 parseMatcher->appendReplacement(deTaggedInput, "", status); 3101 REGEX_CHECK_STATUS; 3102 UnicodeString groupNum = parseMatcher->group(2, status); 3103 if (groupNum == "r") { 3104 // <r> or </r>, a region specification within the string 3105 if (parseMatcher->group(1, status) == "/") { 3106 regionEnd = deTaggedInput.length(); 3107 } else { 3108 regionStart = deTaggedInput.length(); 3109 } 3110 } else { 3111 // <digits> or </digits>, a group match boundary tag. 3112 if (parseMatcher->group(1, status) == "/") { 3113 set(groupEnds, deTaggedInput.length(), groupNum); 3114 } else { 3115 set(groupStarts, deTaggedInput.length(), groupNum); 3116 } 3117 } 3118 } 3119 parseMatcher->appendTail(deTaggedInput); 3120 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line); 3121 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) { 3122 errln("mismatched <r> tags"); 3123 failed = TRUE; 3124 goto cleanupAndReturn; 3125 } 3126 3127 3128 // 3129 // Configure the matcher according to the flags specified with this test. 3130 // 3131 matcher = callerPattern->matcher(deTaggedInput, status); 3132 REGEX_CHECK_STATUS_L(line); 3133 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag 3134 matcher->setTrace(TRUE); 3135 } 3136 3137 if (UTF8Pattern != NULL) { 3138 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status); 3139 status = U_ZERO_ERROR; // buffer overflow 3140 inputChars = new char[inputUTF8Length+1]; 3141 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status); 3142 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status); 3143 3144 if (status == U_ZERO_ERROR) { 3145 UTF8Matcher = UTF8Pattern->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status); 3146 REGEX_CHECK_STATUS_L(line); 3147 } 3148 3149 if (UTF8Matcher == NULL) { 3150 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine 3151 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for line %d", line); 3152 status = U_ZERO_ERROR; 3153 } 3154 } 3155 3156 if (regionStart>=0) { 3157 matcher->region(regionStart, regionEnd, status); 3158 REGEX_CHECK_STATUS_L(line); 3159 if (UTF8Matcher != NULL) { 3160 UTF8Matcher->region(regionStart, regionEnd, status); 3161 REGEX_CHECK_STATUS_L(line); 3162 } 3163 } 3164 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag 3165 matcher->useAnchoringBounds(FALSE); 3166 if (UTF8Matcher != NULL) { 3167 UTF8Matcher->useAnchoringBounds(FALSE); 3168 } 3169 } 3170 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag 3171 matcher->useTransparentBounds(TRUE); 3172 if (UTF8Matcher != NULL) { 3173 UTF8Matcher->useTransparentBounds(TRUE); 3174 } 3175 } 3176 3177 3178 3179 // 3180 // Do a find on the de-tagged input using the caller's pattern 3181 // TODO: error on count>1 and not find(). 3182 // error on both matches() and lookingAt(). 3183 // 3184 for (i=0; i<numFinds; i++) { 3185 if (useMatchesFunc) { 3186 isMatch = matcher->matches(status); 3187 if (UTF8Matcher != NULL) { 3188 isUTF8Match = UTF8Matcher->matches(status); 3189 } 3190 } else if (useLookingAtFunc) { 3191 isMatch = matcher->lookingAt(status); 3192 if (UTF8Matcher != NULL) { 3193 isUTF8Match = UTF8Matcher->lookingAt(status); 3194 } 3195 } else { 3196 isMatch = matcher->find(); 3197 if (UTF8Matcher != NULL) { 3198 isUTF8Match = UTF8Matcher->find(); 3199 } 3200 } 3201 } 3202 matcher->setTrace(FALSE); 3203 3204 // 3205 // Match up the groups from the find() with the groups from the tags 3206 // 3207 3208 // number of tags should match number of groups from find operation. 3209 // matcher->groupCount does not include group 0, the entire match, hence the +1. 3210 // G option in test means that capture group data is not available in the 3211 // expected results, so the check needs to be suppressed. 3212 if (isMatch == FALSE && groupStarts.size() != 0) { 3213 errln("Error at line %d: Match expected, but none found.", line); 3214 failed = TRUE; 3215 goto cleanupAndReturn; 3216 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) { 3217 errln("Error at line %d: Match expected, but none found. (UTF8)", line); 3218 failed = TRUE; 3219 goto cleanupAndReturn; 3220 } 3221 3222 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) { 3223 // Only check for match / no match. Don't check capture groups. 3224 if (isMatch && groupStarts.size() == 0) { 3225 errln("Error at line %d: No match expected, but one found.", line); 3226 failed = TRUE; 3227 } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) { 3228 errln("Error at line %d: No match expected, but one found. (UTF8)", line); 3229 failed = TRUE; 3230 } 3231 goto cleanupAndReturn; 3232 } 3233 3234 REGEX_CHECK_STATUS_L(line); 3235 for (i=0; i<=matcher->groupCount(); i++) { 3236 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i)); 3237 if (matcher->start(i, status) != expectedStart) { 3238 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d", 3239 line, i, expectedStart, matcher->start(i, status)); 3240 failed = TRUE; 3241 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3242 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStart) { 3243 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)", 3244 line, i, expectedStart, UTF8Matcher->start(i, status)); 3245 failed = TRUE; 3246 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3247 } 3248 3249 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i)); 3250 if (matcher->end(i, status) != expectedEnd) { 3251 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d", 3252 line, i, expectedEnd, matcher->end(i, status)); 3253 failed = TRUE; 3254 // Error on end position; keep going; real error is probably yet to come as group 3255 // end positions work from end of the input data towards the front. 3256 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEnd) { 3257 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)", 3258 line, i, expectedEnd, UTF8Matcher->end(i, status)); 3259 failed = TRUE; 3260 // Error on end position; keep going; real error is probably yet to come as group 3261 // end positions work from end of the input data towards the front. 3262 } 3263 } 3264 if ( matcher->groupCount()+1 < groupStarts.size()) { 3265 errln("Error at line %d: Expected %d capture groups, found %d.", 3266 line, groupStarts.size()-1, matcher->groupCount()); 3267 failed = TRUE; 3268 } 3269 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) { 3270 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)", 3271 line, groupStarts.size()-1, UTF8Matcher->groupCount()); 3272 failed = TRUE; 3273 } 3274 3275 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false 3276 matcher->requireEnd() == TRUE) { 3277 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line); 3278 failed = TRUE; 3279 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false 3280 UTF8Matcher->requireEnd() == TRUE) { 3281 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line); 3282 failed = TRUE; 3283 } 3284 3285 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true 3286 matcher->requireEnd() == FALSE) { 3287 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line); 3288 failed = TRUE; 3289 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false 3290 UTF8Matcher->requireEnd() == FALSE) { 3291 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line); 3292 failed = TRUE; 3293 } 3294 3295 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false 3296 matcher->hitEnd() == TRUE) { 3297 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line); 3298 failed = TRUE; 3299 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false 3300 UTF8Matcher->hitEnd() == TRUE) { 3301 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line); 3302 failed = TRUE; 3303 } 3304 3305 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true 3306 matcher->hitEnd() == FALSE) { 3307 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line); 3308 failed = TRUE; 3309 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true 3310 UTF8Matcher->hitEnd() == FALSE) { 3311 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line); 3312 failed = TRUE; 3313 } 3314 3315 3316 cleanupAndReturn: 3317 if (failed) { 3318 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" " 3319 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\""); 3320 // callerPattern->dump(); 3321 } 3322 delete parseMatcher; 3323 delete parsePat; 3324 delete UTF8Matcher; 3325 delete UTF8Pattern; 3326 delete matcher; 3327 delete callerPattern; 3328 3329 utext_close(&inputText); 3330 delete[] inputChars; 3331 utext_close(&patternText); 3332 delete[] patternChars; 3333 ucnv_close(UTF8Converter); 3334 } 3335 3336 3337 3338 3339 //--------------------------------------------------------------------------- 3340 // 3341 // Errors Check for error handling in patterns. 3342 // 3343 //--------------------------------------------------------------------------- 3344 void RegexTest::Errors() { 3345 // \escape sequences that aren't implemented yet. 3346 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED); 3347 3348 // Missing close parentheses 3349 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN); 3350 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN); 3351 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN); 3352 3353 // Extra close paren 3354 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN); 3355 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN); 3356 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN); 3357 3358 // Look-ahead, Look-behind 3359 // TODO: add tests for unbounded length look-behinds. 3360 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct 3361 3362 // Attempt to use non-default flags 3363 { 3364 UParseError pe; 3365 UErrorCode status = U_ZERO_ERROR; 3366 int32_t flags = UREGEX_CANON_EQ | 3367 UREGEX_COMMENTS | UREGEX_DOTALL | 3368 UREGEX_MULTILINE; 3369 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status); 3370 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED); 3371 delete pat1; 3372 } 3373 3374 3375 // Quantifiers are allowed only after something that can be quantified. 3376 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX); 3377 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX); 3378 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX); 3379 3380 // Mal-formed {min,max} quantifiers 3381 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL); 3382 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN); 3383 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL); 3384 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL); 3385 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL); 3386 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG); 3387 REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan 3388 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format 3389 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG); 3390 3391 // Ticket 5389 3392 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX); 3393 3394 // Invalid Back Reference \0 3395 // For ICU 3.8 and earlier 3396 // For ICU versions newer than 3.8, \0 introduces an octal escape. 3397 // 3398 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE); 3399 3400 } 3401 3402 3403 //------------------------------------------------------------------------------- 3404 // 3405 // Read a text data file, convert it to UChars, and return the data 3406 // in one big UChar * buffer, which the caller must delete. 3407 // 3408 //-------------------------------------------------------------------------------- 3409 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen, 3410 const char *defEncoding, UErrorCode &status) { 3411 UChar *retPtr = NULL; 3412 char *fileBuf = NULL; 3413 UConverter* conv = NULL; 3414 FILE *f = NULL; 3415 3416 ulen = 0; 3417 if (U_FAILURE(status)) { 3418 return retPtr; 3419 } 3420 3421 // 3422 // Open the file. 3423 // 3424 f = fopen(fileName, "rb"); 3425 if (f == 0) { 3426 dataerrln("Error opening test data file %s\n", fileName); 3427 status = U_FILE_ACCESS_ERROR; 3428 return NULL; 3429 } 3430 // 3431 // Read it in 3432 // 3433 int32_t fileSize; 3434 int32_t amt_read; 3435 3436 fseek( f, 0, SEEK_END); 3437 fileSize = ftell(f); 3438 fileBuf = new char[fileSize]; 3439 fseek(f, 0, SEEK_SET); 3440 amt_read = fread(fileBuf, 1, fileSize, f); 3441 if (amt_read != fileSize || fileSize <= 0) { 3442 errln("Error reading test data file."); 3443 goto cleanUpAndReturn; 3444 } 3445 3446 // 3447 // Look for a Unicode Signature (BOM) on the data just read 3448 // 3449 int32_t signatureLength; 3450 const char * fileBufC; 3451 const char* encoding; 3452 3453 fileBufC = fileBuf; 3454 encoding = ucnv_detectUnicodeSignature( 3455 fileBuf, fileSize, &signatureLength, &status); 3456 if(encoding!=NULL ){ 3457 fileBufC += signatureLength; 3458 fileSize -= signatureLength; 3459 } else { 3460 encoding = defEncoding; 3461 if (strcmp(encoding, "utf-8") == 0) { 3462 errln("file %s is missing its BOM", fileName); 3463 } 3464 } 3465 3466 // 3467 // Open a converter to take the rule file to UTF-16 3468 // 3469 conv = ucnv_open(encoding, &status); 3470 if (U_FAILURE(status)) { 3471 goto cleanUpAndReturn; 3472 } 3473 3474 // 3475 // Convert the rules to UChar. 3476 // Preflight first to determine required buffer size. 3477 // 3478 ulen = ucnv_toUChars(conv, 3479 NULL, // dest, 3480 0, // destCapacity, 3481 fileBufC, 3482 fileSize, 3483 &status); 3484 if (status == U_BUFFER_OVERFLOW_ERROR) { 3485 // Buffer Overflow is expected from the preflight operation. 3486 status = U_ZERO_ERROR; 3487 3488 retPtr = new UChar[ulen+1]; 3489 ucnv_toUChars(conv, 3490 retPtr, // dest, 3491 ulen+1, 3492 fileBufC, 3493 fileSize, 3494 &status); 3495 } 3496 3497 cleanUpAndReturn: 3498 fclose(f); 3499 delete[] fileBuf; 3500 ucnv_close(conv); 3501 if (U_FAILURE(status)) { 3502 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 3503 delete retPtr; 3504 retPtr = 0; 3505 ulen = 0; 3506 }; 3507 return retPtr; 3508 } 3509 3510 3511 //------------------------------------------------------------------------------- 3512 // 3513 // PerlTests - Run Perl's regular expression tests 3514 // The input file for this test is re_tests, the standard regular 3515 // expression test data distributed with the Perl source code. 3516 // 3517 // Here is Perl's description of the test data file: 3518 // 3519 // # The tests are in a separate file 't/op/re_tests'. 3520 // # Each line in that file is a separate test. 3521 // # There are five columns, separated by tabs. 3522 // # 3523 // # Column 1 contains the pattern, optionally enclosed in C<''>. 3524 // # Modifiers can be put after the closing C<'>. 3525 // # 3526 // # Column 2 contains the string to be matched. 3527 // # 3528 // # Column 3 contains the expected result: 3529 // # y expect a match 3530 // # n expect no match 3531 // # c expect an error 3532 // # B test exposes a known bug in Perl, should be skipped 3533 // # b test exposes a known bug in Perl, should be skipped if noamp 3534 // # 3535 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>. 3536 // # 3537 // # Column 4 contains a string, usually C<$&>. 3538 // # 3539 // # Column 5 contains the expected result of double-quote 3540 // # interpolating that string after the match, or start of error message. 3541 // # 3542 // # Column 6, if present, contains a reason why the test is skipped. 3543 // # This is printed with "skipped", for harness to pick up. 3544 // # 3545 // # \n in the tests are interpolated, as are variables of the form ${\w+}. 3546 // # 3547 // # If you want to add a regular expression test that can't be expressed 3548 // # in this format, don't add it here: put it in op/pat.t instead. 3549 // 3550 // For ICU, if field 3 contains an 'i', the test will be skipped. 3551 // The test exposes is some known incompatibility between ICU and Perl regexps. 3552 // (The i is in addition to whatever was there before.) 3553 // 3554 //------------------------------------------------------------------------------- 3555 void RegexTest::PerlTests() { 3556 char tdd[2048]; 3557 const char *srcPath; 3558 UErrorCode status = U_ZERO_ERROR; 3559 UParseError pe; 3560 3561 // 3562 // Open and read the test data file. 3563 // 3564 srcPath=getPath(tdd, "re_tests.txt"); 3565 if(srcPath==NULL) { 3566 return; /* something went wrong, error already output */ 3567 } 3568 3569 int32_t len; 3570 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status); 3571 if (U_FAILURE(status)) { 3572 return; /* something went wrong, error already output */ 3573 } 3574 3575 // 3576 // Put the test data into a UnicodeString 3577 // 3578 UnicodeString testDataString(FALSE, testData, len); 3579 3580 // 3581 // Regex to break the input file into lines, and strip the new lines. 3582 // One line per match, capture group one is the desired data. 3583 // 3584 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status); 3585 if (U_FAILURE(status)) { 3586 dataerrln("RegexPattern::compile() error"); 3587 return; 3588 } 3589 RegexMatcher* lineMat = linePat->matcher(testDataString, status); 3590 3591 // 3592 // Regex to split a test file line into fields. 3593 // There are six fields, separated by tabs. 3594 // 3595 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status); 3596 3597 // 3598 // Regex to identify test patterns with flag settings, and to separate them. 3599 // Test patterns with flags look like 'pattern'i 3600 // Test patterns without flags are not quoted: pattern 3601 // Coming out, capture group 2 is the pattern, capture group 3 is the flags. 3602 // 3603 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status); 3604 RegexMatcher* flagMat = flagPat->matcher(status); 3605 3606 // 3607 // The Perl tests reference several perl-isms, which are evaluated/substituted 3608 // in the test data. Not being perl, this must be done explicitly. Here 3609 // are string constants and REs for these constructs. 3610 // 3611 UnicodeString nulnulSrc("${nulnul}"); 3612 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV); 3613 nulnul = nulnul.unescape(); 3614 3615 UnicodeString ffffSrc("${ffff}"); 3616 UnicodeString ffff("\\uffff", -1, US_INV); 3617 ffff = ffff.unescape(); 3618 3619 // regexp for $-[0], $+[2], etc. 3620 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status); 3621 RegexMatcher *groupsMat = groupsPat->matcher(status); 3622 3623 // regexp for $0, $1, $2, etc. 3624 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status); 3625 RegexMatcher *cgMat = cgPat->matcher(status); 3626 3627 3628 // 3629 // Main Loop for the Perl Tests, runs once per line from the 3630 // test data file. 3631 // 3632 int32_t lineNum = 0; 3633 int32_t skippedUnimplementedCount = 0; 3634 while (lineMat->find()) { 3635 lineNum++; 3636 3637 // 3638 // Get a line, break it into its fields, do the Perl 3639 // variable substitutions. 3640 // 3641 UnicodeString line = lineMat->group(1, status); 3642 UnicodeString fields[7]; 3643 fieldPat->split(line, fields, 7, status); 3644 3645 flagMat->reset(fields[0]); 3646 flagMat->matches(status); 3647 UnicodeString pattern = flagMat->group(2, status); 3648 pattern.findAndReplace("${bang}", "!"); 3649 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000")); 3650 pattern.findAndReplace(ffffSrc, ffff); 3651 3652 // 3653 // Identify patterns that include match flag settings, 3654 // split off the flags, remove the extra quotes. 3655 // 3656 UnicodeString flagStr = flagMat->group(3, status); 3657 if (U_FAILURE(status)) { 3658 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 3659 return; 3660 } 3661 int32_t flags = 0; 3662 const UChar UChar_c = 0x63; // Char constants for the flag letters. 3663 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C) 3664 const UChar UChar_m = 0x6d; 3665 const UChar UChar_x = 0x78; 3666 const UChar UChar_y = 0x79; 3667 if (flagStr.indexOf(UChar_i) != -1) { 3668 flags |= UREGEX_CASE_INSENSITIVE; 3669 } 3670 if (flagStr.indexOf(UChar_m) != -1) { 3671 flags |= UREGEX_MULTILINE; 3672 } 3673 if (flagStr.indexOf(UChar_x) != -1) { 3674 flags |= UREGEX_COMMENTS; 3675 } 3676 3677 // 3678 // Compile the test pattern. 3679 // 3680 status = U_ZERO_ERROR; 3681 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status); 3682 if (status == U_REGEX_UNIMPLEMENTED) { 3683 // 3684 // Test of a feature that is planned for ICU, but not yet implemented. 3685 // skip the test. 3686 skippedUnimplementedCount++; 3687 delete testPat; 3688 status = U_ZERO_ERROR; 3689 continue; 3690 } 3691 3692 if (U_FAILURE(status)) { 3693 // Some tests are supposed to generate errors. 3694 // Only report an error for tests that are supposed to succeed. 3695 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND 3696 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility 3697 { 3698 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status)); 3699 } 3700 status = U_ZERO_ERROR; 3701 delete testPat; 3702 continue; 3703 } 3704 3705 if (fields[2].indexOf(UChar_i) >= 0) { 3706 // ICU should skip this test. 3707 delete testPat; 3708 continue; 3709 } 3710 3711 if (fields[2].indexOf(UChar_c) >= 0) { 3712 // This pattern should have caused a compilation error, but didn't/ 3713 errln("line %d: Expected a pattern compile error, got success.", lineNum); 3714 delete testPat; 3715 continue; 3716 } 3717 3718 // 3719 // replace the Perl variables that appear in some of the 3720 // match data strings. 3721 // 3722 UnicodeString matchString = fields[1]; 3723 matchString.findAndReplace(nulnulSrc, nulnul); 3724 matchString.findAndReplace(ffffSrc, ffff); 3725 3726 // Replace any \n in the match string with an actual new-line char. 3727 // Don't do full unescape, as this unescapes more than Perl does, which 3728 // causes other spurious failures in the tests. 3729 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 3730 3731 3732 3733 // 3734 // Run the test, check for expected match/don't match result. 3735 // 3736 RegexMatcher *testMat = testPat->matcher(matchString, status); 3737 UBool found = testMat->find(); 3738 UBool expected = FALSE; 3739 if (fields[2].indexOf(UChar_y) >=0) { 3740 expected = TRUE; 3741 } 3742 if (expected != found) { 3743 errln("line %d: Expected %smatch, got %smatch", 3744 lineNum, expected?"":"no ", found?"":"no " ); 3745 continue; 3746 } 3747 3748 // Don't try to check expected results if there is no match. 3749 // (Some have stuff in the expected fields) 3750 if (!found) { 3751 delete testMat; 3752 delete testPat; 3753 continue; 3754 } 3755 3756 // 3757 // Interpret the Perl expression from the fourth field of the data file, 3758 // building up an ICU string from the results of the ICU match. 3759 // The Perl expression will contain references to the results of 3760 // a regex match, including the matched string, capture group strings, 3761 // group starting and ending indicies, etc. 3762 // 3763 UnicodeString resultString; 3764 UnicodeString perlExpr = fields[3]; 3765 #if SUPPORT_MUTATING_INPUT_STRING 3766 groupsMat->reset(perlExpr); 3767 cgMat->reset(perlExpr); 3768 #endif 3769 3770 while (perlExpr.length() > 0) { 3771 #if !SUPPORT_MUTATING_INPUT_STRING 3772 // Perferred usage. Reset after any modification to input string. 3773 groupsMat->reset(perlExpr); 3774 cgMat->reset(perlExpr); 3775 #endif 3776 3777 if (perlExpr.startsWith("$&")) { 3778 resultString.append(testMat->group(status)); 3779 perlExpr.remove(0, 2); 3780 } 3781 3782 else if (groupsMat->lookingAt(status)) { 3783 // $-[0] $+[2] etc. 3784 UnicodeString digitString = groupsMat->group(2, status); 3785 int32_t t = 0; 3786 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 3787 UnicodeString plusOrMinus = groupsMat->group(1, status); 3788 int32_t matchPosition; 3789 if (plusOrMinus.compare("+") == 0) { 3790 matchPosition = testMat->end(groupNum, status); 3791 } else { 3792 matchPosition = testMat->start(groupNum, status); 3793 } 3794 if (matchPosition != -1) { 3795 ICU_Utility::appendNumber(resultString, matchPosition); 3796 } 3797 perlExpr.remove(0, groupsMat->end(status)); 3798 } 3799 3800 else if (cgMat->lookingAt(status)) { 3801 // $1, $2, $3, etc. 3802 UnicodeString digitString = cgMat->group(1, status); 3803 int32_t t = 0; 3804 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 3805 if (U_SUCCESS(status)) { 3806 resultString.append(testMat->group(groupNum, status)); 3807 status = U_ZERO_ERROR; 3808 } 3809 perlExpr.remove(0, cgMat->end(status)); 3810 } 3811 3812 else if (perlExpr.startsWith("@-")) { 3813 int32_t i; 3814 for (i=0; i<=testMat->groupCount(); i++) { 3815 if (i>0) { 3816 resultString.append(" "); 3817 } 3818 ICU_Utility::appendNumber(resultString, testMat->start(i, status)); 3819 } 3820 perlExpr.remove(0, 2); 3821 } 3822 3823 else if (perlExpr.startsWith("@+")) { 3824 int32_t i; 3825 for (i=0; i<=testMat->groupCount(); i++) { 3826 if (i>0) { 3827 resultString.append(" "); 3828 } 3829 ICU_Utility::appendNumber(resultString, testMat->end(i, status)); 3830 } 3831 perlExpr.remove(0, 2); 3832 } 3833 3834 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal. 3835 // or as an escaped sequence (e.g. \n) 3836 if (perlExpr.length() > 1) { 3837 perlExpr.remove(0, 1); // Remove the '\', but only if not last char. 3838 } 3839 UChar c = perlExpr.charAt(0); 3840 switch (c) { 3841 case 'n': c = '\n'; break; 3842 // add any other escape sequences that show up in the test expected results. 3843 } 3844 resultString.append(c); 3845 perlExpr.remove(0, 1); 3846 } 3847 3848 else { 3849 // Any characters from the perl expression that we don't explicitly 3850 // recognize before here are assumed to be literals and copied 3851 // as-is to the expected results. 3852 resultString.append(perlExpr.charAt(0)); 3853 perlExpr.remove(0, 1); 3854 } 3855 3856 if (U_FAILURE(status)) { 3857 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status)); 3858 break; 3859 } 3860 } 3861 3862 // 3863 // Expected Results Compare 3864 // 3865 UnicodeString expectedS(fields[4]); 3866 expectedS.findAndReplace(nulnulSrc, nulnul); 3867 expectedS.findAndReplace(ffffSrc, ffff); 3868 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 3869 3870 3871 if (expectedS.compare(resultString) != 0) { 3872 err("Line %d: Incorrect perl expression results.", lineNum); 3873 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\""); 3874 } 3875 3876 delete testMat; 3877 delete testPat; 3878 } 3879 3880 // 3881 // All done. Clean up allocated stuff. 3882 // 3883 delete cgMat; 3884 delete cgPat; 3885 3886 delete groupsMat; 3887 delete groupsPat; 3888 3889 delete flagMat; 3890 delete flagPat; 3891 3892 delete lineMat; 3893 delete linePat; 3894 3895 delete fieldPat; 3896 delete [] testData; 3897 3898 3899 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount); 3900 3901 } 3902 3903 3904 //------------------------------------------------------------------------------- 3905 // 3906 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts 3907 // (instead of using UnicodeStrings) to test the alternate engine. 3908 // The input file for this test is re_tests, the standard regular 3909 // expression test data distributed with the Perl source code. 3910 // See PerlTests() for more information. 3911 // 3912 //------------------------------------------------------------------------------- 3913 void RegexTest::PerlTestsUTF8() { 3914 char tdd[2048]; 3915 const char *srcPath; 3916 UErrorCode status = U_ZERO_ERROR; 3917 UParseError pe; 3918 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status)); 3919 UText patternText = UTEXT_INITIALIZER; 3920 char *patternChars = NULL; 3921 int32_t patternLength; 3922 int32_t patternCapacity = 0; 3923 UText inputText = UTEXT_INITIALIZER; 3924 char *inputChars = NULL; 3925 int32_t inputLength; 3926 int32_t inputCapacity = 0; 3927 3928 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); 3929 3930 // 3931 // Open and read the test data file. 3932 // 3933 srcPath=getPath(tdd, "re_tests.txt"); 3934 if(srcPath==NULL) { 3935 return; /* something went wrong, error already output */ 3936 } 3937 3938 int32_t len; 3939 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status); 3940 if (U_FAILURE(status)) { 3941 return; /* something went wrong, error already output */ 3942 } 3943 3944 // 3945 // Put the test data into a UnicodeString 3946 // 3947 UnicodeString testDataString(FALSE, testData, len); 3948 3949 // 3950 // Regex to break the input file into lines, and strip the new lines. 3951 // One line per match, capture group one is the desired data. 3952 // 3953 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status); 3954 if (U_FAILURE(status)) { 3955 dataerrln("RegexPattern::compile() error"); 3956 return; 3957 } 3958 RegexMatcher* lineMat = linePat->matcher(testDataString, status); 3959 3960 // 3961 // Regex to split a test file line into fields. 3962 // There are six fields, separated by tabs. 3963 // 3964 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status); 3965 3966 // 3967 // Regex to identify test patterns with flag settings, and to separate them. 3968 // Test patterns with flags look like 'pattern'i 3969 // Test patterns without flags are not quoted: pattern 3970 // Coming out, capture group 2 is the pattern, capture group 3 is the flags. 3971 // 3972 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status); 3973 RegexMatcher* flagMat = flagPat->matcher(status); 3974 3975 // 3976 // The Perl tests reference several perl-isms, which are evaluated/substituted 3977 // in the test data. Not being perl, this must be done explicitly. Here 3978 // are string constants and REs for these constructs. 3979 // 3980 UnicodeString nulnulSrc("${nulnul}"); 3981 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV); 3982 nulnul = nulnul.unescape(); 3983 3984 UnicodeString ffffSrc("${ffff}"); 3985 UnicodeString ffff("\\uffff", -1, US_INV); 3986 ffff = ffff.unescape(); 3987 3988 // regexp for $-[0], $+[2], etc. 3989 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status); 3990 RegexMatcher *groupsMat = groupsPat->matcher(status); 3991 3992 // regexp for $0, $1, $2, etc. 3993 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status); 3994 RegexMatcher *cgMat = cgPat->matcher(status); 3995 3996 3997 // 3998 // Main Loop for the Perl Tests, runs once per line from the 3999 // test data file. 4000 // 4001 int32_t lineNum = 0; 4002 int32_t skippedUnimplementedCount = 0; 4003 while (lineMat->find()) { 4004 lineNum++; 4005 4006 // 4007 // Get a line, break it into its fields, do the Perl 4008 // variable substitutions. 4009 // 4010 UnicodeString line = lineMat->group(1, status); 4011 UnicodeString fields[7]; 4012 fieldPat->split(line, fields, 7, status); 4013 4014 flagMat->reset(fields[0]); 4015 flagMat->matches(status); 4016 UnicodeString pattern = flagMat->group(2, status); 4017 pattern.findAndReplace("${bang}", "!"); 4018 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000")); 4019 pattern.findAndReplace(ffffSrc, ffff); 4020 4021 // 4022 // Identify patterns that include match flag settings, 4023 // split off the flags, remove the extra quotes. 4024 // 4025 UnicodeString flagStr = flagMat->group(3, status); 4026 if (U_FAILURE(status)) { 4027 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 4028 return; 4029 } 4030 int32_t flags = 0; 4031 const UChar UChar_c = 0x63; // Char constants for the flag letters. 4032 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C) 4033 const UChar UChar_m = 0x6d; 4034 const UChar UChar_x = 0x78; 4035 const UChar UChar_y = 0x79; 4036 if (flagStr.indexOf(UChar_i) != -1) { 4037 flags |= UREGEX_CASE_INSENSITIVE; 4038 } 4039 if (flagStr.indexOf(UChar_m) != -1) { 4040 flags |= UREGEX_MULTILINE; 4041 } 4042 if (flagStr.indexOf(UChar_x) != -1) { 4043 flags |= UREGEX_COMMENTS; 4044 } 4045 4046 // 4047 // Put the pattern in a UTF-8 UText 4048 // 4049 status = U_ZERO_ERROR; 4050 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status); 4051 if (status == U_BUFFER_OVERFLOW_ERROR) { 4052 status = U_ZERO_ERROR; 4053 delete[] patternChars; 4054 patternCapacity = patternLength + 1; 4055 patternChars = new char[patternCapacity]; 4056 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status); 4057 } 4058 utext_openUTF8(&patternText, patternChars, patternLength, &status); 4059 4060 // 4061 // Compile the test pattern. 4062 // 4063 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status); 4064 if (status == U_REGEX_UNIMPLEMENTED) { 4065 // 4066 // Test of a feature that is planned for ICU, but not yet implemented. 4067 // skip the test. 4068 skippedUnimplementedCount++; 4069 delete testPat; 4070 status = U_ZERO_ERROR; 4071 continue; 4072 } 4073 4074 if (U_FAILURE(status)) { 4075 // Some tests are supposed to generate errors. 4076 // Only report an error for tests that are supposed to succeed. 4077 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND 4078 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility 4079 { 4080 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status)); 4081 } 4082 status = U_ZERO_ERROR; 4083 delete testPat; 4084 continue; 4085 } 4086 4087 if (fields[2].indexOf(UChar_i) >= 0) { 4088 // ICU should skip this test. 4089 delete testPat; 4090 continue; 4091 } 4092 4093 if (fields[2].indexOf(UChar_c) >= 0) { 4094 // This pattern should have caused a compilation error, but didn't/ 4095 errln("line %d: Expected a pattern compile error, got success.", lineNum); 4096 delete testPat; 4097 continue; 4098 } 4099 4100 4101 // 4102 // replace the Perl variables that appear in some of the 4103 // match data strings. 4104 // 4105 UnicodeString matchString = fields[1]; 4106 matchString.findAndReplace(nulnulSrc, nulnul); 4107 matchString.findAndReplace(ffffSrc, ffff); 4108 4109 // Replace any \n in the match string with an actual new-line char. 4110 // Don't do full unescape, as this unescapes more than Perl does, which 4111 // causes other spurious failures in the tests. 4112 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4113 4114 // 4115 // Put the input in a UTF-8 UText 4116 // 4117 status = U_ZERO_ERROR; 4118 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status); 4119 if (status == U_BUFFER_OVERFLOW_ERROR) { 4120 status = U_ZERO_ERROR; 4121 delete[] inputChars; 4122 inputCapacity = inputLength + 1; 4123 inputChars = new char[inputCapacity]; 4124 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status); 4125 } 4126 utext_openUTF8(&inputText, inputChars, inputLength, &status); 4127 4128 // 4129 // Run the test, check for expected match/don't match result. 4130 // 4131 RegexMatcher *testMat = testPat->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status); 4132 UBool found = testMat->find(); 4133 UBool expected = FALSE; 4134 if (fields[2].indexOf(UChar_y) >=0) { 4135 expected = TRUE; 4136 } 4137 if (expected != found) { 4138 errln("line %d: Expected %smatch, got %smatch", 4139 lineNum, expected?"":"no ", found?"":"no " ); 4140 continue; 4141 } 4142 4143 // Don't try to check expected results if there is no match. 4144 // (Some have stuff in the expected fields) 4145 if (!found) { 4146 delete testMat; 4147 delete testPat; 4148 continue; 4149 } 4150 4151 // 4152 // Interpret the Perl expression from the fourth field of the data file, 4153 // building up an ICU string from the results of the ICU match. 4154 // The Perl expression will contain references to the results of 4155 // a regex match, including the matched string, capture group strings, 4156 // group starting and ending indicies, etc. 4157 // 4158 UnicodeString resultString; 4159 UnicodeString perlExpr = fields[3]; 4160 4161 while (perlExpr.length() > 0) { 4162 groupsMat->reset(perlExpr); 4163 cgMat->reset(perlExpr); 4164 4165 if (perlExpr.startsWith("$&")) { 4166 resultString.append(testMat->group(status)); 4167 perlExpr.remove(0, 2); 4168 } 4169 4170 else if (groupsMat->lookingAt(status)) { 4171 // $-[0] $+[2] etc. 4172 UnicodeString digitString = groupsMat->group(2, status); 4173 int32_t t = 0; 4174 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4175 UnicodeString plusOrMinus = groupsMat->group(1, status); 4176 int32_t matchPosition; 4177 if (plusOrMinus.compare("+") == 0) { 4178 matchPosition = testMat->end(groupNum, status); 4179 } else { 4180 matchPosition = testMat->start(groupNum, status); 4181 } 4182 if (matchPosition != -1) { 4183 ICU_Utility::appendNumber(resultString, matchPosition); 4184 } 4185 perlExpr.remove(0, groupsMat->end(status)); 4186 } 4187 4188 else if (cgMat->lookingAt(status)) { 4189 // $1, $2, $3, etc. 4190 UnicodeString digitString = cgMat->group(1, status); 4191 int32_t t = 0; 4192 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4193 if (U_SUCCESS(status)) { 4194 resultString.append(testMat->group(groupNum, status)); 4195 status = U_ZERO_ERROR; 4196 } 4197 perlExpr.remove(0, cgMat->end(status)); 4198 } 4199 4200 else if (perlExpr.startsWith("@-")) { 4201 int32_t i; 4202 for (i=0; i<=testMat->groupCount(); i++) { 4203 if (i>0) { 4204 resultString.append(" "); 4205 } 4206 ICU_Utility::appendNumber(resultString, testMat->start(i, status)); 4207 } 4208 perlExpr.remove(0, 2); 4209 } 4210 4211 else if (perlExpr.startsWith("@+")) { 4212 int32_t i; 4213 for (i=0; i<=testMat->groupCount(); i++) { 4214 if (i>0) { 4215 resultString.append(" "); 4216 } 4217 ICU_Utility::appendNumber(resultString, testMat->end(i, status)); 4218 } 4219 perlExpr.remove(0, 2); 4220 } 4221 4222 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal. 4223 // or as an escaped sequence (e.g. \n) 4224 if (perlExpr.length() > 1) { 4225 perlExpr.remove(0, 1); // Remove the '\', but only if not last char. 4226 } 4227 UChar c = perlExpr.charAt(0); 4228 switch (c) { 4229 case 'n': c = '\n'; break; 4230 // add any other escape sequences that show up in the test expected results. 4231 } 4232 resultString.append(c); 4233 perlExpr.remove(0, 1); 4234 } 4235 4236 else { 4237 // Any characters from the perl expression that we don't explicitly 4238 // recognize before here are assumed to be literals and copied 4239 // as-is to the expected results. 4240 resultString.append(perlExpr.charAt(0)); 4241 perlExpr.remove(0, 1); 4242 } 4243 4244 if (U_FAILURE(status)) { 4245 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status)); 4246 break; 4247 } 4248 } 4249 4250 // 4251 // Expected Results Compare 4252 // 4253 UnicodeString expectedS(fields[4]); 4254 expectedS.findAndReplace(nulnulSrc, nulnul); 4255 expectedS.findAndReplace(ffffSrc, ffff); 4256 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4257 4258 4259 if (expectedS.compare(resultString) != 0) { 4260 err("Line %d: Incorrect perl expression results.", lineNum); 4261 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\""); 4262 } 4263 4264 delete testMat; 4265 delete testPat; 4266 } 4267 4268 // 4269 // All done. Clean up allocated stuff. 4270 // 4271 delete cgMat; 4272 delete cgPat; 4273 4274 delete groupsMat; 4275 delete groupsPat; 4276 4277 delete flagMat; 4278 delete flagPat; 4279 4280 delete lineMat; 4281 delete linePat; 4282 4283 delete fieldPat; 4284 delete [] testData; 4285 4286 utext_close(&patternText); 4287 utext_close(&inputText); 4288 4289 delete [] patternChars; 4290 delete [] inputChars; 4291 4292 4293 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount); 4294 4295 } 4296 4297 4298 //-------------------------------------------------------------- 4299 // 4300 // Bug6149 Verify limits to heap expansion for backtrack stack. 4301 // Use this pattern, 4302 // "(a?){1,}" 4303 // The zero-length match will repeat forever. 4304 // (That this goes into a loop is another bug) 4305 // 4306 //--------------------------------------------------------------- 4307 void RegexTest::Bug6149() { 4308 UnicodeString pattern("(a?){1,}"); 4309 UnicodeString s("xyz"); 4310 uint32_t flags = 0; 4311 UErrorCode status = U_ZERO_ERROR; 4312 4313 RegexMatcher matcher(pattern, s, flags, status); 4314 UBool result = false; 4315 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW); 4316 REGEX_ASSERT(result == FALSE); 4317 } 4318 4319 4320 // 4321 // Callbacks() Test the callback function. 4322 // When set, callbacks occur periodically during matching operations, 4323 // giving the application code the ability to abort the operation 4324 // before it's normal completion. 4325 // 4326 4327 struct callBackContext { 4328 RegexTest *test; 4329 int32_t maxCalls; 4330 int32_t numCalls; 4331 int32_t lastSteps; 4332 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;}; 4333 }; 4334 4335 U_CDECL_BEGIN 4336 static UBool U_CALLCONV 4337 testCallBackFn(const void *context, int32_t steps) { 4338 callBackContext *info = (callBackContext *)context; 4339 if (info->lastSteps+1 != steps) { 4340 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps); 4341 } 4342 info->lastSteps = steps; 4343 info->numCalls++; 4344 return (info->numCalls < info->maxCalls); 4345 } 4346 U_CDECL_END 4347 4348 void RegexTest::Callbacks() { 4349 { 4350 // Getter returns NULLs if no callback has been set 4351 4352 // The variables that the getter will fill in. 4353 // Init to non-null values so that the action of the getter can be seen. 4354 const void *returnedContext = &returnedContext; 4355 URegexMatchCallback *returnedFn = &testCallBackFn; 4356 4357 UErrorCode status = U_ZERO_ERROR; 4358 RegexMatcher matcher("x", 0, status); 4359 REGEX_CHECK_STATUS; 4360 matcher.getMatchCallback(returnedFn, returnedContext, status); 4361 REGEX_CHECK_STATUS; 4362 REGEX_ASSERT(returnedFn == NULL); 4363 REGEX_ASSERT(returnedContext == NULL); 4364 } 4365 4366 { 4367 // Set and Get work 4368 callBackContext cbInfo = {this, 0, 0, 0}; 4369 const void *returnedContext; 4370 URegexMatchCallback *returnedFn; 4371 UErrorCode status = U_ZERO_ERROR; 4372 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long. 4373 REGEX_CHECK_STATUS; 4374 matcher.setMatchCallback(testCallBackFn, &cbInfo, status); 4375 REGEX_CHECK_STATUS; 4376 matcher.getMatchCallback(returnedFn, returnedContext, status); 4377 REGEX_CHECK_STATUS; 4378 REGEX_ASSERT(returnedFn == testCallBackFn); 4379 REGEX_ASSERT(returnedContext == &cbInfo); 4380 4381 // A short-running match shouldn't invoke the callback 4382 status = U_ZERO_ERROR; 4383 cbInfo.reset(1); 4384 UnicodeString s = "xxx"; 4385 matcher.reset(s); 4386 REGEX_ASSERT(matcher.matches(status)); 4387 REGEX_CHECK_STATUS; 4388 REGEX_ASSERT(cbInfo.numCalls == 0); 4389 4390 // A medium-length match that runs long enough to invoke the 4391 // callback, but not so long that the callback aborts it. 4392 status = U_ZERO_ERROR; 4393 cbInfo.reset(4); 4394 s = "aaaaaaaaaaaaaaaaaaab"; 4395 matcher.reset(s); 4396 REGEX_ASSERT(matcher.matches(status)==FALSE); 4397 REGEX_CHECK_STATUS; 4398 REGEX_ASSERT(cbInfo.numCalls > 0); 4399 4400 // A longer running match that the callback function will abort. 4401 status = U_ZERO_ERROR; 4402 cbInfo.reset(4); 4403 s = "aaaaaaaaaaaaaaaaaaaaaaab"; 4404 matcher.reset(s); 4405 REGEX_ASSERT(matcher.matches(status)==FALSE); 4406 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); 4407 REGEX_ASSERT(cbInfo.numCalls == 4); 4408 } 4409 4410 4411 } 4412 4413 4414 //--------------------------------------------------------------------------- 4415 // 4416 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable 4417 // UTexts. The pure-C implementation of UText 4418 // has no mutable backing stores, but we can 4419 // use UnicodeString here to test the functionality. 4420 // 4421 //--------------------------------------------------------------------------- 4422 void RegexTest::PreAllocatedUTextCAPI () { 4423 UErrorCode status = U_ZERO_ERROR; 4424 URegularExpression *re; 4425 UText patternText = UTEXT_INITIALIZER; 4426 UnicodeString buffer; 4427 UText bufferText = UTEXT_INITIALIZER; 4428 4429 utext_openUnicodeString(&bufferText, &buffer, &status); 4430 4431 /* 4432 * getText() and getUText() 4433 */ 4434 { 4435 UText text1 = UTEXT_INITIALIZER; 4436 UText text2 = UTEXT_INITIALIZER; 4437 UChar text2Chars[20]; 4438 UText *resultText; 4439 4440 status = U_ZERO_ERROR; 4441 utext_openUTF8(&text1, "abcccd", -1, &status); 4442 utext_openUTF8(&text2, "abcccxd", -1, &status); 4443 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2); 4444 utext_openUChars(&text2, text2Chars, -1, &status); 4445 4446 utext_openUTF8(&patternText, "abc*d", -1, &status); 4447 re = uregex_openUText(&patternText, 0, NULL, &status); 4448 4449 /* First set a UText */ 4450 uregex_setUText(re, &text1, &status); 4451 resultText = uregex_getUText(re, &bufferText, &status); 4452 REGEX_CHECK_STATUS; 4453 REGEX_ASSERT(resultText == &bufferText); 4454 utext_setNativeIndex(resultText, 0); 4455 utext_setNativeIndex(&text1, 0); 4456 REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0); 4457 4458 resultText = uregex_getUText(re, &bufferText, &status); 4459 REGEX_CHECK_STATUS; 4460 REGEX_ASSERT(resultText == &bufferText); 4461 utext_setNativeIndex(resultText, 0); 4462 utext_setNativeIndex(&text1, 0); 4463 REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0); 4464 4465 /* Then set a UChar * */ 4466 uregex_setText(re, text2Chars, 7, &status); 4467 resultText = uregex_getUText(re, &bufferText, &status); 4468 REGEX_CHECK_STATUS; 4469 REGEX_ASSERT(resultText == &bufferText); 4470 utext_setNativeIndex(resultText, 0); 4471 utext_setNativeIndex(&text2, 0); 4472 REGEX_ASSERT(utext_compare(resultText, -1, &text2, -1) == 0); 4473 4474 uregex_close(re); 4475 utext_close(&text1); 4476 utext_close(&text2); 4477 } 4478 4479 /* 4480 * group() 4481 */ 4482 { 4483 UChar text1[80]; 4484 UText *actual; 4485 UBool result; 4486 u_uastrncpy(text1, "noise abc interior def, and this is off the end", sizeof(text1)/2); 4487 4488 status = U_ZERO_ERROR; 4489 re = uregex_openC("abc(.*?)def", 0, NULL, &status); 4490 REGEX_CHECK_STATUS; 4491 4492 uregex_setText(re, text1, -1, &status); 4493 result = uregex_find(re, 0, &status); 4494 REGEX_ASSERT(result==TRUE); 4495 4496 /* Capture Group 0, the full match. Should succeed. */ 4497 status = U_ZERO_ERROR; 4498 actual = uregex_groupUText(re, 0, &bufferText, &status); 4499 REGEX_CHECK_STATUS; 4500 REGEX_ASSERT(actual == &bufferText); 4501 REGEX_ASSERT_UTEXT("abc interior def", actual); 4502 4503 /* Capture group #1. Should succeed. */ 4504 status = U_ZERO_ERROR; 4505 actual = uregex_groupUText(re, 1, &bufferText, &status); 4506 REGEX_CHECK_STATUS; 4507 REGEX_ASSERT(actual == &bufferText); 4508 REGEX_ASSERT_UTEXT(" interior ", actual); 4509 4510 /* Capture group out of range. Error. */ 4511 status = U_ZERO_ERROR; 4512 actual = uregex_groupUText(re, 2, &bufferText, &status); 4513 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 4514 REGEX_ASSERT(actual == &bufferText); 4515 4516 uregex_close(re); 4517 4518 } 4519 4520 /* 4521 * replaceFirst() 4522 */ 4523 { 4524 UChar text1[80]; 4525 UChar text2[80]; 4526 UText replText = UTEXT_INITIALIZER; 4527 UText *result; 4528 4529 status = U_ZERO_ERROR; 4530 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2); 4531 u_uastrncpy(text2, "No match here.", sizeof(text2)/2); 4532 utext_openUTF8(&replText, "<$1>", -1, &status); 4533 4534 re = uregex_openC("x(.*?)x", 0, NULL, &status); 4535 REGEX_CHECK_STATUS; 4536 4537 /* Normal case, with match */ 4538 uregex_setText(re, text1, -1, &status); 4539 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 4540 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 4541 REGEX_CHECK_STATUS; 4542 REGEX_ASSERT(result == &bufferText); 4543 REGEX_ASSERT_UTEXT("Replace <aa> x1x x...x.", result); 4544 4545 /* No match. Text should copy to output with no changes. */ 4546 uregex_setText(re, text2, -1, &status); 4547 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 4548 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 4549 REGEX_CHECK_STATUS; 4550 REGEX_ASSERT(result == &bufferText); 4551 REGEX_ASSERT_UTEXT("No match here.", result); 4552 4553 /* Unicode escapes */ 4554 uregex_setText(re, text1, -1, &status); 4555 utext_openUTF8(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status); 4556 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 4557 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 4558 REGEX_CHECK_STATUS; 4559 REGEX_ASSERT(result == &bufferText); 4560 REGEX_ASSERT_UTEXT("Replace \\AaaB$a x1x x...x.", result); 4561 4562 uregex_close(re); 4563 utext_close(&replText); 4564 } 4565 4566 4567 /* 4568 * replaceAll() 4569 */ 4570 { 4571 UChar text1[80]; 4572 UChar text2[80]; 4573 UText replText = UTEXT_INITIALIZER; 4574 UText *result; 4575 4576 status = U_ZERO_ERROR; 4577 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2); 4578 u_uastrncpy(text2, "No match here.", sizeof(text2)/2); 4579 utext_openUTF8(&replText, "<$1>", -1, &status); 4580 4581 re = uregex_openC("x(.*?)x", 0, NULL, &status); 4582 REGEX_CHECK_STATUS; 4583 4584 /* Normal case, with match */ 4585 uregex_setText(re, text1, -1, &status); 4586 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 4587 result = uregex_replaceAllUText(re, &replText, &bufferText, &status); 4588 REGEX_CHECK_STATUS; 4589 REGEX_ASSERT(result == &bufferText); 4590 REGEX_ASSERT_UTEXT("Replace <aa> <1> <...>.", result); 4591 4592 /* No match. Text should copy to output with no changes. */ 4593 uregex_setText(re, text2, -1, &status); 4594 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 4595 result = uregex_replaceAllUText(re, &replText, &bufferText, &status); 4596 REGEX_CHECK_STATUS; 4597 REGEX_ASSERT(result == &bufferText); 4598 REGEX_ASSERT_UTEXT("No match here.", result); 4599 4600 uregex_close(re); 4601 utext_close(&replText); 4602 } 4603 4604 4605 /* 4606 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts, 4607 * so we don't need to test it here. 4608 */ 4609 4610 utext_close(&bufferText); 4611 utext_close(&patternText); 4612 } 4613 4614 //-------------------------------------------------------------- 4615 // 4616 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher. 4617 // 4618 //--------------------------------------------------------------- 4619 void RegexTest::Bug7651() { 4620 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)"); 4621 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData. 4622 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)"); 4623 UnicodeString s("#ff @abcd This is test"); 4624 RegexPattern *REPattern = NULL; 4625 RegexMatcher *REMatcher = NULL; 4626 UErrorCode status = U_ZERO_ERROR; 4627 UParseError pe; 4628 4629 REPattern = RegexPattern::compile(pattern1, 0, pe, status); 4630 REGEX_CHECK_STATUS; 4631 REMatcher = REPattern->matcher(s, status); 4632 REGEX_CHECK_STATUS; 4633 REGEX_ASSERT(REMatcher->find()); 4634 REGEX_ASSERT(REMatcher->start(status) == 0); 4635 delete REPattern; 4636 delete REMatcher; 4637 status = U_ZERO_ERROR; 4638 4639 REPattern = RegexPattern::compile(pattern2, 0, pe, status); 4640 REGEX_CHECK_STATUS; 4641 REMatcher = REPattern->matcher(s, status); 4642 REGEX_CHECK_STATUS; 4643 REGEX_ASSERT(REMatcher->find()); 4644 REGEX_ASSERT(REMatcher->start(status) == 0); 4645 delete REPattern; 4646 delete REMatcher; 4647 status = U_ZERO_ERROR; 4648 } 4649 4650 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ 4651 4652