1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /******************************************************************** 4 * COPYRIGHT: 5 * Copyright (c) 2002-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ********************************************************************/ 8 9 // 10 // regextst.cpp 11 // 12 // ICU Regular Expressions test, part of intltest. 13 // 14 15 /* 16 NOTE!! 17 18 PLEASE be careful about ASCII assumptions in this test. 19 This test is one of the worst repeat offenders. 20 If you have questions, contact someone on the ICU PMC 21 who has access to an EBCDIC system. 22 23 */ 24 25 #include "intltest.h" 26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 27 28 #include <stdlib.h> 29 #include <stdio.h> 30 #include <string.h> 31 32 #include "unicode/localpointer.h" 33 #include "unicode/regex.h" 34 #include "unicode/uchar.h" 35 #include "unicode/ucnv.h" 36 #include "unicode/uniset.h" 37 #include "unicode/uregex.h" 38 #include "unicode/usetiter.h" 39 #include "unicode/ustring.h" 40 #include "unicode/utext.h" 41 #include "unicode/utf16.h" 42 #include "cstr.h" 43 #include "regextst.h" 44 #include "regexcmp.h" 45 #include "uvector.h" 46 #include "util.h" 47 #include "cmemory.h" 48 #include "cstring.h" 49 #include "uinvchar.h" 50 51 #define SUPPORT_MUTATING_INPUT_STRING 0 52 53 //--------------------------------------------------------------------------- 54 // 55 // Test class boilerplate 56 // 57 //--------------------------------------------------------------------------- 58 RegexTest::RegexTest() 59 { 60 } 61 62 63 RegexTest::~RegexTest() 64 { 65 } 66 67 68 69 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) 70 { 71 if (exec) logln("TestSuite RegexTest: "); 72 TESTCASE_AUTO_BEGIN; 73 TESTCASE_AUTO(Basic); 74 TESTCASE_AUTO(API_Match); 75 TESTCASE_AUTO(API_Replace); 76 TESTCASE_AUTO(API_Pattern); 77 #if !UCONFIG_NO_FILE_IO 78 TESTCASE_AUTO(Extended); 79 #endif 80 TESTCASE_AUTO(Errors); 81 TESTCASE_AUTO(PerlTests); 82 TESTCASE_AUTO(Callbacks); 83 TESTCASE_AUTO(FindProgressCallbacks); 84 TESTCASE_AUTO(Bug6149); 85 TESTCASE_AUTO(UTextBasic); 86 TESTCASE_AUTO(API_Match_UTF8); 87 TESTCASE_AUTO(API_Replace_UTF8); 88 TESTCASE_AUTO(API_Pattern_UTF8); 89 TESTCASE_AUTO(PerlTestsUTF8); 90 TESTCASE_AUTO(PreAllocatedUTextCAPI); 91 TESTCASE_AUTO(Bug7651); 92 TESTCASE_AUTO(Bug7740); 93 TESTCASE_AUTO(Bug8479); 94 TESTCASE_AUTO(Bug7029); 95 TESTCASE_AUTO(CheckInvBufSize); 96 TESTCASE_AUTO(Bug9283); 97 TESTCASE_AUTO(Bug10459); 98 TESTCASE_AUTO(TestCaseInsensitiveStarters); 99 TESTCASE_AUTO(TestBug11049); 100 TESTCASE_AUTO(TestBug11371); 101 TESTCASE_AUTO(TestBug11480); 102 TESTCASE_AUTO(NamedCapture); 103 TESTCASE_AUTO(NamedCaptureLimits); 104 TESTCASE_AUTO(TestBug12884); 105 TESTCASE_AUTO(TestBug13631); 106 TESTCASE_AUTO_END; 107 } 108 109 110 /** 111 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage 112 * into ASCII. 113 * @see utext_openUTF8 114 */ 115 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status); 116 117 //--------------------------------------------------------------------------- 118 // 119 // Error Checking / Reporting macros used in all of the tests. 120 // 121 //--------------------------------------------------------------------------- 122 123 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) { 124 int64_t oldIndex = utext_getNativeIndex(text); 125 utext_setNativeIndex(text, 0); 126 char *bufPtr = buf; 127 UChar32 c = utext_next32From(text, 0); 128 while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) { 129 if (0x000020<=c && c<0x00007e) { 130 *bufPtr = c; 131 } else { 132 #if 0 133 sprintf(bufPtr,"U+%04X", c); 134 bufPtr+= strlen(bufPtr)-1; 135 #else 136 *bufPtr = '%'; 137 #endif 138 } 139 bufPtr++; 140 c = UTEXT_NEXT32(text); 141 } 142 *bufPtr = 0; 143 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY) 144 char *ebuf = (char*)malloc(bufLen); 145 uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen); 146 uprv_strncpy(buf, ebuf, bufLen); 147 free((void*)ebuf); 148 #endif 149 utext_setNativeIndex(text, oldIndex); 150 } 151 152 153 static char ASSERT_BUF[1024]; 154 155 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) { 156 if(message.length()==0) { 157 strcpy(ASSERT_BUF, "[[empty UnicodeString]]"); 158 } else { 159 UnicodeString buf; 160 IntlTest::prettify(message,buf); 161 if(buf.length()==0) { 162 strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]"); 163 } else { 164 buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1); 165 if(ASSERT_BUF[0]==0) { 166 ASSERT_BUF[0]=0; 167 for(int32_t i=0;i<buf.length();i++) { 168 UChar ch = buf[i]; 169 sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch); 170 } 171 } 172 } 173 } 174 ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0; 175 return ASSERT_BUF; 176 } 177 178 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,UPRV_LENGTHOF(buf),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);} 179 180 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \ 181 __FILE__, __LINE__, u_errorName(status)); return;}} 182 183 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};} 184 185 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\ 186 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \ 187 __LINE__, u_errorName(errcode), u_errorName(status));};} 188 189 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \ 190 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }} 191 192 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \ 193 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}} 194 195 // expected: const char * , restricted to invariant characters. 196 // actual: const UnicodeString & 197 #define REGEX_ASSERT_UNISTR(expected, actual) { \ 198 if (UnicodeString(expected, -1, US_INV) != (actual)) { \ 199 errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \ 200 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};} 201 202 203 static UBool testUTextEqual(UText *uta, UText *utb) { 204 UChar32 ca = 0; 205 UChar32 cb = 0; 206 utext_setNativeIndex(uta, 0); 207 utext_setNativeIndex(utb, 0); 208 do { 209 ca = utext_next32(uta); 210 cb = utext_next32(utb); 211 if (ca != cb) { 212 break; 213 } 214 } while (ca != U_SENTINEL); 215 return ca == cb; 216 } 217 218 219 /** 220 * @param expected expected text in UTF-8 (not platform) codepage 221 */ 222 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) { 223 UErrorCode status = U_ZERO_ERROR; 224 UText expectedText = UTEXT_INITIALIZER; 225 utext_openUTF8(&expectedText, expected, -1, &status); 226 if(U_FAILURE(status)) { 227 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected)); 228 return; 229 } 230 if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) { 231 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected)); 232 return; 233 } 234 utext_setNativeIndex(actual, 0); 235 if (!testUTextEqual(&expectedText, actual)) { 236 char buf[201 /*21*/]; 237 char expectedBuf[201]; 238 utextToPrintable(buf, UPRV_LENGTHOF(buf), actual); 239 utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText); 240 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual)); 241 } 242 utext_close(&expectedText); 243 } 244 /** 245 * @param expected invariant (platform local text) input 246 */ 247 248 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) { 249 UErrorCode status = U_ZERO_ERROR; 250 UText expectedText = UTEXT_INITIALIZER; 251 regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status); 252 if(U_FAILURE(status)) { 253 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected)); 254 return; 255 } 256 utext_setNativeIndex(actual, 0); 257 if (!testUTextEqual(&expectedText, actual)) { 258 char buf[201 /*21*/]; 259 char expectedBuf[201]; 260 utextToPrintable(buf, UPRV_LENGTHOF(buf), actual); 261 utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText); 262 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual)); 263 } 264 utext_close(&expectedText); 265 } 266 267 /** 268 * Assumes utf-8 input 269 */ 270 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__) 271 /** 272 * Assumes Invariant input 273 */ 274 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__) 275 276 /** 277 * This buffer ( inv_buf ) is used to hold the UTF-8 strings 278 * passed into utext_openUTF8. An error will be given if 279 * INV_BUFSIZ is too small. It's only used on EBCDIC systems. 280 */ 281 282 #define INV_BUFSIZ 2048 /* increase this if too small */ 283 284 static int64_t inv_next=0; 285 286 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY 287 static char inv_buf[INV_BUFSIZ]; 288 #endif 289 290 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) { 291 if(length==-1) length=strlen(inv); 292 #if U_CHARSET_FAMILY==U_ASCII_FAMILY 293 inv_next+=length; 294 return utext_openUTF8(ut, inv, length, status); 295 #else 296 if(inv_next+length+1>INV_BUFSIZ) { 297 fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n", 298 __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1)); 299 *status = U_MEMORY_ALLOCATION_ERROR; 300 return NULL; 301 } 302 303 unsigned char *buf = (unsigned char*)inv_buf+inv_next; 304 uprv_aestrncpy(buf, (const uint8_t*)inv, length); 305 inv_next+=length; 306 307 #if 0 308 fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next); 309 #endif 310 311 return utext_openUTF8(ut, (const char*)buf, length, status); 312 #endif 313 } 314 315 316 //--------------------------------------------------------------------------- 317 // 318 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests 319 // for the LookingAt() and Match() functions. 320 // 321 // usage: 322 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected); 323 // 324 // The expected results are UBool - TRUE or FALSE. 325 // The input text is unescaped. The pattern is not. 326 // 327 // 328 //--------------------------------------------------------------------------- 329 330 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);} 331 332 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) { 333 const UnicodeString pattern(pat, -1, US_INV); 334 const UnicodeString inputText(text, -1, US_INV); 335 UErrorCode status = U_ZERO_ERROR; 336 UParseError pe; 337 RegexPattern *REPattern = NULL; 338 RegexMatcher *REMatcher = NULL; 339 UBool retVal = TRUE; 340 341 UnicodeString patString(pat, -1, US_INV); 342 REPattern = RegexPattern::compile(patString, 0, pe, status); 343 if (U_FAILURE(status)) { 344 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s", 345 line, u_errorName(status)); 346 return FALSE; 347 } 348 if (line==376) { REPattern->dumpPattern();} 349 350 UnicodeString inputString(inputText); 351 UnicodeString unEscapedInput = inputString.unescape(); 352 REMatcher = REPattern->matcher(unEscapedInput, status); 353 if (U_FAILURE(status)) { 354 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n", 355 line, u_errorName(status)); 356 return FALSE; 357 } 358 359 UBool actualmatch; 360 actualmatch = REMatcher->lookingAt(status); 361 if (U_FAILURE(status)) { 362 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n", 363 line, u_errorName(status)); 364 retVal = FALSE; 365 } 366 if (actualmatch != looking) { 367 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line); 368 retVal = FALSE; 369 } 370 371 status = U_ZERO_ERROR; 372 actualmatch = REMatcher->matches(status); 373 if (U_FAILURE(status)) { 374 errln("RegexTest failure in matches() at line %d. Status = %s\n", 375 line, u_errorName(status)); 376 retVal = FALSE; 377 } 378 if (actualmatch != match) { 379 errln("RegexTest: wrong return from matches() at line %d.\n", line); 380 retVal = FALSE; 381 } 382 383 if (retVal == FALSE) { 384 REPattern->dumpPattern(); 385 } 386 387 delete REPattern; 388 delete REMatcher; 389 return retVal; 390 } 391 392 393 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) { 394 UText pattern = UTEXT_INITIALIZER; 395 int32_t inputUTF8Length; 396 char *textChars = NULL; 397 UText inputText = UTEXT_INITIALIZER; 398 UErrorCode status = U_ZERO_ERROR; 399 UParseError pe; 400 RegexPattern *REPattern = NULL; 401 RegexMatcher *REMatcher = NULL; 402 UBool retVal = TRUE; 403 404 regextst_openUTF8FromInvariant(&pattern, pat, -1, &status); 405 REPattern = RegexPattern::compile(&pattern, 0, pe, status); 406 if (U_FAILURE(status)) { 407 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n", 408 line, u_errorName(status)); 409 return FALSE; 410 } 411 412 UnicodeString inputString(text, -1, US_INV); 413 UnicodeString unEscapedInput = inputString.unescape(); 414 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status)); 415 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); 416 417 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status); 418 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) { 419 // UTF-8 does not allow unpaired surrogates, so this could actually happen 420 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status)); 421 return TRUE; // not a failure of the Regex engine 422 } 423 status = U_ZERO_ERROR; // buffer overflow 424 textChars = new char[inputUTF8Length+1]; 425 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status); 426 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status); 427 428 REMatcher = &REPattern->matcher(status)->reset(&inputText); 429 if (U_FAILURE(status)) { 430 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n", 431 line, u_errorName(status)); 432 return FALSE; 433 } 434 435 UBool actualmatch; 436 actualmatch = REMatcher->lookingAt(status); 437 if (U_FAILURE(status)) { 438 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n", 439 line, u_errorName(status)); 440 retVal = FALSE; 441 } 442 if (actualmatch != looking) { 443 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line); 444 retVal = FALSE; 445 } 446 447 status = U_ZERO_ERROR; 448 actualmatch = REMatcher->matches(status); 449 if (U_FAILURE(status)) { 450 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n", 451 line, u_errorName(status)); 452 retVal = FALSE; 453 } 454 if (actualmatch != match) { 455 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line); 456 retVal = FALSE; 457 } 458 459 if (retVal == FALSE) { 460 REPattern->dumpPattern(); 461 } 462 463 delete REPattern; 464 delete REMatcher; 465 utext_close(&inputText); 466 utext_close(&pattern); 467 delete[] textChars; 468 return retVal; 469 } 470 471 472 473 //--------------------------------------------------------------------------- 474 // 475 // REGEX_ERR Macro + invocation function to simplify writing tests 476 // regex tests for incorrect patterns 477 // 478 // usage: 479 // REGEX_ERR("pattern", expected error line, column, expected status); 480 // 481 //--------------------------------------------------------------------------- 482 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__); 483 484 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol, 485 UErrorCode expectedStatus, int32_t line) { 486 UnicodeString pattern(pat); 487 488 UErrorCode status = U_ZERO_ERROR; 489 UParseError pe; 490 RegexPattern *callerPattern = NULL; 491 492 // 493 // Compile the caller's pattern 494 // 495 UnicodeString patString(pat); 496 callerPattern = RegexPattern::compile(patString, 0, pe, status); 497 if (status != expectedStatus) { 498 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status)); 499 } else { 500 if (status != U_ZERO_ERROR) { 501 if (pe.line != errLine || pe.offset != errCol) { 502 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n", 503 line, errLine, errCol, pe.line, pe.offset); 504 } 505 } 506 } 507 508 delete callerPattern; 509 510 // 511 // Compile again, using a UTF-8-based UText 512 // 513 UText patternText = UTEXT_INITIALIZER; 514 regextst_openUTF8FromInvariant(&patternText, pat, -1, &status); 515 callerPattern = RegexPattern::compile(&patternText, 0, pe, status); 516 if (status != expectedStatus) { 517 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status)); 518 } else { 519 if (status != U_ZERO_ERROR) { 520 if (pe.line != errLine || pe.offset != errCol) { 521 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n", 522 line, errLine, errCol, pe.line, pe.offset); 523 } 524 } 525 } 526 527 delete callerPattern; 528 utext_close(&patternText); 529 } 530 531 532 533 //--------------------------------------------------------------------------- 534 // 535 // Basic Check for basic functionality of regex pattern matching. 536 // Avoid the use of REGEX_FIND test macro, which has 537 // substantial dependencies on basic Regex functionality. 538 // 539 //--------------------------------------------------------------------------- 540 void RegexTest::Basic() { 541 542 543 // 544 // Debug - slide failing test cases early 545 // 546 #if 0 547 { 548 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE); 549 UParseError pe; 550 UErrorCode status = U_ZERO_ERROR; 551 RegexPattern *pattern; 552 pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status); 553 pattern->dumpPattern(); 554 RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status); 555 UBool result = m->find(); 556 printf("result = %d\n", result); 557 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd"); 558 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX===================="); 559 } 560 exit(1); 561 #endif 562 563 564 // 565 // Pattern with parentheses 566 // 567 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE); 568 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE); 569 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE); 570 571 // 572 // Patterns with * 573 // 574 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE); 575 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE); 576 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE); 577 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE); 578 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE); 579 580 REGEX_TESTLM("a*", "", TRUE, TRUE); 581 REGEX_TESTLM("a*", "b", TRUE, FALSE); 582 583 584 // 585 // Patterns with "." 586 // 587 REGEX_TESTLM(".", "abc", TRUE, FALSE); 588 REGEX_TESTLM("...", "abc", TRUE, TRUE); 589 REGEX_TESTLM("....", "abc", FALSE, FALSE); 590 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE); 591 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE); 592 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE); 593 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE); 594 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE); 595 596 // 597 // Patterns with * applied to chars at end of literal string 598 // 599 REGEX_TESTLM("abc*", "ab", TRUE, TRUE); 600 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE); 601 602 // 603 // Supplemental chars match as single chars, not a pair of surrogates. 604 // 605 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE); 606 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE); 607 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE); 608 609 610 // 611 // UnicodeSets in the pattern 612 // 613 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE); 614 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE); 615 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE); 616 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE); 617 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE); 618 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE); 619 620 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE); 621 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE); 622 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE); 623 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences. 624 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE); 625 626 // 627 // OR operator in patterns 628 // 629 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE); 630 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE); 631 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE); 632 REGEX_TESTLM("a|b", "b", TRUE, TRUE); 633 634 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE); 635 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE); 636 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE); 637 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE); 638 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE); 639 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE); 640 641 // 642 // + 643 // 644 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE); 645 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE); 646 REGEX_TESTLM("b+", "", FALSE, FALSE); 647 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE); 648 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE); 649 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE); 650 651 // 652 // ? 653 // 654 REGEX_TESTLM("ab?", "ab", TRUE, TRUE); 655 REGEX_TESTLM("ab?", "a", TRUE, TRUE); 656 REGEX_TESTLM("ab?", "ac", TRUE, FALSE); 657 REGEX_TESTLM("ab?", "abb", TRUE, FALSE); 658 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE); 659 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE); 660 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE); 661 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE); 662 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE); 663 664 // 665 // Escape sequences that become single literal chars, handled internally 666 // by ICU's Unescape. 667 // 668 669 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet. 670 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL 671 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L 672 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape 673 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed 674 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line 675 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR 676 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab 677 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE); 678 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE); 679 680 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input 681 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input 682 683 // Escape of special chars in patterns 684 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE); 685 } 686 687 688 //--------------------------------------------------------------------------- 689 // 690 // UTextBasic Check for quirks that are specific to the UText 691 // implementation. 692 // 693 //--------------------------------------------------------------------------- 694 void RegexTest::UTextBasic() { 695 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 696 UErrorCode status = U_ZERO_ERROR; 697 UText pattern = UTEXT_INITIALIZER; 698 utext_openUTF8(&pattern, str_abc, -1, &status); 699 RegexMatcher matcher(&pattern, 0, status); 700 REGEX_CHECK_STATUS; 701 702 UText input = UTEXT_INITIALIZER; 703 utext_openUTF8(&input, str_abc, -1, &status); 704 REGEX_CHECK_STATUS; 705 matcher.reset(&input); 706 REGEX_CHECK_STATUS; 707 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText()); 708 709 matcher.reset(matcher.inputText()); 710 REGEX_CHECK_STATUS; 711 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText()); 712 713 utext_close(&pattern); 714 utext_close(&input); 715 } 716 717 718 //--------------------------------------------------------------------------- 719 // 720 // API_Match Test that the API for class RegexMatcher 721 // is present and nominally working, but excluding functions 722 // implementing replace operations. 723 // 724 //--------------------------------------------------------------------------- 725 void RegexTest::API_Match() { 726 UParseError pe; 727 UErrorCode status=U_ZERO_ERROR; 728 int32_t flags = 0; 729 730 // 731 // Debug - slide failing test cases early 732 // 733 #if 0 734 { 735 } 736 return; 737 #endif 738 739 // 740 // Simple pattern compilation 741 // 742 { 743 UnicodeString re("abc"); 744 RegexPattern *pat2; 745 pat2 = RegexPattern::compile(re, flags, pe, status); 746 REGEX_CHECK_STATUS; 747 748 UnicodeString inStr1 = "abcdef this is a test"; 749 UnicodeString instr2 = "not abc"; 750 UnicodeString empty = ""; 751 752 753 // 754 // Matcher creation and reset. 755 // 756 RegexMatcher *m1 = pat2->matcher(inStr1, status); 757 REGEX_CHECK_STATUS; 758 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 759 REGEX_ASSERT(m1->input() == inStr1); 760 m1->reset(instr2); 761 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 762 REGEX_ASSERT(m1->input() == instr2); 763 m1->reset(inStr1); 764 REGEX_ASSERT(m1->input() == inStr1); 765 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 766 m1->reset(empty); 767 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 768 REGEX_ASSERT(m1->input() == empty); 769 REGEX_ASSERT(&m1->pattern() == pat2); 770 771 // 772 // reset(pos, status) 773 // 774 m1->reset(inStr1); 775 m1->reset(4, status); 776 REGEX_CHECK_STATUS; 777 REGEX_ASSERT(m1->input() == inStr1); 778 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 779 780 m1->reset(-1, status); 781 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 782 status = U_ZERO_ERROR; 783 784 m1->reset(0, status); 785 REGEX_CHECK_STATUS; 786 status = U_ZERO_ERROR; 787 788 int32_t len = m1->input().length(); 789 m1->reset(len-1, status); 790 REGEX_CHECK_STATUS; 791 status = U_ZERO_ERROR; 792 793 m1->reset(len, status); 794 REGEX_CHECK_STATUS; 795 status = U_ZERO_ERROR; 796 797 m1->reset(len+1, status); 798 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 799 status = U_ZERO_ERROR; 800 801 // 802 // match(pos, status) 803 // 804 m1->reset(instr2); 805 REGEX_ASSERT(m1->matches(4, status) == TRUE); 806 m1->reset(); 807 REGEX_ASSERT(m1->matches(3, status) == FALSE); 808 m1->reset(); 809 REGEX_ASSERT(m1->matches(5, status) == FALSE); 810 REGEX_ASSERT(m1->matches(4, status) == TRUE); 811 REGEX_ASSERT(m1->matches(-1, status) == FALSE); 812 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 813 814 // Match() at end of string should fail, but should not 815 // be an error. 816 status = U_ZERO_ERROR; 817 len = m1->input().length(); 818 REGEX_ASSERT(m1->matches(len, status) == FALSE); 819 REGEX_CHECK_STATUS; 820 821 // Match beyond end of string should fail with an error. 822 status = U_ZERO_ERROR; 823 REGEX_ASSERT(m1->matches(len+1, status) == FALSE); 824 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 825 826 // Successful match at end of string. 827 { 828 status = U_ZERO_ERROR; 829 RegexMatcher m("A?", 0, status); // will match zero length string. 830 REGEX_CHECK_STATUS; 831 m.reset(inStr1); 832 len = inStr1.length(); 833 REGEX_ASSERT(m.matches(len, status) == TRUE); 834 REGEX_CHECK_STATUS; 835 m.reset(empty); 836 REGEX_ASSERT(m.matches(0, status) == TRUE); 837 REGEX_CHECK_STATUS; 838 } 839 840 841 // 842 // lookingAt(pos, status) 843 // 844 status = U_ZERO_ERROR; 845 m1->reset(instr2); // "not abc" 846 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 847 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE); 848 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE); 849 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 850 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE); 851 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 852 status = U_ZERO_ERROR; 853 len = m1->input().length(); 854 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE); 855 REGEX_CHECK_STATUS; 856 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE); 857 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 858 859 delete m1; 860 delete pat2; 861 } 862 863 864 // 865 // Capture Group. 866 // RegexMatcher::start(); 867 // RegexMatcher::end(); 868 // RegexMatcher::groupCount(); 869 // 870 { 871 int32_t flags=0; 872 UParseError pe; 873 UErrorCode status=U_ZERO_ERROR; 874 875 UnicodeString re("01(23(45)67)(.*)"); 876 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 877 REGEX_CHECK_STATUS; 878 UnicodeString data = "0123456789"; 879 880 RegexMatcher *matcher = pat->matcher(data, status); 881 REGEX_CHECK_STATUS; 882 REGEX_ASSERT(matcher->lookingAt(status) == TRUE); 883 static const int32_t matchStarts[] = {0, 2, 4, 8}; 884 static const int32_t matchEnds[] = {10, 8, 6, 10}; 885 int32_t i; 886 for (i=0; i<4; i++) { 887 int32_t actualStart = matcher->start(i, status); 888 REGEX_CHECK_STATUS; 889 if (actualStart != matchStarts[i]) { 890 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n", 891 __LINE__, i, matchStarts[i], actualStart); 892 } 893 int32_t actualEnd = matcher->end(i, status); 894 REGEX_CHECK_STATUS; 895 if (actualEnd != matchEnds[i]) { 896 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n", 897 __LINE__, i, matchEnds[i], actualEnd); 898 } 899 } 900 901 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status)); 902 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status)); 903 904 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 905 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 906 matcher->reset(); 907 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE); 908 909 matcher->lookingAt(status); 910 REGEX_ASSERT(matcher->group(status) == "0123456789"); 911 REGEX_ASSERT(matcher->group(0, status) == "0123456789"); 912 REGEX_ASSERT(matcher->group(1, status) == "234567" ); 913 REGEX_ASSERT(matcher->group(2, status) == "45" ); 914 REGEX_ASSERT(matcher->group(3, status) == "89" ); 915 REGEX_CHECK_STATUS; 916 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 917 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 918 matcher->reset(); 919 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); 920 921 delete matcher; 922 delete pat; 923 924 } 925 926 // 927 // find 928 // 929 { 930 int32_t flags=0; 931 UParseError pe; 932 UErrorCode status=U_ZERO_ERROR; 933 934 UnicodeString re("abc"); 935 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 936 REGEX_CHECK_STATUS; 937 UnicodeString data = ".abc..abc...abc.."; 938 // 012345678901234567 939 940 RegexMatcher *matcher = pat->matcher(data, status); 941 REGEX_CHECK_STATUS; 942 REGEX_ASSERT(matcher->find()); 943 REGEX_ASSERT(matcher->start(status) == 1); 944 REGEX_ASSERT(matcher->find()); 945 REGEX_ASSERT(matcher->start(status) == 6); 946 REGEX_ASSERT(matcher->find()); 947 REGEX_ASSERT(matcher->start(status) == 12); 948 REGEX_ASSERT(matcher->find() == FALSE); 949 REGEX_ASSERT(matcher->find() == FALSE); 950 951 matcher->reset(); 952 REGEX_ASSERT(matcher->find()); 953 REGEX_ASSERT(matcher->start(status) == 1); 954 955 REGEX_ASSERT(matcher->find(0, status)); 956 REGEX_ASSERT(matcher->start(status) == 1); 957 REGEX_ASSERT(matcher->find(1, status)); 958 REGEX_ASSERT(matcher->start(status) == 1); 959 REGEX_ASSERT(matcher->find(2, status)); 960 REGEX_ASSERT(matcher->start(status) == 6); 961 REGEX_ASSERT(matcher->find(12, status)); 962 REGEX_ASSERT(matcher->start(status) == 12); 963 REGEX_ASSERT(matcher->find(13, status) == FALSE); 964 REGEX_ASSERT(matcher->find(16, status) == FALSE); 965 REGEX_ASSERT(matcher->find(17, status) == FALSE); 966 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE); 967 968 status = U_ZERO_ERROR; 969 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 970 status = U_ZERO_ERROR; 971 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR); 972 973 REGEX_ASSERT(matcher->groupCount() == 0); 974 975 delete matcher; 976 delete pat; 977 } 978 979 980 // 981 // find, with \G in pattern (true if at the end of a previous match). 982 // 983 { 984 int32_t flags=0; 985 UParseError pe; 986 UErrorCode status=U_ZERO_ERROR; 987 988 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV); 989 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 990 REGEX_CHECK_STATUS; 991 UnicodeString data = ".abcabc.abc.."; 992 // 012345678901234567 993 994 RegexMatcher *matcher = pat->matcher(data, status); 995 REGEX_CHECK_STATUS; 996 REGEX_ASSERT(matcher->find()); 997 REGEX_ASSERT(matcher->start(status) == 0); 998 REGEX_ASSERT(matcher->start(1, status) == -1); 999 REGEX_ASSERT(matcher->start(2, status) == 1); 1000 1001 REGEX_ASSERT(matcher->find()); 1002 REGEX_ASSERT(matcher->start(status) == 4); 1003 REGEX_ASSERT(matcher->start(1, status) == 4); 1004 REGEX_ASSERT(matcher->start(2, status) == -1); 1005 REGEX_CHECK_STATUS; 1006 1007 delete matcher; 1008 delete pat; 1009 } 1010 1011 // 1012 // find with zero length matches, match position should bump ahead 1013 // to prevent loops. 1014 // 1015 { 1016 int32_t i; 1017 UErrorCode status=U_ZERO_ERROR; 1018 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere, 1019 // using an always-true look-ahead. 1020 REGEX_CHECK_STATUS; 1021 UnicodeString s(" "); 1022 m.reset(s); 1023 for (i=0; ; i++) { 1024 if (m.find() == FALSE) { 1025 break; 1026 } 1027 REGEX_ASSERT(m.start(status) == i); 1028 REGEX_ASSERT(m.end(status) == i); 1029 } 1030 REGEX_ASSERT(i==5); 1031 1032 // Check that the bump goes over surrogate pairs OK 1033 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004"); 1034 s = s.unescape(); 1035 m.reset(s); 1036 for (i=0; ; i+=2) { 1037 if (m.find() == FALSE) { 1038 break; 1039 } 1040 REGEX_ASSERT(m.start(status) == i); 1041 REGEX_ASSERT(m.end(status) == i); 1042 } 1043 REGEX_ASSERT(i==10); 1044 } 1045 { 1046 // find() loop breaking test. 1047 // with pattern of /.?/, should see a series of one char matches, then a single 1048 // match of zero length at the end of the input string. 1049 int32_t i; 1050 UErrorCode status=U_ZERO_ERROR; 1051 RegexMatcher m(".?", 0, status); 1052 REGEX_CHECK_STATUS; 1053 UnicodeString s(" "); 1054 m.reset(s); 1055 for (i=0; ; i++) { 1056 if (m.find() == FALSE) { 1057 break; 1058 } 1059 REGEX_ASSERT(m.start(status) == i); 1060 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i)); 1061 } 1062 REGEX_ASSERT(i==5); 1063 } 1064 1065 1066 // 1067 // Matchers with no input string behave as if they had an empty input string. 1068 // 1069 1070 { 1071 UErrorCode status = U_ZERO_ERROR; 1072 RegexMatcher m(".?", 0, status); 1073 REGEX_CHECK_STATUS; 1074 REGEX_ASSERT(m.find()); 1075 REGEX_ASSERT(m.start(status) == 0); 1076 REGEX_ASSERT(m.input() == ""); 1077 } 1078 { 1079 UErrorCode status = U_ZERO_ERROR; 1080 RegexPattern *p = RegexPattern::compile(".", 0, status); 1081 RegexMatcher *m = p->matcher(status); 1082 REGEX_CHECK_STATUS; 1083 1084 REGEX_ASSERT(m->find() == FALSE); 1085 REGEX_ASSERT(m->input() == ""); 1086 delete m; 1087 delete p; 1088 } 1089 1090 // 1091 // Regions 1092 // 1093 { 1094 UErrorCode status = U_ZERO_ERROR; 1095 UnicodeString testString("This is test data"); 1096 RegexMatcher m(".*", testString, 0, status); 1097 REGEX_CHECK_STATUS; 1098 REGEX_ASSERT(m.regionStart() == 0); 1099 REGEX_ASSERT(m.regionEnd() == testString.length()); 1100 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1101 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1102 1103 m.region(2,4, status); 1104 REGEX_CHECK_STATUS; 1105 REGEX_ASSERT(m.matches(status)); 1106 REGEX_ASSERT(m.start(status)==2); 1107 REGEX_ASSERT(m.end(status)==4); 1108 REGEX_CHECK_STATUS; 1109 1110 m.reset(); 1111 REGEX_ASSERT(m.regionStart() == 0); 1112 REGEX_ASSERT(m.regionEnd() == testString.length()); 1113 1114 UnicodeString shorterString("short"); 1115 m.reset(shorterString); 1116 REGEX_ASSERT(m.regionStart() == 0); 1117 REGEX_ASSERT(m.regionEnd() == shorterString.length()); 1118 1119 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1120 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE)); 1121 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 1122 REGEX_ASSERT(&m == &m.reset()); 1123 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 1124 1125 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE)); 1126 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1127 REGEX_ASSERT(&m == &m.reset()); 1128 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1129 1130 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1131 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE)); 1132 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 1133 REGEX_ASSERT(&m == &m.reset()); 1134 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 1135 1136 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE)); 1137 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1138 REGEX_ASSERT(&m == &m.reset()); 1139 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1140 1141 } 1142 1143 // 1144 // hitEnd() and requireEnd() 1145 // 1146 { 1147 UErrorCode status = U_ZERO_ERROR; 1148 UnicodeString testString("aabb"); 1149 RegexMatcher m1(".*", testString, 0, status); 1150 REGEX_ASSERT(m1.lookingAt(status) == TRUE); 1151 REGEX_ASSERT(m1.hitEnd() == TRUE); 1152 REGEX_ASSERT(m1.requireEnd() == FALSE); 1153 REGEX_CHECK_STATUS; 1154 1155 status = U_ZERO_ERROR; 1156 RegexMatcher m2("a*", testString, 0, status); 1157 REGEX_ASSERT(m2.lookingAt(status) == TRUE); 1158 REGEX_ASSERT(m2.hitEnd() == FALSE); 1159 REGEX_ASSERT(m2.requireEnd() == FALSE); 1160 REGEX_CHECK_STATUS; 1161 1162 status = U_ZERO_ERROR; 1163 RegexMatcher m3(".*$", testString, 0, status); 1164 REGEX_ASSERT(m3.lookingAt(status) == TRUE); 1165 REGEX_ASSERT(m3.hitEnd() == TRUE); 1166 REGEX_ASSERT(m3.requireEnd() == TRUE); 1167 REGEX_CHECK_STATUS; 1168 } 1169 1170 1171 // 1172 // Compilation error on reset with UChar * 1173 // These were a hazard that people were stumbling over with runtime errors. 1174 // Changed them to compiler errors by adding private methods that more closely 1175 // matched the incorrect use of the functions. 1176 // 1177 #if 0 1178 { 1179 UErrorCode status = U_ZERO_ERROR; 1180 UChar ucharString[20]; 1181 RegexMatcher m(".", 0, status); 1182 m.reset(ucharString); // should not compile. 1183 1184 RegexPattern *p = RegexPattern::compile(".", 0, status); 1185 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile. 1186 1187 RegexMatcher m3(".", ucharString, 0, status); // Should not compile 1188 } 1189 #endif 1190 1191 // 1192 // Time Outs. 1193 // Note: These tests will need to be changed when the regexp engine is 1194 // able to detect and cut short the exponential time behavior on 1195 // this type of match. 1196 // 1197 { 1198 UErrorCode status = U_ZERO_ERROR; 1199 // Enough 'a's in the string to cause the match to time out. 1200 // (Each on additonal 'a' doubles the time) 1201 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa"); 1202 RegexMatcher matcher("(a+)+b", testString, 0, status); 1203 REGEX_CHECK_STATUS; 1204 REGEX_ASSERT(matcher.getTimeLimit() == 0); 1205 matcher.setTimeLimit(100, status); 1206 REGEX_ASSERT(matcher.getTimeLimit() == 100); 1207 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1208 REGEX_ASSERT(status == U_REGEX_TIME_OUT); 1209 } 1210 { 1211 UErrorCode status = U_ZERO_ERROR; 1212 // Few enough 'a's to slip in under the time limit. 1213 UnicodeString testString("aaaaaaaaaaaaaaaaaa"); 1214 RegexMatcher matcher("(a+)+b", testString, 0, status); 1215 REGEX_CHECK_STATUS; 1216 matcher.setTimeLimit(100, status); 1217 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1218 REGEX_CHECK_STATUS; 1219 } 1220 1221 // 1222 // Stack Limits 1223 // 1224 { 1225 UErrorCode status = U_ZERO_ERROR; 1226 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A' 1227 1228 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations 1229 // of the '+', and makes the stack frames larger. 1230 RegexMatcher matcher("(A)+A$", testString, 0, status); 1231 1232 // With the default stack, this match should fail to run 1233 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1234 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW); 1235 1236 // With unlimited stack, it should run 1237 status = U_ZERO_ERROR; 1238 matcher.setStackLimit(0, status); 1239 REGEX_CHECK_STATUS; 1240 REGEX_ASSERT(matcher.lookingAt(status) == TRUE); 1241 REGEX_CHECK_STATUS; 1242 REGEX_ASSERT(matcher.getStackLimit() == 0); 1243 1244 // With a limited stack, it the match should fail 1245 status = U_ZERO_ERROR; 1246 matcher.setStackLimit(10000, status); 1247 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1248 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW); 1249 REGEX_ASSERT(matcher.getStackLimit() == 10000); 1250 } 1251 1252 // A pattern that doesn't save state should work with 1253 // a minimal sized stack 1254 { 1255 UErrorCode status = U_ZERO_ERROR; 1256 UnicodeString testString = "abc"; 1257 RegexMatcher matcher("abc", testString, 0, status); 1258 REGEX_CHECK_STATUS; 1259 matcher.setStackLimit(30, status); 1260 REGEX_CHECK_STATUS; 1261 REGEX_ASSERT(matcher.matches(status) == TRUE); 1262 REGEX_CHECK_STATUS; 1263 REGEX_ASSERT(matcher.getStackLimit() == 30); 1264 1265 // Negative stack sizes should fail 1266 status = U_ZERO_ERROR; 1267 matcher.setStackLimit(1000, status); 1268 REGEX_CHECK_STATUS; 1269 matcher.setStackLimit(-1, status); 1270 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); 1271 REGEX_ASSERT(matcher.getStackLimit() == 1000); 1272 } 1273 1274 1275 } 1276 1277 1278 1279 1280 1281 1282 //--------------------------------------------------------------------------- 1283 // 1284 // API_Replace API test for class RegexMatcher, testing the 1285 // Replace family of functions. 1286 // 1287 //--------------------------------------------------------------------------- 1288 void RegexTest::API_Replace() { 1289 // 1290 // Replace 1291 // 1292 int32_t flags=0; 1293 UParseError pe; 1294 UErrorCode status=U_ZERO_ERROR; 1295 1296 UnicodeString re("abc"); 1297 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 1298 REGEX_CHECK_STATUS; 1299 UnicodeString data = ".abc..abc...abc.."; 1300 // 012345678901234567 1301 RegexMatcher *matcher = pat->matcher(data, status); 1302 1303 // 1304 // Plain vanilla matches. 1305 // 1306 UnicodeString dest; 1307 dest = matcher->replaceFirst("yz", status); 1308 REGEX_CHECK_STATUS; 1309 REGEX_ASSERT(dest == ".yz..abc...abc.."); 1310 1311 dest = matcher->replaceAll("yz", status); 1312 REGEX_CHECK_STATUS; 1313 REGEX_ASSERT(dest == ".yz..yz...yz.."); 1314 1315 // 1316 // Plain vanilla non-matches. 1317 // 1318 UnicodeString d2 = ".abx..abx...abx.."; 1319 matcher->reset(d2); 1320 dest = matcher->replaceFirst("yz", status); 1321 REGEX_CHECK_STATUS; 1322 REGEX_ASSERT(dest == ".abx..abx...abx.."); 1323 1324 dest = matcher->replaceAll("yz", status); 1325 REGEX_CHECK_STATUS; 1326 REGEX_ASSERT(dest == ".abx..abx...abx.."); 1327 1328 // 1329 // Empty source string 1330 // 1331 UnicodeString d3 = ""; 1332 matcher->reset(d3); 1333 dest = matcher->replaceFirst("yz", status); 1334 REGEX_CHECK_STATUS; 1335 REGEX_ASSERT(dest == ""); 1336 1337 dest = matcher->replaceAll("yz", status); 1338 REGEX_CHECK_STATUS; 1339 REGEX_ASSERT(dest == ""); 1340 1341 // 1342 // Empty substitution string 1343 // 1344 matcher->reset(data); // ".abc..abc...abc.." 1345 dest = matcher->replaceFirst("", status); 1346 REGEX_CHECK_STATUS; 1347 REGEX_ASSERT(dest == "...abc...abc.."); 1348 1349 dest = matcher->replaceAll("", status); 1350 REGEX_CHECK_STATUS; 1351 REGEX_ASSERT(dest == "........"); 1352 1353 // 1354 // match whole string 1355 // 1356 UnicodeString d4 = "abc"; 1357 matcher->reset(d4); 1358 dest = matcher->replaceFirst("xyz", status); 1359 REGEX_CHECK_STATUS; 1360 REGEX_ASSERT(dest == "xyz"); 1361 1362 dest = matcher->replaceAll("xyz", status); 1363 REGEX_CHECK_STATUS; 1364 REGEX_ASSERT(dest == "xyz"); 1365 1366 // 1367 // Capture Group, simple case 1368 // 1369 UnicodeString re2("a(..)"); 1370 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status); 1371 REGEX_CHECK_STATUS; 1372 UnicodeString d5 = "abcdefg"; 1373 RegexMatcher *matcher2 = pat2->matcher(d5, status); 1374 REGEX_CHECK_STATUS; 1375 dest = matcher2->replaceFirst("$1$1", status); 1376 REGEX_CHECK_STATUS; 1377 REGEX_ASSERT(dest == "bcbcdefg"); 1378 1379 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status); 1380 REGEX_CHECK_STATUS; 1381 REGEX_ASSERT(dest == "The value of $1 is bc.defg"); 1382 1383 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status); 1384 REGEX_ASSERT(U_FAILURE(status)); 1385 status = U_ZERO_ERROR; 1386 1387 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF."); 1388 replacement = replacement.unescape(); 1389 dest = matcher2->replaceFirst(replacement, status); 1390 REGEX_CHECK_STATUS; 1391 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg"); 1392 1393 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR); 1394 1395 1396 // 1397 // Replacement String with \u hex escapes 1398 // 1399 { 1400 UnicodeString src = "abc 1 abc 2 abc 3"; 1401 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--"); 1402 matcher->reset(src); 1403 UnicodeString result = matcher->replaceAll(substitute, status); 1404 REGEX_CHECK_STATUS; 1405 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3"); 1406 } 1407 { 1408 UnicodeString src = "abc !"; 1409 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--"); 1410 matcher->reset(src); 1411 UnicodeString result = matcher->replaceAll(substitute, status); 1412 REGEX_CHECK_STATUS; 1413 UnicodeString expected = UnicodeString("--"); 1414 expected.append((UChar32)0x10000); 1415 expected.append("-- !"); 1416 REGEX_ASSERT(result == expected); 1417 } 1418 // TODO: need more through testing of capture substitutions. 1419 1420 // Bug 4057 1421 // 1422 { 1423 status = U_ZERO_ERROR; 1424 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin"; 1425 RegexMatcher m("ss(.*?)ee", 0, status); 1426 REGEX_CHECK_STATUS; 1427 UnicodeString result; 1428 1429 // Multiple finds do NOT bump up the previous appendReplacement postion. 1430 m.reset(s); 1431 m.find(); 1432 m.find(); 1433 m.appendReplacement(result, "ooh", status); 1434 REGEX_CHECK_STATUS; 1435 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1436 1437 // After a reset into the interior of a string, appendReplacemnt still starts at beginning. 1438 status = U_ZERO_ERROR; 1439 result.truncate(0); 1440 m.reset(10, status); 1441 m.find(); 1442 m.find(); 1443 m.appendReplacement(result, "ooh", status); 1444 REGEX_CHECK_STATUS; 1445 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1446 1447 // find() at interior of string, appendReplacemnt still starts at beginning. 1448 status = U_ZERO_ERROR; 1449 result.truncate(0); 1450 m.reset(); 1451 m.find(10, status); 1452 m.find(); 1453 m.appendReplacement(result, "ooh", status); 1454 REGEX_CHECK_STATUS; 1455 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1456 1457 m.appendTail(result); 1458 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin"); 1459 1460 } 1461 1462 delete matcher2; 1463 delete pat2; 1464 delete matcher; 1465 delete pat; 1466 } 1467 1468 1469 //--------------------------------------------------------------------------- 1470 // 1471 // API_Pattern Test that the API for class RegexPattern is 1472 // present and nominally working. 1473 // 1474 //--------------------------------------------------------------------------- 1475 void RegexTest::API_Pattern() { 1476 RegexPattern pata; // Test default constructor to not crash. 1477 RegexPattern patb; 1478 1479 REGEX_ASSERT(pata == patb); 1480 REGEX_ASSERT(pata == pata); 1481 1482 UnicodeString re1("abc[a-l][m-z]"); 1483 UnicodeString re2("def"); 1484 UErrorCode status = U_ZERO_ERROR; 1485 UParseError pe; 1486 1487 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status); 1488 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status); 1489 REGEX_CHECK_STATUS; 1490 REGEX_ASSERT(*pat1 == *pat1); 1491 REGEX_ASSERT(*pat1 != pata); 1492 1493 // Assign 1494 patb = *pat1; 1495 REGEX_ASSERT(patb == *pat1); 1496 1497 // Copy Construct 1498 RegexPattern patc(*pat1); 1499 REGEX_ASSERT(patc == *pat1); 1500 REGEX_ASSERT(patb == patc); 1501 REGEX_ASSERT(pat1 != pat2); 1502 patb = *pat2; 1503 REGEX_ASSERT(patb != patc); 1504 REGEX_ASSERT(patb == *pat2); 1505 1506 // Compile with no flags. 1507 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status); 1508 REGEX_ASSERT(*pat1a == *pat1); 1509 1510 REGEX_ASSERT(pat1a->flags() == 0); 1511 1512 // Compile with different flags should be not equal 1513 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status); 1514 REGEX_CHECK_STATUS; 1515 1516 REGEX_ASSERT(*pat1b != *pat1a); 1517 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE); 1518 REGEX_ASSERT(pat1a->flags() == 0); 1519 delete pat1b; 1520 1521 // clone 1522 RegexPattern *pat1c = pat1->clone(); 1523 REGEX_ASSERT(*pat1c == *pat1); 1524 REGEX_ASSERT(*pat1c != *pat2); 1525 1526 delete pat1c; 1527 delete pat1a; 1528 delete pat1; 1529 delete pat2; 1530 1531 1532 // 1533 // Verify that a matcher created from a cloned pattern works. 1534 // (Jitterbug 3423) 1535 // 1536 { 1537 UErrorCode status = U_ZERO_ERROR; 1538 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status); 1539 RegexPattern *pClone = pSource->clone(); 1540 delete pSource; 1541 RegexMatcher *mFromClone = pClone->matcher(status); 1542 REGEX_CHECK_STATUS; 1543 UnicodeString s = "Hello World"; 1544 mFromClone->reset(s); 1545 REGEX_ASSERT(mFromClone->find() == TRUE); 1546 REGEX_ASSERT(mFromClone->group(status) == "Hello"); 1547 REGEX_ASSERT(mFromClone->find() == TRUE); 1548 REGEX_ASSERT(mFromClone->group(status) == "World"); 1549 REGEX_ASSERT(mFromClone->find() == FALSE); 1550 delete mFromClone; 1551 delete pClone; 1552 } 1553 1554 // 1555 // matches convenience API 1556 // 1557 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE); 1558 REGEX_CHECK_STATUS; 1559 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE); 1560 REGEX_CHECK_STATUS; 1561 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE); 1562 REGEX_CHECK_STATUS; 1563 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE); 1564 REGEX_CHECK_STATUS; 1565 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE); 1566 REGEX_CHECK_STATUS; 1567 status = U_INDEX_OUTOFBOUNDS_ERROR; 1568 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE); 1569 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1570 1571 1572 // 1573 // Split() 1574 // 1575 status = U_ZERO_ERROR; 1576 pat1 = RegexPattern::compile(" +", pe, status); 1577 REGEX_CHECK_STATUS; 1578 UnicodeString fields[10]; 1579 1580 int32_t n; 1581 n = pat1->split("Now is the time", fields, 10, status); 1582 REGEX_CHECK_STATUS; 1583 REGEX_ASSERT(n==4); 1584 REGEX_ASSERT(fields[0]=="Now"); 1585 REGEX_ASSERT(fields[1]=="is"); 1586 REGEX_ASSERT(fields[2]=="the"); 1587 REGEX_ASSERT(fields[3]=="time"); 1588 REGEX_ASSERT(fields[4]==""); 1589 1590 n = pat1->split("Now is the time", fields, 2, status); 1591 REGEX_CHECK_STATUS; 1592 REGEX_ASSERT(n==2); 1593 REGEX_ASSERT(fields[0]=="Now"); 1594 REGEX_ASSERT(fields[1]=="is the time"); 1595 REGEX_ASSERT(fields[2]=="the"); // left over from previous test 1596 1597 fields[1] = "*"; 1598 status = U_ZERO_ERROR; 1599 n = pat1->split("Now is the time", fields, 1, status); 1600 REGEX_CHECK_STATUS; 1601 REGEX_ASSERT(n==1); 1602 REGEX_ASSERT(fields[0]=="Now is the time"); 1603 REGEX_ASSERT(fields[1]=="*"); 1604 status = U_ZERO_ERROR; 1605 1606 n = pat1->split(" Now is the time ", fields, 10, status); 1607 REGEX_CHECK_STATUS; 1608 REGEX_ASSERT(n==6); 1609 REGEX_ASSERT(fields[0]==""); 1610 REGEX_ASSERT(fields[1]=="Now"); 1611 REGEX_ASSERT(fields[2]=="is"); 1612 REGEX_ASSERT(fields[3]=="the"); 1613 REGEX_ASSERT(fields[4]=="time"); 1614 REGEX_ASSERT(fields[5]==""); 1615 1616 n = pat1->split(" ", fields, 10, status); 1617 REGEX_CHECK_STATUS; 1618 REGEX_ASSERT(n==2); 1619 REGEX_ASSERT(fields[0]==""); 1620 REGEX_ASSERT(fields[1]==""); 1621 1622 fields[0] = "foo"; 1623 n = pat1->split("", fields, 10, status); 1624 REGEX_CHECK_STATUS; 1625 REGEX_ASSERT(n==0); 1626 REGEX_ASSERT(fields[0]=="foo"); 1627 1628 delete pat1; 1629 1630 // split, with a pattern with (capture) 1631 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status); 1632 REGEX_CHECK_STATUS; 1633 1634 status = U_ZERO_ERROR; 1635 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status); 1636 REGEX_CHECK_STATUS; 1637 REGEX_ASSERT(n==7); 1638 REGEX_ASSERT(fields[0]==""); 1639 REGEX_ASSERT(fields[1]=="a"); 1640 REGEX_ASSERT(fields[2]=="Now is "); 1641 REGEX_ASSERT(fields[3]=="b"); 1642 REGEX_ASSERT(fields[4]=="the time"); 1643 REGEX_ASSERT(fields[5]=="c"); 1644 REGEX_ASSERT(fields[6]==""); 1645 REGEX_ASSERT(status==U_ZERO_ERROR); 1646 1647 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status); 1648 REGEX_CHECK_STATUS; 1649 REGEX_ASSERT(n==7); 1650 REGEX_ASSERT(fields[0]==" "); 1651 REGEX_ASSERT(fields[1]=="a"); 1652 REGEX_ASSERT(fields[2]=="Now is "); 1653 REGEX_ASSERT(fields[3]=="b"); 1654 REGEX_ASSERT(fields[4]=="the time"); 1655 REGEX_ASSERT(fields[5]=="c"); 1656 REGEX_ASSERT(fields[6]==""); 1657 1658 status = U_ZERO_ERROR; 1659 fields[6] = "foo"; 1660 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status); 1661 REGEX_CHECK_STATUS; 1662 REGEX_ASSERT(n==6); 1663 REGEX_ASSERT(fields[0]==" "); 1664 REGEX_ASSERT(fields[1]=="a"); 1665 REGEX_ASSERT(fields[2]=="Now is "); 1666 REGEX_ASSERT(fields[3]=="b"); 1667 REGEX_ASSERT(fields[4]=="the time"); 1668 REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter. 1669 REGEX_ASSERT(fields[6]=="foo"); 1670 1671 status = U_ZERO_ERROR; 1672 fields[5] = "foo"; 1673 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status); 1674 REGEX_CHECK_STATUS; 1675 REGEX_ASSERT(n==5); 1676 REGEX_ASSERT(fields[0]==" "); 1677 REGEX_ASSERT(fields[1]=="a"); 1678 REGEX_ASSERT(fields[2]=="Now is "); 1679 REGEX_ASSERT(fields[3]=="b"); 1680 REGEX_ASSERT(fields[4]=="the time<c>"); 1681 REGEX_ASSERT(fields[5]=="foo"); 1682 1683 status = U_ZERO_ERROR; 1684 fields[5] = "foo"; 1685 n = pat1->split(" <a>Now is <b>the time", fields, 5, status); 1686 REGEX_CHECK_STATUS; 1687 REGEX_ASSERT(n==5); 1688 REGEX_ASSERT(fields[0]==" "); 1689 REGEX_ASSERT(fields[1]=="a"); 1690 REGEX_ASSERT(fields[2]=="Now is "); 1691 REGEX_ASSERT(fields[3]=="b"); 1692 REGEX_ASSERT(fields[4]=="the time"); 1693 REGEX_ASSERT(fields[5]=="foo"); 1694 1695 status = U_ZERO_ERROR; 1696 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status); 1697 REGEX_CHECK_STATUS; 1698 REGEX_ASSERT(n==4); 1699 REGEX_ASSERT(fields[0]==" "); 1700 REGEX_ASSERT(fields[1]=="a"); 1701 REGEX_ASSERT(fields[2]=="Now is "); 1702 REGEX_ASSERT(fields[3]=="the time<c>"); 1703 status = U_ZERO_ERROR; 1704 delete pat1; 1705 1706 pat1 = RegexPattern::compile("([-,])", pe, status); 1707 REGEX_CHECK_STATUS; 1708 n = pat1->split("1-10,20", fields, 10, status); 1709 REGEX_CHECK_STATUS; 1710 REGEX_ASSERT(n==5); 1711 REGEX_ASSERT(fields[0]=="1"); 1712 REGEX_ASSERT(fields[1]=="-"); 1713 REGEX_ASSERT(fields[2]=="10"); 1714 REGEX_ASSERT(fields[3]==","); 1715 REGEX_ASSERT(fields[4]=="20"); 1716 delete pat1; 1717 1718 // Test split of string with empty trailing fields 1719 pat1 = RegexPattern::compile(",", pe, status); 1720 REGEX_CHECK_STATUS; 1721 n = pat1->split("a,b,c,", fields, 10, status); 1722 REGEX_CHECK_STATUS; 1723 REGEX_ASSERT(n==4); 1724 REGEX_ASSERT(fields[0]=="a"); 1725 REGEX_ASSERT(fields[1]=="b"); 1726 REGEX_ASSERT(fields[2]=="c"); 1727 REGEX_ASSERT(fields[3]==""); 1728 1729 n = pat1->split("a,,,", fields, 10, status); 1730 REGEX_CHECK_STATUS; 1731 REGEX_ASSERT(n==4); 1732 REGEX_ASSERT(fields[0]=="a"); 1733 REGEX_ASSERT(fields[1]==""); 1734 REGEX_ASSERT(fields[2]==""); 1735 REGEX_ASSERT(fields[3]==""); 1736 delete pat1; 1737 1738 // Split Separator with zero length match. 1739 pat1 = RegexPattern::compile(":?", pe, status); 1740 REGEX_CHECK_STATUS; 1741 n = pat1->split("abc", fields, 10, status); 1742 REGEX_CHECK_STATUS; 1743 REGEX_ASSERT(n==5); 1744 REGEX_ASSERT(fields[0]==""); 1745 REGEX_ASSERT(fields[1]=="a"); 1746 REGEX_ASSERT(fields[2]=="b"); 1747 REGEX_ASSERT(fields[3]=="c"); 1748 REGEX_ASSERT(fields[4]==""); 1749 1750 delete pat1; 1751 1752 // 1753 // RegexPattern::pattern() 1754 // 1755 pat1 = new RegexPattern(); 1756 REGEX_ASSERT(pat1->pattern() == ""); 1757 delete pat1; 1758 1759 pat1 = RegexPattern::compile("(Hello, world)*", pe, status); 1760 REGEX_CHECK_STATUS; 1761 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*"); 1762 delete pat1; 1763 1764 1765 // 1766 // classID functions 1767 // 1768 pat1 = RegexPattern::compile("(Hello, world)*", pe, status); 1769 REGEX_CHECK_STATUS; 1770 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID()); 1771 REGEX_ASSERT(pat1->getDynamicClassID() != NULL); 1772 UnicodeString Hello("Hello, world."); 1773 RegexMatcher *m = pat1->matcher(Hello, status); 1774 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID()); 1775 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID()); 1776 REGEX_ASSERT(m->getDynamicClassID() != NULL); 1777 delete m; 1778 delete pat1; 1779 1780 } 1781 1782 //--------------------------------------------------------------------------- 1783 // 1784 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher 1785 // is present and working, but excluding functions 1786 // implementing replace operations. 1787 // 1788 //--------------------------------------------------------------------------- 1789 void RegexTest::API_Match_UTF8() { 1790 UParseError pe; 1791 UErrorCode status=U_ZERO_ERROR; 1792 int32_t flags = 0; 1793 1794 // 1795 // Debug - slide failing test cases early 1796 // 1797 #if 0 1798 { 1799 } 1800 return; 1801 #endif 1802 1803 // 1804 // Simple pattern compilation 1805 // 1806 { 1807 UText re = UTEXT_INITIALIZER; 1808 regextst_openUTF8FromInvariant(&re, "abc", -1, &status); 1809 REGEX_VERBOSE_TEXT(&re); 1810 RegexPattern *pat2; 1811 pat2 = RegexPattern::compile(&re, flags, pe, status); 1812 REGEX_CHECK_STATUS; 1813 1814 UText input1 = UTEXT_INITIALIZER; 1815 UText input2 = UTEXT_INITIALIZER; 1816 UText empty = UTEXT_INITIALIZER; 1817 regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status); 1818 REGEX_VERBOSE_TEXT(&input1); 1819 regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status); 1820 REGEX_VERBOSE_TEXT(&input2); 1821 utext_openUChars(&empty, NULL, 0, &status); 1822 1823 int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */ 1824 int32_t input2Len = strlen("not abc"); 1825 1826 1827 // 1828 // Matcher creation and reset. 1829 // 1830 RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1); 1831 REGEX_CHECK_STATUS; 1832 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 1833 const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */ 1834 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText()); 1835 m1->reset(&input2); 1836 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 1837 const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */ 1838 REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText()); 1839 m1->reset(&input1); 1840 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText()); 1841 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 1842 m1->reset(&empty); 1843 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 1844 REGEX_ASSERT(utext_nativeLength(&empty) == 0); 1845 1846 // 1847 // reset(pos, status) 1848 // 1849 m1->reset(&input1); 1850 m1->reset(4, status); 1851 REGEX_CHECK_STATUS; 1852 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText()); 1853 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 1854 1855 m1->reset(-1, status); 1856 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1857 status = U_ZERO_ERROR; 1858 1859 m1->reset(0, status); 1860 REGEX_CHECK_STATUS; 1861 status = U_ZERO_ERROR; 1862 1863 m1->reset(input1Len-1, status); 1864 REGEX_CHECK_STATUS; 1865 status = U_ZERO_ERROR; 1866 1867 m1->reset(input1Len, status); 1868 REGEX_CHECK_STATUS; 1869 status = U_ZERO_ERROR; 1870 1871 m1->reset(input1Len+1, status); 1872 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1873 status = U_ZERO_ERROR; 1874 1875 // 1876 // match(pos, status) 1877 // 1878 m1->reset(&input2); 1879 REGEX_ASSERT(m1->matches(4, status) == TRUE); 1880 m1->reset(); 1881 REGEX_ASSERT(m1->matches(3, status) == FALSE); 1882 m1->reset(); 1883 REGEX_ASSERT(m1->matches(5, status) == FALSE); 1884 REGEX_ASSERT(m1->matches(4, status) == TRUE); 1885 REGEX_ASSERT(m1->matches(-1, status) == FALSE); 1886 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1887 1888 // Match() at end of string should fail, but should not 1889 // be an error. 1890 status = U_ZERO_ERROR; 1891 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE); 1892 REGEX_CHECK_STATUS; 1893 1894 // Match beyond end of string should fail with an error. 1895 status = U_ZERO_ERROR; 1896 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE); 1897 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1898 1899 // Successful match at end of string. 1900 { 1901 status = U_ZERO_ERROR; 1902 RegexMatcher m("A?", 0, status); // will match zero length string. 1903 REGEX_CHECK_STATUS; 1904 m.reset(&input1); 1905 REGEX_ASSERT(m.matches(input1Len, status) == TRUE); 1906 REGEX_CHECK_STATUS; 1907 m.reset(&empty); 1908 REGEX_ASSERT(m.matches(0, status) == TRUE); 1909 REGEX_CHECK_STATUS; 1910 } 1911 1912 1913 // 1914 // lookingAt(pos, status) 1915 // 1916 status = U_ZERO_ERROR; 1917 m1->reset(&input2); // "not abc" 1918 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 1919 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE); 1920 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE); 1921 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 1922 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE); 1923 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1924 status = U_ZERO_ERROR; 1925 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE); 1926 REGEX_CHECK_STATUS; 1927 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE); 1928 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1929 1930 delete m1; 1931 delete pat2; 1932 1933 utext_close(&re); 1934 utext_close(&input1); 1935 utext_close(&input2); 1936 utext_close(&empty); 1937 } 1938 1939 1940 // 1941 // Capture Group. 1942 // RegexMatcher::start(); 1943 // RegexMatcher::end(); 1944 // RegexMatcher::groupCount(); 1945 // 1946 { 1947 int32_t flags=0; 1948 UParseError pe; 1949 UErrorCode status=U_ZERO_ERROR; 1950 UText re=UTEXT_INITIALIZER; 1951 const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */ 1952 utext_openUTF8(&re, str_01234567_pat, -1, &status); 1953 1954 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 1955 REGEX_CHECK_STATUS; 1956 1957 UText input = UTEXT_INITIALIZER; 1958 const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */ 1959 utext_openUTF8(&input, str_0123456789, -1, &status); 1960 1961 RegexMatcher *matcher = &pat->matcher(status)->reset(&input); 1962 REGEX_CHECK_STATUS; 1963 REGEX_ASSERT(matcher->lookingAt(status) == TRUE); 1964 static const int32_t matchStarts[] = {0, 2, 4, 8}; 1965 static const int32_t matchEnds[] = {10, 8, 6, 10}; 1966 int32_t i; 1967 for (i=0; i<4; i++) { 1968 int32_t actualStart = matcher->start(i, status); 1969 REGEX_CHECK_STATUS; 1970 if (actualStart != matchStarts[i]) { 1971 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n", 1972 __FILE__, __LINE__, i, matchStarts[i], actualStart); 1973 } 1974 int32_t actualEnd = matcher->end(i, status); 1975 REGEX_CHECK_STATUS; 1976 if (actualEnd != matchEnds[i]) { 1977 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n", 1978 __FILE__, __LINE__, i, matchEnds[i], actualEnd); 1979 } 1980 } 1981 1982 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status)); 1983 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status)); 1984 1985 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 1986 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 1987 matcher->reset(); 1988 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE); 1989 1990 matcher->lookingAt(status); 1991 1992 UnicodeString dest; 1993 UText destText = UTEXT_INITIALIZER; 1994 utext_openUnicodeString(&destText, &dest, &status); 1995 UText *result; 1996 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */ 1997 // Test shallow-clone API 1998 int64_t group_len; 1999 result = matcher->group((UText *)NULL, group_len, status); 2000 REGEX_CHECK_STATUS; 2001 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 2002 utext_close(result); 2003 result = matcher->group(0, &destText, group_len, status); 2004 REGEX_CHECK_STATUS; 2005 REGEX_ASSERT(result == &destText); 2006 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 2007 // destText is now immutable, reopen it 2008 utext_close(&destText); 2009 utext_openUnicodeString(&destText, &dest, &status); 2010 2011 int64_t length; 2012 result = matcher->group(0, NULL, length, status); 2013 REGEX_CHECK_STATUS; 2014 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 2015 utext_close(result); 2016 result = matcher->group(0, &destText, length, status); 2017 REGEX_CHECK_STATUS; 2018 REGEX_ASSERT(result == &destText); 2019 REGEX_ASSERT(utext_getNativeIndex(result) == 0); 2020 REGEX_ASSERT(length == 10); 2021 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2022 2023 // Capture Group 1 == "234567" 2024 result = matcher->group(1, NULL, length, status); 2025 REGEX_CHECK_STATUS; 2026 REGEX_ASSERT(utext_getNativeIndex(result) == 2); 2027 REGEX_ASSERT(length == 6); 2028 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2029 utext_close(result); 2030 2031 result = matcher->group(1, &destText, length, status); 2032 REGEX_CHECK_STATUS; 2033 REGEX_ASSERT(result == &destText); 2034 REGEX_ASSERT(utext_getNativeIndex(result) == 2); 2035 REGEX_ASSERT(length == 6); 2036 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2037 utext_close(result); 2038 2039 // Capture Group 2 == "45" 2040 result = matcher->group(2, NULL, length, status); 2041 REGEX_CHECK_STATUS; 2042 REGEX_ASSERT(utext_getNativeIndex(result) == 4); 2043 REGEX_ASSERT(length == 2); 2044 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2045 utext_close(result); 2046 2047 result = matcher->group(2, &destText, length, status); 2048 REGEX_CHECK_STATUS; 2049 REGEX_ASSERT(result == &destText); 2050 REGEX_ASSERT(utext_getNativeIndex(result) == 4); 2051 REGEX_ASSERT(length == 2); 2052 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2053 utext_close(result); 2054 2055 // Capture Group 3 == "89" 2056 result = matcher->group(3, NULL, length, status); 2057 REGEX_CHECK_STATUS; 2058 REGEX_ASSERT(utext_getNativeIndex(result) == 8); 2059 REGEX_ASSERT(length == 2); 2060 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2061 utext_close(result); 2062 2063 result = matcher->group(3, &destText, length, status); 2064 REGEX_CHECK_STATUS; 2065 REGEX_ASSERT(result == &destText); 2066 REGEX_ASSERT(utext_getNativeIndex(result) == 8); 2067 REGEX_ASSERT(length == 2); 2068 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2069 utext_close(result); 2070 2071 // Capture Group number out of range. 2072 status = U_ZERO_ERROR; 2073 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 2074 status = U_ZERO_ERROR; 2075 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 2076 status = U_ZERO_ERROR; 2077 matcher->reset(); 2078 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); 2079 2080 delete matcher; 2081 delete pat; 2082 2083 utext_close(&destText); 2084 utext_close(&input); 2085 utext_close(&re); 2086 } 2087 2088 // 2089 // find 2090 // 2091 { 2092 int32_t flags=0; 2093 UParseError pe; 2094 UErrorCode status=U_ZERO_ERROR; 2095 UText re=UTEXT_INITIALIZER; 2096 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 2097 utext_openUTF8(&re, str_abc, -1, &status); 2098 2099 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 2100 REGEX_CHECK_STATUS; 2101 UText input = UTEXT_INITIALIZER; 2102 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */ 2103 utext_openUTF8(&input, str_abcabcabc, -1, &status); 2104 // 012345678901234567 2105 2106 RegexMatcher *matcher = &pat->matcher(status)->reset(&input); 2107 REGEX_CHECK_STATUS; 2108 REGEX_ASSERT(matcher->find()); 2109 REGEX_ASSERT(matcher->start(status) == 1); 2110 REGEX_ASSERT(matcher->find()); 2111 REGEX_ASSERT(matcher->start(status) == 6); 2112 REGEX_ASSERT(matcher->find()); 2113 REGEX_ASSERT(matcher->start(status) == 12); 2114 REGEX_ASSERT(matcher->find() == FALSE); 2115 REGEX_ASSERT(matcher->find() == FALSE); 2116 2117 matcher->reset(); 2118 REGEX_ASSERT(matcher->find()); 2119 REGEX_ASSERT(matcher->start(status) == 1); 2120 2121 REGEX_ASSERT(matcher->find(0, status)); 2122 REGEX_ASSERT(matcher->start(status) == 1); 2123 REGEX_ASSERT(matcher->find(1, status)); 2124 REGEX_ASSERT(matcher->start(status) == 1); 2125 REGEX_ASSERT(matcher->find(2, status)); 2126 REGEX_ASSERT(matcher->start(status) == 6); 2127 REGEX_ASSERT(matcher->find(12, status)); 2128 REGEX_ASSERT(matcher->start(status) == 12); 2129 REGEX_ASSERT(matcher->find(13, status) == FALSE); 2130 REGEX_ASSERT(matcher->find(16, status) == FALSE); 2131 REGEX_ASSERT(matcher->find(17, status) == FALSE); 2132 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE); 2133 2134 status = U_ZERO_ERROR; 2135 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 2136 status = U_ZERO_ERROR; 2137 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR); 2138 2139 REGEX_ASSERT(matcher->groupCount() == 0); 2140 2141 delete matcher; 2142 delete pat; 2143 2144 utext_close(&input); 2145 utext_close(&re); 2146 } 2147 2148 2149 // 2150 // find, with \G in pattern (true if at the end of a previous match). 2151 // 2152 { 2153 int32_t flags=0; 2154 UParseError pe; 2155 UErrorCode status=U_ZERO_ERROR; 2156 UText re=UTEXT_INITIALIZER; 2157 const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */ 2158 utext_openUTF8(&re, str_Gabcabc, -1, &status); 2159 2160 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 2161 2162 REGEX_CHECK_STATUS; 2163 UText input = UTEXT_INITIALIZER; 2164 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */ 2165 utext_openUTF8(&input, str_abcabcabc, -1, &status); 2166 // 012345678901234567 2167 2168 RegexMatcher *matcher = &pat->matcher(status)->reset(&input); 2169 REGEX_CHECK_STATUS; 2170 REGEX_ASSERT(matcher->find()); 2171 REGEX_ASSERT(matcher->start(status) == 0); 2172 REGEX_ASSERT(matcher->start(1, status) == -1); 2173 REGEX_ASSERT(matcher->start(2, status) == 1); 2174 2175 REGEX_ASSERT(matcher->find()); 2176 REGEX_ASSERT(matcher->start(status) == 4); 2177 REGEX_ASSERT(matcher->start(1, status) == 4); 2178 REGEX_ASSERT(matcher->start(2, status) == -1); 2179 REGEX_CHECK_STATUS; 2180 2181 delete matcher; 2182 delete pat; 2183 2184 utext_close(&input); 2185 utext_close(&re); 2186 } 2187 2188 // 2189 // find with zero length matches, match position should bump ahead 2190 // to prevent loops. 2191 // 2192 { 2193 int32_t i; 2194 UErrorCode status=U_ZERO_ERROR; 2195 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere, 2196 // using an always-true look-ahead. 2197 REGEX_CHECK_STATUS; 2198 UText s = UTEXT_INITIALIZER; 2199 utext_openUTF8(&s, " ", -1, &status); 2200 m.reset(&s); 2201 for (i=0; ; i++) { 2202 if (m.find() == FALSE) { 2203 break; 2204 } 2205 REGEX_ASSERT(m.start(status) == i); 2206 REGEX_ASSERT(m.end(status) == i); 2207 } 2208 REGEX_ASSERT(i==5); 2209 2210 // Check that the bump goes over characters outside the BMP OK 2211 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8 2212 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00}; 2213 utext_openUTF8(&s, (char *)aboveBMP, -1, &status); 2214 m.reset(&s); 2215 for (i=0; ; i+=4) { 2216 if (m.find() == FALSE) { 2217 break; 2218 } 2219 REGEX_ASSERT(m.start(status) == i); 2220 REGEX_ASSERT(m.end(status) == i); 2221 } 2222 REGEX_ASSERT(i==20); 2223 2224 utext_close(&s); 2225 } 2226 { 2227 // find() loop breaking test. 2228 // with pattern of /.?/, should see a series of one char matches, then a single 2229 // match of zero length at the end of the input string. 2230 int32_t i; 2231 UErrorCode status=U_ZERO_ERROR; 2232 RegexMatcher m(".?", 0, status); 2233 REGEX_CHECK_STATUS; 2234 UText s = UTEXT_INITIALIZER; 2235 utext_openUTF8(&s, " ", -1, &status); 2236 m.reset(&s); 2237 for (i=0; ; i++) { 2238 if (m.find() == FALSE) { 2239 break; 2240 } 2241 REGEX_ASSERT(m.start(status) == i); 2242 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i)); 2243 } 2244 REGEX_ASSERT(i==5); 2245 2246 utext_close(&s); 2247 } 2248 2249 2250 // 2251 // Matchers with no input string behave as if they had an empty input string. 2252 // 2253 2254 { 2255 UErrorCode status = U_ZERO_ERROR; 2256 RegexMatcher m(".?", 0, status); 2257 REGEX_CHECK_STATUS; 2258 REGEX_ASSERT(m.find()); 2259 REGEX_ASSERT(m.start(status) == 0); 2260 REGEX_ASSERT(m.input() == ""); 2261 } 2262 { 2263 UErrorCode status = U_ZERO_ERROR; 2264 RegexPattern *p = RegexPattern::compile(".", 0, status); 2265 RegexMatcher *m = p->matcher(status); 2266 REGEX_CHECK_STATUS; 2267 2268 REGEX_ASSERT(m->find() == FALSE); 2269 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0); 2270 delete m; 2271 delete p; 2272 } 2273 2274 // 2275 // Regions 2276 // 2277 { 2278 UErrorCode status = U_ZERO_ERROR; 2279 UText testPattern = UTEXT_INITIALIZER; 2280 UText testText = UTEXT_INITIALIZER; 2281 regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status); 2282 REGEX_VERBOSE_TEXT(&testPattern); 2283 regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status); 2284 REGEX_VERBOSE_TEXT(&testText); 2285 2286 RegexMatcher m(&testPattern, &testText, 0, status); 2287 REGEX_CHECK_STATUS; 2288 REGEX_ASSERT(m.regionStart() == 0); 2289 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data")); 2290 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2291 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2292 2293 m.region(2,4, status); 2294 REGEX_CHECK_STATUS; 2295 REGEX_ASSERT(m.matches(status)); 2296 REGEX_ASSERT(m.start(status)==2); 2297 REGEX_ASSERT(m.end(status)==4); 2298 REGEX_CHECK_STATUS; 2299 2300 m.reset(); 2301 REGEX_ASSERT(m.regionStart() == 0); 2302 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data")); 2303 2304 regextst_openUTF8FromInvariant(&testText, "short", -1, &status); 2305 REGEX_VERBOSE_TEXT(&testText); 2306 m.reset(&testText); 2307 REGEX_ASSERT(m.regionStart() == 0); 2308 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short")); 2309 2310 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2311 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE)); 2312 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 2313 REGEX_ASSERT(&m == &m.reset()); 2314 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 2315 2316 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE)); 2317 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2318 REGEX_ASSERT(&m == &m.reset()); 2319 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2320 2321 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2322 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE)); 2323 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 2324 REGEX_ASSERT(&m == &m.reset()); 2325 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 2326 2327 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE)); 2328 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2329 REGEX_ASSERT(&m == &m.reset()); 2330 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2331 2332 utext_close(&testText); 2333 utext_close(&testPattern); 2334 } 2335 2336 // 2337 // hitEnd() and requireEnd() 2338 // 2339 { 2340 UErrorCode status = U_ZERO_ERROR; 2341 UText testPattern = UTEXT_INITIALIZER; 2342 UText testText = UTEXT_INITIALIZER; 2343 const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */ 2344 const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */ 2345 utext_openUTF8(&testPattern, str_, -1, &status); 2346 utext_openUTF8(&testText, str_aabb, -1, &status); 2347 2348 RegexMatcher m1(&testPattern, &testText, 0, status); 2349 REGEX_ASSERT(m1.lookingAt(status) == TRUE); 2350 REGEX_ASSERT(m1.hitEnd() == TRUE); 2351 REGEX_ASSERT(m1.requireEnd() == FALSE); 2352 REGEX_CHECK_STATUS; 2353 2354 status = U_ZERO_ERROR; 2355 const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */ 2356 utext_openUTF8(&testPattern, str_a, -1, &status); 2357 RegexMatcher m2(&testPattern, &testText, 0, status); 2358 REGEX_ASSERT(m2.lookingAt(status) == TRUE); 2359 REGEX_ASSERT(m2.hitEnd() == FALSE); 2360 REGEX_ASSERT(m2.requireEnd() == FALSE); 2361 REGEX_CHECK_STATUS; 2362 2363 status = U_ZERO_ERROR; 2364 const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */ 2365 utext_openUTF8(&testPattern, str_dotstardollar, -1, &status); 2366 RegexMatcher m3(&testPattern, &testText, 0, status); 2367 REGEX_ASSERT(m3.lookingAt(status) == TRUE); 2368 REGEX_ASSERT(m3.hitEnd() == TRUE); 2369 REGEX_ASSERT(m3.requireEnd() == TRUE); 2370 REGEX_CHECK_STATUS; 2371 2372 utext_close(&testText); 2373 utext_close(&testPattern); 2374 } 2375 } 2376 2377 2378 //--------------------------------------------------------------------------- 2379 // 2380 // API_Replace_UTF8 API test for class RegexMatcher, testing the 2381 // Replace family of functions. 2382 // 2383 //--------------------------------------------------------------------------- 2384 void RegexTest::API_Replace_UTF8() { 2385 // 2386 // Replace 2387 // 2388 int32_t flags=0; 2389 UParseError pe; 2390 UErrorCode status=U_ZERO_ERROR; 2391 2392 UText re=UTEXT_INITIALIZER; 2393 regextst_openUTF8FromInvariant(&re, "abc", -1, &status); 2394 REGEX_VERBOSE_TEXT(&re); 2395 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 2396 REGEX_CHECK_STATUS; 2397 2398 char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */ 2399 // 012345678901234567 2400 UText dataText = UTEXT_INITIALIZER; 2401 utext_openUTF8(&dataText, data, -1, &status); 2402 REGEX_CHECK_STATUS; 2403 REGEX_VERBOSE_TEXT(&dataText); 2404 RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText); 2405 2406 // 2407 // Plain vanilla matches. 2408 // 2409 UnicodeString dest; 2410 UText destText = UTEXT_INITIALIZER; 2411 utext_openUnicodeString(&destText, &dest, &status); 2412 UText *result; 2413 2414 UText replText = UTEXT_INITIALIZER; 2415 2416 const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */ 2417 utext_openUTF8(&replText, str_yz, -1, &status); 2418 REGEX_VERBOSE_TEXT(&replText); 2419 result = matcher->replaceFirst(&replText, NULL, status); 2420 REGEX_CHECK_STATUS; 2421 const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */ 2422 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result); 2423 utext_close(result); 2424 result = matcher->replaceFirst(&replText, &destText, status); 2425 REGEX_CHECK_STATUS; 2426 REGEX_ASSERT(result == &destText); 2427 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result); 2428 2429 result = matcher->replaceAll(&replText, NULL, status); 2430 REGEX_CHECK_STATUS; 2431 const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */ 2432 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result); 2433 utext_close(result); 2434 2435 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2436 result = matcher->replaceAll(&replText, &destText, status); 2437 REGEX_CHECK_STATUS; 2438 REGEX_ASSERT(result == &destText); 2439 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result); 2440 2441 // 2442 // Plain vanilla non-matches. 2443 // 2444 const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */ 2445 utext_openUTF8(&dataText, str_abxabxabx, -1, &status); 2446 matcher->reset(&dataText); 2447 2448 result = matcher->replaceFirst(&replText, NULL, status); 2449 REGEX_CHECK_STATUS; 2450 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2451 utext_close(result); 2452 result = matcher->replaceFirst(&replText, &destText, status); 2453 REGEX_CHECK_STATUS; 2454 REGEX_ASSERT(result == &destText); 2455 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2456 2457 result = matcher->replaceAll(&replText, NULL, status); 2458 REGEX_CHECK_STATUS; 2459 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2460 utext_close(result); 2461 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2462 result = matcher->replaceAll(&replText, &destText, status); 2463 REGEX_CHECK_STATUS; 2464 REGEX_ASSERT(result == &destText); 2465 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2466 2467 // 2468 // Empty source string 2469 // 2470 utext_openUTF8(&dataText, NULL, 0, &status); 2471 matcher->reset(&dataText); 2472 2473 result = matcher->replaceFirst(&replText, NULL, status); 2474 REGEX_CHECK_STATUS; 2475 REGEX_ASSERT_UTEXT_UTF8("", result); 2476 utext_close(result); 2477 result = matcher->replaceFirst(&replText, &destText, status); 2478 REGEX_CHECK_STATUS; 2479 REGEX_ASSERT(result == &destText); 2480 REGEX_ASSERT_UTEXT_UTF8("", result); 2481 2482 result = matcher->replaceAll(&replText, NULL, status); 2483 REGEX_CHECK_STATUS; 2484 REGEX_ASSERT_UTEXT_UTF8("", result); 2485 utext_close(result); 2486 result = matcher->replaceAll(&replText, &destText, status); 2487 REGEX_CHECK_STATUS; 2488 REGEX_ASSERT(result == &destText); 2489 REGEX_ASSERT_UTEXT_UTF8("", result); 2490 2491 // 2492 // Empty substitution string 2493 // 2494 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.." 2495 matcher->reset(&dataText); 2496 2497 utext_openUTF8(&replText, NULL, 0, &status); 2498 result = matcher->replaceFirst(&replText, NULL, status); 2499 REGEX_CHECK_STATUS; 2500 const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */ 2501 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result); 2502 utext_close(result); 2503 result = matcher->replaceFirst(&replText, &destText, status); 2504 REGEX_CHECK_STATUS; 2505 REGEX_ASSERT(result == &destText); 2506 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result); 2507 2508 result = matcher->replaceAll(&replText, NULL, status); 2509 REGEX_CHECK_STATUS; 2510 const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */ 2511 REGEX_ASSERT_UTEXT_UTF8(str_dots, result); 2512 utext_close(result); 2513 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2514 result = matcher->replaceAll(&replText, &destText, status); 2515 REGEX_CHECK_STATUS; 2516 REGEX_ASSERT(result == &destText); 2517 REGEX_ASSERT_UTEXT_UTF8(str_dots, result); 2518 2519 // 2520 // match whole string 2521 // 2522 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 2523 utext_openUTF8(&dataText, str_abc, -1, &status); 2524 matcher->reset(&dataText); 2525 2526 const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */ 2527 utext_openUTF8(&replText, str_xyz, -1, &status); 2528 result = matcher->replaceFirst(&replText, NULL, status); 2529 REGEX_CHECK_STATUS; 2530 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2531 utext_close(result); 2532 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2533 result = matcher->replaceFirst(&replText, &destText, status); 2534 REGEX_CHECK_STATUS; 2535 REGEX_ASSERT(result == &destText); 2536 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2537 2538 result = matcher->replaceAll(&replText, NULL, status); 2539 REGEX_CHECK_STATUS; 2540 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2541 utext_close(result); 2542 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2543 result = matcher->replaceAll(&replText, &destText, status); 2544 REGEX_CHECK_STATUS; 2545 REGEX_ASSERT(result == &destText); 2546 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2547 2548 // 2549 // Capture Group, simple case 2550 // 2551 const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */ 2552 utext_openUTF8(&re, str_add, -1, &status); 2553 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status); 2554 REGEX_CHECK_STATUS; 2555 2556 const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */ 2557 utext_openUTF8(&dataText, str_abcdefg, -1, &status); 2558 RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText); 2559 REGEX_CHECK_STATUS; 2560 2561 const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */ 2562 utext_openUTF8(&replText, str_11, -1, &status); 2563 result = matcher2->replaceFirst(&replText, NULL, status); 2564 REGEX_CHECK_STATUS; 2565 const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */ 2566 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result); 2567 utext_close(result); 2568 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2569 result = matcher2->replaceFirst(&replText, &destText, status); 2570 REGEX_CHECK_STATUS; 2571 REGEX_ASSERT(result == &destText); 2572 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result); 2573 2574 const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */ 2575 utext_openUTF8(&replText, str_v, -1, &status); 2576 REGEX_VERBOSE_TEXT(&replText); 2577 result = matcher2->replaceFirst(&replText, NULL, status); 2578 REGEX_CHECK_STATUS; 2579 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */ 2580 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result); 2581 utext_close(result); 2582 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2583 result = matcher2->replaceFirst(&replText, &destText, status); 2584 REGEX_CHECK_STATUS; 2585 REGEX_ASSERT(result == &destText); 2586 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result); 2587 2588 const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 2589 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 2590 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */ 2591 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status); 2592 result = matcher2->replaceFirst(&replText, NULL, status); 2593 REGEX_CHECK_STATUS; 2594 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */ 2595 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result); 2596 utext_close(result); 2597 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2598 result = matcher2->replaceFirst(&replText, &destText, status); 2599 REGEX_CHECK_STATUS; 2600 REGEX_ASSERT(result == &destText); 2601 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result); 2602 2603 unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */ 2604 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE 2605 // 012345678901234567890123456 2606 supplDigitChars[22] = 0xF0; 2607 supplDigitChars[23] = 0x9D; 2608 supplDigitChars[24] = 0x9F; 2609 supplDigitChars[25] = 0x8F; 2610 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status); 2611 2612 result = matcher2->replaceFirst(&replText, NULL, status); 2613 REGEX_CHECK_STATUS; 2614 const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */ 2615 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result); 2616 utext_close(result); 2617 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2618 result = matcher2->replaceFirst(&replText, &destText, status); 2619 REGEX_CHECK_STATUS; 2620 REGEX_ASSERT(result == &destText); 2621 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result); 2622 const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */ 2623 utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status); 2624 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR); 2625 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result); 2626 utext_close(result); 2627 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2628 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR); 2629 REGEX_ASSERT(result == &destText); 2630 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result); 2631 2632 // 2633 // Replacement String with \u hex escapes 2634 // 2635 { 2636 const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */ 2637 const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */ 2638 utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status); 2639 utext_openUTF8(&replText, str_u0043, -1, &status); 2640 matcher->reset(&dataText); 2641 2642 result = matcher->replaceAll(&replText, NULL, status); 2643 REGEX_CHECK_STATUS; 2644 const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */ 2645 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result); 2646 utext_close(result); 2647 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2648 result = matcher->replaceAll(&replText, &destText, status); 2649 REGEX_CHECK_STATUS; 2650 REGEX_ASSERT(result == &destText); 2651 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result); 2652 } 2653 { 2654 const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */ 2655 utext_openUTF8(&dataText, str_abc, -1, &status); 2656 const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */ 2657 utext_openUTF8(&replText, str_U00010000, -1, &status); 2658 matcher->reset(&dataText); 2659 2660 unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A" 2661 // 0123456789 2662 expected[2] = 0xF0; 2663 expected[3] = 0x90; 2664 expected[4] = 0x80; 2665 expected[5] = 0x80; 2666 2667 result = matcher->replaceAll(&replText, NULL, status); 2668 REGEX_CHECK_STATUS; 2669 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result); 2670 utext_close(result); 2671 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2672 result = matcher->replaceAll(&replText, &destText, status); 2673 REGEX_CHECK_STATUS; 2674 REGEX_ASSERT(result == &destText); 2675 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result); 2676 } 2677 // TODO: need more through testing of capture substitutions. 2678 2679 // Bug 4057 2680 // 2681 { 2682 status = U_ZERO_ERROR; 2683 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */ 2684 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */ 2685 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */ 2686 utext_openUTF8(&re, str_ssee, -1, &status); 2687 utext_openUTF8(&dataText, str_blah, -1, &status); 2688 utext_openUTF8(&replText, str_ooh, -1, &status); 2689 2690 RegexMatcher m(&re, 0, status); 2691 REGEX_CHECK_STATUS; 2692 2693 UnicodeString result; 2694 UText resultText = UTEXT_INITIALIZER; 2695 utext_openUnicodeString(&resultText, &result, &status); 2696 2697 // Multiple finds do NOT bump up the previous appendReplacement postion. 2698 m.reset(&dataText); 2699 m.find(); 2700 m.find(); 2701 m.appendReplacement(&resultText, &replText, status); 2702 REGEX_CHECK_STATUS; 2703 const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */ 2704 REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText); 2705 2706 // After a reset into the interior of a string, appendReplacement still starts at beginning. 2707 status = U_ZERO_ERROR; 2708 result.truncate(0); 2709 utext_openUnicodeString(&resultText, &result, &status); 2710 m.reset(10, status); 2711 m.find(); 2712 m.find(); 2713 m.appendReplacement(&resultText, &replText, status); 2714 REGEX_CHECK_STATUS; 2715 const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */ 2716 REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText); 2717 2718 // find() at interior of string, appendReplacement still starts at beginning. 2719 status = U_ZERO_ERROR; 2720 result.truncate(0); 2721 utext_openUnicodeString(&resultText, &result, &status); 2722 m.reset(); 2723 m.find(10, status); 2724 m.find(); 2725 m.appendReplacement(&resultText, &replText, status); 2726 REGEX_CHECK_STATUS; 2727 const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */ 2728 REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText); 2729 2730 m.appendTail(&resultText, status); 2731 const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */ 2732 REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText); 2733 2734 utext_close(&resultText); 2735 } 2736 2737 delete matcher2; 2738 delete pat2; 2739 delete matcher; 2740 delete pat; 2741 2742 utext_close(&dataText); 2743 utext_close(&replText); 2744 utext_close(&destText); 2745 utext_close(&re); 2746 } 2747 2748 2749 //--------------------------------------------------------------------------- 2750 // 2751 // API_Pattern_UTF8 Test that the API for class RegexPattern is 2752 // present and nominally working. 2753 // 2754 //--------------------------------------------------------------------------- 2755 void RegexTest::API_Pattern_UTF8() { 2756 RegexPattern pata; // Test default constructor to not crash. 2757 RegexPattern patb; 2758 2759 REGEX_ASSERT(pata == patb); 2760 REGEX_ASSERT(pata == pata); 2761 2762 UText re1 = UTEXT_INITIALIZER; 2763 UText re2 = UTEXT_INITIALIZER; 2764 UErrorCode status = U_ZERO_ERROR; 2765 UParseError pe; 2766 2767 const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */ 2768 const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */ 2769 utext_openUTF8(&re1, str_abcalmz, -1, &status); 2770 utext_openUTF8(&re2, str_def, -1, &status); 2771 2772 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status); 2773 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status); 2774 REGEX_CHECK_STATUS; 2775 REGEX_ASSERT(*pat1 == *pat1); 2776 REGEX_ASSERT(*pat1 != pata); 2777 2778 // Assign 2779 patb = *pat1; 2780 REGEX_ASSERT(patb == *pat1); 2781 2782 // Copy Construct 2783 RegexPattern patc(*pat1); 2784 REGEX_ASSERT(patc == *pat1); 2785 REGEX_ASSERT(patb == patc); 2786 REGEX_ASSERT(pat1 != pat2); 2787 patb = *pat2; 2788 REGEX_ASSERT(patb != patc); 2789 REGEX_ASSERT(patb == *pat2); 2790 2791 // Compile with no flags. 2792 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status); 2793 REGEX_ASSERT(*pat1a == *pat1); 2794 2795 REGEX_ASSERT(pat1a->flags() == 0); 2796 2797 // Compile with different flags should be not equal 2798 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status); 2799 REGEX_CHECK_STATUS; 2800 2801 REGEX_ASSERT(*pat1b != *pat1a); 2802 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE); 2803 REGEX_ASSERT(pat1a->flags() == 0); 2804 delete pat1b; 2805 2806 // clone 2807 RegexPattern *pat1c = pat1->clone(); 2808 REGEX_ASSERT(*pat1c == *pat1); 2809 REGEX_ASSERT(*pat1c != *pat2); 2810 2811 delete pat1c; 2812 delete pat1a; 2813 delete pat1; 2814 delete pat2; 2815 2816 utext_close(&re1); 2817 utext_close(&re2); 2818 2819 2820 // 2821 // Verify that a matcher created from a cloned pattern works. 2822 // (Jitterbug 3423) 2823 // 2824 { 2825 UErrorCode status = U_ZERO_ERROR; 2826 UText pattern = UTEXT_INITIALIZER; 2827 const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */ 2828 utext_openUTF8(&pattern, str_pL, -1, &status); 2829 2830 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status); 2831 RegexPattern *pClone = pSource->clone(); 2832 delete pSource; 2833 RegexMatcher *mFromClone = pClone->matcher(status); 2834 REGEX_CHECK_STATUS; 2835 2836 UText input = UTEXT_INITIALIZER; 2837 const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */ 2838 utext_openUTF8(&input, str_HelloWorld, -1, &status); 2839 mFromClone->reset(&input); 2840 REGEX_ASSERT(mFromClone->find() == TRUE); 2841 REGEX_ASSERT(mFromClone->group(status) == "Hello"); 2842 REGEX_ASSERT(mFromClone->find() == TRUE); 2843 REGEX_ASSERT(mFromClone->group(status) == "World"); 2844 REGEX_ASSERT(mFromClone->find() == FALSE); 2845 delete mFromClone; 2846 delete pClone; 2847 2848 utext_close(&input); 2849 utext_close(&pattern); 2850 } 2851 2852 // 2853 // matches convenience API 2854 // 2855 { 2856 UErrorCode status = U_ZERO_ERROR; 2857 UText pattern = UTEXT_INITIALIZER; 2858 UText input = UTEXT_INITIALIZER; 2859 2860 const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */ 2861 utext_openUTF8(&input, str_randominput, -1, &status); 2862 2863 const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */ 2864 utext_openUTF8(&pattern, str_dotstar, -1, &status); 2865 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE); 2866 REGEX_CHECK_STATUS; 2867 2868 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 2869 utext_openUTF8(&pattern, str_abc, -1, &status); 2870 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE); 2871 REGEX_CHECK_STATUS; 2872 2873 const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */ 2874 utext_openUTF8(&pattern, str_nput, -1, &status); 2875 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE); 2876 REGEX_CHECK_STATUS; 2877 2878 utext_openUTF8(&pattern, str_randominput, -1, &status); 2879 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE); 2880 REGEX_CHECK_STATUS; 2881 2882 const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */ 2883 utext_openUTF8(&pattern, str_u, -1, &status); 2884 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE); 2885 REGEX_CHECK_STATUS; 2886 2887 utext_openUTF8(&input, str_abc, -1, &status); 2888 utext_openUTF8(&pattern, str_abc, -1, &status); 2889 status = U_INDEX_OUTOFBOUNDS_ERROR; 2890 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE); 2891 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 2892 2893 utext_close(&input); 2894 utext_close(&pattern); 2895 } 2896 2897 2898 // 2899 // Split() 2900 // 2901 status = U_ZERO_ERROR; 2902 const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */ 2903 utext_openUTF8(&re1, str_spaceplus, -1, &status); 2904 pat1 = RegexPattern::compile(&re1, pe, status); 2905 REGEX_CHECK_STATUS; 2906 UnicodeString fields[10]; 2907 2908 int32_t n; 2909 n = pat1->split("Now is the time", fields, 10, status); 2910 REGEX_CHECK_STATUS; 2911 REGEX_ASSERT(n==4); 2912 REGEX_ASSERT(fields[0]=="Now"); 2913 REGEX_ASSERT(fields[1]=="is"); 2914 REGEX_ASSERT(fields[2]=="the"); 2915 REGEX_ASSERT(fields[3]=="time"); 2916 REGEX_ASSERT(fields[4]==""); 2917 2918 n = pat1->split("Now is the time", fields, 2, status); 2919 REGEX_CHECK_STATUS; 2920 REGEX_ASSERT(n==2); 2921 REGEX_ASSERT(fields[0]=="Now"); 2922 REGEX_ASSERT(fields[1]=="is the time"); 2923 REGEX_ASSERT(fields[2]=="the"); // left over from previous test 2924 2925 fields[1] = "*"; 2926 status = U_ZERO_ERROR; 2927 n = pat1->split("Now is the time", fields, 1, status); 2928 REGEX_CHECK_STATUS; 2929 REGEX_ASSERT(n==1); 2930 REGEX_ASSERT(fields[0]=="Now is the time"); 2931 REGEX_ASSERT(fields[1]=="*"); 2932 status = U_ZERO_ERROR; 2933 2934 n = pat1->split(" Now is the time ", fields, 10, status); 2935 REGEX_CHECK_STATUS; 2936 REGEX_ASSERT(n==6); 2937 REGEX_ASSERT(fields[0]==""); 2938 REGEX_ASSERT(fields[1]=="Now"); 2939 REGEX_ASSERT(fields[2]=="is"); 2940 REGEX_ASSERT(fields[3]=="the"); 2941 REGEX_ASSERT(fields[4]=="time"); 2942 REGEX_ASSERT(fields[5]==""); 2943 REGEX_ASSERT(fields[6]==""); 2944 2945 fields[2] = "*"; 2946 n = pat1->split(" ", fields, 10, status); 2947 REGEX_CHECK_STATUS; 2948 REGEX_ASSERT(n==2); 2949 REGEX_ASSERT(fields[0]==""); 2950 REGEX_ASSERT(fields[1]==""); 2951 REGEX_ASSERT(fields[2]=="*"); 2952 2953 fields[0] = "foo"; 2954 n = pat1->split("", fields, 10, status); 2955 REGEX_CHECK_STATUS; 2956 REGEX_ASSERT(n==0); 2957 REGEX_ASSERT(fields[0]=="foo"); 2958 2959 delete pat1; 2960 2961 // split, with a pattern with (capture) 2962 regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status); 2963 pat1 = RegexPattern::compile(&re1, pe, status); 2964 REGEX_CHECK_STATUS; 2965 2966 status = U_ZERO_ERROR; 2967 fields[6] = fields[7] = "*"; 2968 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status); 2969 REGEX_CHECK_STATUS; 2970 REGEX_ASSERT(n==7); 2971 REGEX_ASSERT(fields[0]==""); 2972 REGEX_ASSERT(fields[1]=="a"); 2973 REGEX_ASSERT(fields[2]=="Now is "); 2974 REGEX_ASSERT(fields[3]=="b"); 2975 REGEX_ASSERT(fields[4]=="the time"); 2976 REGEX_ASSERT(fields[5]=="c"); 2977 REGEX_ASSERT(fields[6]==""); 2978 REGEX_ASSERT(fields[7]=="*"); 2979 REGEX_ASSERT(status==U_ZERO_ERROR); 2980 2981 fields[6] = fields[7] = "*"; 2982 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status); 2983 REGEX_CHECK_STATUS; 2984 REGEX_ASSERT(n==7); 2985 REGEX_ASSERT(fields[0]==" "); 2986 REGEX_ASSERT(fields[1]=="a"); 2987 REGEX_ASSERT(fields[2]=="Now is "); 2988 REGEX_ASSERT(fields[3]=="b"); 2989 REGEX_ASSERT(fields[4]=="the time"); 2990 REGEX_ASSERT(fields[5]=="c"); 2991 REGEX_ASSERT(fields[6]==""); 2992 REGEX_ASSERT(fields[7]=="*"); 2993 2994 status = U_ZERO_ERROR; 2995 fields[6] = "foo"; 2996 n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status); 2997 REGEX_CHECK_STATUS; 2998 REGEX_ASSERT(n==6); 2999 REGEX_ASSERT(fields[0]==" "); 3000 REGEX_ASSERT(fields[1]=="a"); 3001 REGEX_ASSERT(fields[2]=="Now is "); 3002 REGEX_ASSERT(fields[3]=="b"); 3003 REGEX_ASSERT(fields[4]=="the time"); 3004 REGEX_ASSERT(fields[5]==" "); 3005 REGEX_ASSERT(fields[6]=="foo"); 3006 3007 status = U_ZERO_ERROR; 3008 fields[5] = "foo"; 3009 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status); 3010 REGEX_CHECK_STATUS; 3011 REGEX_ASSERT(n==5); 3012 REGEX_ASSERT(fields[0]==" "); 3013 REGEX_ASSERT(fields[1]=="a"); 3014 REGEX_ASSERT(fields[2]=="Now is "); 3015 REGEX_ASSERT(fields[3]=="b"); 3016 REGEX_ASSERT(fields[4]=="the time<c>"); 3017 REGEX_ASSERT(fields[5]=="foo"); 3018 3019 status = U_ZERO_ERROR; 3020 fields[5] = "foo"; 3021 n = pat1->split(" <a>Now is <b>the time", fields, 5, status); 3022 REGEX_CHECK_STATUS; 3023 REGEX_ASSERT(n==5); 3024 REGEX_ASSERT(fields[0]==" "); 3025 REGEX_ASSERT(fields[1]=="a"); 3026 REGEX_ASSERT(fields[2]=="Now is "); 3027 REGEX_ASSERT(fields[3]=="b"); 3028 REGEX_ASSERT(fields[4]=="the time"); 3029 REGEX_ASSERT(fields[5]=="foo"); 3030 3031 status = U_ZERO_ERROR; 3032 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status); 3033 REGEX_CHECK_STATUS; 3034 REGEX_ASSERT(n==4); 3035 REGEX_ASSERT(fields[0]==" "); 3036 REGEX_ASSERT(fields[1]=="a"); 3037 REGEX_ASSERT(fields[2]=="Now is "); 3038 REGEX_ASSERT(fields[3]=="the time<c>"); 3039 status = U_ZERO_ERROR; 3040 delete pat1; 3041 3042 regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status); 3043 pat1 = RegexPattern::compile(&re1, pe, status); 3044 REGEX_CHECK_STATUS; 3045 n = pat1->split("1-10,20", fields, 10, status); 3046 REGEX_CHECK_STATUS; 3047 REGEX_ASSERT(n==5); 3048 REGEX_ASSERT(fields[0]=="1"); 3049 REGEX_ASSERT(fields[1]=="-"); 3050 REGEX_ASSERT(fields[2]=="10"); 3051 REGEX_ASSERT(fields[3]==","); 3052 REGEX_ASSERT(fields[4]=="20"); 3053 delete pat1; 3054 3055 3056 // 3057 // split of a UText based string, with library allocating output UTexts. 3058 // 3059 { 3060 status = U_ZERO_ERROR; 3061 RegexMatcher matcher(UnicodeString("(:)"), 0, status); 3062 UnicodeString stringToSplit("first:second:third"); 3063 UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status); 3064 REGEX_CHECK_STATUS; 3065 3066 UText *splits[10] = {NULL}; 3067 int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status); 3068 REGEX_CHECK_STATUS; 3069 REGEX_ASSERT(numFields == 5); 3070 REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]); 3071 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]); 3072 REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]); 3073 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]); 3074 REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]); 3075 REGEX_ASSERT(splits[5] == NULL); 3076 3077 for (int i=0; i<UPRV_LENGTHOF(splits); i++) { 3078 if (splits[i]) { 3079 utext_close(splits[i]); 3080 splits[i] = NULL; 3081 } 3082 } 3083 utext_close(textToSplit); 3084 } 3085 3086 3087 // 3088 // RegexPattern::pattern() and patternText() 3089 // 3090 pat1 = new RegexPattern(); 3091 REGEX_ASSERT(pat1->pattern() == ""); 3092 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status)); 3093 delete pat1; 3094 const char *helloWorldInvariant = "(Hello, world)*"; 3095 regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status); 3096 pat1 = RegexPattern::compile(&re1, pe, status); 3097 REGEX_CHECK_STATUS; 3098 REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern()); 3099 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status)); 3100 delete pat1; 3101 3102 utext_close(&re1); 3103 } 3104 3105 3106 //--------------------------------------------------------------------------- 3107 // 3108 // Extended A more thorough check for features of regex patterns 3109 // The test cases are in a separate data file, 3110 // source/tests/testdata/regextst.txt 3111 // A description of the test data format is included in that file. 3112 // 3113 //--------------------------------------------------------------------------- 3114 3115 const char * 3116 RegexTest::getPath(char buffer[2048], const char *filename) { 3117 UErrorCode status=U_ZERO_ERROR; 3118 const char *testDataDirectory = IntlTest::getSourceTestData(status); 3119 if (U_FAILURE(status)) { 3120 errln("ERROR: loadTestData() failed - %s", u_errorName(status)); 3121 return NULL; 3122 } 3123 3124 strcpy(buffer, testDataDirectory); 3125 strcat(buffer, filename); 3126 return buffer; 3127 } 3128 3129 void RegexTest::Extended() { 3130 char tdd[2048]; 3131 const char *srcPath; 3132 UErrorCode status = U_ZERO_ERROR; 3133 int32_t lineNum = 0; 3134 3135 // 3136 // Open and read the test data file. 3137 // 3138 srcPath=getPath(tdd, "regextst.txt"); 3139 if(srcPath==NULL) { 3140 return; /* something went wrong, error already output */ 3141 } 3142 3143 int32_t len; 3144 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status); 3145 if (U_FAILURE(status)) { 3146 return; /* something went wrong, error already output */ 3147 } 3148 3149 // 3150 // Put the test data into a UnicodeString 3151 // 3152 UnicodeString testString(FALSE, testData, len); 3153 3154 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status); 3155 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status); 3156 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status); 3157 3158 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status); 3159 UnicodeString testPattern; // The pattern for test from the test file. 3160 UnicodeString testFlags; // the flags for a test. 3161 UnicodeString matchString; // The marked up string to be used as input 3162 3163 if (U_FAILURE(status)){ 3164 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status)); 3165 delete [] testData; 3166 return; 3167 } 3168 3169 // 3170 // Loop over the test data file, once per line. 3171 // 3172 while (lineMat.find()) { 3173 lineNum++; 3174 if (U_FAILURE(status)) { 3175 errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status)); 3176 } 3177 3178 status = U_ZERO_ERROR; 3179 UnicodeString testLine = lineMat.group(1, status); 3180 if (testLine.length() == 0) { 3181 continue; 3182 } 3183 3184 // 3185 // Parse the test line. Skip blank and comment only lines. 3186 // Separate out the three main fields - pattern, flags, target. 3187 // 3188 3189 commentMat.reset(testLine); 3190 if (commentMat.lookingAt(status)) { 3191 // This line is a comment, or blank. 3192 continue; 3193 } 3194 3195 // 3196 // Pull out the pattern field, remove it from the test file line. 3197 // 3198 quotedStuffMat.reset(testLine); 3199 if (quotedStuffMat.lookingAt(status)) { 3200 testPattern = quotedStuffMat.group(2, status); 3201 testLine.remove(0, quotedStuffMat.end(0, status)); 3202 } else { 3203 errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum); 3204 continue; 3205 } 3206 3207 3208 // 3209 // Pull out the flags from the test file line. 3210 // 3211 flagsMat.reset(testLine); 3212 flagsMat.lookingAt(status); // Will always match, possibly an empty string. 3213 testFlags = flagsMat.group(1, status); 3214 if (flagsMat.group(2, status).length() > 0) { 3215 errln("Bad Match flag at line %d. Scanning %c\n", 3216 lineNum, flagsMat.group(2, status).charAt(0)); 3217 continue; 3218 } 3219 testLine.remove(0, flagsMat.end(0, status)); 3220 3221 // 3222 // Pull out the match string, as a whole. 3223 // We'll process the <tags> later. 3224 // 3225 quotedStuffMat.reset(testLine); 3226 if (quotedStuffMat.lookingAt(status)) { 3227 matchString = quotedStuffMat.group(2, status); 3228 testLine.remove(0, quotedStuffMat.end(0, status)); 3229 } else { 3230 errln("Bad match string at test file line %d", lineNum); 3231 continue; 3232 } 3233 3234 // 3235 // The only thing left from the input line should be an optional trailing comment. 3236 // 3237 commentMat.reset(testLine); 3238 if (commentMat.lookingAt(status) == FALSE) { 3239 errln("Line %d: unexpected characters at end of test line.", lineNum); 3240 continue; 3241 } 3242 3243 // 3244 // Run the test 3245 // 3246 regex_find(testPattern, testFlags, matchString, srcPath, lineNum); 3247 } 3248 3249 delete [] testData; 3250 3251 } 3252 3253 3254 3255 //--------------------------------------------------------------------------- 3256 // 3257 // regex_find(pattern, flags, inputString, lineNumber) 3258 // 3259 // Function to run a single test from the Extended (data driven) tests. 3260 // See file test/testdata/regextst.txt for a description of the 3261 // pattern and inputString fields, and the allowed flags. 3262 // lineNumber is the source line in regextst.txt of the test. 3263 // 3264 //--------------------------------------------------------------------------- 3265 3266 3267 // Set a value into a UVector at position specified by a decimal number in 3268 // a UnicodeString. This is a utility function needed by the actual test function, 3269 // which follows. 3270 static void set(UVector &vec, int32_t val, UnicodeString index) { 3271 UErrorCode status=U_ZERO_ERROR; 3272 int32_t idx = 0; 3273 for (int32_t i=0; i<index.length(); i++) { 3274 int32_t d=u_charDigitValue(index.charAt(i)); 3275 if (d<0) {return;} 3276 idx = idx*10 + d; 3277 } 3278 while (vec.size()<idx+1) {vec.addElement(-1, status);} 3279 vec.setElementAt(val, idx); 3280 } 3281 3282 static void setInt(UVector &vec, int32_t val, int32_t idx) { 3283 UErrorCode status=U_ZERO_ERROR; 3284 while (vec.size()<idx+1) {vec.addElement(-1, status);} 3285 vec.setElementAt(val, idx); 3286 } 3287 3288 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex) 3289 { 3290 UBool couldFind = TRUE; 3291 UTEXT_SETNATIVEINDEX(utext, 0); 3292 int32_t i = 0; 3293 while (i < unistrOffset) { 3294 UChar32 c = UTEXT_NEXT32(utext); 3295 if (c != U_SENTINEL) { 3296 i += U16_LENGTH(c); 3297 } else { 3298 couldFind = FALSE; 3299 break; 3300 } 3301 } 3302 nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext); 3303 return couldFind; 3304 } 3305 3306 3307 void RegexTest::regex_find(const UnicodeString &pattern, 3308 const UnicodeString &flags, 3309 const UnicodeString &inputString, 3310 const char *srcPath, 3311 int32_t line) { 3312 UnicodeString unEscapedInput; 3313 UnicodeString deTaggedInput; 3314 3315 int32_t patternUTF8Length, inputUTF8Length; 3316 char *patternChars = NULL, *inputChars = NULL; 3317 UText patternText = UTEXT_INITIALIZER; 3318 UText inputText = UTEXT_INITIALIZER; 3319 UConverter *UTF8Converter = NULL; 3320 3321 UErrorCode status = U_ZERO_ERROR; 3322 UParseError pe; 3323 RegexPattern *parsePat = NULL; 3324 RegexMatcher *parseMatcher = NULL; 3325 RegexPattern *callerPattern = NULL, *UTF8Pattern = NULL; 3326 RegexMatcher *matcher = NULL, *UTF8Matcher = NULL; 3327 UVector groupStarts(status); 3328 UVector groupEnds(status); 3329 UVector groupStartsUTF8(status); 3330 UVector groupEndsUTF8(status); 3331 UBool isMatch = FALSE, isUTF8Match = FALSE; 3332 UBool failed = FALSE; 3333 int32_t numFinds; 3334 int32_t i; 3335 UBool useMatchesFunc = FALSE; 3336 UBool useLookingAtFunc = FALSE; 3337 int32_t regionStart = -1; 3338 int32_t regionEnd = -1; 3339 int32_t regionStartUTF8 = -1; 3340 int32_t regionEndUTF8 = -1; 3341 3342 3343 // 3344 // Compile the caller's pattern 3345 // 3346 uint32_t bflags = 0; 3347 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag 3348 bflags |= UREGEX_CASE_INSENSITIVE; 3349 } 3350 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag 3351 bflags |= UREGEX_COMMENTS; 3352 } 3353 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag 3354 bflags |= UREGEX_DOTALL; 3355 } 3356 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag 3357 bflags |= UREGEX_MULTILINE; 3358 } 3359 3360 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag 3361 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES; 3362 } 3363 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag 3364 bflags |= UREGEX_UNIX_LINES; 3365 } 3366 if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag 3367 bflags |= UREGEX_LITERAL; 3368 } 3369 3370 3371 callerPattern = RegexPattern::compile(pattern, bflags, pe, status); 3372 if (status != U_ZERO_ERROR) { 3373 #if UCONFIG_NO_BREAK_ITERATION==1 3374 // 'v' test flag means that the test pattern should not compile if ICU was configured 3375 // to not include break iteration. RBBI is needed for Unicode word boundaries. 3376 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) { 3377 goto cleanupAndReturn; 3378 } 3379 #endif 3380 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E' 3381 // Expected pattern compilation error. 3382 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd' 3383 logln("Pattern Compile returns \"%s\"", u_errorName(status)); 3384 } 3385 goto cleanupAndReturn; 3386 } else { 3387 // Unexpected pattern compilation error. 3388 dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status)); 3389 goto cleanupAndReturn; 3390 } 3391 } 3392 3393 UTF8Converter = ucnv_open("UTF8", &status); 3394 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); 3395 3396 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status); 3397 status = U_ZERO_ERROR; // buffer overflow 3398 patternChars = new char[patternUTF8Length+1]; 3399 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status); 3400 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status); 3401 3402 if (status == U_ZERO_ERROR) { 3403 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status); 3404 3405 if (status != U_ZERO_ERROR) { 3406 #if UCONFIG_NO_BREAK_ITERATION==1 3407 // 'v' test flag means that the test pattern should not compile if ICU was configured 3408 // to not include break iteration. RBBI is needed for Unicode word boundaries. 3409 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) { 3410 goto cleanupAndReturn; 3411 } 3412 #endif 3413 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E' 3414 // Expected pattern compilation error. 3415 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd' 3416 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status)); 3417 } 3418 goto cleanupAndReturn; 3419 } else { 3420 // Unexpected pattern compilation error. 3421 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status)); 3422 goto cleanupAndReturn; 3423 } 3424 } 3425 } 3426 3427 if (UTF8Pattern == NULL) { 3428 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine 3429 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line); 3430 status = U_ZERO_ERROR; 3431 } 3432 3433 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag 3434 callerPattern->dumpPattern(); 3435 } 3436 3437 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag 3438 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line); 3439 goto cleanupAndReturn; 3440 } 3441 3442 3443 // 3444 // Number of times find() should be called on the test string, default to 1 3445 // 3446 numFinds = 1; 3447 for (i=2; i<=9; i++) { 3448 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag 3449 if (numFinds != 1) { 3450 errln("Line %d: more than one digit flag. Scanning %d.", line, i); 3451 goto cleanupAndReturn; 3452 } 3453 numFinds = i; 3454 } 3455 } 3456 3457 // 'M' flag. Use matches() instead of find() 3458 if (flags.indexOf((UChar)0x4d) >= 0) { 3459 useMatchesFunc = TRUE; 3460 } 3461 if (flags.indexOf((UChar)0x4c) >= 0) { 3462 useLookingAtFunc = TRUE; 3463 } 3464 3465 // 3466 // Find the tags in the input data, remove them, and record the group boundary 3467 // positions. 3468 // 3469 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status); 3470 REGEX_CHECK_STATUS_L(line); 3471 3472 unEscapedInput = inputString.unescape(); 3473 parseMatcher = parsePat->matcher(unEscapedInput, status); 3474 REGEX_CHECK_STATUS_L(line); 3475 while(parseMatcher->find()) { 3476 parseMatcher->appendReplacement(deTaggedInput, "", status); 3477 REGEX_CHECK_STATUS; 3478 UnicodeString groupNum = parseMatcher->group(2, status); 3479 if (groupNum == "r") { 3480 // <r> or </r>, a region specification within the string 3481 if (parseMatcher->group(1, status) == "/") { 3482 regionEnd = deTaggedInput.length(); 3483 } else { 3484 regionStart = deTaggedInput.length(); 3485 } 3486 } else { 3487 // <digits> or </digits>, a group match boundary tag. 3488 if (parseMatcher->group(1, status) == "/") { 3489 set(groupEnds, deTaggedInput.length(), groupNum); 3490 } else { 3491 set(groupStarts, deTaggedInput.length(), groupNum); 3492 } 3493 } 3494 } 3495 parseMatcher->appendTail(deTaggedInput); 3496 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line); 3497 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) { 3498 errln("mismatched <r> tags"); 3499 failed = TRUE; 3500 goto cleanupAndReturn; 3501 } 3502 3503 // 3504 // Configure the matcher according to the flags specified with this test. 3505 // 3506 matcher = callerPattern->matcher(deTaggedInput, status); 3507 REGEX_CHECK_STATUS_L(line); 3508 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag 3509 matcher->setTrace(TRUE); 3510 } 3511 3512 if (UTF8Pattern != NULL) { 3513 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status); 3514 status = U_ZERO_ERROR; // buffer overflow 3515 inputChars = new char[inputUTF8Length+1]; 3516 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status); 3517 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status); 3518 3519 if (status == U_ZERO_ERROR) { 3520 UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText); 3521 REGEX_CHECK_STATUS_L(line); 3522 } 3523 3524 if (UTF8Matcher == NULL) { 3525 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine 3526 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line); 3527 status = U_ZERO_ERROR; 3528 } 3529 } 3530 3531 // 3532 // Generate native indices for UTF8 versions of region and capture group info 3533 // 3534 if (UTF8Matcher != NULL) { 3535 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag 3536 UTF8Matcher->setTrace(TRUE); 3537 } 3538 if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8); 3539 if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8); 3540 3541 // Fill out the native index UVector info. 3542 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size() 3543 for (i=0; i<groupStarts.size(); i++) { 3544 int32_t start = groupStarts.elementAti(i); 3545 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting 3546 if (start >= 0) { 3547 int32_t startUTF8; 3548 if (!utextOffsetToNative(&inputText, start, startUTF8)) { 3549 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start); 3550 failed = TRUE; 3551 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3552 } 3553 setInt(groupStartsUTF8, startUTF8, i); 3554 } 3555 3556 int32_t end = groupEnds.elementAti(i); 3557 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting 3558 if (end >= 0) { 3559 int32_t endUTF8; 3560 if (!utextOffsetToNative(&inputText, end, endUTF8)) { 3561 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end); 3562 failed = TRUE; 3563 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3564 } 3565 setInt(groupEndsUTF8, endUTF8, i); 3566 } 3567 } 3568 } 3569 3570 if (regionStart>=0) { 3571 matcher->region(regionStart, regionEnd, status); 3572 REGEX_CHECK_STATUS_L(line); 3573 if (UTF8Matcher != NULL) { 3574 UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status); 3575 REGEX_CHECK_STATUS_L(line); 3576 } 3577 } 3578 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag 3579 matcher->useAnchoringBounds(FALSE); 3580 if (UTF8Matcher != NULL) { 3581 UTF8Matcher->useAnchoringBounds(FALSE); 3582 } 3583 } 3584 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag 3585 matcher->useTransparentBounds(TRUE); 3586 if (UTF8Matcher != NULL) { 3587 UTF8Matcher->useTransparentBounds(TRUE); 3588 } 3589 } 3590 3591 3592 3593 // 3594 // Do a find on the de-tagged input using the caller's pattern 3595 // TODO: error on count>1 and not find(). 3596 // error on both matches() and lookingAt(). 3597 // 3598 for (i=0; i<numFinds; i++) { 3599 if (useMatchesFunc) { 3600 isMatch = matcher->matches(status); 3601 if (UTF8Matcher != NULL) { 3602 isUTF8Match = UTF8Matcher->matches(status); 3603 } 3604 } else if (useLookingAtFunc) { 3605 isMatch = matcher->lookingAt(status); 3606 if (UTF8Matcher != NULL) { 3607 isUTF8Match = UTF8Matcher->lookingAt(status); 3608 } 3609 } else { 3610 isMatch = matcher->find(); 3611 if (UTF8Matcher != NULL) { 3612 isUTF8Match = UTF8Matcher->find(); 3613 } 3614 } 3615 } 3616 matcher->setTrace(FALSE); 3617 if (UTF8Matcher) { 3618 UTF8Matcher->setTrace(FALSE); 3619 } 3620 if (U_FAILURE(status)) { 3621 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status)); 3622 } 3623 3624 // 3625 // Match up the groups from the find() with the groups from the tags 3626 // 3627 3628 // number of tags should match number of groups from find operation. 3629 // matcher->groupCount does not include group 0, the entire match, hence the +1. 3630 // G option in test means that capture group data is not available in the 3631 // expected results, so the check needs to be suppressed. 3632 if (isMatch == FALSE && groupStarts.size() != 0) { 3633 dataerrln("Error at line %d: Match expected, but none found.", line); 3634 failed = TRUE; 3635 goto cleanupAndReturn; 3636 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) { 3637 errln("Error at line %d: Match expected, but none found. (UTF8)", line); 3638 failed = TRUE; 3639 goto cleanupAndReturn; 3640 } 3641 if (isMatch && groupStarts.size() == 0) { 3642 errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status)); 3643 failed = TRUE; 3644 } 3645 if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) { 3646 errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status)); 3647 failed = TRUE; 3648 } 3649 3650 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) { 3651 // Only check for match / no match. Don't check capture groups. 3652 goto cleanupAndReturn; 3653 } 3654 3655 REGEX_CHECK_STATUS_L(line); 3656 for (i=0; i<=matcher->groupCount(); i++) { 3657 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i)); 3658 int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i)); 3659 if (matcher->start(i, status) != expectedStart) { 3660 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d", 3661 line, i, expectedStart, matcher->start(i, status)); 3662 failed = TRUE; 3663 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3664 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) { 3665 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)", 3666 line, i, expectedStartUTF8, UTF8Matcher->start(i, status)); 3667 failed = TRUE; 3668 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3669 } 3670 3671 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i)); 3672 int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i)); 3673 if (matcher->end(i, status) != expectedEnd) { 3674 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d", 3675 line, i, expectedEnd, matcher->end(i, status)); 3676 failed = TRUE; 3677 // Error on end position; keep going; real error is probably yet to come as group 3678 // end positions work from end of the input data towards the front. 3679 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) { 3680 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)", 3681 line, i, expectedEndUTF8, UTF8Matcher->end(i, status)); 3682 failed = TRUE; 3683 // Error on end position; keep going; real error is probably yet to come as group 3684 // end positions work from end of the input data towards the front. 3685 } 3686 } 3687 if ( matcher->groupCount()+1 < groupStarts.size()) { 3688 errln("Error at line %d: Expected %d capture groups, found %d.", 3689 line, groupStarts.size()-1, matcher->groupCount()); 3690 failed = TRUE; 3691 } 3692 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) { 3693 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)", 3694 line, groupStarts.size()-1, UTF8Matcher->groupCount()); 3695 failed = TRUE; 3696 } 3697 3698 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false 3699 matcher->requireEnd() == TRUE) { 3700 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line); 3701 failed = TRUE; 3702 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false 3703 UTF8Matcher->requireEnd() == TRUE) { 3704 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line); 3705 failed = TRUE; 3706 } 3707 3708 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true 3709 matcher->requireEnd() == FALSE) { 3710 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line); 3711 failed = TRUE; 3712 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false 3713 UTF8Matcher->requireEnd() == FALSE) { 3714 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line); 3715 failed = TRUE; 3716 } 3717 3718 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false 3719 matcher->hitEnd() == TRUE) { 3720 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line); 3721 failed = TRUE; 3722 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false 3723 UTF8Matcher->hitEnd() == TRUE) { 3724 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line); 3725 failed = TRUE; 3726 } 3727 3728 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true 3729 matcher->hitEnd() == FALSE) { 3730 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line); 3731 failed = TRUE; 3732 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true 3733 UTF8Matcher->hitEnd() == FALSE) { 3734 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line); 3735 failed = TRUE; 3736 } 3737 3738 3739 cleanupAndReturn: 3740 if (failed) { 3741 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" " 3742 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\""); 3743 // callerPattern->dump(); 3744 } 3745 delete parseMatcher; 3746 delete parsePat; 3747 delete UTF8Matcher; 3748 delete UTF8Pattern; 3749 delete matcher; 3750 delete callerPattern; 3751 3752 utext_close(&inputText); 3753 delete[] inputChars; 3754 utext_close(&patternText); 3755 delete[] patternChars; 3756 ucnv_close(UTF8Converter); 3757 } 3758 3759 3760 3761 3762 //--------------------------------------------------------------------------- 3763 // 3764 // Errors Check for error handling in patterns. 3765 // 3766 //--------------------------------------------------------------------------- 3767 void RegexTest::Errors() { 3768 // \escape sequences that aren't implemented yet. 3769 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED); 3770 3771 // Missing close parentheses 3772 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN); 3773 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN); 3774 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN); 3775 3776 // Extra close paren 3777 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN); 3778 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN); 3779 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN); 3780 3781 // Look-ahead, Look-behind 3782 // TODO: add tests for unbounded length look-behinds. 3783 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct 3784 3785 // Attempt to use non-default flags 3786 { 3787 UParseError pe; 3788 UErrorCode status = U_ZERO_ERROR; 3789 int32_t flags = UREGEX_CANON_EQ | 3790 UREGEX_COMMENTS | UREGEX_DOTALL | 3791 UREGEX_MULTILINE; 3792 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status); 3793 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED); 3794 delete pat1; 3795 } 3796 3797 3798 // Quantifiers are allowed only after something that can be quantified. 3799 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX); 3800 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX); 3801 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX); 3802 3803 // Mal-formed {min,max} quantifiers 3804 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL); 3805 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN); 3806 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL); 3807 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL); 3808 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL); 3809 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG); 3810 REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan 3811 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format 3812 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG); 3813 3814 // Ticket 5389 3815 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX); 3816 3817 // Invalid Back Reference \0 3818 // For ICU 3.8 and earlier 3819 // For ICU versions newer than 3.8, \0 introduces an octal escape. 3820 // 3821 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE); 3822 3823 } 3824 3825 3826 //------------------------------------------------------------------------------- 3827 // 3828 // Read a text data file, convert it to UChars, and return the data 3829 // in one big UChar * buffer, which the caller must delete. 3830 // 3831 //-------------------------------------------------------------------------------- 3832 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen, 3833 const char *defEncoding, UErrorCode &status) { 3834 UChar *retPtr = NULL; 3835 char *fileBuf = NULL; 3836 UConverter* conv = NULL; 3837 FILE *f = NULL; 3838 3839 ulen = 0; 3840 if (U_FAILURE(status)) { 3841 return retPtr; 3842 } 3843 3844 // 3845 // Open the file. 3846 // 3847 f = fopen(fileName, "rb"); 3848 if (f == 0) { 3849 dataerrln("Error opening test data file %s\n", fileName); 3850 status = U_FILE_ACCESS_ERROR; 3851 return NULL; 3852 } 3853 // 3854 // Read it in 3855 // 3856 int32_t fileSize; 3857 int32_t amt_read; 3858 3859 fseek( f, 0, SEEK_END); 3860 fileSize = ftell(f); 3861 fileBuf = new char[fileSize]; 3862 fseek(f, 0, SEEK_SET); 3863 amt_read = fread(fileBuf, 1, fileSize, f); 3864 if (amt_read != fileSize || fileSize <= 0) { 3865 errln("Error reading test data file."); 3866 goto cleanUpAndReturn; 3867 } 3868 3869 // 3870 // Look for a Unicode Signature (BOM) on the data just read 3871 // 3872 int32_t signatureLength; 3873 const char * fileBufC; 3874 const char* encoding; 3875 3876 fileBufC = fileBuf; 3877 encoding = ucnv_detectUnicodeSignature( 3878 fileBuf, fileSize, &signatureLength, &status); 3879 if(encoding!=NULL ){ 3880 fileBufC += signatureLength; 3881 fileSize -= signatureLength; 3882 } else { 3883 encoding = defEncoding; 3884 if (strcmp(encoding, "utf-8") == 0) { 3885 errln("file %s is missing its BOM", fileName); 3886 } 3887 } 3888 3889 // 3890 // Open a converter to take the rule file to UTF-16 3891 // 3892 conv = ucnv_open(encoding, &status); 3893 if (U_FAILURE(status)) { 3894 goto cleanUpAndReturn; 3895 } 3896 3897 // 3898 // Convert the rules to UChar. 3899 // Preflight first to determine required buffer size. 3900 // 3901 ulen = ucnv_toUChars(conv, 3902 NULL, // dest, 3903 0, // destCapacity, 3904 fileBufC, 3905 fileSize, 3906 &status); 3907 if (status == U_BUFFER_OVERFLOW_ERROR) { 3908 // Buffer Overflow is expected from the preflight operation. 3909 status = U_ZERO_ERROR; 3910 3911 retPtr = new UChar[ulen+1]; 3912 ucnv_toUChars(conv, 3913 retPtr, // dest, 3914 ulen+1, 3915 fileBufC, 3916 fileSize, 3917 &status); 3918 } 3919 3920 cleanUpAndReturn: 3921 fclose(f); 3922 delete[] fileBuf; 3923 ucnv_close(conv); 3924 if (U_FAILURE(status)) { 3925 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 3926 delete []retPtr; 3927 retPtr = 0; 3928 ulen = 0; 3929 }; 3930 return retPtr; 3931 } 3932 3933 3934 //------------------------------------------------------------------------------- 3935 // 3936 // PerlTests - Run Perl's regular expression tests 3937 // The input file for this test is re_tests, the standard regular 3938 // expression test data distributed with the Perl source code. 3939 // 3940 // Here is Perl's description of the test data file: 3941 // 3942 // # The tests are in a separate file 't/op/re_tests'. 3943 // # Each line in that file is a separate test. 3944 // # There are five columns, separated by tabs. 3945 // # 3946 // # Column 1 contains the pattern, optionally enclosed in C<''>. 3947 // # Modifiers can be put after the closing C<'>. 3948 // # 3949 // # Column 2 contains the string to be matched. 3950 // # 3951 // # Column 3 contains the expected result: 3952 // # y expect a match 3953 // # n expect no match 3954 // # c expect an error 3955 // # B test exposes a known bug in Perl, should be skipped 3956 // # b test exposes a known bug in Perl, should be skipped if noamp 3957 // # 3958 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>. 3959 // # 3960 // # Column 4 contains a string, usually C<$&>. 3961 // # 3962 // # Column 5 contains the expected result of double-quote 3963 // # interpolating that string after the match, or start of error message. 3964 // # 3965 // # Column 6, if present, contains a reason why the test is skipped. 3966 // # This is printed with "skipped", for harness to pick up. 3967 // # 3968 // # \n in the tests are interpolated, as are variables of the form ${\w+}. 3969 // # 3970 // # If you want to add a regular expression test that can't be expressed 3971 // # in this format, don't add it here: put it in op/pat.t instead. 3972 // 3973 // For ICU, if field 3 contains an 'i', the test will be skipped. 3974 // The test exposes is some known incompatibility between ICU and Perl regexps. 3975 // (The i is in addition to whatever was there before.) 3976 // 3977 //------------------------------------------------------------------------------- 3978 void RegexTest::PerlTests() { 3979 char tdd[2048]; 3980 const char *srcPath; 3981 UErrorCode status = U_ZERO_ERROR; 3982 UParseError pe; 3983 3984 // 3985 // Open and read the test data file. 3986 // 3987 srcPath=getPath(tdd, "re_tests.txt"); 3988 if(srcPath==NULL) { 3989 return; /* something went wrong, error already output */ 3990 } 3991 3992 int32_t len; 3993 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status); 3994 if (U_FAILURE(status)) { 3995 return; /* something went wrong, error already output */ 3996 } 3997 3998 // 3999 // Put the test data into a UnicodeString 4000 // 4001 UnicodeString testDataString(FALSE, testData, len); 4002 4003 // 4004 // Regex to break the input file into lines, and strip the new lines. 4005 // One line per match, capture group one is the desired data. 4006 // 4007 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status); 4008 if (U_FAILURE(status)) { 4009 dataerrln("RegexPattern::compile() error"); 4010 return; 4011 } 4012 RegexMatcher* lineMat = linePat->matcher(testDataString, status); 4013 4014 // 4015 // Regex to split a test file line into fields. 4016 // There are six fields, separated by tabs. 4017 // 4018 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status); 4019 4020 // 4021 // Regex to identify test patterns with flag settings, and to separate them. 4022 // Test patterns with flags look like 'pattern'i 4023 // Test patterns without flags are not quoted: pattern 4024 // Coming out, capture group 2 is the pattern, capture group 3 is the flags. 4025 // 4026 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status); 4027 RegexMatcher* flagMat = flagPat->matcher(status); 4028 4029 // 4030 // The Perl tests reference several perl-isms, which are evaluated/substituted 4031 // in the test data. Not being perl, this must be done explicitly. Here 4032 // are string constants and REs for these constructs. 4033 // 4034 UnicodeString nulnulSrc("${nulnul}"); 4035 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV); 4036 nulnul = nulnul.unescape(); 4037 4038 UnicodeString ffffSrc("${ffff}"); 4039 UnicodeString ffff("\\uffff", -1, US_INV); 4040 ffff = ffff.unescape(); 4041 4042 // regexp for $-[0], $+[2], etc. 4043 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status); 4044 RegexMatcher *groupsMat = groupsPat->matcher(status); 4045 4046 // regexp for $0, $1, $2, etc. 4047 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status); 4048 RegexMatcher *cgMat = cgPat->matcher(status); 4049 4050 4051 // 4052 // Main Loop for the Perl Tests, runs once per line from the 4053 // test data file. 4054 // 4055 int32_t lineNum = 0; 4056 int32_t skippedUnimplementedCount = 0; 4057 while (lineMat->find()) { 4058 lineNum++; 4059 4060 // 4061 // Get a line, break it into its fields, do the Perl 4062 // variable substitutions. 4063 // 4064 UnicodeString line = lineMat->group(1, status); 4065 UnicodeString fields[7]; 4066 fieldPat->split(line, fields, 7, status); 4067 4068 flagMat->reset(fields[0]); 4069 flagMat->matches(status); 4070 UnicodeString pattern = flagMat->group(2, status); 4071 pattern.findAndReplace("${bang}", "!"); 4072 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000")); 4073 pattern.findAndReplace(ffffSrc, ffff); 4074 4075 // 4076 // Identify patterns that include match flag settings, 4077 // split off the flags, remove the extra quotes. 4078 // 4079 UnicodeString flagStr = flagMat->group(3, status); 4080 if (U_FAILURE(status)) { 4081 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 4082 return; 4083 } 4084 int32_t flags = 0; 4085 const UChar UChar_c = 0x63; // Char constants for the flag letters. 4086 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C) 4087 const UChar UChar_m = 0x6d; 4088 const UChar UChar_x = 0x78; 4089 const UChar UChar_y = 0x79; 4090 if (flagStr.indexOf(UChar_i) != -1) { 4091 flags |= UREGEX_CASE_INSENSITIVE; 4092 } 4093 if (flagStr.indexOf(UChar_m) != -1) { 4094 flags |= UREGEX_MULTILINE; 4095 } 4096 if (flagStr.indexOf(UChar_x) != -1) { 4097 flags |= UREGEX_COMMENTS; 4098 } 4099 4100 // 4101 // Compile the test pattern. 4102 // 4103 status = U_ZERO_ERROR; 4104 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status); 4105 if (status == U_REGEX_UNIMPLEMENTED) { 4106 // 4107 // Test of a feature that is planned for ICU, but not yet implemented. 4108 // skip the test. 4109 skippedUnimplementedCount++; 4110 delete testPat; 4111 status = U_ZERO_ERROR; 4112 continue; 4113 } 4114 4115 if (U_FAILURE(status)) { 4116 // Some tests are supposed to generate errors. 4117 // Only report an error for tests that are supposed to succeed. 4118 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND 4119 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility 4120 { 4121 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status)); 4122 } 4123 status = U_ZERO_ERROR; 4124 delete testPat; 4125 continue; 4126 } 4127 4128 if (fields[2].indexOf(UChar_i) >= 0) { 4129 // ICU should skip this test. 4130 delete testPat; 4131 continue; 4132 } 4133 4134 if (fields[2].indexOf(UChar_c) >= 0) { 4135 // This pattern should have caused a compilation error, but didn't/ 4136 errln("line %d: Expected a pattern compile error, got success.", lineNum); 4137 delete testPat; 4138 continue; 4139 } 4140 4141 // 4142 // replace the Perl variables that appear in some of the 4143 // match data strings. 4144 // 4145 UnicodeString matchString = fields[1]; 4146 matchString.findAndReplace(nulnulSrc, nulnul); 4147 matchString.findAndReplace(ffffSrc, ffff); 4148 4149 // Replace any \n in the match string with an actual new-line char. 4150 // Don't do full unescape, as this unescapes more than Perl does, which 4151 // causes other spurious failures in the tests. 4152 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4153 4154 4155 4156 // 4157 // Run the test, check for expected match/don't match result. 4158 // 4159 RegexMatcher *testMat = testPat->matcher(matchString, status); 4160 UBool found = testMat->find(); 4161 UBool expected = FALSE; 4162 if (fields[2].indexOf(UChar_y) >=0) { 4163 expected = TRUE; 4164 } 4165 if (expected != found) { 4166 errln("line %d: Expected %smatch, got %smatch", 4167 lineNum, expected?"":"no ", found?"":"no " ); 4168 continue; 4169 } 4170 4171 // Don't try to check expected results if there is no match. 4172 // (Some have stuff in the expected fields) 4173 if (!found) { 4174 delete testMat; 4175 delete testPat; 4176 continue; 4177 } 4178 4179 // 4180 // Interpret the Perl expression from the fourth field of the data file, 4181 // building up an ICU string from the results of the ICU match. 4182 // The Perl expression will contain references to the results of 4183 // a regex match, including the matched string, capture group strings, 4184 // group starting and ending indicies, etc. 4185 // 4186 UnicodeString resultString; 4187 UnicodeString perlExpr = fields[3]; 4188 #if SUPPORT_MUTATING_INPUT_STRING 4189 groupsMat->reset(perlExpr); 4190 cgMat->reset(perlExpr); 4191 #endif 4192 4193 while (perlExpr.length() > 0) { 4194 #if !SUPPORT_MUTATING_INPUT_STRING 4195 // Perferred usage. Reset after any modification to input string. 4196 groupsMat->reset(perlExpr); 4197 cgMat->reset(perlExpr); 4198 #endif 4199 4200 if (perlExpr.startsWith("$&")) { 4201 resultString.append(testMat->group(status)); 4202 perlExpr.remove(0, 2); 4203 } 4204 4205 else if (groupsMat->lookingAt(status)) { 4206 // $-[0] $+[2] etc. 4207 UnicodeString digitString = groupsMat->group(2, status); 4208 int32_t t = 0; 4209 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4210 UnicodeString plusOrMinus = groupsMat->group(1, status); 4211 int32_t matchPosition; 4212 if (plusOrMinus.compare("+") == 0) { 4213 matchPosition = testMat->end(groupNum, status); 4214 } else { 4215 matchPosition = testMat->start(groupNum, status); 4216 } 4217 if (matchPosition != -1) { 4218 ICU_Utility::appendNumber(resultString, matchPosition); 4219 } 4220 perlExpr.remove(0, groupsMat->end(status)); 4221 } 4222 4223 else if (cgMat->lookingAt(status)) { 4224 // $1, $2, $3, etc. 4225 UnicodeString digitString = cgMat->group(1, status); 4226 int32_t t = 0; 4227 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4228 if (U_SUCCESS(status)) { 4229 resultString.append(testMat->group(groupNum, status)); 4230 status = U_ZERO_ERROR; 4231 } 4232 perlExpr.remove(0, cgMat->end(status)); 4233 } 4234 4235 else if (perlExpr.startsWith("@-")) { 4236 int32_t i; 4237 for (i=0; i<=testMat->groupCount(); i++) { 4238 if (i>0) { 4239 resultString.append(" "); 4240 } 4241 ICU_Utility::appendNumber(resultString, testMat->start(i, status)); 4242 } 4243 perlExpr.remove(0, 2); 4244 } 4245 4246 else if (perlExpr.startsWith("@+")) { 4247 int32_t i; 4248 for (i=0; i<=testMat->groupCount(); i++) { 4249 if (i>0) { 4250 resultString.append(" "); 4251 } 4252 ICU_Utility::appendNumber(resultString, testMat->end(i, status)); 4253 } 4254 perlExpr.remove(0, 2); 4255 } 4256 4257 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal. 4258 // or as an escaped sequence (e.g. \n) 4259 if (perlExpr.length() > 1) { 4260 perlExpr.remove(0, 1); // Remove the '\', but only if not last char. 4261 } 4262 UChar c = perlExpr.charAt(0); 4263 switch (c) { 4264 case 'n': c = '\n'; break; 4265 // add any other escape sequences that show up in the test expected results. 4266 } 4267 resultString.append(c); 4268 perlExpr.remove(0, 1); 4269 } 4270 4271 else { 4272 // Any characters from the perl expression that we don't explicitly 4273 // recognize before here are assumed to be literals and copied 4274 // as-is to the expected results. 4275 resultString.append(perlExpr.charAt(0)); 4276 perlExpr.remove(0, 1); 4277 } 4278 4279 if (U_FAILURE(status)) { 4280 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status)); 4281 break; 4282 } 4283 } 4284 4285 // 4286 // Expected Results Compare 4287 // 4288 UnicodeString expectedS(fields[4]); 4289 expectedS.findAndReplace(nulnulSrc, nulnul); 4290 expectedS.findAndReplace(ffffSrc, ffff); 4291 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4292 4293 4294 if (expectedS.compare(resultString) != 0) { 4295 err("Line %d: Incorrect perl expression results.", lineNum); 4296 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\""); 4297 } 4298 4299 delete testMat; 4300 delete testPat; 4301 } 4302 4303 // 4304 // All done. Clean up allocated stuff. 4305 // 4306 delete cgMat; 4307 delete cgPat; 4308 4309 delete groupsMat; 4310 delete groupsPat; 4311 4312 delete flagMat; 4313 delete flagPat; 4314 4315 delete lineMat; 4316 delete linePat; 4317 4318 delete fieldPat; 4319 delete [] testData; 4320 4321 4322 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount); 4323 4324 } 4325 4326 4327 //------------------------------------------------------------------------------- 4328 // 4329 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts 4330 // (instead of using UnicodeStrings) to test the alternate engine. 4331 // The input file for this test is re_tests, the standard regular 4332 // expression test data distributed with the Perl source code. 4333 // See PerlTests() for more information. 4334 // 4335 //------------------------------------------------------------------------------- 4336 void RegexTest::PerlTestsUTF8() { 4337 char tdd[2048]; 4338 const char *srcPath; 4339 UErrorCode status = U_ZERO_ERROR; 4340 UParseError pe; 4341 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status)); 4342 UText patternText = UTEXT_INITIALIZER; 4343 char *patternChars = NULL; 4344 int32_t patternLength; 4345 int32_t patternCapacity = 0; 4346 UText inputText = UTEXT_INITIALIZER; 4347 char *inputChars = NULL; 4348 int32_t inputLength; 4349 int32_t inputCapacity = 0; 4350 4351 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); 4352 4353 // 4354 // Open and read the test data file. 4355 // 4356 srcPath=getPath(tdd, "re_tests.txt"); 4357 if(srcPath==NULL) { 4358 return; /* something went wrong, error already output */ 4359 } 4360 4361 int32_t len; 4362 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status); 4363 if (U_FAILURE(status)) { 4364 return; /* something went wrong, error already output */ 4365 } 4366 4367 // 4368 // Put the test data into a UnicodeString 4369 // 4370 UnicodeString testDataString(FALSE, testData, len); 4371 4372 // 4373 // Regex to break the input file into lines, and strip the new lines. 4374 // One line per match, capture group one is the desired data. 4375 // 4376 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status); 4377 if (U_FAILURE(status)) { 4378 dataerrln("RegexPattern::compile() error"); 4379 return; 4380 } 4381 RegexMatcher* lineMat = linePat->matcher(testDataString, status); 4382 4383 // 4384 // Regex to split a test file line into fields. 4385 // There are six fields, separated by tabs. 4386 // 4387 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status); 4388 4389 // 4390 // Regex to identify test patterns with flag settings, and to separate them. 4391 // Test patterns with flags look like 'pattern'i 4392 // Test patterns without flags are not quoted: pattern 4393 // Coming out, capture group 2 is the pattern, capture group 3 is the flags. 4394 // 4395 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status); 4396 RegexMatcher* flagMat = flagPat->matcher(status); 4397 4398 // 4399 // The Perl tests reference several perl-isms, which are evaluated/substituted 4400 // in the test data. Not being perl, this must be done explicitly. Here 4401 // are string constants and REs for these constructs. 4402 // 4403 UnicodeString nulnulSrc("${nulnul}"); 4404 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV); 4405 nulnul = nulnul.unescape(); 4406 4407 UnicodeString ffffSrc("${ffff}"); 4408 UnicodeString ffff("\\uffff", -1, US_INV); 4409 ffff = ffff.unescape(); 4410 4411 // regexp for $-[0], $+[2], etc. 4412 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status); 4413 RegexMatcher *groupsMat = groupsPat->matcher(status); 4414 4415 // regexp for $0, $1, $2, etc. 4416 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status); 4417 RegexMatcher *cgMat = cgPat->matcher(status); 4418 4419 4420 // 4421 // Main Loop for the Perl Tests, runs once per line from the 4422 // test data file. 4423 // 4424 int32_t lineNum = 0; 4425 int32_t skippedUnimplementedCount = 0; 4426 while (lineMat->find()) { 4427 lineNum++; 4428 4429 // 4430 // Get a line, break it into its fields, do the Perl 4431 // variable substitutions. 4432 // 4433 UnicodeString line = lineMat->group(1, status); 4434 UnicodeString fields[7]; 4435 fieldPat->split(line, fields, 7, status); 4436 4437 flagMat->reset(fields[0]); 4438 flagMat->matches(status); 4439 UnicodeString pattern = flagMat->group(2, status); 4440 pattern.findAndReplace("${bang}", "!"); 4441 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000")); 4442 pattern.findAndReplace(ffffSrc, ffff); 4443 4444 // 4445 // Identify patterns that include match flag settings, 4446 // split off the flags, remove the extra quotes. 4447 // 4448 UnicodeString flagStr = flagMat->group(3, status); 4449 if (U_FAILURE(status)) { 4450 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 4451 return; 4452 } 4453 int32_t flags = 0; 4454 const UChar UChar_c = 0x63; // Char constants for the flag letters. 4455 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C) 4456 const UChar UChar_m = 0x6d; 4457 const UChar UChar_x = 0x78; 4458 const UChar UChar_y = 0x79; 4459 if (flagStr.indexOf(UChar_i) != -1) { 4460 flags |= UREGEX_CASE_INSENSITIVE; 4461 } 4462 if (flagStr.indexOf(UChar_m) != -1) { 4463 flags |= UREGEX_MULTILINE; 4464 } 4465 if (flagStr.indexOf(UChar_x) != -1) { 4466 flags |= UREGEX_COMMENTS; 4467 } 4468 4469 // 4470 // Put the pattern in a UTF-8 UText 4471 // 4472 status = U_ZERO_ERROR; 4473 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status); 4474 if (status == U_BUFFER_OVERFLOW_ERROR) { 4475 status = U_ZERO_ERROR; 4476 delete[] patternChars; 4477 patternCapacity = patternLength + 1; 4478 patternChars = new char[patternCapacity]; 4479 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status); 4480 } 4481 utext_openUTF8(&patternText, patternChars, patternLength, &status); 4482 4483 // 4484 // Compile the test pattern. 4485 // 4486 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status); 4487 if (status == U_REGEX_UNIMPLEMENTED) { 4488 // 4489 // Test of a feature that is planned for ICU, but not yet implemented. 4490 // skip the test. 4491 skippedUnimplementedCount++; 4492 delete testPat; 4493 status = U_ZERO_ERROR; 4494 continue; 4495 } 4496 4497 if (U_FAILURE(status)) { 4498 // Some tests are supposed to generate errors. 4499 // Only report an error for tests that are supposed to succeed. 4500 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND 4501 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility 4502 { 4503 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status)); 4504 } 4505 status = U_ZERO_ERROR; 4506 delete testPat; 4507 continue; 4508 } 4509 4510 if (fields[2].indexOf(UChar_i) >= 0) { 4511 // ICU should skip this test. 4512 delete testPat; 4513 continue; 4514 } 4515 4516 if (fields[2].indexOf(UChar_c) >= 0) { 4517 // This pattern should have caused a compilation error, but didn't/ 4518 errln("line %d: Expected a pattern compile error, got success.", lineNum); 4519 delete testPat; 4520 continue; 4521 } 4522 4523 4524 // 4525 // replace the Perl variables that appear in some of the 4526 // match data strings. 4527 // 4528 UnicodeString matchString = fields[1]; 4529 matchString.findAndReplace(nulnulSrc, nulnul); 4530 matchString.findAndReplace(ffffSrc, ffff); 4531 4532 // Replace any \n in the match string with an actual new-line char. 4533 // Don't do full unescape, as this unescapes more than Perl does, which 4534 // causes other spurious failures in the tests. 4535 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4536 4537 // 4538 // Put the input in a UTF-8 UText 4539 // 4540 status = U_ZERO_ERROR; 4541 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status); 4542 if (status == U_BUFFER_OVERFLOW_ERROR) { 4543 status = U_ZERO_ERROR; 4544 delete[] inputChars; 4545 inputCapacity = inputLength + 1; 4546 inputChars = new char[inputCapacity]; 4547 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status); 4548 } 4549 utext_openUTF8(&inputText, inputChars, inputLength, &status); 4550 4551 // 4552 // Run the test, check for expected match/don't match result. 4553 // 4554 RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText); 4555 UBool found = testMat->find(); 4556 UBool expected = FALSE; 4557 if (fields[2].indexOf(UChar_y) >=0) { 4558 expected = TRUE; 4559 } 4560 if (expected != found) { 4561 errln("line %d: Expected %smatch, got %smatch", 4562 lineNum, expected?"":"no ", found?"":"no " ); 4563 continue; 4564 } 4565 4566 // Don't try to check expected results if there is no match. 4567 // (Some have stuff in the expected fields) 4568 if (!found) { 4569 delete testMat; 4570 delete testPat; 4571 continue; 4572 } 4573 4574 // 4575 // Interpret the Perl expression from the fourth field of the data file, 4576 // building up an ICU string from the results of the ICU match. 4577 // The Perl expression will contain references to the results of 4578 // a regex match, including the matched string, capture group strings, 4579 // group starting and ending indicies, etc. 4580 // 4581 UnicodeString resultString; 4582 UnicodeString perlExpr = fields[3]; 4583 4584 while (perlExpr.length() > 0) { 4585 groupsMat->reset(perlExpr); 4586 cgMat->reset(perlExpr); 4587 4588 if (perlExpr.startsWith("$&")) { 4589 resultString.append(testMat->group(status)); 4590 perlExpr.remove(0, 2); 4591 } 4592 4593 else if (groupsMat->lookingAt(status)) { 4594 // $-[0] $+[2] etc. 4595 UnicodeString digitString = groupsMat->group(2, status); 4596 int32_t t = 0; 4597 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4598 UnicodeString plusOrMinus = groupsMat->group(1, status); 4599 int32_t matchPosition; 4600 if (plusOrMinus.compare("+") == 0) { 4601 matchPosition = testMat->end(groupNum, status); 4602 } else { 4603 matchPosition = testMat->start(groupNum, status); 4604 } 4605 if (matchPosition != -1) { 4606 ICU_Utility::appendNumber(resultString, matchPosition); 4607 } 4608 perlExpr.remove(0, groupsMat->end(status)); 4609 } 4610 4611 else if (cgMat->lookingAt(status)) { 4612 // $1, $2, $3, etc. 4613 UnicodeString digitString = cgMat->group(1, status); 4614 int32_t t = 0; 4615 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4616 if (U_SUCCESS(status)) { 4617 resultString.append(testMat->group(groupNum, status)); 4618 status = U_ZERO_ERROR; 4619 } 4620 perlExpr.remove(0, cgMat->end(status)); 4621 } 4622 4623 else if (perlExpr.startsWith("@-")) { 4624 int32_t i; 4625 for (i=0; i<=testMat->groupCount(); i++) { 4626 if (i>0) { 4627 resultString.append(" "); 4628 } 4629 ICU_Utility::appendNumber(resultString, testMat->start(i, status)); 4630 } 4631 perlExpr.remove(0, 2); 4632 } 4633 4634 else if (perlExpr.startsWith("@+")) { 4635 int32_t i; 4636 for (i=0; i<=testMat->groupCount(); i++) { 4637 if (i>0) { 4638 resultString.append(" "); 4639 } 4640 ICU_Utility::appendNumber(resultString, testMat->end(i, status)); 4641 } 4642 perlExpr.remove(0, 2); 4643 } 4644 4645 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal. 4646 // or as an escaped sequence (e.g. \n) 4647 if (perlExpr.length() > 1) { 4648 perlExpr.remove(0, 1); // Remove the '\', but only if not last char. 4649 } 4650 UChar c = perlExpr.charAt(0); 4651 switch (c) { 4652 case 'n': c = '\n'; break; 4653 // add any other escape sequences that show up in the test expected results. 4654 } 4655 resultString.append(c); 4656 perlExpr.remove(0, 1); 4657 } 4658 4659 else { 4660 // Any characters from the perl expression that we don't explicitly 4661 // recognize before here are assumed to be literals and copied 4662 // as-is to the expected results. 4663 resultString.append(perlExpr.charAt(0)); 4664 perlExpr.remove(0, 1); 4665 } 4666 4667 if (U_FAILURE(status)) { 4668 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status)); 4669 break; 4670 } 4671 } 4672 4673 // 4674 // Expected Results Compare 4675 // 4676 UnicodeString expectedS(fields[4]); 4677 expectedS.findAndReplace(nulnulSrc, nulnul); 4678 expectedS.findAndReplace(ffffSrc, ffff); 4679 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4680 4681 4682 if (expectedS.compare(resultString) != 0) { 4683 err("Line %d: Incorrect perl expression results.", lineNum); 4684 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\""); 4685 } 4686 4687 delete testMat; 4688 delete testPat; 4689 } 4690 4691 // 4692 // All done. Clean up allocated stuff. 4693 // 4694 delete cgMat; 4695 delete cgPat; 4696 4697 delete groupsMat; 4698 delete groupsPat; 4699 4700 delete flagMat; 4701 delete flagPat; 4702 4703 delete lineMat; 4704 delete linePat; 4705 4706 delete fieldPat; 4707 delete [] testData; 4708 4709 utext_close(&patternText); 4710 utext_close(&inputText); 4711 4712 delete [] patternChars; 4713 delete [] inputChars; 4714 4715 4716 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount); 4717 4718 } 4719 4720 4721 //-------------------------------------------------------------- 4722 // 4723 // Bug6149 Verify limits to heap expansion for backtrack stack. 4724 // Use this pattern, 4725 // "(a?){1,8000000}" 4726 // Note: was an unbounded upperbounds, but that now has loop-breaking enabled. 4727 // This test is likely to be fragile, as further optimizations stop 4728 // more cases of pointless looping in the match engine. 4729 // 4730 //--------------------------------------------------------------- 4731 void RegexTest::Bug6149() { 4732 UnicodeString pattern("(a?){1,8000000}"); 4733 UnicodeString s("xyz"); 4734 uint32_t flags = 0; 4735 UErrorCode status = U_ZERO_ERROR; 4736 4737 RegexMatcher matcher(pattern, s, flags, status); 4738 UBool result = false; 4739 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW); 4740 REGEX_ASSERT(result == FALSE); 4741 } 4742 4743 4744 // 4745 // Callbacks() Test the callback function. 4746 // When set, callbacks occur periodically during matching operations, 4747 // giving the application code the ability to abort the operation 4748 // before it's normal completion. 4749 // 4750 4751 struct callBackContext { 4752 RegexTest *test; 4753 int32_t maxCalls; 4754 int32_t numCalls; 4755 int32_t lastSteps; 4756 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;}; 4757 }; 4758 4759 U_CDECL_BEGIN 4760 static UBool U_CALLCONV 4761 testCallBackFn(const void *context, int32_t steps) { 4762 callBackContext *info = (callBackContext *)context; 4763 if (info->lastSteps+1 != steps) { 4764 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps); 4765 } 4766 info->lastSteps = steps; 4767 info->numCalls++; 4768 return (info->numCalls < info->maxCalls); 4769 } 4770 U_CDECL_END 4771 4772 void RegexTest::Callbacks() { 4773 { 4774 // Getter returns NULLs if no callback has been set 4775 4776 // The variables that the getter will fill in. 4777 // Init to non-null values so that the action of the getter can be seen. 4778 const void *returnedContext = &returnedContext; 4779 URegexMatchCallback *returnedFn = &testCallBackFn; 4780 4781 UErrorCode status = U_ZERO_ERROR; 4782 RegexMatcher matcher("x", 0, status); 4783 REGEX_CHECK_STATUS; 4784 matcher.getMatchCallback(returnedFn, returnedContext, status); 4785 REGEX_CHECK_STATUS; 4786 REGEX_ASSERT(returnedFn == NULL); 4787 REGEX_ASSERT(returnedContext == NULL); 4788 } 4789 4790 { 4791 // Set and Get work 4792 callBackContext cbInfo = {this, 0, 0, 0}; 4793 const void *returnedContext; 4794 URegexMatchCallback *returnedFn; 4795 UErrorCode status = U_ZERO_ERROR; 4796 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long. 4797 REGEX_CHECK_STATUS; 4798 matcher.setMatchCallback(testCallBackFn, &cbInfo, status); 4799 REGEX_CHECK_STATUS; 4800 matcher.getMatchCallback(returnedFn, returnedContext, status); 4801 REGEX_CHECK_STATUS; 4802 REGEX_ASSERT(returnedFn == testCallBackFn); 4803 REGEX_ASSERT(returnedContext == &cbInfo); 4804 4805 // A short-running match shouldn't invoke the callback 4806 status = U_ZERO_ERROR; 4807 cbInfo.reset(1); 4808 UnicodeString s = "xxx"; 4809 matcher.reset(s); 4810 REGEX_ASSERT(matcher.matches(status)); 4811 REGEX_CHECK_STATUS; 4812 REGEX_ASSERT(cbInfo.numCalls == 0); 4813 4814 // A medium-length match that runs long enough to invoke the 4815 // callback, but not so long that the callback aborts it. 4816 status = U_ZERO_ERROR; 4817 cbInfo.reset(4); 4818 s = "aaaaaaaaaaaaaaaaaaab"; 4819 matcher.reset(s); 4820 REGEX_ASSERT(matcher.matches(status)==FALSE); 4821 REGEX_CHECK_STATUS; 4822 REGEX_ASSERT(cbInfo.numCalls > 0); 4823 4824 // A longer running match that the callback function will abort. 4825 status = U_ZERO_ERROR; 4826 cbInfo.reset(4); 4827 s = "aaaaaaaaaaaaaaaaaaaaaaab"; 4828 matcher.reset(s); 4829 REGEX_ASSERT(matcher.matches(status)==FALSE); 4830 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); 4831 REGEX_ASSERT(cbInfo.numCalls == 4); 4832 4833 // A longer running find that the callback function will abort. 4834 status = U_ZERO_ERROR; 4835 cbInfo.reset(4); 4836 s = "aaaaaaaaaaaaaaaaaaaaaaab"; 4837 matcher.reset(s); 4838 REGEX_ASSERT(matcher.find(status)==FALSE); 4839 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); 4840 REGEX_ASSERT(cbInfo.numCalls == 4); 4841 } 4842 4843 4844 } 4845 4846 4847 // 4848 // FindProgressCallbacks() Test the find "progress" callback function. 4849 // When set, the find progress callback will be invoked during a find operations 4850 // after each return from a match attempt, giving the application the opportunity 4851 // to terminate a long-running find operation before it's normal completion. 4852 // 4853 4854 struct progressCallBackContext { 4855 RegexTest *test; 4856 int64_t lastIndex; 4857 int32_t maxCalls; 4858 int32_t numCalls; 4859 void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;}; 4860 }; 4861 4862 // call-back function for find(). 4863 // Return TRUE to continue the find(). 4864 // Return FALSE to stop the find(). 4865 U_CDECL_BEGIN 4866 static UBool U_CALLCONV 4867 testProgressCallBackFn(const void *context, int64_t matchIndex) { 4868 progressCallBackContext *info = (progressCallBackContext *)context; 4869 info->numCalls++; 4870 info->lastIndex = matchIndex; 4871 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls); 4872 return (info->numCalls < info->maxCalls); 4873 } 4874 U_CDECL_END 4875 4876 void RegexTest::FindProgressCallbacks() { 4877 { 4878 // Getter returns NULLs if no callback has been set 4879 4880 // The variables that the getter will fill in. 4881 // Init to non-null values so that the action of the getter can be seen. 4882 const void *returnedContext = &returnedContext; 4883 URegexFindProgressCallback *returnedFn = &testProgressCallBackFn; 4884 4885 UErrorCode status = U_ZERO_ERROR; 4886 RegexMatcher matcher("x", 0, status); 4887 REGEX_CHECK_STATUS; 4888 matcher.getFindProgressCallback(returnedFn, returnedContext, status); 4889 REGEX_CHECK_STATUS; 4890 REGEX_ASSERT(returnedFn == NULL); 4891 REGEX_ASSERT(returnedContext == NULL); 4892 } 4893 4894 { 4895 // Set and Get work 4896 progressCallBackContext cbInfo = {this, 0, 0, 0}; 4897 const void *returnedContext; 4898 URegexFindProgressCallback *returnedFn; 4899 UErrorCode status = U_ZERO_ERROR; 4900 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status); 4901 REGEX_CHECK_STATUS; 4902 matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status); 4903 REGEX_CHECK_STATUS; 4904 matcher.getFindProgressCallback(returnedFn, returnedContext, status); 4905 REGEX_CHECK_STATUS; 4906 REGEX_ASSERT(returnedFn == testProgressCallBackFn); 4907 REGEX_ASSERT(returnedContext == &cbInfo); 4908 4909 // A find that matches on the initial position does NOT invoke the callback. 4910 status = U_ZERO_ERROR; 4911 cbInfo.reset(100); 4912 UnicodeString s = "aaxxx"; 4913 matcher.reset(s); 4914 #if 0 4915 matcher.setTrace(TRUE); 4916 #endif 4917 REGEX_ASSERT(matcher.find(0, status)); 4918 REGEX_CHECK_STATUS; 4919 REGEX_ASSERT(cbInfo.numCalls == 0); 4920 4921 // A medium running find() that causes matcher.find() to invoke our callback for each index, 4922 // but not so many times that we interrupt the operation. 4923 status = U_ZERO_ERROR; 4924 s = "aaaaaaaaaaaaaaaaaaab"; 4925 cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string 4926 matcher.reset(s); 4927 REGEX_ASSERT(matcher.find(0, status)==FALSE); 4928 REGEX_CHECK_STATUS; 4929 REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25); 4930 4931 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point. 4932 status = U_ZERO_ERROR; 4933 UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab"; 4934 cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string 4935 matcher.reset(s1); 4936 REGEX_ASSERT(matcher.find(0, status)==FALSE); 4937 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); 4938 REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5); 4939 4940 // Now a match that will succeed, but after an interruption 4941 status = U_ZERO_ERROR; 4942 UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx"; 4943 cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string 4944 matcher.reset(s2); 4945 REGEX_ASSERT(matcher.find(0, status)==FALSE); 4946 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); 4947 // Now retry the match from where left off 4948 cbInfo.maxCalls = 100; // No callback limit 4949 status = U_ZERO_ERROR; 4950 REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status)); 4951 REGEX_CHECK_STATUS; 4952 } 4953 4954 4955 } 4956 4957 4958 //--------------------------------------------------------------------------- 4959 // 4960 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable 4961 // UTexts. The pure-C implementation of UText 4962 // has no mutable backing stores, but we can 4963 // use UnicodeString here to test the functionality. 4964 // 4965 //--------------------------------------------------------------------------- 4966 void RegexTest::PreAllocatedUTextCAPI () { 4967 UErrorCode status = U_ZERO_ERROR; 4968 URegularExpression *re; 4969 UText patternText = UTEXT_INITIALIZER; 4970 UnicodeString buffer; 4971 UText bufferText = UTEXT_INITIALIZER; 4972 4973 utext_openUnicodeString(&bufferText, &buffer, &status); 4974 4975 /* 4976 * getText() and getUText() 4977 */ 4978 { 4979 UText text1 = UTEXT_INITIALIZER; 4980 UText text2 = UTEXT_INITIALIZER; 4981 UChar text2Chars[20]; 4982 UText *resultText; 4983 4984 status = U_ZERO_ERROR; 4985 regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status); 4986 regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status); 4987 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2); 4988 utext_openUChars(&text2, text2Chars, -1, &status); 4989 4990 regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status); 4991 re = uregex_openUText(&patternText, 0, NULL, &status); 4992 4993 /* First set a UText */ 4994 uregex_setUText(re, &text1, &status); 4995 resultText = uregex_getUText(re, &bufferText, &status); 4996 REGEX_CHECK_STATUS; 4997 REGEX_ASSERT(resultText == &bufferText); 4998 utext_setNativeIndex(resultText, 0); 4999 utext_setNativeIndex(&text1, 0); 5000 REGEX_ASSERT(testUTextEqual(resultText, &text1)); 5001 5002 resultText = uregex_getUText(re, &bufferText, &status); 5003 REGEX_CHECK_STATUS; 5004 REGEX_ASSERT(resultText == &bufferText); 5005 utext_setNativeIndex(resultText, 0); 5006 utext_setNativeIndex(&text1, 0); 5007 REGEX_ASSERT(testUTextEqual(resultText, &text1)); 5008 5009 /* Then set a UChar * */ 5010 uregex_setText(re, text2Chars, 7, &status); 5011 resultText = uregex_getUText(re, &bufferText, &status); 5012 REGEX_CHECK_STATUS; 5013 REGEX_ASSERT(resultText == &bufferText); 5014 utext_setNativeIndex(resultText, 0); 5015 utext_setNativeIndex(&text2, 0); 5016 REGEX_ASSERT(testUTextEqual(resultText, &text2)); 5017 5018 uregex_close(re); 5019 utext_close(&text1); 5020 utext_close(&text2); 5021 } 5022 5023 /* 5024 * group() 5025 */ 5026 { 5027 UChar text1[80]; 5028 UText *actual; 5029 UBool result; 5030 int64_t length = 0; 5031 5032 u_uastrncpy(text1, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1)); 5033 // 012345678901234567890123456789012345678901234567 5034 // 0 1 2 3 4 5035 5036 status = U_ZERO_ERROR; 5037 re = uregex_openC("abc(.*?)def", 0, NULL, &status); 5038 REGEX_CHECK_STATUS; 5039 5040 uregex_setText(re, text1, -1, &status); 5041 result = uregex_find(re, 0, &status); 5042 REGEX_ASSERT(result==TRUE); 5043 5044 /* Capture Group 0, the full match. Should succeed. "abc interior def" */ 5045 status = U_ZERO_ERROR; 5046 actual = uregex_groupUText(re, 0, &bufferText, &length, &status); 5047 REGEX_CHECK_STATUS; 5048 REGEX_ASSERT(actual == &bufferText); 5049 REGEX_ASSERT(utext_getNativeIndex(actual) == 6); 5050 REGEX_ASSERT(length == 16); 5051 REGEX_ASSERT(utext_nativeLength(actual) == 47); 5052 5053 /* Capture group #1. Should succeed, matching " interior ". */ 5054 status = U_ZERO_ERROR; 5055 actual = uregex_groupUText(re, 1, &bufferText, &length, &status); 5056 REGEX_CHECK_STATUS; 5057 REGEX_ASSERT(actual == &bufferText); 5058 REGEX_ASSERT(utext_getNativeIndex(actual) == 9); // position of " interior " 5059 REGEX_ASSERT(length == 10); 5060 REGEX_ASSERT(utext_nativeLength(actual) == 47); 5061 5062 /* Capture group out of range. Error. */ 5063 status = U_ZERO_ERROR; 5064 actual = uregex_groupUText(re, 2, &bufferText, &length, &status); 5065 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 5066 REGEX_ASSERT(actual == &bufferText); 5067 uregex_close(re); 5068 5069 } 5070 5071 /* 5072 * replaceFirst() 5073 */ 5074 { 5075 UChar text1[80]; 5076 UChar text2[80]; 5077 UText replText = UTEXT_INITIALIZER; 5078 UText *result; 5079 status = U_ZERO_ERROR; 5080 utext_openUnicodeString(&bufferText, &buffer, &status); 5081 5082 status = U_ZERO_ERROR; 5083 u_uastrncpy(text1, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1)); 5084 u_uastrncpy(text2, "No match here.", UPRV_LENGTHOF(text2)/2); 5085 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status); 5086 5087 re = uregex_openC("x(.*?)x", 0, NULL, &status); 5088 REGEX_CHECK_STATUS; 5089 5090 /* Normal case, with match */ 5091 uregex_setText(re, text1, -1, &status); 5092 REGEX_CHECK_STATUS; 5093 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5094 REGEX_CHECK_STATUS; 5095 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 5096 REGEX_CHECK_STATUS; 5097 REGEX_ASSERT(result == &bufferText); 5098 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result); 5099 5100 /* No match. Text should copy to output with no changes. */ 5101 uregex_setText(re, text2, -1, &status); 5102 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5103 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 5104 REGEX_CHECK_STATUS; 5105 REGEX_ASSERT(result == &bufferText); 5106 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result); 5107 5108 /* Unicode escapes */ 5109 uregex_setText(re, text1, -1, &status); 5110 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status); 5111 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5112 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 5113 REGEX_CHECK_STATUS; 5114 REGEX_ASSERT(result == &bufferText); 5115 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result); 5116 5117 uregex_close(re); 5118 utext_close(&replText); 5119 } 5120 5121 5122 /* 5123 * replaceAll() 5124 */ 5125 { 5126 UChar text1[80]; 5127 UChar text2[80]; 5128 UText replText = UTEXT_INITIALIZER; 5129 UText *result; 5130 5131 status = U_ZERO_ERROR; 5132 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2); 5133 u_uastrncpy(text2, "No match here.", sizeof(text2)/2); 5134 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status); 5135 5136 re = uregex_openC("x(.*?)x", 0, NULL, &status); 5137 REGEX_CHECK_STATUS; 5138 5139 /* Normal case, with match */ 5140 uregex_setText(re, text1, -1, &status); 5141 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5142 result = uregex_replaceAllUText(re, &replText, &bufferText, &status); 5143 REGEX_CHECK_STATUS; 5144 REGEX_ASSERT(result == &bufferText); 5145 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result); 5146 5147 /* No match. Text should copy to output with no changes. */ 5148 uregex_setText(re, text2, -1, &status); 5149 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5150 result = uregex_replaceAllUText(re, &replText, &bufferText, &status); 5151 REGEX_CHECK_STATUS; 5152 REGEX_ASSERT(result == &bufferText); 5153 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result); 5154 5155 uregex_close(re); 5156 utext_close(&replText); 5157 } 5158 5159 5160 /* 5161 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts, 5162 * so we don't need to test it here. 5163 */ 5164 5165 utext_close(&bufferText); 5166 utext_close(&patternText); 5167 } 5168 5169 5170 //-------------------------------------------------------------- 5171 // 5172 // NamedCapture Check basic named capture group functionality 5173 // 5174 //-------------------------------------------------------------- 5175 void RegexTest::NamedCapture() { 5176 UErrorCode status = U_ZERO_ERROR; 5177 RegexPattern *pat = RegexPattern::compile(UnicodeString( 5178 "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status); 5179 REGEX_CHECK_STATUS; 5180 int32_t group = pat->groupNumberFromName("five", -1, status); 5181 REGEX_CHECK_STATUS; 5182 REGEX_ASSERT(5 == group); 5183 group = pat->groupNumberFromName("three", -1, status); 5184 REGEX_CHECK_STATUS; 5185 REGEX_ASSERT(3 == group); 5186 5187 status = U_ZERO_ERROR; 5188 group = pat->groupNumberFromName(UnicodeString("six"), status); 5189 REGEX_CHECK_STATUS; 5190 REGEX_ASSERT(6 == group); 5191 5192 status = U_ZERO_ERROR; 5193 group = pat->groupNumberFromName(UnicodeString("nosuch"), status); 5194 U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5195 5196 status = U_ZERO_ERROR; 5197 5198 // After copying a pattern, named capture should still work in the copy. 5199 RegexPattern *copiedPat = new RegexPattern(*pat); 5200 REGEX_ASSERT(*copiedPat == *pat); 5201 delete pat; pat = NULL; // Delete original, copy should have no references back to it. 5202 5203 group = copiedPat->groupNumberFromName("five", -1, status); 5204 REGEX_CHECK_STATUS; 5205 REGEX_ASSERT(5 == group); 5206 group = copiedPat->groupNumberFromName("three", -1, status); 5207 REGEX_CHECK_STATUS; 5208 REGEX_ASSERT(3 == group); 5209 delete copiedPat; 5210 5211 // ReplaceAll with named capture group. 5212 status = U_ZERO_ERROR; 5213 UnicodeString text("Substitution of <<quotes>> for <<double brackets>>"); 5214 RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status); 5215 REGEX_CHECK_STATUS; 5216 // m.pattern().dumpPattern(); 5217 UnicodeString replacedText = m->replaceAll("'${mid}'", status); 5218 REGEX_CHECK_STATUS; 5219 REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText); 5220 delete m; 5221 5222 // ReplaceAll, allowed capture group numbers. 5223 text = UnicodeString("abcmxyz"); 5224 m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status); 5225 REGEX_CHECK_STATUS; 5226 5227 status = U_ZERO_ERROR; 5228 replacedText = m->replaceAll(UnicodeString("<$0>"), status); // group 0, full match, is allowed. 5229 REGEX_CHECK_STATUS; 5230 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText); 5231 5232 status = U_ZERO_ERROR; 5233 replacedText = m->replaceAll(UnicodeString("<$1>"), status); // group 1 by number. 5234 REGEX_CHECK_STATUS; 5235 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText); 5236 5237 status = U_ZERO_ERROR; 5238 replacedText = m->replaceAll(UnicodeString("<${one}>"), status); // group 1 by name. 5239 REGEX_CHECK_STATUS; 5240 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText); 5241 5242 status = U_ZERO_ERROR; 5243 replacedText = m->replaceAll(UnicodeString("<$2>"), status); // group 2. 5244 REGEX_CHECK_STATUS; 5245 REGEX_ASSERT(UnicodeString("a<x>z") == replacedText); 5246 5247 status = U_ZERO_ERROR; 5248 replacedText = m->replaceAll(UnicodeString("<$3>"), status); 5249 REGEX_CHECK_STATUS; 5250 REGEX_ASSERT(UnicodeString("a<y>z") == replacedText); 5251 5252 status = U_ZERO_ERROR; 5253 replacedText = m->replaceAll(UnicodeString("<$4>"), status); 5254 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 5255 5256 status = U_ZERO_ERROR; 5257 replacedText = m->replaceAll(UnicodeString("<$04>"), status); // group 0, leading 0, 5258 REGEX_CHECK_STATUS; // trailing out-of-range 4 passes through. 5259 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText); 5260 5261 status = U_ZERO_ERROR; 5262 replacedText = m->replaceAll(UnicodeString("<$000016>"), status); // Consume leading zeroes. Don't consume digits 5263 REGEX_CHECK_STATUS; // that push group num out of range. 5264 REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText); // This is group 1. 5265 5266 status = U_ZERO_ERROR; 5267 replacedText = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status); 5268 REGEX_CHECK_STATUS; 5269 REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText); 5270 5271 status = U_ZERO_ERROR; 5272 replacedText = m->replaceAll(UnicodeString("$3$2$1${one}"), status); 5273 REGEX_CHECK_STATUS; 5274 REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText); 5275 5276 status = U_ZERO_ERROR; 5277 replacedText = m->replaceAll(UnicodeString("<${noSuchName}>"), status); 5278 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5279 5280 status = U_ZERO_ERROR; 5281 replacedText = m->replaceAll(UnicodeString("<${invalid-name}>"), status); 5282 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5283 5284 status = U_ZERO_ERROR; 5285 replacedText = m->replaceAll(UnicodeString("<${one"), status); 5286 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5287 5288 status = U_ZERO_ERROR; 5289 replacedText = m->replaceAll(UnicodeString("$not a capture group"), status); 5290 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5291 5292 delete m; 5293 5294 // Repeat the above replaceAll() tests using the plain C API, which 5295 // has a separate implementation internally. 5296 // TODO: factor out the test data. 5297 5298 status = U_ZERO_ERROR; 5299 URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status); 5300 REGEX_CHECK_STATUS; 5301 text = UnicodeString("abcmxyz"); 5302 uregex_setText(re, text.getBuffer(), text.length(), &status); 5303 REGEX_CHECK_STATUS; 5304 5305 UChar resultBuf[100]; 5306 int32_t resultLength; 5307 UnicodeString repl; 5308 5309 status = U_ZERO_ERROR; 5310 repl = UnicodeString("<$0>"); 5311 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5312 REGEX_CHECK_STATUS; 5313 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength)); 5314 5315 status = U_ZERO_ERROR; 5316 repl = UnicodeString("<$1>"); 5317 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5318 REGEX_CHECK_STATUS; 5319 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength)); 5320 5321 status = U_ZERO_ERROR; 5322 repl = UnicodeString("<${one}>"); 5323 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5324 REGEX_CHECK_STATUS; 5325 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength)); 5326 5327 status = U_ZERO_ERROR; 5328 repl = UnicodeString("<$2>"); 5329 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5330 REGEX_CHECK_STATUS; 5331 REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength)); 5332 5333 status = U_ZERO_ERROR; 5334 repl = UnicodeString("<$3>"); 5335 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5336 REGEX_CHECK_STATUS; 5337 REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength)); 5338 5339 status = U_ZERO_ERROR; 5340 repl = UnicodeString("<$4>"); 5341 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5342 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 5343 5344 status = U_ZERO_ERROR; 5345 repl = UnicodeString("<$04>"); 5346 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5347 REGEX_CHECK_STATUS; 5348 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength)); 5349 5350 status = U_ZERO_ERROR; 5351 repl = UnicodeString("<$000016>"); 5352 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5353 REGEX_CHECK_STATUS; 5354 REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength)); 5355 5356 status = U_ZERO_ERROR; 5357 repl = UnicodeString("<$3$2$1${one}>"); 5358 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5359 REGEX_CHECK_STATUS; 5360 REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength)); 5361 5362 status = U_ZERO_ERROR; 5363 repl = UnicodeString("$3$2$1${one}"); 5364 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5365 REGEX_CHECK_STATUS; 5366 REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength)); 5367 5368 status = U_ZERO_ERROR; 5369 repl = UnicodeString("<${noSuchName}>"); 5370 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5371 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5372 5373 status = U_ZERO_ERROR; 5374 repl = UnicodeString("<${invalid-name}>"); 5375 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5376 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5377 5378 status = U_ZERO_ERROR; 5379 repl = UnicodeString("<${one"); 5380 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5381 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5382 5383 status = U_ZERO_ERROR; 5384 repl = UnicodeString("$not a capture group"); 5385 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5386 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5387 5388 uregex_close(re); 5389 } 5390 5391 //-------------------------------------------------------------- 5392 // 5393 // NamedCaptureLimits Patterns with huge numbers of named capture groups. 5394 // The point is not so much what the exact limit is, 5395 // but that a largish number doesn't hit bad non-linear performance, 5396 // and that exceeding the limit fails cleanly. 5397 // 5398 //-------------------------------------------------------------- 5399 void RegexTest::NamedCaptureLimits() { 5400 if (quick) { 5401 logln("Skipping test. Runs in exhuastive mode only."); 5402 return; 5403 } 5404 const int32_t goodLimit = 1000000; // Pattern w this many groups builds successfully. 5405 const int32_t failLimit = 10000000; // Pattern exceeds internal limits, fails to compile. 5406 char nnbuf[100]; 5407 UnicodeString pattern; 5408 int32_t nn; 5409 5410 for (nn=1; nn<goodLimit; nn++) { 5411 sprintf(nnbuf, "(?<nn%d>)", nn); 5412 pattern.append(UnicodeString(nnbuf, -1, US_INV)); 5413 } 5414 UErrorCode status = U_ZERO_ERROR; 5415 RegexPattern *pat = RegexPattern::compile(pattern, 0, status); 5416 REGEX_CHECK_STATUS; 5417 for (nn=1; nn<goodLimit; nn++) { 5418 sprintf(nnbuf, "nn%d", nn); 5419 int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status); 5420 REGEX_ASSERT(nn == groupNum); 5421 if (nn != groupNum) { 5422 break; 5423 } 5424 } 5425 delete pat; 5426 5427 pattern.remove(); 5428 for (nn=1; nn<failLimit; nn++) { 5429 sprintf(nnbuf, "(?<nn%d>)", nn); 5430 pattern.append(UnicodeString(nnbuf, -1, US_INV)); 5431 } 5432 status = U_ZERO_ERROR; 5433 pat = RegexPattern::compile(pattern, 0, status); 5434 REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG); 5435 delete pat; 5436 } 5437 5438 5439 //-------------------------------------------------------------- 5440 // 5441 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher. 5442 // 5443 //--------------------------------------------------------------- 5444 void RegexTest::Bug7651() { 5445 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)"); 5446 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData. 5447 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation. 5448 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)"); 5449 UnicodeString s("#ff @abcd This is test"); 5450 RegexPattern *REPattern = NULL; 5451 RegexMatcher *REMatcher = NULL; 5452 UErrorCode status = U_ZERO_ERROR; 5453 UParseError pe; 5454 5455 REPattern = RegexPattern::compile(pattern1, 0, pe, status); 5456 REGEX_CHECK_STATUS; 5457 REMatcher = REPattern->matcher(s, status); 5458 REGEX_CHECK_STATUS; 5459 REGEX_ASSERT(REMatcher->find()); 5460 REGEX_ASSERT(REMatcher->start(status) == 0); 5461 delete REPattern; 5462 delete REMatcher; 5463 status = U_ZERO_ERROR; 5464 5465 REPattern = RegexPattern::compile(pattern2, 0, pe, status); 5466 REGEX_CHECK_STATUS; 5467 REMatcher = REPattern->matcher(s, status); 5468 REGEX_CHECK_STATUS; 5469 REGEX_ASSERT(REMatcher->find()); 5470 REGEX_ASSERT(REMatcher->start(status) == 0); 5471 delete REPattern; 5472 delete REMatcher; 5473 status = U_ZERO_ERROR; 5474 } 5475 5476 void RegexTest::Bug7740() { 5477 UErrorCode status = U_ZERO_ERROR; 5478 UnicodeString pattern = "(a)"; 5479 UnicodeString text = "abcdef"; 5480 RegexMatcher *m = new RegexMatcher(pattern, text, 0, status); 5481 REGEX_CHECK_STATUS; 5482 REGEX_ASSERT(m->lookingAt(status)); 5483 REGEX_CHECK_STATUS; 5484 status = U_ILLEGAL_ARGUMENT_ERROR; 5485 UnicodeString s = m->group(1, status); // Bug 7740: segfault here. 5486 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); 5487 REGEX_ASSERT(s == ""); 5488 delete m; 5489 } 5490 5491 // Bug 8479: was crashing whith a Bogus UnicodeString as input. 5492 5493 void RegexTest::Bug8479() { 5494 UErrorCode status = U_ZERO_ERROR; 5495 5496 RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status); 5497 REGEX_CHECK_STATUS; 5498 if (U_SUCCESS(status)) 5499 { 5500 UnicodeString str; 5501 str.setToBogus(); 5502 pMatcher->reset(str); 5503 status = U_ZERO_ERROR; 5504 pMatcher->matches(status); 5505 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); 5506 delete pMatcher; 5507 } 5508 } 5509 5510 5511 // Bug 7029 5512 void RegexTest::Bug7029() { 5513 UErrorCode status = U_ZERO_ERROR; 5514 5515 RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status); 5516 UnicodeString text = "abc.def"; 5517 UnicodeString splits[10]; 5518 REGEX_CHECK_STATUS; 5519 int32_t numFields = pMatcher->split(text, splits, 10, status); 5520 REGEX_CHECK_STATUS; 5521 REGEX_ASSERT(numFields == 8); 5522 delete pMatcher; 5523 } 5524 5525 // Bug 9283 5526 // This test is checking for the existance of any supplemental characters that case-fold 5527 // to a bmp character. 5528 // 5529 // At the time of this writing there are none. If any should appear in a subsequent release 5530 // of Unicode, the code in regular expressions compilation that determines the longest 5531 // posssible match for a literal string will need to be enhanced. 5532 // 5533 // See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength() 5534 // for details on what to do in case of a failure of this test. 5535 // 5536 void RegexTest::Bug9283() { 5537 #if !UCONFIG_NO_NORMALIZATION 5538 UErrorCode status = U_ZERO_ERROR; 5539 UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status); 5540 REGEX_CHECK_STATUS; 5541 int32_t index; 5542 UChar32 c; 5543 for (index=0; ; index++) { 5544 c = supplementalsWithCaseFolding.charAt(index); 5545 if (c == -1) { 5546 break; 5547 } 5548 UnicodeString cf = UnicodeString(c).foldCase(); 5549 REGEX_ASSERT(cf.length() >= 2); 5550 } 5551 #endif /* #if !UCONFIG_NO_NORMALIZATION */ 5552 } 5553 5554 5555 void RegexTest::CheckInvBufSize() { 5556 if(inv_next>=INV_BUFSIZ) { 5557 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n", 5558 __FILE__, INV_BUFSIZ, inv_next); 5559 } else { 5560 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next); 5561 } 5562 } 5563 5564 5565 void RegexTest::Bug10459() { 5566 UErrorCode status = U_ZERO_ERROR; 5567 UnicodeString patternString("(txt)"); 5568 UnicodeString txtString("txt"); 5569 5570 UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status); 5571 REGEX_CHECK_STATUS; 5572 UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status); 5573 REGEX_CHECK_STATUS; 5574 5575 URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status); 5576 REGEX_CHECK_STATUS; 5577 5578 uregex_setUText(icu_re, utext_txt, &status); 5579 REGEX_CHECK_STATUS; 5580 5581 // The bug was that calling uregex_group() before doing a matching operation 5582 // was causing a segfault. Only for Regular Expressions created from UText. 5583 // It should set an U_REGEX_INVALID_STATE. 5584 5585 UChar buf[100]; 5586 int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status); 5587 REGEX_ASSERT(status == U_REGEX_INVALID_STATE); 5588 REGEX_ASSERT(len == 0); 5589 5590 uregex_close(icu_re); 5591 utext_close(utext_pat); 5592 utext_close(utext_txt); 5593 } 5594 5595 void RegexTest::TestCaseInsensitiveStarters() { 5596 // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't 5597 // become stale because of new Unicode characters. 5598 // If it is stale, rerun the generation tool 5599 // svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing 5600 // and replace the embedded data in i18n/regexcmp.cpp 5601 5602 for (UChar32 cp=0; cp<=0x10ffff; cp++) { 5603 if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) { 5604 continue; 5605 } 5606 UnicodeSet s(cp, cp); 5607 s.closeOver(USET_CASE_INSENSITIVE); 5608 UnicodeSetIterator setIter(s); 5609 while (setIter.next()) { 5610 if (!setIter.isString()) { 5611 continue; 5612 } 5613 const UnicodeString &str = setIter.getString(); 5614 UChar32 firstChar = str.char32At(0); 5615 UnicodeSet starters; 5616 RegexCompile::findCaseInsensitiveStarters(firstChar, &starters); 5617 if (!starters.contains(cp)) { 5618 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar); 5619 return; 5620 } 5621 } 5622 } 5623 } 5624 5625 5626 void RegexTest::TestBug11049() { 5627 // Original bug report: pattern with match start consisting of one of several individual characters, 5628 // and the text being matched ending with a supplementary character. find() would read past the 5629 // end of the input text when searching for potential match starting points. 5630 5631 // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will 5632 // detect the bad read. 5633 5634 TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__); 5635 TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__); 5636 5637 // Test again with a pattern starting with a single character, 5638 // which takes a different code path than starting with an OR expression, 5639 // but with similar logic. 5640 TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__); 5641 TestCase11049("C", "string matches at end C", TRUE, __LINE__); 5642 } 5643 5644 // Run a single test case from TestBug11049(). Internal function. 5645 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) { 5646 UErrorCode status = U_ZERO_ERROR; 5647 UnicodeString patternString = UnicodeString(pattern).unescape(); 5648 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status)); 5649 5650 UnicodeString dataString = UnicodeString(data).unescape(); 5651 UChar *exactBuffer = new UChar[dataString.length()]; 5652 dataString.extract(exactBuffer, dataString.length(), status); 5653 UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status); 5654 5655 LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status)); 5656 REGEX_CHECK_STATUS; 5657 matcher->reset(ut); 5658 UBool result = matcher->find(); 5659 if (result != expectMatch) { 5660 errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"", 5661 __FILE__, lineNumber, expectMatch, result, pattern, data); 5662 } 5663 5664 // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see 5665 // off-by-one on find() with match at the last code point. 5666 // Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8 5667 // because string.unescape() will only shrink it. 5668 char * utf8Buffer = new char[uprv_strlen(data)+1]; 5669 u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status); 5670 REGEX_CHECK_STATUS; 5671 ut = utext_openUTF8(ut, utf8Buffer, -1, &status); 5672 REGEX_CHECK_STATUS; 5673 matcher->reset(ut); 5674 result = matcher->find(); 5675 if (result != expectMatch) { 5676 errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"", 5677 __FILE__, lineNumber, expectMatch, result, pattern, data); 5678 } 5679 delete [] utf8Buffer; 5680 5681 utext_close(ut); 5682 delete [] exactBuffer; 5683 } 5684 5685 5686 void RegexTest::TestBug11371() { 5687 if (quick) { 5688 logln("Skipping test. Runs in exhuastive mode only."); 5689 return; 5690 } 5691 UErrorCode status = U_ZERO_ERROR; 5692 UnicodeString patternString; 5693 5694 for (int i=0; i<8000000; i++) { 5695 patternString.append(UnicodeString("()")); 5696 } 5697 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status)); 5698 if (status != U_REGEX_PATTERN_TOO_BIG) { 5699 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.", 5700 __FILE__, __LINE__, u_errorName(status)); 5701 } 5702 5703 status = U_ZERO_ERROR; 5704 patternString = "("; 5705 for (int i=0; i<20000000; i++) { 5706 patternString.append(UnicodeString("A++")); 5707 } 5708 patternString.append(UnicodeString("){0}B++")); 5709 LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status)); 5710 if (status != U_REGEX_PATTERN_TOO_BIG) { 5711 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.", 5712 __FILE__, __LINE__, u_errorName(status)); 5713 } 5714 5715 // Pattern with too much string data, such that string indexes overflow operand data field size 5716 // in compiled instruction. 5717 status = U_ZERO_ERROR; 5718 patternString = ""; 5719 while (patternString.length() < 0x00ffffff) { 5720 patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n")); 5721 } 5722 patternString.append(UnicodeString("X? trailing string")); 5723 LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status)); 5724 if (status != U_REGEX_PATTERN_TOO_BIG) { 5725 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.", 5726 __FILE__, __LINE__, u_errorName(status)); 5727 } 5728 } 5729 5730 void RegexTest::TestBug11480() { 5731 // C API, get capture group of a group that does not participate in the match. 5732 // (Returns a zero length string, with nul termination, 5733 // indistinguishable from a group with a zero length match.) 5734 5735 UErrorCode status = U_ZERO_ERROR; 5736 URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status); 5737 REGEX_CHECK_STATUS; 5738 UnicodeString text = UNICODE_STRING_SIMPLE("A"); 5739 uregex_setText(re, text.getBuffer(), text.length(), &status); 5740 REGEX_CHECK_STATUS; 5741 REGEX_ASSERT(uregex_lookingAt(re, 0, &status)); 5742 UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13}; 5743 int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status); 5744 REGEX_ASSERT(length == 0); 5745 REGEX_ASSERT(buf[0] == 13); 5746 REGEX_ASSERT(buf[1] == 0); 5747 REGEX_ASSERT(buf[2] == 13); 5748 uregex_close(re); 5749 5750 // UText C++ API, length of match is 0 for non-participating matches. 5751 UText ut = UTEXT_INITIALIZER; 5752 utext_openUnicodeString(&ut, &text, &status); 5753 RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status); 5754 REGEX_CHECK_STATUS; 5755 matcher.reset(&ut); 5756 REGEX_ASSERT(matcher.lookingAt(0, status)); 5757 5758 // UText C++ API, Capture group 1 matches "A", position 0, length 1. 5759 int64_t groupLen = -666; 5760 UText group = UTEXT_INITIALIZER; 5761 matcher.group(1, &group, groupLen, status); 5762 REGEX_CHECK_STATUS; 5763 REGEX_ASSERT(groupLen == 1); 5764 REGEX_ASSERT(utext_getNativeIndex(&group) == 0); 5765 5766 // Capture group 2, the (B), does not participate in the match. 5767 matcher.group(2, &group, groupLen, status); 5768 REGEX_CHECK_STATUS; 5769 REGEX_ASSERT(groupLen == 0); 5770 REGEX_ASSERT(matcher.start(2, status) == -1); 5771 REGEX_CHECK_STATUS; 5772 } 5773 5774 void RegexTest::TestBug12884() { 5775 // setTimeLimit() was not effective for empty sub-patterns with large {minimum counts} 5776 UnicodeString pattern(u"(((((((){120}){11}){11}){11}){80}){11}){4}"); 5777 UnicodeString text(u"hello"); 5778 UErrorCode status = U_ZERO_ERROR; 5779 RegexMatcher m(pattern, text, 0, status); 5780 REGEX_CHECK_STATUS; 5781 m.setTimeLimit(5, status); 5782 m.find(status); 5783 REGEX_ASSERT(status == U_REGEX_TIME_OUT); 5784 5785 // Non-greedy loops. They take a different code path during matching. 5786 UnicodeString ngPattern(u"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?"); 5787 status = U_ZERO_ERROR; 5788 RegexMatcher ngM(ngPattern, text, 0, status); 5789 REGEX_CHECK_STATUS; 5790 ngM.setTimeLimit(5, status); 5791 ngM.find(status); 5792 REGEX_ASSERT(status == U_REGEX_TIME_OUT); 5793 5794 // UText, wrapping non-UTF-16 text, also takes a different execution path. 5795 const char *text8 = u8"Qu es Unicode? Unicode proporciona un nmero nico para cada" 5796 "carcter, sin importar la plataforma, sin importar el programa," 5797 "sin importar el idioma."; 5798 status = U_ZERO_ERROR; 5799 LocalUTextPointer ut(utext_openUTF8(NULL, text8, -1, &status)); 5800 REGEX_CHECK_STATUS; 5801 m.reset(ut.getAlias()); 5802 m.find(status); 5803 REGEX_ASSERT(status == U_REGEX_TIME_OUT); 5804 5805 status = U_ZERO_ERROR; 5806 ngM.reset(ut.getAlias()); 5807 ngM.find(status); 5808 REGEX_ASSERT(status == U_REGEX_TIME_OUT); 5809 } 5810 5811 // Bug 13631. A find() of a pattern with a zero length look-behind assertions 5812 // can cause a read past the end of the input text. 5813 // The failure is seen when running this test with Clang's Addresss Sanitizer. 5814 5815 void RegexTest::TestBug13631() { 5816 const UChar *pats[] = { u"(?<!^)", 5817 u"(?<=^)", 5818 nullptr 5819 }; 5820 for (const UChar **pat=pats; *pat; ++pat) { 5821 UErrorCode status = U_ZERO_ERROR; 5822 UnicodeString upat(*pat); 5823 RegexMatcher matcher(upat, 0, status); 5824 const UChar s =u'a'; 5825 UText *ut = utext_openUChars(nullptr, &s, 1, &status); 5826 REGEX_CHECK_STATUS; 5827 matcher.reset(ut); 5828 while (matcher.find()) { 5829 } 5830 utext_close(ut); 5831 } 5832 } 5833 5834 5835 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ 5836