1 /******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 2002-2015, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6 7 // 8 // regextst.cpp 9 // 10 // ICU Regular Expressions test, part of intltest. 11 // 12 13 /* 14 NOTE!! 15 16 PLEASE be careful about ASCII assumptions in this test. 17 This test is one of the worst repeat offenders. 18 If you have questions, contact someone on the ICU PMC 19 who has access to an EBCDIC system. 20 21 */ 22 23 #include "intltest.h" 24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 25 26 #include <stdlib.h> 27 #include <stdio.h> 28 #include <string.h> 29 30 #include "unicode/localpointer.h" 31 #include "unicode/regex.h" 32 #include "unicode/uchar.h" 33 #include "unicode/ucnv.h" 34 #include "unicode/uniset.h" 35 #include "unicode/uregex.h" 36 #include "unicode/usetiter.h" 37 #include "unicode/ustring.h" 38 #include "unicode/utext.h" 39 40 #include "regextst.h" 41 #include "regexcmp.h" 42 #include "uvector.h" 43 #include "util.h" 44 #include "cmemory.h" 45 #include "cstring.h" 46 #include "uinvchar.h" 47 48 #define SUPPORT_MUTATING_INPUT_STRING 0 49 50 //--------------------------------------------------------------------------- 51 // 52 // Test class boilerplate 53 // 54 //--------------------------------------------------------------------------- 55 RegexTest::RegexTest() 56 { 57 } 58 59 60 RegexTest::~RegexTest() 61 { 62 } 63 64 65 66 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) 67 { 68 if (exec) logln("TestSuite RegexTest: "); 69 switch (index) { 70 71 case 0: name = "Basic"; 72 if (exec) Basic(); 73 break; 74 case 1: name = "API_Match"; 75 if (exec) API_Match(); 76 break; 77 case 2: name = "API_Replace"; 78 if (exec) API_Replace(); 79 break; 80 case 3: name = "API_Pattern"; 81 if (exec) API_Pattern(); 82 break; 83 case 4: 84 #if !UCONFIG_NO_FILE_IO 85 name = "Extended"; 86 if (exec) Extended(); 87 #else 88 name = "skip"; 89 #endif 90 break; 91 case 5: name = "Errors"; 92 if (exec) Errors(); 93 break; 94 case 6: name = "PerlTests"; 95 if (exec) PerlTests(); 96 break; 97 case 7: name = "Callbacks"; 98 if (exec) Callbacks(); 99 break; 100 case 8: name = "FindProgressCallbacks"; 101 if (exec) FindProgressCallbacks(); 102 break; 103 case 9: name = "Bug 6149"; 104 if (exec) Bug6149(); 105 break; 106 case 10: name = "UTextBasic"; 107 if (exec) UTextBasic(); 108 break; 109 case 11: name = "API_Match_UTF8"; 110 if (exec) API_Match_UTF8(); 111 break; 112 case 12: name = "API_Replace_UTF8"; 113 if (exec) API_Replace_UTF8(); 114 break; 115 case 13: name = "API_Pattern_UTF8"; 116 if (exec) API_Pattern_UTF8(); 117 break; 118 case 14: name = "PerlTestsUTF8"; 119 if (exec) PerlTestsUTF8(); 120 break; 121 case 15: name = "PreAllocatedUTextCAPI"; 122 if (exec) PreAllocatedUTextCAPI(); 123 break; 124 case 16: name = "Bug 7651"; 125 if (exec) Bug7651(); 126 break; 127 case 17: name = "Bug 7740"; 128 if (exec) Bug7740(); 129 break; 130 case 18: name = "Bug 8479"; 131 if (exec) Bug8479(); 132 break; 133 case 19: name = "Bug 7029"; 134 if (exec) Bug7029(); 135 break; 136 case 20: name = "CheckInvBufSize"; 137 if (exec) CheckInvBufSize(); 138 break; 139 case 21: name = "Bug 9283"; 140 if (exec) Bug9283(); 141 break; 142 case 22: name = "Bug10459"; 143 if (exec) Bug10459(); 144 break; 145 case 23: name = "TestCaseInsensitiveStarters"; 146 if (exec) TestCaseInsensitiveStarters(); 147 break; 148 case 24: name = "TestBug11049"; 149 if (exec) TestBug11049(); 150 break; 151 case 25: name = "TestBug11371"; 152 if (exec) TestBug11371(); 153 break; 154 case 26: name = "TestBug11480"; 155 if (exec) TestBug11480(); 156 break; 157 case 27: name = "NamedCapture"; 158 if (exec) NamedCapture(); 159 break; 160 case 28: name = "NamedCaptureLimits"; 161 if (exec) NamedCaptureLimits(); 162 break; 163 default: name = ""; 164 break; //needed to end loop 165 } 166 } 167 168 169 170 /** 171 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage 172 * into ASCII. 173 * @see utext_openUTF8 174 */ 175 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status); 176 177 //--------------------------------------------------------------------------- 178 // 179 // Error Checking / Reporting macros used in all of the tests. 180 // 181 //--------------------------------------------------------------------------- 182 183 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) { 184 int64_t oldIndex = utext_getNativeIndex(text); 185 utext_setNativeIndex(text, 0); 186 char *bufPtr = buf; 187 UChar32 c = utext_next32From(text, 0); 188 while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) { 189 if (0x000020<=c && c<0x00007e) { 190 *bufPtr = c; 191 } else { 192 #if 0 193 sprintf(bufPtr,"U+%04X", c); 194 bufPtr+= strlen(bufPtr)-1; 195 #else 196 *bufPtr = '%'; 197 #endif 198 } 199 bufPtr++; 200 c = UTEXT_NEXT32(text); 201 } 202 *bufPtr = 0; 203 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY) 204 char *ebuf = (char*)malloc(bufLen); 205 uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen); 206 uprv_strncpy(buf, ebuf, bufLen); 207 free((void*)ebuf); 208 #endif 209 utext_setNativeIndex(text, oldIndex); 210 } 211 212 213 static char ASSERT_BUF[1024]; 214 215 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) { 216 if(message.length()==0) { 217 strcpy(ASSERT_BUF, "[[empty UnicodeString]]"); 218 } else { 219 UnicodeString buf; 220 IntlTest::prettify(message,buf); 221 if(buf.length()==0) { 222 strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]"); 223 } else { 224 buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1); 225 if(ASSERT_BUF[0]==0) { 226 ASSERT_BUF[0]=0; 227 for(int32_t i=0;i<buf.length();i++) { 228 UChar ch = buf[i]; 229 sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch); 230 } 231 } 232 } 233 } 234 ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0; 235 return ASSERT_BUF; 236 } 237 238 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);} 239 240 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \ 241 __FILE__, __LINE__, u_errorName(status)); return;}} 242 243 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};} 244 245 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\ 246 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \ 247 __LINE__, u_errorName(errcode), u_errorName(status));};} 248 249 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \ 250 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }} 251 252 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \ 253 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}} 254 255 // expected: const char * , restricted to invariant characters. 256 // actual: const UnicodeString & 257 #define REGEX_ASSERT_UNISTR(expected, actual) { \ 258 if (UnicodeString(expected, -1, US_INV) != (actual)) { \ 259 errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \ 260 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};} 261 262 263 static UBool testUTextEqual(UText *uta, UText *utb) { 264 UChar32 ca = 0; 265 UChar32 cb = 0; 266 utext_setNativeIndex(uta, 0); 267 utext_setNativeIndex(utb, 0); 268 do { 269 ca = utext_next32(uta); 270 cb = utext_next32(utb); 271 if (ca != cb) { 272 break; 273 } 274 } while (ca != U_SENTINEL); 275 return ca == cb; 276 } 277 278 279 /** 280 * @param expected expected text in UTF-8 (not platform) codepage 281 */ 282 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) { 283 UErrorCode status = U_ZERO_ERROR; 284 UText expectedText = UTEXT_INITIALIZER; 285 utext_openUTF8(&expectedText, expected, -1, &status); 286 if(U_FAILURE(status)) { 287 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected)); 288 return; 289 } 290 if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) { 291 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected)); 292 return; 293 } 294 utext_setNativeIndex(actual, 0); 295 if (!testUTextEqual(&expectedText, actual)) { 296 char buf[201 /*21*/]; 297 char expectedBuf[201]; 298 utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual); 299 utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText); 300 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual)); 301 } 302 utext_close(&expectedText); 303 } 304 /** 305 * @param expected invariant (platform local text) input 306 */ 307 308 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) { 309 UErrorCode status = U_ZERO_ERROR; 310 UText expectedText = UTEXT_INITIALIZER; 311 regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status); 312 if(U_FAILURE(status)) { 313 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected)); 314 return; 315 } 316 utext_setNativeIndex(actual, 0); 317 if (!testUTextEqual(&expectedText, actual)) { 318 char buf[201 /*21*/]; 319 char expectedBuf[201]; 320 utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual); 321 utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText); 322 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual)); 323 } 324 utext_close(&expectedText); 325 } 326 327 /** 328 * Assumes utf-8 input 329 */ 330 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__) 331 /** 332 * Assumes Invariant input 333 */ 334 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__) 335 336 /** 337 * This buffer ( inv_buf ) is used to hold the UTF-8 strings 338 * passed into utext_openUTF8. An error will be given if 339 * INV_BUFSIZ is too small. It's only used on EBCDIC systems. 340 */ 341 342 #define INV_BUFSIZ 2048 /* increase this if too small */ 343 344 static int64_t inv_next=0; 345 346 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY 347 static char inv_buf[INV_BUFSIZ]; 348 #endif 349 350 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) { 351 if(length==-1) length=strlen(inv); 352 #if U_CHARSET_FAMILY==U_ASCII_FAMILY 353 inv_next+=length; 354 return utext_openUTF8(ut, inv, length, status); 355 #else 356 if(inv_next+length+1>INV_BUFSIZ) { 357 fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n", 358 __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1)); 359 *status = U_MEMORY_ALLOCATION_ERROR; 360 return NULL; 361 } 362 363 unsigned char *buf = (unsigned char*)inv_buf+inv_next; 364 uprv_aestrncpy(buf, (const uint8_t*)inv, length); 365 inv_next+=length; 366 367 #if 0 368 fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next); 369 #endif 370 371 return utext_openUTF8(ut, (const char*)buf, length, status); 372 #endif 373 } 374 375 376 //--------------------------------------------------------------------------- 377 // 378 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests 379 // for the LookingAt() and Match() functions. 380 // 381 // usage: 382 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected); 383 // 384 // The expected results are UBool - TRUE or FALSE. 385 // The input text is unescaped. The pattern is not. 386 // 387 // 388 //--------------------------------------------------------------------------- 389 390 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);} 391 392 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) { 393 const UnicodeString pattern(pat, -1, US_INV); 394 const UnicodeString inputText(text, -1, US_INV); 395 UErrorCode status = U_ZERO_ERROR; 396 UParseError pe; 397 RegexPattern *REPattern = NULL; 398 RegexMatcher *REMatcher = NULL; 399 UBool retVal = TRUE; 400 401 UnicodeString patString(pat, -1, US_INV); 402 REPattern = RegexPattern::compile(patString, 0, pe, status); 403 if (U_FAILURE(status)) { 404 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s", 405 line, u_errorName(status)); 406 return FALSE; 407 } 408 if (line==376) { REPattern->dumpPattern();} 409 410 UnicodeString inputString(inputText); 411 UnicodeString unEscapedInput = inputString.unescape(); 412 REMatcher = REPattern->matcher(unEscapedInput, status); 413 if (U_FAILURE(status)) { 414 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n", 415 line, u_errorName(status)); 416 return FALSE; 417 } 418 419 UBool actualmatch; 420 actualmatch = REMatcher->lookingAt(status); 421 if (U_FAILURE(status)) { 422 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n", 423 line, u_errorName(status)); 424 retVal = FALSE; 425 } 426 if (actualmatch != looking) { 427 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line); 428 retVal = FALSE; 429 } 430 431 status = U_ZERO_ERROR; 432 actualmatch = REMatcher->matches(status); 433 if (U_FAILURE(status)) { 434 errln("RegexTest failure in matches() at line %d. Status = %s\n", 435 line, u_errorName(status)); 436 retVal = FALSE; 437 } 438 if (actualmatch != match) { 439 errln("RegexTest: wrong return from matches() at line %d.\n", line); 440 retVal = FALSE; 441 } 442 443 if (retVal == FALSE) { 444 REPattern->dumpPattern(); 445 } 446 447 delete REPattern; 448 delete REMatcher; 449 return retVal; 450 } 451 452 453 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) { 454 UText pattern = UTEXT_INITIALIZER; 455 int32_t inputUTF8Length; 456 char *textChars = NULL; 457 UText inputText = UTEXT_INITIALIZER; 458 UErrorCode status = U_ZERO_ERROR; 459 UParseError pe; 460 RegexPattern *REPattern = NULL; 461 RegexMatcher *REMatcher = NULL; 462 UBool retVal = TRUE; 463 464 regextst_openUTF8FromInvariant(&pattern, pat, -1, &status); 465 REPattern = RegexPattern::compile(&pattern, 0, pe, status); 466 if (U_FAILURE(status)) { 467 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n", 468 line, u_errorName(status)); 469 return FALSE; 470 } 471 472 UnicodeString inputString(text, -1, US_INV); 473 UnicodeString unEscapedInput = inputString.unescape(); 474 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status)); 475 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); 476 477 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status); 478 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) { 479 // UTF-8 does not allow unpaired surrogates, so this could actually happen 480 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status)); 481 return TRUE; // not a failure of the Regex engine 482 } 483 status = U_ZERO_ERROR; // buffer overflow 484 textChars = new char[inputUTF8Length+1]; 485 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status); 486 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status); 487 488 REMatcher = &REPattern->matcher(status)->reset(&inputText); 489 if (U_FAILURE(status)) { 490 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n", 491 line, u_errorName(status)); 492 return FALSE; 493 } 494 495 UBool actualmatch; 496 actualmatch = REMatcher->lookingAt(status); 497 if (U_FAILURE(status)) { 498 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n", 499 line, u_errorName(status)); 500 retVal = FALSE; 501 } 502 if (actualmatch != looking) { 503 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line); 504 retVal = FALSE; 505 } 506 507 status = U_ZERO_ERROR; 508 actualmatch = REMatcher->matches(status); 509 if (U_FAILURE(status)) { 510 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n", 511 line, u_errorName(status)); 512 retVal = FALSE; 513 } 514 if (actualmatch != match) { 515 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line); 516 retVal = FALSE; 517 } 518 519 if (retVal == FALSE) { 520 REPattern->dumpPattern(); 521 } 522 523 delete REPattern; 524 delete REMatcher; 525 utext_close(&inputText); 526 utext_close(&pattern); 527 delete[] textChars; 528 return retVal; 529 } 530 531 532 533 //--------------------------------------------------------------------------- 534 // 535 // REGEX_ERR Macro + invocation function to simplify writing tests 536 // regex tests for incorrect patterns 537 // 538 // usage: 539 // REGEX_ERR("pattern", expected error line, column, expected status); 540 // 541 //--------------------------------------------------------------------------- 542 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__); 543 544 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol, 545 UErrorCode expectedStatus, int32_t line) { 546 UnicodeString pattern(pat); 547 548 UErrorCode status = U_ZERO_ERROR; 549 UParseError pe; 550 RegexPattern *callerPattern = NULL; 551 552 // 553 // Compile the caller's pattern 554 // 555 UnicodeString patString(pat); 556 callerPattern = RegexPattern::compile(patString, 0, pe, status); 557 if (status != expectedStatus) { 558 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status)); 559 } else { 560 if (status != U_ZERO_ERROR) { 561 if (pe.line != errLine || pe.offset != errCol) { 562 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n", 563 line, errLine, errCol, pe.line, pe.offset); 564 } 565 } 566 } 567 568 delete callerPattern; 569 570 // 571 // Compile again, using a UTF-8-based UText 572 // 573 UText patternText = UTEXT_INITIALIZER; 574 regextst_openUTF8FromInvariant(&patternText, pat, -1, &status); 575 callerPattern = RegexPattern::compile(&patternText, 0, pe, status); 576 if (status != expectedStatus) { 577 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status)); 578 } else { 579 if (status != U_ZERO_ERROR) { 580 if (pe.line != errLine || pe.offset != errCol) { 581 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n", 582 line, errLine, errCol, pe.line, pe.offset); 583 } 584 } 585 } 586 587 delete callerPattern; 588 utext_close(&patternText); 589 } 590 591 592 593 //--------------------------------------------------------------------------- 594 // 595 // Basic Check for basic functionality of regex pattern matching. 596 // Avoid the use of REGEX_FIND test macro, which has 597 // substantial dependencies on basic Regex functionality. 598 // 599 //--------------------------------------------------------------------------- 600 void RegexTest::Basic() { 601 602 603 // 604 // Debug - slide failing test cases early 605 // 606 #if 0 607 { 608 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE); 609 UParseError pe; 610 UErrorCode status = U_ZERO_ERROR; 611 RegexPattern *pattern; 612 pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status); 613 pattern->dumpPattern(); 614 RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status); 615 UBool result = m->find(); 616 printf("result = %d\n", result); 617 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd"); 618 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX===================="); 619 } 620 exit(1); 621 #endif 622 623 624 // 625 // Pattern with parentheses 626 // 627 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE); 628 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE); 629 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE); 630 631 // 632 // Patterns with * 633 // 634 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE); 635 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE); 636 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE); 637 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE); 638 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE); 639 640 REGEX_TESTLM("a*", "", TRUE, TRUE); 641 REGEX_TESTLM("a*", "b", TRUE, FALSE); 642 643 644 // 645 // Patterns with "." 646 // 647 REGEX_TESTLM(".", "abc", TRUE, FALSE); 648 REGEX_TESTLM("...", "abc", TRUE, TRUE); 649 REGEX_TESTLM("....", "abc", FALSE, FALSE); 650 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE); 651 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE); 652 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE); 653 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE); 654 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE); 655 656 // 657 // Patterns with * applied to chars at end of literal string 658 // 659 REGEX_TESTLM("abc*", "ab", TRUE, TRUE); 660 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE); 661 662 // 663 // Supplemental chars match as single chars, not a pair of surrogates. 664 // 665 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE); 666 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE); 667 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE); 668 669 670 // 671 // UnicodeSets in the pattern 672 // 673 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE); 674 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE); 675 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE); 676 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE); 677 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE); 678 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE); 679 680 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE); 681 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE); 682 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE); 683 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences. 684 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE); 685 686 // 687 // OR operator in patterns 688 // 689 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE); 690 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE); 691 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE); 692 REGEX_TESTLM("a|b", "b", TRUE, TRUE); 693 694 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE); 695 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE); 696 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE); 697 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE); 698 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE); 699 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE); 700 701 // 702 // + 703 // 704 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE); 705 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE); 706 REGEX_TESTLM("b+", "", FALSE, FALSE); 707 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE); 708 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE); 709 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE); 710 711 // 712 // ? 713 // 714 REGEX_TESTLM("ab?", "ab", TRUE, TRUE); 715 REGEX_TESTLM("ab?", "a", TRUE, TRUE); 716 REGEX_TESTLM("ab?", "ac", TRUE, FALSE); 717 REGEX_TESTLM("ab?", "abb", TRUE, FALSE); 718 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE); 719 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE); 720 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE); 721 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE); 722 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE); 723 724 // 725 // Escape sequences that become single literal chars, handled internally 726 // by ICU's Unescape. 727 // 728 729 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet. 730 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL 731 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L 732 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape 733 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed 734 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line 735 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR 736 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab 737 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE); 738 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE); 739 740 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input 741 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input 742 743 // Escape of special chars in patterns 744 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE); 745 } 746 747 748 //--------------------------------------------------------------------------- 749 // 750 // UTextBasic Check for quirks that are specific to the UText 751 // implementation. 752 // 753 //--------------------------------------------------------------------------- 754 void RegexTest::UTextBasic() { 755 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 756 UErrorCode status = U_ZERO_ERROR; 757 UText pattern = UTEXT_INITIALIZER; 758 utext_openUTF8(&pattern, str_abc, -1, &status); 759 RegexMatcher matcher(&pattern, 0, status); 760 REGEX_CHECK_STATUS; 761 762 UText input = UTEXT_INITIALIZER; 763 utext_openUTF8(&input, str_abc, -1, &status); 764 REGEX_CHECK_STATUS; 765 matcher.reset(&input); 766 REGEX_CHECK_STATUS; 767 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText()); 768 769 matcher.reset(matcher.inputText()); 770 REGEX_CHECK_STATUS; 771 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText()); 772 773 utext_close(&pattern); 774 utext_close(&input); 775 } 776 777 778 //--------------------------------------------------------------------------- 779 // 780 // API_Match Test that the API for class RegexMatcher 781 // is present and nominally working, but excluding functions 782 // implementing replace operations. 783 // 784 //--------------------------------------------------------------------------- 785 void RegexTest::API_Match() { 786 UParseError pe; 787 UErrorCode status=U_ZERO_ERROR; 788 int32_t flags = 0; 789 790 // 791 // Debug - slide failing test cases early 792 // 793 #if 0 794 { 795 } 796 return; 797 #endif 798 799 // 800 // Simple pattern compilation 801 // 802 { 803 UnicodeString re("abc"); 804 RegexPattern *pat2; 805 pat2 = RegexPattern::compile(re, flags, pe, status); 806 REGEX_CHECK_STATUS; 807 808 UnicodeString inStr1 = "abcdef this is a test"; 809 UnicodeString instr2 = "not abc"; 810 UnicodeString empty = ""; 811 812 813 // 814 // Matcher creation and reset. 815 // 816 RegexMatcher *m1 = pat2->matcher(inStr1, status); 817 REGEX_CHECK_STATUS; 818 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 819 REGEX_ASSERT(m1->input() == inStr1); 820 m1->reset(instr2); 821 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 822 REGEX_ASSERT(m1->input() == instr2); 823 m1->reset(inStr1); 824 REGEX_ASSERT(m1->input() == inStr1); 825 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 826 m1->reset(empty); 827 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 828 REGEX_ASSERT(m1->input() == empty); 829 REGEX_ASSERT(&m1->pattern() == pat2); 830 831 // 832 // reset(pos, status) 833 // 834 m1->reset(inStr1); 835 m1->reset(4, status); 836 REGEX_CHECK_STATUS; 837 REGEX_ASSERT(m1->input() == inStr1); 838 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 839 840 m1->reset(-1, status); 841 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 842 status = U_ZERO_ERROR; 843 844 m1->reset(0, status); 845 REGEX_CHECK_STATUS; 846 status = U_ZERO_ERROR; 847 848 int32_t len = m1->input().length(); 849 m1->reset(len-1, status); 850 REGEX_CHECK_STATUS; 851 status = U_ZERO_ERROR; 852 853 m1->reset(len, status); 854 REGEX_CHECK_STATUS; 855 status = U_ZERO_ERROR; 856 857 m1->reset(len+1, status); 858 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 859 status = U_ZERO_ERROR; 860 861 // 862 // match(pos, status) 863 // 864 m1->reset(instr2); 865 REGEX_ASSERT(m1->matches(4, status) == TRUE); 866 m1->reset(); 867 REGEX_ASSERT(m1->matches(3, status) == FALSE); 868 m1->reset(); 869 REGEX_ASSERT(m1->matches(5, status) == FALSE); 870 REGEX_ASSERT(m1->matches(4, status) == TRUE); 871 REGEX_ASSERT(m1->matches(-1, status) == FALSE); 872 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 873 874 // Match() at end of string should fail, but should not 875 // be an error. 876 status = U_ZERO_ERROR; 877 len = m1->input().length(); 878 REGEX_ASSERT(m1->matches(len, status) == FALSE); 879 REGEX_CHECK_STATUS; 880 881 // Match beyond end of string should fail with an error. 882 status = U_ZERO_ERROR; 883 REGEX_ASSERT(m1->matches(len+1, status) == FALSE); 884 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 885 886 // Successful match at end of string. 887 { 888 status = U_ZERO_ERROR; 889 RegexMatcher m("A?", 0, status); // will match zero length string. 890 REGEX_CHECK_STATUS; 891 m.reset(inStr1); 892 len = inStr1.length(); 893 REGEX_ASSERT(m.matches(len, status) == TRUE); 894 REGEX_CHECK_STATUS; 895 m.reset(empty); 896 REGEX_ASSERT(m.matches(0, status) == TRUE); 897 REGEX_CHECK_STATUS; 898 } 899 900 901 // 902 // lookingAt(pos, status) 903 // 904 status = U_ZERO_ERROR; 905 m1->reset(instr2); // "not abc" 906 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 907 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE); 908 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE); 909 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 910 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE); 911 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 912 status = U_ZERO_ERROR; 913 len = m1->input().length(); 914 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE); 915 REGEX_CHECK_STATUS; 916 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE); 917 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 918 919 delete m1; 920 delete pat2; 921 } 922 923 924 // 925 // Capture Group. 926 // RegexMatcher::start(); 927 // RegexMatcher::end(); 928 // RegexMatcher::groupCount(); 929 // 930 { 931 int32_t flags=0; 932 UParseError pe; 933 UErrorCode status=U_ZERO_ERROR; 934 935 UnicodeString re("01(23(45)67)(.*)"); 936 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 937 REGEX_CHECK_STATUS; 938 UnicodeString data = "0123456789"; 939 940 RegexMatcher *matcher = pat->matcher(data, status); 941 REGEX_CHECK_STATUS; 942 REGEX_ASSERT(matcher->lookingAt(status) == TRUE); 943 static const int32_t matchStarts[] = {0, 2, 4, 8}; 944 static const int32_t matchEnds[] = {10, 8, 6, 10}; 945 int32_t i; 946 for (i=0; i<4; i++) { 947 int32_t actualStart = matcher->start(i, status); 948 REGEX_CHECK_STATUS; 949 if (actualStart != matchStarts[i]) { 950 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n", 951 __LINE__, i, matchStarts[i], actualStart); 952 } 953 int32_t actualEnd = matcher->end(i, status); 954 REGEX_CHECK_STATUS; 955 if (actualEnd != matchEnds[i]) { 956 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n", 957 __LINE__, i, matchEnds[i], actualEnd); 958 } 959 } 960 961 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status)); 962 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status)); 963 964 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 965 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 966 matcher->reset(); 967 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE); 968 969 matcher->lookingAt(status); 970 REGEX_ASSERT(matcher->group(status) == "0123456789"); 971 REGEX_ASSERT(matcher->group(0, status) == "0123456789"); 972 REGEX_ASSERT(matcher->group(1, status) == "234567" ); 973 REGEX_ASSERT(matcher->group(2, status) == "45" ); 974 REGEX_ASSERT(matcher->group(3, status) == "89" ); 975 REGEX_CHECK_STATUS; 976 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 977 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 978 matcher->reset(); 979 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); 980 981 delete matcher; 982 delete pat; 983 984 } 985 986 // 987 // find 988 // 989 { 990 int32_t flags=0; 991 UParseError pe; 992 UErrorCode status=U_ZERO_ERROR; 993 994 UnicodeString re("abc"); 995 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 996 REGEX_CHECK_STATUS; 997 UnicodeString data = ".abc..abc...abc.."; 998 // 012345678901234567 999 1000 RegexMatcher *matcher = pat->matcher(data, status); 1001 REGEX_CHECK_STATUS; 1002 REGEX_ASSERT(matcher->find()); 1003 REGEX_ASSERT(matcher->start(status) == 1); 1004 REGEX_ASSERT(matcher->find()); 1005 REGEX_ASSERT(matcher->start(status) == 6); 1006 REGEX_ASSERT(matcher->find()); 1007 REGEX_ASSERT(matcher->start(status) == 12); 1008 REGEX_ASSERT(matcher->find() == FALSE); 1009 REGEX_ASSERT(matcher->find() == FALSE); 1010 1011 matcher->reset(); 1012 REGEX_ASSERT(matcher->find()); 1013 REGEX_ASSERT(matcher->start(status) == 1); 1014 1015 REGEX_ASSERT(matcher->find(0, status)); 1016 REGEX_ASSERT(matcher->start(status) == 1); 1017 REGEX_ASSERT(matcher->find(1, status)); 1018 REGEX_ASSERT(matcher->start(status) == 1); 1019 REGEX_ASSERT(matcher->find(2, status)); 1020 REGEX_ASSERT(matcher->start(status) == 6); 1021 REGEX_ASSERT(matcher->find(12, status)); 1022 REGEX_ASSERT(matcher->start(status) == 12); 1023 REGEX_ASSERT(matcher->find(13, status) == FALSE); 1024 REGEX_ASSERT(matcher->find(16, status) == FALSE); 1025 REGEX_ASSERT(matcher->find(17, status) == FALSE); 1026 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE); 1027 1028 status = U_ZERO_ERROR; 1029 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 1030 status = U_ZERO_ERROR; 1031 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR); 1032 1033 REGEX_ASSERT(matcher->groupCount() == 0); 1034 1035 delete matcher; 1036 delete pat; 1037 } 1038 1039 1040 // 1041 // find, with \G in pattern (true if at the end of a previous match). 1042 // 1043 { 1044 int32_t flags=0; 1045 UParseError pe; 1046 UErrorCode status=U_ZERO_ERROR; 1047 1048 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV); 1049 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 1050 REGEX_CHECK_STATUS; 1051 UnicodeString data = ".abcabc.abc.."; 1052 // 012345678901234567 1053 1054 RegexMatcher *matcher = pat->matcher(data, status); 1055 REGEX_CHECK_STATUS; 1056 REGEX_ASSERT(matcher->find()); 1057 REGEX_ASSERT(matcher->start(status) == 0); 1058 REGEX_ASSERT(matcher->start(1, status) == -1); 1059 REGEX_ASSERT(matcher->start(2, status) == 1); 1060 1061 REGEX_ASSERT(matcher->find()); 1062 REGEX_ASSERT(matcher->start(status) == 4); 1063 REGEX_ASSERT(matcher->start(1, status) == 4); 1064 REGEX_ASSERT(matcher->start(2, status) == -1); 1065 REGEX_CHECK_STATUS; 1066 1067 delete matcher; 1068 delete pat; 1069 } 1070 1071 // 1072 // find with zero length matches, match position should bump ahead 1073 // to prevent loops. 1074 // 1075 { 1076 int32_t i; 1077 UErrorCode status=U_ZERO_ERROR; 1078 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere, 1079 // using an always-true look-ahead. 1080 REGEX_CHECK_STATUS; 1081 UnicodeString s(" "); 1082 m.reset(s); 1083 for (i=0; ; i++) { 1084 if (m.find() == FALSE) { 1085 break; 1086 } 1087 REGEX_ASSERT(m.start(status) == i); 1088 REGEX_ASSERT(m.end(status) == i); 1089 } 1090 REGEX_ASSERT(i==5); 1091 1092 // Check that the bump goes over surrogate pairs OK 1093 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004"); 1094 s = s.unescape(); 1095 m.reset(s); 1096 for (i=0; ; i+=2) { 1097 if (m.find() == FALSE) { 1098 break; 1099 } 1100 REGEX_ASSERT(m.start(status) == i); 1101 REGEX_ASSERT(m.end(status) == i); 1102 } 1103 REGEX_ASSERT(i==10); 1104 } 1105 { 1106 // find() loop breaking test. 1107 // with pattern of /.?/, should see a series of one char matches, then a single 1108 // match of zero length at the end of the input string. 1109 int32_t i; 1110 UErrorCode status=U_ZERO_ERROR; 1111 RegexMatcher m(".?", 0, status); 1112 REGEX_CHECK_STATUS; 1113 UnicodeString s(" "); 1114 m.reset(s); 1115 for (i=0; ; i++) { 1116 if (m.find() == FALSE) { 1117 break; 1118 } 1119 REGEX_ASSERT(m.start(status) == i); 1120 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i)); 1121 } 1122 REGEX_ASSERT(i==5); 1123 } 1124 1125 1126 // 1127 // Matchers with no input string behave as if they had an empty input string. 1128 // 1129 1130 { 1131 UErrorCode status = U_ZERO_ERROR; 1132 RegexMatcher m(".?", 0, status); 1133 REGEX_CHECK_STATUS; 1134 REGEX_ASSERT(m.find()); 1135 REGEX_ASSERT(m.start(status) == 0); 1136 REGEX_ASSERT(m.input() == ""); 1137 } 1138 { 1139 UErrorCode status = U_ZERO_ERROR; 1140 RegexPattern *p = RegexPattern::compile(".", 0, status); 1141 RegexMatcher *m = p->matcher(status); 1142 REGEX_CHECK_STATUS; 1143 1144 REGEX_ASSERT(m->find() == FALSE); 1145 REGEX_ASSERT(m->input() == ""); 1146 delete m; 1147 delete p; 1148 } 1149 1150 // 1151 // Regions 1152 // 1153 { 1154 UErrorCode status = U_ZERO_ERROR; 1155 UnicodeString testString("This is test data"); 1156 RegexMatcher m(".*", testString, 0, status); 1157 REGEX_CHECK_STATUS; 1158 REGEX_ASSERT(m.regionStart() == 0); 1159 REGEX_ASSERT(m.regionEnd() == testString.length()); 1160 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1161 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1162 1163 m.region(2,4, status); 1164 REGEX_CHECK_STATUS; 1165 REGEX_ASSERT(m.matches(status)); 1166 REGEX_ASSERT(m.start(status)==2); 1167 REGEX_ASSERT(m.end(status)==4); 1168 REGEX_CHECK_STATUS; 1169 1170 m.reset(); 1171 REGEX_ASSERT(m.regionStart() == 0); 1172 REGEX_ASSERT(m.regionEnd() == testString.length()); 1173 1174 UnicodeString shorterString("short"); 1175 m.reset(shorterString); 1176 REGEX_ASSERT(m.regionStart() == 0); 1177 REGEX_ASSERT(m.regionEnd() == shorterString.length()); 1178 1179 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1180 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE)); 1181 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 1182 REGEX_ASSERT(&m == &m.reset()); 1183 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 1184 1185 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE)); 1186 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1187 REGEX_ASSERT(&m == &m.reset()); 1188 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1189 1190 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1191 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE)); 1192 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 1193 REGEX_ASSERT(&m == &m.reset()); 1194 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 1195 1196 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE)); 1197 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1198 REGEX_ASSERT(&m == &m.reset()); 1199 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1200 1201 } 1202 1203 // 1204 // hitEnd() and requireEnd() 1205 // 1206 { 1207 UErrorCode status = U_ZERO_ERROR; 1208 UnicodeString testString("aabb"); 1209 RegexMatcher m1(".*", testString, 0, status); 1210 REGEX_ASSERT(m1.lookingAt(status) == TRUE); 1211 REGEX_ASSERT(m1.hitEnd() == TRUE); 1212 REGEX_ASSERT(m1.requireEnd() == FALSE); 1213 REGEX_CHECK_STATUS; 1214 1215 status = U_ZERO_ERROR; 1216 RegexMatcher m2("a*", testString, 0, status); 1217 REGEX_ASSERT(m2.lookingAt(status) == TRUE); 1218 REGEX_ASSERT(m2.hitEnd() == FALSE); 1219 REGEX_ASSERT(m2.requireEnd() == FALSE); 1220 REGEX_CHECK_STATUS; 1221 1222 status = U_ZERO_ERROR; 1223 RegexMatcher m3(".*$", testString, 0, status); 1224 REGEX_ASSERT(m3.lookingAt(status) == TRUE); 1225 REGEX_ASSERT(m3.hitEnd() == TRUE); 1226 REGEX_ASSERT(m3.requireEnd() == TRUE); 1227 REGEX_CHECK_STATUS; 1228 } 1229 1230 1231 // 1232 // Compilation error on reset with UChar * 1233 // These were a hazard that people were stumbling over with runtime errors. 1234 // Changed them to compiler errors by adding private methods that more closely 1235 // matched the incorrect use of the functions. 1236 // 1237 #if 0 1238 { 1239 UErrorCode status = U_ZERO_ERROR; 1240 UChar ucharString[20]; 1241 RegexMatcher m(".", 0, status); 1242 m.reset(ucharString); // should not compile. 1243 1244 RegexPattern *p = RegexPattern::compile(".", 0, status); 1245 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile. 1246 1247 RegexMatcher m3(".", ucharString, 0, status); // Should not compile 1248 } 1249 #endif 1250 1251 // 1252 // Time Outs. 1253 // Note: These tests will need to be changed when the regexp engine is 1254 // able to detect and cut short the exponential time behavior on 1255 // this type of match. 1256 // 1257 { 1258 UErrorCode status = U_ZERO_ERROR; 1259 // Enough 'a's in the string to cause the match to time out. 1260 // (Each on additonal 'a' doubles the time) 1261 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa"); 1262 RegexMatcher matcher("(a+)+b", testString, 0, status); 1263 REGEX_CHECK_STATUS; 1264 REGEX_ASSERT(matcher.getTimeLimit() == 0); 1265 matcher.setTimeLimit(100, status); 1266 REGEX_ASSERT(matcher.getTimeLimit() == 100); 1267 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1268 REGEX_ASSERT(status == U_REGEX_TIME_OUT); 1269 } 1270 { 1271 UErrorCode status = U_ZERO_ERROR; 1272 // Few enough 'a's to slip in under the time limit. 1273 UnicodeString testString("aaaaaaaaaaaaaaaaaa"); 1274 RegexMatcher matcher("(a+)+b", testString, 0, status); 1275 REGEX_CHECK_STATUS; 1276 matcher.setTimeLimit(100, status); 1277 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1278 REGEX_CHECK_STATUS; 1279 } 1280 1281 // 1282 // Stack Limits 1283 // 1284 { 1285 UErrorCode status = U_ZERO_ERROR; 1286 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A' 1287 1288 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations 1289 // of the '+', and makes the stack frames larger. 1290 RegexMatcher matcher("(A)+A$", testString, 0, status); 1291 1292 // With the default stack, this match should fail to run 1293 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1294 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW); 1295 1296 // With unlimited stack, it should run 1297 status = U_ZERO_ERROR; 1298 matcher.setStackLimit(0, status); 1299 REGEX_CHECK_STATUS; 1300 REGEX_ASSERT(matcher.lookingAt(status) == TRUE); 1301 REGEX_CHECK_STATUS; 1302 REGEX_ASSERT(matcher.getStackLimit() == 0); 1303 1304 // With a limited stack, it the match should fail 1305 status = U_ZERO_ERROR; 1306 matcher.setStackLimit(10000, status); 1307 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1308 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW); 1309 REGEX_ASSERT(matcher.getStackLimit() == 10000); 1310 } 1311 1312 // A pattern that doesn't save state should work with 1313 // a minimal sized stack 1314 { 1315 UErrorCode status = U_ZERO_ERROR; 1316 UnicodeString testString = "abc"; 1317 RegexMatcher matcher("abc", testString, 0, status); 1318 REGEX_CHECK_STATUS; 1319 matcher.setStackLimit(30, status); 1320 REGEX_CHECK_STATUS; 1321 REGEX_ASSERT(matcher.matches(status) == TRUE); 1322 REGEX_CHECK_STATUS; 1323 REGEX_ASSERT(matcher.getStackLimit() == 30); 1324 1325 // Negative stack sizes should fail 1326 status = U_ZERO_ERROR; 1327 matcher.setStackLimit(1000, status); 1328 REGEX_CHECK_STATUS; 1329 matcher.setStackLimit(-1, status); 1330 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); 1331 REGEX_ASSERT(matcher.getStackLimit() == 1000); 1332 } 1333 1334 1335 } 1336 1337 1338 1339 1340 1341 1342 //--------------------------------------------------------------------------- 1343 // 1344 // API_Replace API test for class RegexMatcher, testing the 1345 // Replace family of functions. 1346 // 1347 //--------------------------------------------------------------------------- 1348 void RegexTest::API_Replace() { 1349 // 1350 // Replace 1351 // 1352 int32_t flags=0; 1353 UParseError pe; 1354 UErrorCode status=U_ZERO_ERROR; 1355 1356 UnicodeString re("abc"); 1357 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 1358 REGEX_CHECK_STATUS; 1359 UnicodeString data = ".abc..abc...abc.."; 1360 // 012345678901234567 1361 RegexMatcher *matcher = pat->matcher(data, status); 1362 1363 // 1364 // Plain vanilla matches. 1365 // 1366 UnicodeString dest; 1367 dest = matcher->replaceFirst("yz", status); 1368 REGEX_CHECK_STATUS; 1369 REGEX_ASSERT(dest == ".yz..abc...abc.."); 1370 1371 dest = matcher->replaceAll("yz", status); 1372 REGEX_CHECK_STATUS; 1373 REGEX_ASSERT(dest == ".yz..yz...yz.."); 1374 1375 // 1376 // Plain vanilla non-matches. 1377 // 1378 UnicodeString d2 = ".abx..abx...abx.."; 1379 matcher->reset(d2); 1380 dest = matcher->replaceFirst("yz", status); 1381 REGEX_CHECK_STATUS; 1382 REGEX_ASSERT(dest == ".abx..abx...abx.."); 1383 1384 dest = matcher->replaceAll("yz", status); 1385 REGEX_CHECK_STATUS; 1386 REGEX_ASSERT(dest == ".abx..abx...abx.."); 1387 1388 // 1389 // Empty source string 1390 // 1391 UnicodeString d3 = ""; 1392 matcher->reset(d3); 1393 dest = matcher->replaceFirst("yz", status); 1394 REGEX_CHECK_STATUS; 1395 REGEX_ASSERT(dest == ""); 1396 1397 dest = matcher->replaceAll("yz", status); 1398 REGEX_CHECK_STATUS; 1399 REGEX_ASSERT(dest == ""); 1400 1401 // 1402 // Empty substitution string 1403 // 1404 matcher->reset(data); // ".abc..abc...abc.." 1405 dest = matcher->replaceFirst("", status); 1406 REGEX_CHECK_STATUS; 1407 REGEX_ASSERT(dest == "...abc...abc.."); 1408 1409 dest = matcher->replaceAll("", status); 1410 REGEX_CHECK_STATUS; 1411 REGEX_ASSERT(dest == "........"); 1412 1413 // 1414 // match whole string 1415 // 1416 UnicodeString d4 = "abc"; 1417 matcher->reset(d4); 1418 dest = matcher->replaceFirst("xyz", status); 1419 REGEX_CHECK_STATUS; 1420 REGEX_ASSERT(dest == "xyz"); 1421 1422 dest = matcher->replaceAll("xyz", status); 1423 REGEX_CHECK_STATUS; 1424 REGEX_ASSERT(dest == "xyz"); 1425 1426 // 1427 // Capture Group, simple case 1428 // 1429 UnicodeString re2("a(..)"); 1430 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status); 1431 REGEX_CHECK_STATUS; 1432 UnicodeString d5 = "abcdefg"; 1433 RegexMatcher *matcher2 = pat2->matcher(d5, status); 1434 REGEX_CHECK_STATUS; 1435 dest = matcher2->replaceFirst("$1$1", status); 1436 REGEX_CHECK_STATUS; 1437 REGEX_ASSERT(dest == "bcbcdefg"); 1438 1439 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status); 1440 REGEX_CHECK_STATUS; 1441 REGEX_ASSERT(dest == "The value of $1 is bc.defg"); 1442 1443 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status); 1444 REGEX_ASSERT(U_FAILURE(status)); 1445 status = U_ZERO_ERROR; 1446 1447 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF."); 1448 replacement = replacement.unescape(); 1449 dest = matcher2->replaceFirst(replacement, status); 1450 REGEX_CHECK_STATUS; 1451 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg"); 1452 1453 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR); 1454 1455 1456 // 1457 // Replacement String with \u hex escapes 1458 // 1459 { 1460 UnicodeString src = "abc 1 abc 2 abc 3"; 1461 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--"); 1462 matcher->reset(src); 1463 UnicodeString result = matcher->replaceAll(substitute, status); 1464 REGEX_CHECK_STATUS; 1465 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3"); 1466 } 1467 { 1468 UnicodeString src = "abc !"; 1469 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--"); 1470 matcher->reset(src); 1471 UnicodeString result = matcher->replaceAll(substitute, status); 1472 REGEX_CHECK_STATUS; 1473 UnicodeString expected = UnicodeString("--"); 1474 expected.append((UChar32)0x10000); 1475 expected.append("-- !"); 1476 REGEX_ASSERT(result == expected); 1477 } 1478 // TODO: need more through testing of capture substitutions. 1479 1480 // Bug 4057 1481 // 1482 { 1483 status = U_ZERO_ERROR; 1484 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin"; 1485 RegexMatcher m("ss(.*?)ee", 0, status); 1486 REGEX_CHECK_STATUS; 1487 UnicodeString result; 1488 1489 // Multiple finds do NOT bump up the previous appendReplacement postion. 1490 m.reset(s); 1491 m.find(); 1492 m.find(); 1493 m.appendReplacement(result, "ooh", status); 1494 REGEX_CHECK_STATUS; 1495 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1496 1497 // After a reset into the interior of a string, appendReplacemnt still starts at beginning. 1498 status = U_ZERO_ERROR; 1499 result.truncate(0); 1500 m.reset(10, status); 1501 m.find(); 1502 m.find(); 1503 m.appendReplacement(result, "ooh", status); 1504 REGEX_CHECK_STATUS; 1505 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1506 1507 // find() at interior of string, appendReplacemnt still starts at beginning. 1508 status = U_ZERO_ERROR; 1509 result.truncate(0); 1510 m.reset(); 1511 m.find(10, status); 1512 m.find(); 1513 m.appendReplacement(result, "ooh", status); 1514 REGEX_CHECK_STATUS; 1515 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1516 1517 m.appendTail(result); 1518 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin"); 1519 1520 } 1521 1522 delete matcher2; 1523 delete pat2; 1524 delete matcher; 1525 delete pat; 1526 } 1527 1528 1529 //--------------------------------------------------------------------------- 1530 // 1531 // API_Pattern Test that the API for class RegexPattern is 1532 // present and nominally working. 1533 // 1534 //--------------------------------------------------------------------------- 1535 void RegexTest::API_Pattern() { 1536 RegexPattern pata; // Test default constructor to not crash. 1537 RegexPattern patb; 1538 1539 REGEX_ASSERT(pata == patb); 1540 REGEX_ASSERT(pata == pata); 1541 1542 UnicodeString re1("abc[a-l][m-z]"); 1543 UnicodeString re2("def"); 1544 UErrorCode status = U_ZERO_ERROR; 1545 UParseError pe; 1546 1547 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status); 1548 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status); 1549 REGEX_CHECK_STATUS; 1550 REGEX_ASSERT(*pat1 == *pat1); 1551 REGEX_ASSERT(*pat1 != pata); 1552 1553 // Assign 1554 patb = *pat1; 1555 REGEX_ASSERT(patb == *pat1); 1556 1557 // Copy Construct 1558 RegexPattern patc(*pat1); 1559 REGEX_ASSERT(patc == *pat1); 1560 REGEX_ASSERT(patb == patc); 1561 REGEX_ASSERT(pat1 != pat2); 1562 patb = *pat2; 1563 REGEX_ASSERT(patb != patc); 1564 REGEX_ASSERT(patb == *pat2); 1565 1566 // Compile with no flags. 1567 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status); 1568 REGEX_ASSERT(*pat1a == *pat1); 1569 1570 REGEX_ASSERT(pat1a->flags() == 0); 1571 1572 // Compile with different flags should be not equal 1573 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status); 1574 REGEX_CHECK_STATUS; 1575 1576 REGEX_ASSERT(*pat1b != *pat1a); 1577 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE); 1578 REGEX_ASSERT(pat1a->flags() == 0); 1579 delete pat1b; 1580 1581 // clone 1582 RegexPattern *pat1c = pat1->clone(); 1583 REGEX_ASSERT(*pat1c == *pat1); 1584 REGEX_ASSERT(*pat1c != *pat2); 1585 1586 delete pat1c; 1587 delete pat1a; 1588 delete pat1; 1589 delete pat2; 1590 1591 1592 // 1593 // Verify that a matcher created from a cloned pattern works. 1594 // (Jitterbug 3423) 1595 // 1596 { 1597 UErrorCode status = U_ZERO_ERROR; 1598 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status); 1599 RegexPattern *pClone = pSource->clone(); 1600 delete pSource; 1601 RegexMatcher *mFromClone = pClone->matcher(status); 1602 REGEX_CHECK_STATUS; 1603 UnicodeString s = "Hello World"; 1604 mFromClone->reset(s); 1605 REGEX_ASSERT(mFromClone->find() == TRUE); 1606 REGEX_ASSERT(mFromClone->group(status) == "Hello"); 1607 REGEX_ASSERT(mFromClone->find() == TRUE); 1608 REGEX_ASSERT(mFromClone->group(status) == "World"); 1609 REGEX_ASSERT(mFromClone->find() == FALSE); 1610 delete mFromClone; 1611 delete pClone; 1612 } 1613 1614 // 1615 // matches convenience API 1616 // 1617 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE); 1618 REGEX_CHECK_STATUS; 1619 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE); 1620 REGEX_CHECK_STATUS; 1621 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE); 1622 REGEX_CHECK_STATUS; 1623 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE); 1624 REGEX_CHECK_STATUS; 1625 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE); 1626 REGEX_CHECK_STATUS; 1627 status = U_INDEX_OUTOFBOUNDS_ERROR; 1628 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE); 1629 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1630 1631 1632 // 1633 // Split() 1634 // 1635 status = U_ZERO_ERROR; 1636 pat1 = RegexPattern::compile(" +", pe, status); 1637 REGEX_CHECK_STATUS; 1638 UnicodeString fields[10]; 1639 1640 int32_t n; 1641 n = pat1->split("Now is the time", fields, 10, status); 1642 REGEX_CHECK_STATUS; 1643 REGEX_ASSERT(n==4); 1644 REGEX_ASSERT(fields[0]=="Now"); 1645 REGEX_ASSERT(fields[1]=="is"); 1646 REGEX_ASSERT(fields[2]=="the"); 1647 REGEX_ASSERT(fields[3]=="time"); 1648 REGEX_ASSERT(fields[4]==""); 1649 1650 n = pat1->split("Now is the time", fields, 2, status); 1651 REGEX_CHECK_STATUS; 1652 REGEX_ASSERT(n==2); 1653 REGEX_ASSERT(fields[0]=="Now"); 1654 REGEX_ASSERT(fields[1]=="is the time"); 1655 REGEX_ASSERT(fields[2]=="the"); // left over from previous test 1656 1657 fields[1] = "*"; 1658 status = U_ZERO_ERROR; 1659 n = pat1->split("Now is the time", fields, 1, status); 1660 REGEX_CHECK_STATUS; 1661 REGEX_ASSERT(n==1); 1662 REGEX_ASSERT(fields[0]=="Now is the time"); 1663 REGEX_ASSERT(fields[1]=="*"); 1664 status = U_ZERO_ERROR; 1665 1666 n = pat1->split(" Now is the time ", fields, 10, status); 1667 REGEX_CHECK_STATUS; 1668 REGEX_ASSERT(n==6); 1669 REGEX_ASSERT(fields[0]==""); 1670 REGEX_ASSERT(fields[1]=="Now"); 1671 REGEX_ASSERT(fields[2]=="is"); 1672 REGEX_ASSERT(fields[3]=="the"); 1673 REGEX_ASSERT(fields[4]=="time"); 1674 REGEX_ASSERT(fields[5]==""); 1675 1676 n = pat1->split(" ", fields, 10, status); 1677 REGEX_CHECK_STATUS; 1678 REGEX_ASSERT(n==2); 1679 REGEX_ASSERT(fields[0]==""); 1680 REGEX_ASSERT(fields[1]==""); 1681 1682 fields[0] = "foo"; 1683 n = pat1->split("", fields, 10, status); 1684 REGEX_CHECK_STATUS; 1685 REGEX_ASSERT(n==0); 1686 REGEX_ASSERT(fields[0]=="foo"); 1687 1688 delete pat1; 1689 1690 // split, with a pattern with (capture) 1691 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status); 1692 REGEX_CHECK_STATUS; 1693 1694 status = U_ZERO_ERROR; 1695 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status); 1696 REGEX_CHECK_STATUS; 1697 REGEX_ASSERT(n==7); 1698 REGEX_ASSERT(fields[0]==""); 1699 REGEX_ASSERT(fields[1]=="a"); 1700 REGEX_ASSERT(fields[2]=="Now is "); 1701 REGEX_ASSERT(fields[3]=="b"); 1702 REGEX_ASSERT(fields[4]=="the time"); 1703 REGEX_ASSERT(fields[5]=="c"); 1704 REGEX_ASSERT(fields[6]==""); 1705 REGEX_ASSERT(status==U_ZERO_ERROR); 1706 1707 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status); 1708 REGEX_CHECK_STATUS; 1709 REGEX_ASSERT(n==7); 1710 REGEX_ASSERT(fields[0]==" "); 1711 REGEX_ASSERT(fields[1]=="a"); 1712 REGEX_ASSERT(fields[2]=="Now is "); 1713 REGEX_ASSERT(fields[3]=="b"); 1714 REGEX_ASSERT(fields[4]=="the time"); 1715 REGEX_ASSERT(fields[5]=="c"); 1716 REGEX_ASSERT(fields[6]==""); 1717 1718 status = U_ZERO_ERROR; 1719 fields[6] = "foo"; 1720 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status); 1721 REGEX_CHECK_STATUS; 1722 REGEX_ASSERT(n==6); 1723 REGEX_ASSERT(fields[0]==" "); 1724 REGEX_ASSERT(fields[1]=="a"); 1725 REGEX_ASSERT(fields[2]=="Now is "); 1726 REGEX_ASSERT(fields[3]=="b"); 1727 REGEX_ASSERT(fields[4]=="the time"); 1728 REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter. 1729 REGEX_ASSERT(fields[6]=="foo"); 1730 1731 status = U_ZERO_ERROR; 1732 fields[5] = "foo"; 1733 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status); 1734 REGEX_CHECK_STATUS; 1735 REGEX_ASSERT(n==5); 1736 REGEX_ASSERT(fields[0]==" "); 1737 REGEX_ASSERT(fields[1]=="a"); 1738 REGEX_ASSERT(fields[2]=="Now is "); 1739 REGEX_ASSERT(fields[3]=="b"); 1740 REGEX_ASSERT(fields[4]=="the time<c>"); 1741 REGEX_ASSERT(fields[5]=="foo"); 1742 1743 status = U_ZERO_ERROR; 1744 fields[5] = "foo"; 1745 n = pat1->split(" <a>Now is <b>the time", fields, 5, status); 1746 REGEX_CHECK_STATUS; 1747 REGEX_ASSERT(n==5); 1748 REGEX_ASSERT(fields[0]==" "); 1749 REGEX_ASSERT(fields[1]=="a"); 1750 REGEX_ASSERT(fields[2]=="Now is "); 1751 REGEX_ASSERT(fields[3]=="b"); 1752 REGEX_ASSERT(fields[4]=="the time"); 1753 REGEX_ASSERT(fields[5]=="foo"); 1754 1755 status = U_ZERO_ERROR; 1756 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status); 1757 REGEX_CHECK_STATUS; 1758 REGEX_ASSERT(n==4); 1759 REGEX_ASSERT(fields[0]==" "); 1760 REGEX_ASSERT(fields[1]=="a"); 1761 REGEX_ASSERT(fields[2]=="Now is "); 1762 REGEX_ASSERT(fields[3]=="the time<c>"); 1763 status = U_ZERO_ERROR; 1764 delete pat1; 1765 1766 pat1 = RegexPattern::compile("([-,])", pe, status); 1767 REGEX_CHECK_STATUS; 1768 n = pat1->split("1-10,20", fields, 10, status); 1769 REGEX_CHECK_STATUS; 1770 REGEX_ASSERT(n==5); 1771 REGEX_ASSERT(fields[0]=="1"); 1772 REGEX_ASSERT(fields[1]=="-"); 1773 REGEX_ASSERT(fields[2]=="10"); 1774 REGEX_ASSERT(fields[3]==","); 1775 REGEX_ASSERT(fields[4]=="20"); 1776 delete pat1; 1777 1778 // Test split of string with empty trailing fields 1779 pat1 = RegexPattern::compile(",", pe, status); 1780 REGEX_CHECK_STATUS; 1781 n = pat1->split("a,b,c,", fields, 10, status); 1782 REGEX_CHECK_STATUS; 1783 REGEX_ASSERT(n==4); 1784 REGEX_ASSERT(fields[0]=="a"); 1785 REGEX_ASSERT(fields[1]=="b"); 1786 REGEX_ASSERT(fields[2]=="c"); 1787 REGEX_ASSERT(fields[3]==""); 1788 1789 n = pat1->split("a,,,", fields, 10, status); 1790 REGEX_CHECK_STATUS; 1791 REGEX_ASSERT(n==4); 1792 REGEX_ASSERT(fields[0]=="a"); 1793 REGEX_ASSERT(fields[1]==""); 1794 REGEX_ASSERT(fields[2]==""); 1795 REGEX_ASSERT(fields[3]==""); 1796 delete pat1; 1797 1798 // Split Separator with zero length match. 1799 pat1 = RegexPattern::compile(":?", pe, status); 1800 REGEX_CHECK_STATUS; 1801 n = pat1->split("abc", fields, 10, status); 1802 REGEX_CHECK_STATUS; 1803 REGEX_ASSERT(n==5); 1804 REGEX_ASSERT(fields[0]==""); 1805 REGEX_ASSERT(fields[1]=="a"); 1806 REGEX_ASSERT(fields[2]=="b"); 1807 REGEX_ASSERT(fields[3]=="c"); 1808 REGEX_ASSERT(fields[4]==""); 1809 1810 delete pat1; 1811 1812 // 1813 // RegexPattern::pattern() 1814 // 1815 pat1 = new RegexPattern(); 1816 REGEX_ASSERT(pat1->pattern() == ""); 1817 delete pat1; 1818 1819 pat1 = RegexPattern::compile("(Hello, world)*", pe, status); 1820 REGEX_CHECK_STATUS; 1821 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*"); 1822 delete pat1; 1823 1824 1825 // 1826 // classID functions 1827 // 1828 pat1 = RegexPattern::compile("(Hello, world)*", pe, status); 1829 REGEX_CHECK_STATUS; 1830 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID()); 1831 REGEX_ASSERT(pat1->getDynamicClassID() != NULL); 1832 UnicodeString Hello("Hello, world."); 1833 RegexMatcher *m = pat1->matcher(Hello, status); 1834 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID()); 1835 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID()); 1836 REGEX_ASSERT(m->getDynamicClassID() != NULL); 1837 delete m; 1838 delete pat1; 1839 1840 } 1841 1842 //--------------------------------------------------------------------------- 1843 // 1844 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher 1845 // is present and working, but excluding functions 1846 // implementing replace operations. 1847 // 1848 //--------------------------------------------------------------------------- 1849 void RegexTest::API_Match_UTF8() { 1850 UParseError pe; 1851 UErrorCode status=U_ZERO_ERROR; 1852 int32_t flags = 0; 1853 1854 // 1855 // Debug - slide failing test cases early 1856 // 1857 #if 0 1858 { 1859 } 1860 return; 1861 #endif 1862 1863 // 1864 // Simple pattern compilation 1865 // 1866 { 1867 UText re = UTEXT_INITIALIZER; 1868 regextst_openUTF8FromInvariant(&re, "abc", -1, &status); 1869 REGEX_VERBOSE_TEXT(&re); 1870 RegexPattern *pat2; 1871 pat2 = RegexPattern::compile(&re, flags, pe, status); 1872 REGEX_CHECK_STATUS; 1873 1874 UText input1 = UTEXT_INITIALIZER; 1875 UText input2 = UTEXT_INITIALIZER; 1876 UText empty = UTEXT_INITIALIZER; 1877 regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status); 1878 REGEX_VERBOSE_TEXT(&input1); 1879 regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status); 1880 REGEX_VERBOSE_TEXT(&input2); 1881 utext_openUChars(&empty, NULL, 0, &status); 1882 1883 int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */ 1884 int32_t input2Len = strlen("not abc"); 1885 1886 1887 // 1888 // Matcher creation and reset. 1889 // 1890 RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1); 1891 REGEX_CHECK_STATUS; 1892 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 1893 const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */ 1894 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText()); 1895 m1->reset(&input2); 1896 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 1897 const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */ 1898 REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText()); 1899 m1->reset(&input1); 1900 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText()); 1901 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 1902 m1->reset(&empty); 1903 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 1904 REGEX_ASSERT(utext_nativeLength(&empty) == 0); 1905 1906 // 1907 // reset(pos, status) 1908 // 1909 m1->reset(&input1); 1910 m1->reset(4, status); 1911 REGEX_CHECK_STATUS; 1912 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText()); 1913 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 1914 1915 m1->reset(-1, status); 1916 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1917 status = U_ZERO_ERROR; 1918 1919 m1->reset(0, status); 1920 REGEX_CHECK_STATUS; 1921 status = U_ZERO_ERROR; 1922 1923 m1->reset(input1Len-1, status); 1924 REGEX_CHECK_STATUS; 1925 status = U_ZERO_ERROR; 1926 1927 m1->reset(input1Len, status); 1928 REGEX_CHECK_STATUS; 1929 status = U_ZERO_ERROR; 1930 1931 m1->reset(input1Len+1, status); 1932 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1933 status = U_ZERO_ERROR; 1934 1935 // 1936 // match(pos, status) 1937 // 1938 m1->reset(&input2); 1939 REGEX_ASSERT(m1->matches(4, status) == TRUE); 1940 m1->reset(); 1941 REGEX_ASSERT(m1->matches(3, status) == FALSE); 1942 m1->reset(); 1943 REGEX_ASSERT(m1->matches(5, status) == FALSE); 1944 REGEX_ASSERT(m1->matches(4, status) == TRUE); 1945 REGEX_ASSERT(m1->matches(-1, status) == FALSE); 1946 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1947 1948 // Match() at end of string should fail, but should not 1949 // be an error. 1950 status = U_ZERO_ERROR; 1951 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE); 1952 REGEX_CHECK_STATUS; 1953 1954 // Match beyond end of string should fail with an error. 1955 status = U_ZERO_ERROR; 1956 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE); 1957 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1958 1959 // Successful match at end of string. 1960 { 1961 status = U_ZERO_ERROR; 1962 RegexMatcher m("A?", 0, status); // will match zero length string. 1963 REGEX_CHECK_STATUS; 1964 m.reset(&input1); 1965 REGEX_ASSERT(m.matches(input1Len, status) == TRUE); 1966 REGEX_CHECK_STATUS; 1967 m.reset(&empty); 1968 REGEX_ASSERT(m.matches(0, status) == TRUE); 1969 REGEX_CHECK_STATUS; 1970 } 1971 1972 1973 // 1974 // lookingAt(pos, status) 1975 // 1976 status = U_ZERO_ERROR; 1977 m1->reset(&input2); // "not abc" 1978 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 1979 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE); 1980 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE); 1981 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 1982 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE); 1983 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1984 status = U_ZERO_ERROR; 1985 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE); 1986 REGEX_CHECK_STATUS; 1987 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE); 1988 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1989 1990 delete m1; 1991 delete pat2; 1992 1993 utext_close(&re); 1994 utext_close(&input1); 1995 utext_close(&input2); 1996 utext_close(&empty); 1997 } 1998 1999 2000 // 2001 // Capture Group. 2002 // RegexMatcher::start(); 2003 // RegexMatcher::end(); 2004 // RegexMatcher::groupCount(); 2005 // 2006 { 2007 int32_t flags=0; 2008 UParseError pe; 2009 UErrorCode status=U_ZERO_ERROR; 2010 UText re=UTEXT_INITIALIZER; 2011 const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */ 2012 utext_openUTF8(&re, str_01234567_pat, -1, &status); 2013 2014 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 2015 REGEX_CHECK_STATUS; 2016 2017 UText input = UTEXT_INITIALIZER; 2018 const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */ 2019 utext_openUTF8(&input, str_0123456789, -1, &status); 2020 2021 RegexMatcher *matcher = &pat->matcher(status)->reset(&input); 2022 REGEX_CHECK_STATUS; 2023 REGEX_ASSERT(matcher->lookingAt(status) == TRUE); 2024 static const int32_t matchStarts[] = {0, 2, 4, 8}; 2025 static const int32_t matchEnds[] = {10, 8, 6, 10}; 2026 int32_t i; 2027 for (i=0; i<4; i++) { 2028 int32_t actualStart = matcher->start(i, status); 2029 REGEX_CHECK_STATUS; 2030 if (actualStart != matchStarts[i]) { 2031 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n", 2032 __FILE__, __LINE__, i, matchStarts[i], actualStart); 2033 } 2034 int32_t actualEnd = matcher->end(i, status); 2035 REGEX_CHECK_STATUS; 2036 if (actualEnd != matchEnds[i]) { 2037 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n", 2038 __FILE__, __LINE__, i, matchEnds[i], actualEnd); 2039 } 2040 } 2041 2042 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status)); 2043 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status)); 2044 2045 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 2046 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 2047 matcher->reset(); 2048 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE); 2049 2050 matcher->lookingAt(status); 2051 2052 UnicodeString dest; 2053 UText destText = UTEXT_INITIALIZER; 2054 utext_openUnicodeString(&destText, &dest, &status); 2055 UText *result; 2056 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */ 2057 // Test shallow-clone API 2058 int64_t group_len; 2059 result = matcher->group((UText *)NULL, group_len, status); 2060 REGEX_CHECK_STATUS; 2061 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 2062 utext_close(result); 2063 result = matcher->group(0, &destText, group_len, status); 2064 REGEX_CHECK_STATUS; 2065 REGEX_ASSERT(result == &destText); 2066 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 2067 // destText is now immutable, reopen it 2068 utext_close(&destText); 2069 utext_openUnicodeString(&destText, &dest, &status); 2070 2071 int64_t length; 2072 result = matcher->group(0, NULL, length, status); 2073 REGEX_CHECK_STATUS; 2074 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 2075 utext_close(result); 2076 result = matcher->group(0, &destText, length, status); 2077 REGEX_CHECK_STATUS; 2078 REGEX_ASSERT(result == &destText); 2079 REGEX_ASSERT(utext_getNativeIndex(result) == 0); 2080 REGEX_ASSERT(length == 10); 2081 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2082 2083 // Capture Group 1 == "234567" 2084 result = matcher->group(1, NULL, length, status); 2085 REGEX_CHECK_STATUS; 2086 REGEX_ASSERT(utext_getNativeIndex(result) == 2); 2087 REGEX_ASSERT(length == 6); 2088 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2089 utext_close(result); 2090 2091 result = matcher->group(1, &destText, length, status); 2092 REGEX_CHECK_STATUS; 2093 REGEX_ASSERT(result == &destText); 2094 REGEX_ASSERT(utext_getNativeIndex(result) == 2); 2095 REGEX_ASSERT(length == 6); 2096 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2097 utext_close(result); 2098 2099 // Capture Group 2 == "45" 2100 result = matcher->group(2, NULL, length, status); 2101 REGEX_CHECK_STATUS; 2102 REGEX_ASSERT(utext_getNativeIndex(result) == 4); 2103 REGEX_ASSERT(length == 2); 2104 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2105 utext_close(result); 2106 2107 result = matcher->group(2, &destText, length, status); 2108 REGEX_CHECK_STATUS; 2109 REGEX_ASSERT(result == &destText); 2110 REGEX_ASSERT(utext_getNativeIndex(result) == 4); 2111 REGEX_ASSERT(length == 2); 2112 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2113 utext_close(result); 2114 2115 // Capture Group 3 == "89" 2116 result = matcher->group(3, NULL, length, status); 2117 REGEX_CHECK_STATUS; 2118 REGEX_ASSERT(utext_getNativeIndex(result) == 8); 2119 REGEX_ASSERT(length == 2); 2120 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2121 utext_close(result); 2122 2123 result = matcher->group(3, &destText, length, status); 2124 REGEX_CHECK_STATUS; 2125 REGEX_ASSERT(result == &destText); 2126 REGEX_ASSERT(utext_getNativeIndex(result) == 8); 2127 REGEX_ASSERT(length == 2); 2128 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2129 utext_close(result); 2130 2131 // Capture Group number out of range. 2132 status = U_ZERO_ERROR; 2133 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 2134 status = U_ZERO_ERROR; 2135 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 2136 status = U_ZERO_ERROR; 2137 matcher->reset(); 2138 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); 2139 2140 delete matcher; 2141 delete pat; 2142 2143 utext_close(&destText); 2144 utext_close(&input); 2145 utext_close(&re); 2146 } 2147 2148 // 2149 // find 2150 // 2151 { 2152 int32_t flags=0; 2153 UParseError pe; 2154 UErrorCode status=U_ZERO_ERROR; 2155 UText re=UTEXT_INITIALIZER; 2156 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 2157 utext_openUTF8(&re, str_abc, -1, &status); 2158 2159 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 2160 REGEX_CHECK_STATUS; 2161 UText input = UTEXT_INITIALIZER; 2162 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */ 2163 utext_openUTF8(&input, str_abcabcabc, -1, &status); 2164 // 012345678901234567 2165 2166 RegexMatcher *matcher = &pat->matcher(status)->reset(&input); 2167 REGEX_CHECK_STATUS; 2168 REGEX_ASSERT(matcher->find()); 2169 REGEX_ASSERT(matcher->start(status) == 1); 2170 REGEX_ASSERT(matcher->find()); 2171 REGEX_ASSERT(matcher->start(status) == 6); 2172 REGEX_ASSERT(matcher->find()); 2173 REGEX_ASSERT(matcher->start(status) == 12); 2174 REGEX_ASSERT(matcher->find() == FALSE); 2175 REGEX_ASSERT(matcher->find() == FALSE); 2176 2177 matcher->reset(); 2178 REGEX_ASSERT(matcher->find()); 2179 REGEX_ASSERT(matcher->start(status) == 1); 2180 2181 REGEX_ASSERT(matcher->find(0, status)); 2182 REGEX_ASSERT(matcher->start(status) == 1); 2183 REGEX_ASSERT(matcher->find(1, status)); 2184 REGEX_ASSERT(matcher->start(status) == 1); 2185 REGEX_ASSERT(matcher->find(2, status)); 2186 REGEX_ASSERT(matcher->start(status) == 6); 2187 REGEX_ASSERT(matcher->find(12, status)); 2188 REGEX_ASSERT(matcher->start(status) == 12); 2189 REGEX_ASSERT(matcher->find(13, status) == FALSE); 2190 REGEX_ASSERT(matcher->find(16, status) == FALSE); 2191 REGEX_ASSERT(matcher->find(17, status) == FALSE); 2192 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE); 2193 2194 status = U_ZERO_ERROR; 2195 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 2196 status = U_ZERO_ERROR; 2197 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR); 2198 2199 REGEX_ASSERT(matcher->groupCount() == 0); 2200 2201 delete matcher; 2202 delete pat; 2203 2204 utext_close(&input); 2205 utext_close(&re); 2206 } 2207 2208 2209 // 2210 // find, with \G in pattern (true if at the end of a previous match). 2211 // 2212 { 2213 int32_t flags=0; 2214 UParseError pe; 2215 UErrorCode status=U_ZERO_ERROR; 2216 UText re=UTEXT_INITIALIZER; 2217 const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */ 2218 utext_openUTF8(&re, str_Gabcabc, -1, &status); 2219 2220 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 2221 2222 REGEX_CHECK_STATUS; 2223 UText input = UTEXT_INITIALIZER; 2224 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */ 2225 utext_openUTF8(&input, str_abcabcabc, -1, &status); 2226 // 012345678901234567 2227 2228 RegexMatcher *matcher = &pat->matcher(status)->reset(&input); 2229 REGEX_CHECK_STATUS; 2230 REGEX_ASSERT(matcher->find()); 2231 REGEX_ASSERT(matcher->start(status) == 0); 2232 REGEX_ASSERT(matcher->start(1, status) == -1); 2233 REGEX_ASSERT(matcher->start(2, status) == 1); 2234 2235 REGEX_ASSERT(matcher->find()); 2236 REGEX_ASSERT(matcher->start(status) == 4); 2237 REGEX_ASSERT(matcher->start(1, status) == 4); 2238 REGEX_ASSERT(matcher->start(2, status) == -1); 2239 REGEX_CHECK_STATUS; 2240 2241 delete matcher; 2242 delete pat; 2243 2244 utext_close(&input); 2245 utext_close(&re); 2246 } 2247 2248 // 2249 // find with zero length matches, match position should bump ahead 2250 // to prevent loops. 2251 // 2252 { 2253 int32_t i; 2254 UErrorCode status=U_ZERO_ERROR; 2255 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere, 2256 // using an always-true look-ahead. 2257 REGEX_CHECK_STATUS; 2258 UText s = UTEXT_INITIALIZER; 2259 utext_openUTF8(&s, " ", -1, &status); 2260 m.reset(&s); 2261 for (i=0; ; i++) { 2262 if (m.find() == FALSE) { 2263 break; 2264 } 2265 REGEX_ASSERT(m.start(status) == i); 2266 REGEX_ASSERT(m.end(status) == i); 2267 } 2268 REGEX_ASSERT(i==5); 2269 2270 // Check that the bump goes over characters outside the BMP OK 2271 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8 2272 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00}; 2273 utext_openUTF8(&s, (char *)aboveBMP, -1, &status); 2274 m.reset(&s); 2275 for (i=0; ; i+=4) { 2276 if (m.find() == FALSE) { 2277 break; 2278 } 2279 REGEX_ASSERT(m.start(status) == i); 2280 REGEX_ASSERT(m.end(status) == i); 2281 } 2282 REGEX_ASSERT(i==20); 2283 2284 utext_close(&s); 2285 } 2286 { 2287 // find() loop breaking test. 2288 // with pattern of /.?/, should see a series of one char matches, then a single 2289 // match of zero length at the end of the input string. 2290 int32_t i; 2291 UErrorCode status=U_ZERO_ERROR; 2292 RegexMatcher m(".?", 0, status); 2293 REGEX_CHECK_STATUS; 2294 UText s = UTEXT_INITIALIZER; 2295 utext_openUTF8(&s, " ", -1, &status); 2296 m.reset(&s); 2297 for (i=0; ; i++) { 2298 if (m.find() == FALSE) { 2299 break; 2300 } 2301 REGEX_ASSERT(m.start(status) == i); 2302 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i)); 2303 } 2304 REGEX_ASSERT(i==5); 2305 2306 utext_close(&s); 2307 } 2308 2309 2310 // 2311 // Matchers with no input string behave as if they had an empty input string. 2312 // 2313 2314 { 2315 UErrorCode status = U_ZERO_ERROR; 2316 RegexMatcher m(".?", 0, status); 2317 REGEX_CHECK_STATUS; 2318 REGEX_ASSERT(m.find()); 2319 REGEX_ASSERT(m.start(status) == 0); 2320 REGEX_ASSERT(m.input() == ""); 2321 } 2322 { 2323 UErrorCode status = U_ZERO_ERROR; 2324 RegexPattern *p = RegexPattern::compile(".", 0, status); 2325 RegexMatcher *m = p->matcher(status); 2326 REGEX_CHECK_STATUS; 2327 2328 REGEX_ASSERT(m->find() == FALSE); 2329 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0); 2330 delete m; 2331 delete p; 2332 } 2333 2334 // 2335 // Regions 2336 // 2337 { 2338 UErrorCode status = U_ZERO_ERROR; 2339 UText testPattern = UTEXT_INITIALIZER; 2340 UText testText = UTEXT_INITIALIZER; 2341 regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status); 2342 REGEX_VERBOSE_TEXT(&testPattern); 2343 regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status); 2344 REGEX_VERBOSE_TEXT(&testText); 2345 2346 RegexMatcher m(&testPattern, &testText, 0, status); 2347 REGEX_CHECK_STATUS; 2348 REGEX_ASSERT(m.regionStart() == 0); 2349 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data")); 2350 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2351 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2352 2353 m.region(2,4, status); 2354 REGEX_CHECK_STATUS; 2355 REGEX_ASSERT(m.matches(status)); 2356 REGEX_ASSERT(m.start(status)==2); 2357 REGEX_ASSERT(m.end(status)==4); 2358 REGEX_CHECK_STATUS; 2359 2360 m.reset(); 2361 REGEX_ASSERT(m.regionStart() == 0); 2362 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data")); 2363 2364 regextst_openUTF8FromInvariant(&testText, "short", -1, &status); 2365 REGEX_VERBOSE_TEXT(&testText); 2366 m.reset(&testText); 2367 REGEX_ASSERT(m.regionStart() == 0); 2368 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short")); 2369 2370 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2371 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE)); 2372 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 2373 REGEX_ASSERT(&m == &m.reset()); 2374 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 2375 2376 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE)); 2377 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2378 REGEX_ASSERT(&m == &m.reset()); 2379 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2380 2381 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2382 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE)); 2383 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 2384 REGEX_ASSERT(&m == &m.reset()); 2385 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 2386 2387 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE)); 2388 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2389 REGEX_ASSERT(&m == &m.reset()); 2390 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2391 2392 utext_close(&testText); 2393 utext_close(&testPattern); 2394 } 2395 2396 // 2397 // hitEnd() and requireEnd() 2398 // 2399 { 2400 UErrorCode status = U_ZERO_ERROR; 2401 UText testPattern = UTEXT_INITIALIZER; 2402 UText testText = UTEXT_INITIALIZER; 2403 const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */ 2404 const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */ 2405 utext_openUTF8(&testPattern, str_, -1, &status); 2406 utext_openUTF8(&testText, str_aabb, -1, &status); 2407 2408 RegexMatcher m1(&testPattern, &testText, 0, status); 2409 REGEX_ASSERT(m1.lookingAt(status) == TRUE); 2410 REGEX_ASSERT(m1.hitEnd() == TRUE); 2411 REGEX_ASSERT(m1.requireEnd() == FALSE); 2412 REGEX_CHECK_STATUS; 2413 2414 status = U_ZERO_ERROR; 2415 const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */ 2416 utext_openUTF8(&testPattern, str_a, -1, &status); 2417 RegexMatcher m2(&testPattern, &testText, 0, status); 2418 REGEX_ASSERT(m2.lookingAt(status) == TRUE); 2419 REGEX_ASSERT(m2.hitEnd() == FALSE); 2420 REGEX_ASSERT(m2.requireEnd() == FALSE); 2421 REGEX_CHECK_STATUS; 2422 2423 status = U_ZERO_ERROR; 2424 const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */ 2425 utext_openUTF8(&testPattern, str_dotstardollar, -1, &status); 2426 RegexMatcher m3(&testPattern, &testText, 0, status); 2427 REGEX_ASSERT(m3.lookingAt(status) == TRUE); 2428 REGEX_ASSERT(m3.hitEnd() == TRUE); 2429 REGEX_ASSERT(m3.requireEnd() == TRUE); 2430 REGEX_CHECK_STATUS; 2431 2432 utext_close(&testText); 2433 utext_close(&testPattern); 2434 } 2435 } 2436 2437 2438 //--------------------------------------------------------------------------- 2439 // 2440 // API_Replace_UTF8 API test for class RegexMatcher, testing the 2441 // Replace family of functions. 2442 // 2443 //--------------------------------------------------------------------------- 2444 void RegexTest::API_Replace_UTF8() { 2445 // 2446 // Replace 2447 // 2448 int32_t flags=0; 2449 UParseError pe; 2450 UErrorCode status=U_ZERO_ERROR; 2451 2452 UText re=UTEXT_INITIALIZER; 2453 regextst_openUTF8FromInvariant(&re, "abc", -1, &status); 2454 REGEX_VERBOSE_TEXT(&re); 2455 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 2456 REGEX_CHECK_STATUS; 2457 2458 char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */ 2459 // 012345678901234567 2460 UText dataText = UTEXT_INITIALIZER; 2461 utext_openUTF8(&dataText, data, -1, &status); 2462 REGEX_CHECK_STATUS; 2463 REGEX_VERBOSE_TEXT(&dataText); 2464 RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText); 2465 2466 // 2467 // Plain vanilla matches. 2468 // 2469 UnicodeString dest; 2470 UText destText = UTEXT_INITIALIZER; 2471 utext_openUnicodeString(&destText, &dest, &status); 2472 UText *result; 2473 2474 UText replText = UTEXT_INITIALIZER; 2475 2476 const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */ 2477 utext_openUTF8(&replText, str_yz, -1, &status); 2478 REGEX_VERBOSE_TEXT(&replText); 2479 result = matcher->replaceFirst(&replText, NULL, status); 2480 REGEX_CHECK_STATUS; 2481 const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */ 2482 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result); 2483 utext_close(result); 2484 result = matcher->replaceFirst(&replText, &destText, status); 2485 REGEX_CHECK_STATUS; 2486 REGEX_ASSERT(result == &destText); 2487 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result); 2488 2489 result = matcher->replaceAll(&replText, NULL, status); 2490 REGEX_CHECK_STATUS; 2491 const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */ 2492 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result); 2493 utext_close(result); 2494 2495 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2496 result = matcher->replaceAll(&replText, &destText, status); 2497 REGEX_CHECK_STATUS; 2498 REGEX_ASSERT(result == &destText); 2499 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result); 2500 2501 // 2502 // Plain vanilla non-matches. 2503 // 2504 const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */ 2505 utext_openUTF8(&dataText, str_abxabxabx, -1, &status); 2506 matcher->reset(&dataText); 2507 2508 result = matcher->replaceFirst(&replText, NULL, status); 2509 REGEX_CHECK_STATUS; 2510 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2511 utext_close(result); 2512 result = matcher->replaceFirst(&replText, &destText, status); 2513 REGEX_CHECK_STATUS; 2514 REGEX_ASSERT(result == &destText); 2515 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2516 2517 result = matcher->replaceAll(&replText, NULL, status); 2518 REGEX_CHECK_STATUS; 2519 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2520 utext_close(result); 2521 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2522 result = matcher->replaceAll(&replText, &destText, status); 2523 REGEX_CHECK_STATUS; 2524 REGEX_ASSERT(result == &destText); 2525 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2526 2527 // 2528 // Empty source string 2529 // 2530 utext_openUTF8(&dataText, NULL, 0, &status); 2531 matcher->reset(&dataText); 2532 2533 result = matcher->replaceFirst(&replText, NULL, status); 2534 REGEX_CHECK_STATUS; 2535 REGEX_ASSERT_UTEXT_UTF8("", result); 2536 utext_close(result); 2537 result = matcher->replaceFirst(&replText, &destText, status); 2538 REGEX_CHECK_STATUS; 2539 REGEX_ASSERT(result == &destText); 2540 REGEX_ASSERT_UTEXT_UTF8("", result); 2541 2542 result = matcher->replaceAll(&replText, NULL, status); 2543 REGEX_CHECK_STATUS; 2544 REGEX_ASSERT_UTEXT_UTF8("", result); 2545 utext_close(result); 2546 result = matcher->replaceAll(&replText, &destText, status); 2547 REGEX_CHECK_STATUS; 2548 REGEX_ASSERT(result == &destText); 2549 REGEX_ASSERT_UTEXT_UTF8("", result); 2550 2551 // 2552 // Empty substitution string 2553 // 2554 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.." 2555 matcher->reset(&dataText); 2556 2557 utext_openUTF8(&replText, NULL, 0, &status); 2558 result = matcher->replaceFirst(&replText, NULL, status); 2559 REGEX_CHECK_STATUS; 2560 const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */ 2561 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result); 2562 utext_close(result); 2563 result = matcher->replaceFirst(&replText, &destText, status); 2564 REGEX_CHECK_STATUS; 2565 REGEX_ASSERT(result == &destText); 2566 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result); 2567 2568 result = matcher->replaceAll(&replText, NULL, status); 2569 REGEX_CHECK_STATUS; 2570 const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */ 2571 REGEX_ASSERT_UTEXT_UTF8(str_dots, result); 2572 utext_close(result); 2573 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2574 result = matcher->replaceAll(&replText, &destText, status); 2575 REGEX_CHECK_STATUS; 2576 REGEX_ASSERT(result == &destText); 2577 REGEX_ASSERT_UTEXT_UTF8(str_dots, result); 2578 2579 // 2580 // match whole string 2581 // 2582 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 2583 utext_openUTF8(&dataText, str_abc, -1, &status); 2584 matcher->reset(&dataText); 2585 2586 const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */ 2587 utext_openUTF8(&replText, str_xyz, -1, &status); 2588 result = matcher->replaceFirst(&replText, NULL, status); 2589 REGEX_CHECK_STATUS; 2590 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2591 utext_close(result); 2592 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2593 result = matcher->replaceFirst(&replText, &destText, status); 2594 REGEX_CHECK_STATUS; 2595 REGEX_ASSERT(result == &destText); 2596 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2597 2598 result = matcher->replaceAll(&replText, NULL, status); 2599 REGEX_CHECK_STATUS; 2600 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2601 utext_close(result); 2602 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2603 result = matcher->replaceAll(&replText, &destText, status); 2604 REGEX_CHECK_STATUS; 2605 REGEX_ASSERT(result == &destText); 2606 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2607 2608 // 2609 // Capture Group, simple case 2610 // 2611 const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */ 2612 utext_openUTF8(&re, str_add, -1, &status); 2613 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status); 2614 REGEX_CHECK_STATUS; 2615 2616 const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */ 2617 utext_openUTF8(&dataText, str_abcdefg, -1, &status); 2618 RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText); 2619 REGEX_CHECK_STATUS; 2620 2621 const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */ 2622 utext_openUTF8(&replText, str_11, -1, &status); 2623 result = matcher2->replaceFirst(&replText, NULL, status); 2624 REGEX_CHECK_STATUS; 2625 const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */ 2626 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result); 2627 utext_close(result); 2628 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2629 result = matcher2->replaceFirst(&replText, &destText, status); 2630 REGEX_CHECK_STATUS; 2631 REGEX_ASSERT(result == &destText); 2632 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result); 2633 2634 const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */ 2635 utext_openUTF8(&replText, str_v, -1, &status); 2636 REGEX_VERBOSE_TEXT(&replText); 2637 result = matcher2->replaceFirst(&replText, NULL, status); 2638 REGEX_CHECK_STATUS; 2639 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */ 2640 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result); 2641 utext_close(result); 2642 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2643 result = matcher2->replaceFirst(&replText, &destText, status); 2644 REGEX_CHECK_STATUS; 2645 REGEX_ASSERT(result == &destText); 2646 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result); 2647 2648 const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 2649 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 2650 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */ 2651 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status); 2652 result = matcher2->replaceFirst(&replText, NULL, status); 2653 REGEX_CHECK_STATUS; 2654 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */ 2655 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result); 2656 utext_close(result); 2657 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2658 result = matcher2->replaceFirst(&replText, &destText, status); 2659 REGEX_CHECK_STATUS; 2660 REGEX_ASSERT(result == &destText); 2661 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result); 2662 2663 unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */ 2664 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE 2665 // 012345678901234567890123456 2666 supplDigitChars[22] = 0xF0; 2667 supplDigitChars[23] = 0x9D; 2668 supplDigitChars[24] = 0x9F; 2669 supplDigitChars[25] = 0x8F; 2670 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status); 2671 2672 result = matcher2->replaceFirst(&replText, NULL, status); 2673 REGEX_CHECK_STATUS; 2674 const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */ 2675 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result); 2676 utext_close(result); 2677 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2678 result = matcher2->replaceFirst(&replText, &destText, status); 2679 REGEX_CHECK_STATUS; 2680 REGEX_ASSERT(result == &destText); 2681 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result); 2682 const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */ 2683 utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status); 2684 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR); 2685 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result); 2686 utext_close(result); 2687 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2688 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR); 2689 REGEX_ASSERT(result == &destText); 2690 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result); 2691 2692 // 2693 // Replacement String with \u hex escapes 2694 // 2695 { 2696 const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */ 2697 const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */ 2698 utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status); 2699 utext_openUTF8(&replText, str_u0043, -1, &status); 2700 matcher->reset(&dataText); 2701 2702 result = matcher->replaceAll(&replText, NULL, status); 2703 REGEX_CHECK_STATUS; 2704 const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */ 2705 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result); 2706 utext_close(result); 2707 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2708 result = matcher->replaceAll(&replText, &destText, status); 2709 REGEX_CHECK_STATUS; 2710 REGEX_ASSERT(result == &destText); 2711 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result); 2712 } 2713 { 2714 const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */ 2715 utext_openUTF8(&dataText, str_abc, -1, &status); 2716 const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */ 2717 utext_openUTF8(&replText, str_U00010000, -1, &status); 2718 matcher->reset(&dataText); 2719 2720 unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A" 2721 // 0123456789 2722 expected[2] = 0xF0; 2723 expected[3] = 0x90; 2724 expected[4] = 0x80; 2725 expected[5] = 0x80; 2726 2727 result = matcher->replaceAll(&replText, NULL, status); 2728 REGEX_CHECK_STATUS; 2729 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result); 2730 utext_close(result); 2731 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2732 result = matcher->replaceAll(&replText, &destText, status); 2733 REGEX_CHECK_STATUS; 2734 REGEX_ASSERT(result == &destText); 2735 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result); 2736 } 2737 // TODO: need more through testing of capture substitutions. 2738 2739 // Bug 4057 2740 // 2741 { 2742 status = U_ZERO_ERROR; 2743 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */ 2744 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */ 2745 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */ 2746 utext_openUTF8(&re, str_ssee, -1, &status); 2747 utext_openUTF8(&dataText, str_blah, -1, &status); 2748 utext_openUTF8(&replText, str_ooh, -1, &status); 2749 2750 RegexMatcher m(&re, 0, status); 2751 REGEX_CHECK_STATUS; 2752 2753 UnicodeString result; 2754 UText resultText = UTEXT_INITIALIZER; 2755 utext_openUnicodeString(&resultText, &result, &status); 2756 2757 // Multiple finds do NOT bump up the previous appendReplacement postion. 2758 m.reset(&dataText); 2759 m.find(); 2760 m.find(); 2761 m.appendReplacement(&resultText, &replText, status); 2762 REGEX_CHECK_STATUS; 2763 const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */ 2764 REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText); 2765 2766 // After a reset into the interior of a string, appendReplacement still starts at beginning. 2767 status = U_ZERO_ERROR; 2768 result.truncate(0); 2769 utext_openUnicodeString(&resultText, &result, &status); 2770 m.reset(10, status); 2771 m.find(); 2772 m.find(); 2773 m.appendReplacement(&resultText, &replText, status); 2774 REGEX_CHECK_STATUS; 2775 const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */ 2776 REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText); 2777 2778 // find() at interior of string, appendReplacement still starts at beginning. 2779 status = U_ZERO_ERROR; 2780 result.truncate(0); 2781 utext_openUnicodeString(&resultText, &result, &status); 2782 m.reset(); 2783 m.find(10, status); 2784 m.find(); 2785 m.appendReplacement(&resultText, &replText, status); 2786 REGEX_CHECK_STATUS; 2787 const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */ 2788 REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText); 2789 2790 m.appendTail(&resultText, status); 2791 const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */ 2792 REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText); 2793 2794 utext_close(&resultText); 2795 } 2796 2797 delete matcher2; 2798 delete pat2; 2799 delete matcher; 2800 delete pat; 2801 2802 utext_close(&dataText); 2803 utext_close(&replText); 2804 utext_close(&destText); 2805 utext_close(&re); 2806 } 2807 2808 2809 //--------------------------------------------------------------------------- 2810 // 2811 // API_Pattern_UTF8 Test that the API for class RegexPattern is 2812 // present and nominally working. 2813 // 2814 //--------------------------------------------------------------------------- 2815 void RegexTest::API_Pattern_UTF8() { 2816 RegexPattern pata; // Test default constructor to not crash. 2817 RegexPattern patb; 2818 2819 REGEX_ASSERT(pata == patb); 2820 REGEX_ASSERT(pata == pata); 2821 2822 UText re1 = UTEXT_INITIALIZER; 2823 UText re2 = UTEXT_INITIALIZER; 2824 UErrorCode status = U_ZERO_ERROR; 2825 UParseError pe; 2826 2827 const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */ 2828 const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */ 2829 utext_openUTF8(&re1, str_abcalmz, -1, &status); 2830 utext_openUTF8(&re2, str_def, -1, &status); 2831 2832 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status); 2833 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status); 2834 REGEX_CHECK_STATUS; 2835 REGEX_ASSERT(*pat1 == *pat1); 2836 REGEX_ASSERT(*pat1 != pata); 2837 2838 // Assign 2839 patb = *pat1; 2840 REGEX_ASSERT(patb == *pat1); 2841 2842 // Copy Construct 2843 RegexPattern patc(*pat1); 2844 REGEX_ASSERT(patc == *pat1); 2845 REGEX_ASSERT(patb == patc); 2846 REGEX_ASSERT(pat1 != pat2); 2847 patb = *pat2; 2848 REGEX_ASSERT(patb != patc); 2849 REGEX_ASSERT(patb == *pat2); 2850 2851 // Compile with no flags. 2852 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status); 2853 REGEX_ASSERT(*pat1a == *pat1); 2854 2855 REGEX_ASSERT(pat1a->flags() == 0); 2856 2857 // Compile with different flags should be not equal 2858 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status); 2859 REGEX_CHECK_STATUS; 2860 2861 REGEX_ASSERT(*pat1b != *pat1a); 2862 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE); 2863 REGEX_ASSERT(pat1a->flags() == 0); 2864 delete pat1b; 2865 2866 // clone 2867 RegexPattern *pat1c = pat1->clone(); 2868 REGEX_ASSERT(*pat1c == *pat1); 2869 REGEX_ASSERT(*pat1c != *pat2); 2870 2871 delete pat1c; 2872 delete pat1a; 2873 delete pat1; 2874 delete pat2; 2875 2876 utext_close(&re1); 2877 utext_close(&re2); 2878 2879 2880 // 2881 // Verify that a matcher created from a cloned pattern works. 2882 // (Jitterbug 3423) 2883 // 2884 { 2885 UErrorCode status = U_ZERO_ERROR; 2886 UText pattern = UTEXT_INITIALIZER; 2887 const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */ 2888 utext_openUTF8(&pattern, str_pL, -1, &status); 2889 2890 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status); 2891 RegexPattern *pClone = pSource->clone(); 2892 delete pSource; 2893 RegexMatcher *mFromClone = pClone->matcher(status); 2894 REGEX_CHECK_STATUS; 2895 2896 UText input = UTEXT_INITIALIZER; 2897 const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */ 2898 utext_openUTF8(&input, str_HelloWorld, -1, &status); 2899 mFromClone->reset(&input); 2900 REGEX_ASSERT(mFromClone->find() == TRUE); 2901 REGEX_ASSERT(mFromClone->group(status) == "Hello"); 2902 REGEX_ASSERT(mFromClone->find() == TRUE); 2903 REGEX_ASSERT(mFromClone->group(status) == "World"); 2904 REGEX_ASSERT(mFromClone->find() == FALSE); 2905 delete mFromClone; 2906 delete pClone; 2907 2908 utext_close(&input); 2909 utext_close(&pattern); 2910 } 2911 2912 // 2913 // matches convenience API 2914 // 2915 { 2916 UErrorCode status = U_ZERO_ERROR; 2917 UText pattern = UTEXT_INITIALIZER; 2918 UText input = UTEXT_INITIALIZER; 2919 2920 const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */ 2921 utext_openUTF8(&input, str_randominput, -1, &status); 2922 2923 const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */ 2924 utext_openUTF8(&pattern, str_dotstar, -1, &status); 2925 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE); 2926 REGEX_CHECK_STATUS; 2927 2928 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 2929 utext_openUTF8(&pattern, str_abc, -1, &status); 2930 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE); 2931 REGEX_CHECK_STATUS; 2932 2933 const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */ 2934 utext_openUTF8(&pattern, str_nput, -1, &status); 2935 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE); 2936 REGEX_CHECK_STATUS; 2937 2938 utext_openUTF8(&pattern, str_randominput, -1, &status); 2939 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE); 2940 REGEX_CHECK_STATUS; 2941 2942 const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */ 2943 utext_openUTF8(&pattern, str_u, -1, &status); 2944 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE); 2945 REGEX_CHECK_STATUS; 2946 2947 utext_openUTF8(&input, str_abc, -1, &status); 2948 utext_openUTF8(&pattern, str_abc, -1, &status); 2949 status = U_INDEX_OUTOFBOUNDS_ERROR; 2950 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE); 2951 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 2952 2953 utext_close(&input); 2954 utext_close(&pattern); 2955 } 2956 2957 2958 // 2959 // Split() 2960 // 2961 status = U_ZERO_ERROR; 2962 const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */ 2963 utext_openUTF8(&re1, str_spaceplus, -1, &status); 2964 pat1 = RegexPattern::compile(&re1, pe, status); 2965 REGEX_CHECK_STATUS; 2966 UnicodeString fields[10]; 2967 2968 int32_t n; 2969 n = pat1->split("Now is the time", fields, 10, status); 2970 REGEX_CHECK_STATUS; 2971 REGEX_ASSERT(n==4); 2972 REGEX_ASSERT(fields[0]=="Now"); 2973 REGEX_ASSERT(fields[1]=="is"); 2974 REGEX_ASSERT(fields[2]=="the"); 2975 REGEX_ASSERT(fields[3]=="time"); 2976 REGEX_ASSERT(fields[4]==""); 2977 2978 n = pat1->split("Now is the time", fields, 2, status); 2979 REGEX_CHECK_STATUS; 2980 REGEX_ASSERT(n==2); 2981 REGEX_ASSERT(fields[0]=="Now"); 2982 REGEX_ASSERT(fields[1]=="is the time"); 2983 REGEX_ASSERT(fields[2]=="the"); // left over from previous test 2984 2985 fields[1] = "*"; 2986 status = U_ZERO_ERROR; 2987 n = pat1->split("Now is the time", fields, 1, status); 2988 REGEX_CHECK_STATUS; 2989 REGEX_ASSERT(n==1); 2990 REGEX_ASSERT(fields[0]=="Now is the time"); 2991 REGEX_ASSERT(fields[1]=="*"); 2992 status = U_ZERO_ERROR; 2993 2994 n = pat1->split(" Now is the time ", fields, 10, status); 2995 REGEX_CHECK_STATUS; 2996 REGEX_ASSERT(n==6); 2997 REGEX_ASSERT(fields[0]==""); 2998 REGEX_ASSERT(fields[1]=="Now"); 2999 REGEX_ASSERT(fields[2]=="is"); 3000 REGEX_ASSERT(fields[3]=="the"); 3001 REGEX_ASSERT(fields[4]=="time"); 3002 REGEX_ASSERT(fields[5]==""); 3003 REGEX_ASSERT(fields[6]==""); 3004 3005 fields[2] = "*"; 3006 n = pat1->split(" ", fields, 10, status); 3007 REGEX_CHECK_STATUS; 3008 REGEX_ASSERT(n==2); 3009 REGEX_ASSERT(fields[0]==""); 3010 REGEX_ASSERT(fields[1]==""); 3011 REGEX_ASSERT(fields[2]=="*"); 3012 3013 fields[0] = "foo"; 3014 n = pat1->split("", fields, 10, status); 3015 REGEX_CHECK_STATUS; 3016 REGEX_ASSERT(n==0); 3017 REGEX_ASSERT(fields[0]=="foo"); 3018 3019 delete pat1; 3020 3021 // split, with a pattern with (capture) 3022 regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status); 3023 pat1 = RegexPattern::compile(&re1, pe, status); 3024 REGEX_CHECK_STATUS; 3025 3026 status = U_ZERO_ERROR; 3027 fields[6] = fields[7] = "*"; 3028 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status); 3029 REGEX_CHECK_STATUS; 3030 REGEX_ASSERT(n==7); 3031 REGEX_ASSERT(fields[0]==""); 3032 REGEX_ASSERT(fields[1]=="a"); 3033 REGEX_ASSERT(fields[2]=="Now is "); 3034 REGEX_ASSERT(fields[3]=="b"); 3035 REGEX_ASSERT(fields[4]=="the time"); 3036 REGEX_ASSERT(fields[5]=="c"); 3037 REGEX_ASSERT(fields[6]==""); 3038 REGEX_ASSERT(fields[7]=="*"); 3039 REGEX_ASSERT(status==U_ZERO_ERROR); 3040 3041 fields[6] = fields[7] = "*"; 3042 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status); 3043 REGEX_CHECK_STATUS; 3044 REGEX_ASSERT(n==7); 3045 REGEX_ASSERT(fields[0]==" "); 3046 REGEX_ASSERT(fields[1]=="a"); 3047 REGEX_ASSERT(fields[2]=="Now is "); 3048 REGEX_ASSERT(fields[3]=="b"); 3049 REGEX_ASSERT(fields[4]=="the time"); 3050 REGEX_ASSERT(fields[5]=="c"); 3051 REGEX_ASSERT(fields[6]==""); 3052 REGEX_ASSERT(fields[7]=="*"); 3053 3054 status = U_ZERO_ERROR; 3055 fields[6] = "foo"; 3056 n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status); 3057 REGEX_CHECK_STATUS; 3058 REGEX_ASSERT(n==6); 3059 REGEX_ASSERT(fields[0]==" "); 3060 REGEX_ASSERT(fields[1]=="a"); 3061 REGEX_ASSERT(fields[2]=="Now is "); 3062 REGEX_ASSERT(fields[3]=="b"); 3063 REGEX_ASSERT(fields[4]=="the time"); 3064 REGEX_ASSERT(fields[5]==" "); 3065 REGEX_ASSERT(fields[6]=="foo"); 3066 3067 status = U_ZERO_ERROR; 3068 fields[5] = "foo"; 3069 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status); 3070 REGEX_CHECK_STATUS; 3071 REGEX_ASSERT(n==5); 3072 REGEX_ASSERT(fields[0]==" "); 3073 REGEX_ASSERT(fields[1]=="a"); 3074 REGEX_ASSERT(fields[2]=="Now is "); 3075 REGEX_ASSERT(fields[3]=="b"); 3076 REGEX_ASSERT(fields[4]=="the time<c>"); 3077 REGEX_ASSERT(fields[5]=="foo"); 3078 3079 status = U_ZERO_ERROR; 3080 fields[5] = "foo"; 3081 n = pat1->split(" <a>Now is <b>the time", fields, 5, status); 3082 REGEX_CHECK_STATUS; 3083 REGEX_ASSERT(n==5); 3084 REGEX_ASSERT(fields[0]==" "); 3085 REGEX_ASSERT(fields[1]=="a"); 3086 REGEX_ASSERT(fields[2]=="Now is "); 3087 REGEX_ASSERT(fields[3]=="b"); 3088 REGEX_ASSERT(fields[4]=="the time"); 3089 REGEX_ASSERT(fields[5]=="foo"); 3090 3091 status = U_ZERO_ERROR; 3092 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status); 3093 REGEX_CHECK_STATUS; 3094 REGEX_ASSERT(n==4); 3095 REGEX_ASSERT(fields[0]==" "); 3096 REGEX_ASSERT(fields[1]=="a"); 3097 REGEX_ASSERT(fields[2]=="Now is "); 3098 REGEX_ASSERT(fields[3]=="the time<c>"); 3099 status = U_ZERO_ERROR; 3100 delete pat1; 3101 3102 regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status); 3103 pat1 = RegexPattern::compile(&re1, pe, status); 3104 REGEX_CHECK_STATUS; 3105 n = pat1->split("1-10,20", fields, 10, status); 3106 REGEX_CHECK_STATUS; 3107 REGEX_ASSERT(n==5); 3108 REGEX_ASSERT(fields[0]=="1"); 3109 REGEX_ASSERT(fields[1]=="-"); 3110 REGEX_ASSERT(fields[2]=="10"); 3111 REGEX_ASSERT(fields[3]==","); 3112 REGEX_ASSERT(fields[4]=="20"); 3113 delete pat1; 3114 3115 3116 // 3117 // split of a UText based string, with library allocating output UTexts. 3118 // 3119 { 3120 status = U_ZERO_ERROR; 3121 RegexMatcher matcher(UnicodeString("(:)"), 0, status); 3122 UnicodeString stringToSplit("first:second:third"); 3123 UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status); 3124 REGEX_CHECK_STATUS; 3125 3126 UText *splits[10] = {NULL}; 3127 int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status); 3128 REGEX_CHECK_STATUS; 3129 REGEX_ASSERT(numFields == 5); 3130 REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]); 3131 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]); 3132 REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]); 3133 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]); 3134 REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]); 3135 REGEX_ASSERT(splits[5] == NULL); 3136 3137 for (int i=0; i<UPRV_LENGTHOF(splits); i++) { 3138 if (splits[i]) { 3139 utext_close(splits[i]); 3140 splits[i] = NULL; 3141 } 3142 } 3143 utext_close(textToSplit); 3144 } 3145 3146 3147 // 3148 // RegexPattern::pattern() and patternText() 3149 // 3150 pat1 = new RegexPattern(); 3151 REGEX_ASSERT(pat1->pattern() == ""); 3152 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status)); 3153 delete pat1; 3154 const char *helloWorldInvariant = "(Hello, world)*"; 3155 regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status); 3156 pat1 = RegexPattern::compile(&re1, pe, status); 3157 REGEX_CHECK_STATUS; 3158 REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern()); 3159 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status)); 3160 delete pat1; 3161 3162 utext_close(&re1); 3163 } 3164 3165 3166 //--------------------------------------------------------------------------- 3167 // 3168 // Extended A more thorough check for features of regex patterns 3169 // The test cases are in a separate data file, 3170 // source/tests/testdata/regextst.txt 3171 // A description of the test data format is included in that file. 3172 // 3173 //--------------------------------------------------------------------------- 3174 3175 const char * 3176 RegexTest::getPath(char buffer[2048], const char *filename) { 3177 UErrorCode status=U_ZERO_ERROR; 3178 const char *testDataDirectory = IntlTest::getSourceTestData(status); 3179 if (U_FAILURE(status)) { 3180 errln("ERROR: loadTestData() failed - %s", u_errorName(status)); 3181 return NULL; 3182 } 3183 3184 strcpy(buffer, testDataDirectory); 3185 strcat(buffer, filename); 3186 return buffer; 3187 } 3188 3189 void RegexTest::Extended() { 3190 char tdd[2048]; 3191 const char *srcPath; 3192 UErrorCode status = U_ZERO_ERROR; 3193 int32_t lineNum = 0; 3194 3195 // 3196 // Open and read the test data file. 3197 // 3198 srcPath=getPath(tdd, "regextst.txt"); 3199 if(srcPath==NULL) { 3200 return; /* something went wrong, error already output */ 3201 } 3202 3203 int32_t len; 3204 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status); 3205 if (U_FAILURE(status)) { 3206 return; /* something went wrong, error already output */ 3207 } 3208 3209 // 3210 // Put the test data into a UnicodeString 3211 // 3212 UnicodeString testString(FALSE, testData, len); 3213 3214 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status); 3215 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status); 3216 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status); 3217 3218 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status); 3219 UnicodeString testPattern; // The pattern for test from the test file. 3220 UnicodeString testFlags; // the flags for a test. 3221 UnicodeString matchString; // The marked up string to be used as input 3222 3223 if (U_FAILURE(status)){ 3224 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status)); 3225 delete [] testData; 3226 return; 3227 } 3228 3229 // 3230 // Loop over the test data file, once per line. 3231 // 3232 while (lineMat.find()) { 3233 lineNum++; 3234 if (U_FAILURE(status)) { 3235 errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status)); 3236 } 3237 3238 status = U_ZERO_ERROR; 3239 UnicodeString testLine = lineMat.group(1, status); 3240 if (testLine.length() == 0) { 3241 continue; 3242 } 3243 3244 // 3245 // Parse the test line. Skip blank and comment only lines. 3246 // Separate out the three main fields - pattern, flags, target. 3247 // 3248 3249 commentMat.reset(testLine); 3250 if (commentMat.lookingAt(status)) { 3251 // This line is a comment, or blank. 3252 continue; 3253 } 3254 3255 // 3256 // Pull out the pattern field, remove it from the test file line. 3257 // 3258 quotedStuffMat.reset(testLine); 3259 if (quotedStuffMat.lookingAt(status)) { 3260 testPattern = quotedStuffMat.group(2, status); 3261 testLine.remove(0, quotedStuffMat.end(0, status)); 3262 } else { 3263 errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum); 3264 continue; 3265 } 3266 3267 3268 // 3269 // Pull out the flags from the test file line. 3270 // 3271 flagsMat.reset(testLine); 3272 flagsMat.lookingAt(status); // Will always match, possibly an empty string. 3273 testFlags = flagsMat.group(1, status); 3274 if (flagsMat.group(2, status).length() > 0) { 3275 errln("Bad Match flag at line %d. Scanning %c\n", 3276 lineNum, flagsMat.group(2, status).charAt(0)); 3277 continue; 3278 } 3279 testLine.remove(0, flagsMat.end(0, status)); 3280 3281 // 3282 // Pull out the match string, as a whole. 3283 // We'll process the <tags> later. 3284 // 3285 quotedStuffMat.reset(testLine); 3286 if (quotedStuffMat.lookingAt(status)) { 3287 matchString = quotedStuffMat.group(2, status); 3288 testLine.remove(0, quotedStuffMat.end(0, status)); 3289 } else { 3290 errln("Bad match string at test file line %d", lineNum); 3291 continue; 3292 } 3293 3294 // 3295 // The only thing left from the input line should be an optional trailing comment. 3296 // 3297 commentMat.reset(testLine); 3298 if (commentMat.lookingAt(status) == FALSE) { 3299 errln("Line %d: unexpected characters at end of test line.", lineNum); 3300 continue; 3301 } 3302 3303 // 3304 // Run the test 3305 // 3306 regex_find(testPattern, testFlags, matchString, srcPath, lineNum); 3307 } 3308 3309 delete [] testData; 3310 3311 } 3312 3313 3314 3315 //--------------------------------------------------------------------------- 3316 // 3317 // regex_find(pattern, flags, inputString, lineNumber) 3318 // 3319 // Function to run a single test from the Extended (data driven) tests. 3320 // See file test/testdata/regextst.txt for a description of the 3321 // pattern and inputString fields, and the allowed flags. 3322 // lineNumber is the source line in regextst.txt of the test. 3323 // 3324 //--------------------------------------------------------------------------- 3325 3326 3327 // Set a value into a UVector at position specified by a decimal number in 3328 // a UnicodeString. This is a utility function needed by the actual test function, 3329 // which follows. 3330 static void set(UVector &vec, int32_t val, UnicodeString index) { 3331 UErrorCode status=U_ZERO_ERROR; 3332 int32_t idx = 0; 3333 for (int32_t i=0; i<index.length(); i++) { 3334 int32_t d=u_charDigitValue(index.charAt(i)); 3335 if (d<0) {return;} 3336 idx = idx*10 + d; 3337 } 3338 while (vec.size()<idx+1) {vec.addElement(-1, status);} 3339 vec.setElementAt(val, idx); 3340 } 3341 3342 static void setInt(UVector &vec, int32_t val, int32_t idx) { 3343 UErrorCode status=U_ZERO_ERROR; 3344 while (vec.size()<idx+1) {vec.addElement(-1, status);} 3345 vec.setElementAt(val, idx); 3346 } 3347 3348 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex) 3349 { 3350 UBool couldFind = TRUE; 3351 UTEXT_SETNATIVEINDEX(utext, 0); 3352 int32_t i = 0; 3353 while (i < unistrOffset) { 3354 UChar32 c = UTEXT_NEXT32(utext); 3355 if (c != U_SENTINEL) { 3356 i += U16_LENGTH(c); 3357 } else { 3358 couldFind = FALSE; 3359 break; 3360 } 3361 } 3362 nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext); 3363 return couldFind; 3364 } 3365 3366 3367 void RegexTest::regex_find(const UnicodeString &pattern, 3368 const UnicodeString &flags, 3369 const UnicodeString &inputString, 3370 const char *srcPath, 3371 int32_t line) { 3372 UnicodeString unEscapedInput; 3373 UnicodeString deTaggedInput; 3374 3375 int32_t patternUTF8Length, inputUTF8Length; 3376 char *patternChars = NULL, *inputChars = NULL; 3377 UText patternText = UTEXT_INITIALIZER; 3378 UText inputText = UTEXT_INITIALIZER; 3379 UConverter *UTF8Converter = NULL; 3380 3381 UErrorCode status = U_ZERO_ERROR; 3382 UParseError pe; 3383 RegexPattern *parsePat = NULL; 3384 RegexMatcher *parseMatcher = NULL; 3385 RegexPattern *callerPattern = NULL, *UTF8Pattern = NULL; 3386 RegexMatcher *matcher = NULL, *UTF8Matcher = NULL; 3387 UVector groupStarts(status); 3388 UVector groupEnds(status); 3389 UVector groupStartsUTF8(status); 3390 UVector groupEndsUTF8(status); 3391 UBool isMatch = FALSE, isUTF8Match = FALSE; 3392 UBool failed = FALSE; 3393 int32_t numFinds; 3394 int32_t i; 3395 UBool useMatchesFunc = FALSE; 3396 UBool useLookingAtFunc = FALSE; 3397 int32_t regionStart = -1; 3398 int32_t regionEnd = -1; 3399 int32_t regionStartUTF8 = -1; 3400 int32_t regionEndUTF8 = -1; 3401 3402 3403 // 3404 // Compile the caller's pattern 3405 // 3406 uint32_t bflags = 0; 3407 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag 3408 bflags |= UREGEX_CASE_INSENSITIVE; 3409 } 3410 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag 3411 bflags |= UREGEX_COMMENTS; 3412 } 3413 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag 3414 bflags |= UREGEX_DOTALL; 3415 } 3416 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag 3417 bflags |= UREGEX_MULTILINE; 3418 } 3419 3420 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag 3421 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES; 3422 } 3423 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag 3424 bflags |= UREGEX_UNIX_LINES; 3425 } 3426 if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag 3427 bflags |= UREGEX_LITERAL; 3428 } 3429 3430 3431 callerPattern = RegexPattern::compile(pattern, bflags, pe, status); 3432 if (status != U_ZERO_ERROR) { 3433 #if UCONFIG_NO_BREAK_ITERATION==1 3434 // 'v' test flag means that the test pattern should not compile if ICU was configured 3435 // to not include break iteration. RBBI is needed for Unicode word boundaries. 3436 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) { 3437 goto cleanupAndReturn; 3438 } 3439 #endif 3440 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E' 3441 // Expected pattern compilation error. 3442 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd' 3443 logln("Pattern Compile returns \"%s\"", u_errorName(status)); 3444 } 3445 goto cleanupAndReturn; 3446 } else { 3447 // Unexpected pattern compilation error. 3448 dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status)); 3449 goto cleanupAndReturn; 3450 } 3451 } 3452 3453 UTF8Converter = ucnv_open("UTF8", &status); 3454 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); 3455 3456 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status); 3457 status = U_ZERO_ERROR; // buffer overflow 3458 patternChars = new char[patternUTF8Length+1]; 3459 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status); 3460 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status); 3461 3462 if (status == U_ZERO_ERROR) { 3463 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status); 3464 3465 if (status != U_ZERO_ERROR) { 3466 #if UCONFIG_NO_BREAK_ITERATION==1 3467 // 'v' test flag means that the test pattern should not compile if ICU was configured 3468 // to not include break iteration. RBBI is needed for Unicode word boundaries. 3469 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) { 3470 goto cleanupAndReturn; 3471 } 3472 #endif 3473 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E' 3474 // Expected pattern compilation error. 3475 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd' 3476 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status)); 3477 } 3478 goto cleanupAndReturn; 3479 } else { 3480 // Unexpected pattern compilation error. 3481 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status)); 3482 goto cleanupAndReturn; 3483 } 3484 } 3485 } 3486 3487 if (UTF8Pattern == NULL) { 3488 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine 3489 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line); 3490 status = U_ZERO_ERROR; 3491 } 3492 3493 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag 3494 callerPattern->dumpPattern(); 3495 } 3496 3497 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag 3498 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line); 3499 goto cleanupAndReturn; 3500 } 3501 3502 3503 // 3504 // Number of times find() should be called on the test string, default to 1 3505 // 3506 numFinds = 1; 3507 for (i=2; i<=9; i++) { 3508 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag 3509 if (numFinds != 1) { 3510 errln("Line %d: more than one digit flag. Scanning %d.", line, i); 3511 goto cleanupAndReturn; 3512 } 3513 numFinds = i; 3514 } 3515 } 3516 3517 // 'M' flag. Use matches() instead of find() 3518 if (flags.indexOf((UChar)0x4d) >= 0) { 3519 useMatchesFunc = TRUE; 3520 } 3521 if (flags.indexOf((UChar)0x4c) >= 0) { 3522 useLookingAtFunc = TRUE; 3523 } 3524 3525 // 3526 // Find the tags in the input data, remove them, and record the group boundary 3527 // positions. 3528 // 3529 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status); 3530 REGEX_CHECK_STATUS_L(line); 3531 3532 unEscapedInput = inputString.unescape(); 3533 parseMatcher = parsePat->matcher(unEscapedInput, status); 3534 REGEX_CHECK_STATUS_L(line); 3535 while(parseMatcher->find()) { 3536 parseMatcher->appendReplacement(deTaggedInput, "", status); 3537 REGEX_CHECK_STATUS; 3538 UnicodeString groupNum = parseMatcher->group(2, status); 3539 if (groupNum == "r") { 3540 // <r> or </r>, a region specification within the string 3541 if (parseMatcher->group(1, status) == "/") { 3542 regionEnd = deTaggedInput.length(); 3543 } else { 3544 regionStart = deTaggedInput.length(); 3545 } 3546 } else { 3547 // <digits> or </digits>, a group match boundary tag. 3548 if (parseMatcher->group(1, status) == "/") { 3549 set(groupEnds, deTaggedInput.length(), groupNum); 3550 } else { 3551 set(groupStarts, deTaggedInput.length(), groupNum); 3552 } 3553 } 3554 } 3555 parseMatcher->appendTail(deTaggedInput); 3556 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line); 3557 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) { 3558 errln("mismatched <r> tags"); 3559 failed = TRUE; 3560 goto cleanupAndReturn; 3561 } 3562 3563 // 3564 // Configure the matcher according to the flags specified with this test. 3565 // 3566 matcher = callerPattern->matcher(deTaggedInput, status); 3567 REGEX_CHECK_STATUS_L(line); 3568 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag 3569 matcher->setTrace(TRUE); 3570 } 3571 3572 if (UTF8Pattern != NULL) { 3573 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status); 3574 status = U_ZERO_ERROR; // buffer overflow 3575 inputChars = new char[inputUTF8Length+1]; 3576 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status); 3577 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status); 3578 3579 if (status == U_ZERO_ERROR) { 3580 UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText); 3581 REGEX_CHECK_STATUS_L(line); 3582 } 3583 3584 if (UTF8Matcher == NULL) { 3585 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine 3586 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line); 3587 status = U_ZERO_ERROR; 3588 } 3589 } 3590 3591 // 3592 // Generate native indices for UTF8 versions of region and capture group info 3593 // 3594 if (UTF8Matcher != NULL) { 3595 if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8); 3596 if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8); 3597 3598 // Fill out the native index UVector info. 3599 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size() 3600 for (i=0; i<groupStarts.size(); i++) { 3601 int32_t start = groupStarts.elementAti(i); 3602 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting 3603 if (start >= 0) { 3604 int32_t startUTF8; 3605 if (!utextOffsetToNative(&inputText, start, startUTF8)) { 3606 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start); 3607 failed = TRUE; 3608 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3609 } 3610 setInt(groupStartsUTF8, startUTF8, i); 3611 } 3612 3613 int32_t end = groupEnds.elementAti(i); 3614 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting 3615 if (end >= 0) { 3616 int32_t endUTF8; 3617 if (!utextOffsetToNative(&inputText, end, endUTF8)) { 3618 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end); 3619 failed = TRUE; 3620 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3621 } 3622 setInt(groupEndsUTF8, endUTF8, i); 3623 } 3624 } 3625 } 3626 3627 if (regionStart>=0) { 3628 matcher->region(regionStart, regionEnd, status); 3629 REGEX_CHECK_STATUS_L(line); 3630 if (UTF8Matcher != NULL) { 3631 UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status); 3632 REGEX_CHECK_STATUS_L(line); 3633 } 3634 } 3635 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag 3636 matcher->useAnchoringBounds(FALSE); 3637 if (UTF8Matcher != NULL) { 3638 UTF8Matcher->useAnchoringBounds(FALSE); 3639 } 3640 } 3641 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag 3642 matcher->useTransparentBounds(TRUE); 3643 if (UTF8Matcher != NULL) { 3644 UTF8Matcher->useTransparentBounds(TRUE); 3645 } 3646 } 3647 3648 3649 3650 // 3651 // Do a find on the de-tagged input using the caller's pattern 3652 // TODO: error on count>1 and not find(). 3653 // error on both matches() and lookingAt(). 3654 // 3655 for (i=0; i<numFinds; i++) { 3656 if (useMatchesFunc) { 3657 isMatch = matcher->matches(status); 3658 if (UTF8Matcher != NULL) { 3659 isUTF8Match = UTF8Matcher->matches(status); 3660 } 3661 } else if (useLookingAtFunc) { 3662 isMatch = matcher->lookingAt(status); 3663 if (UTF8Matcher != NULL) { 3664 isUTF8Match = UTF8Matcher->lookingAt(status); 3665 } 3666 } else { 3667 isMatch = matcher->find(); 3668 if (UTF8Matcher != NULL) { 3669 isUTF8Match = UTF8Matcher->find(); 3670 } 3671 } 3672 } 3673 matcher->setTrace(FALSE); 3674 if (U_FAILURE(status)) { 3675 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status)); 3676 } 3677 3678 // 3679 // Match up the groups from the find() with the groups from the tags 3680 // 3681 3682 // number of tags should match number of groups from find operation. 3683 // matcher->groupCount does not include group 0, the entire match, hence the +1. 3684 // G option in test means that capture group data is not available in the 3685 // expected results, so the check needs to be suppressed. 3686 if (isMatch == FALSE && groupStarts.size() != 0) { 3687 dataerrln("Error at line %d: Match expected, but none found.", line); 3688 failed = TRUE; 3689 goto cleanupAndReturn; 3690 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) { 3691 errln("Error at line %d: Match expected, but none found. (UTF8)", line); 3692 failed = TRUE; 3693 goto cleanupAndReturn; 3694 } 3695 3696 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) { 3697 // Only check for match / no match. Don't check capture groups. 3698 if (isMatch && groupStarts.size() == 0) { 3699 errln("Error at line %d: No match expected, but one found.", line); 3700 failed = TRUE; 3701 } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) { 3702 errln("Error at line %d: No match expected, but one found. (UTF8)", line); 3703 failed = TRUE; 3704 } 3705 goto cleanupAndReturn; 3706 } 3707 3708 REGEX_CHECK_STATUS_L(line); 3709 for (i=0; i<=matcher->groupCount(); i++) { 3710 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i)); 3711 int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i)); 3712 if (matcher->start(i, status) != expectedStart) { 3713 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d", 3714 line, i, expectedStart, matcher->start(i, status)); 3715 failed = TRUE; 3716 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3717 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) { 3718 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)", 3719 line, i, expectedStartUTF8, UTF8Matcher->start(i, status)); 3720 failed = TRUE; 3721 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3722 } 3723 3724 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i)); 3725 int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i)); 3726 if (matcher->end(i, status) != expectedEnd) { 3727 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d", 3728 line, i, expectedEnd, matcher->end(i, status)); 3729 failed = TRUE; 3730 // Error on end position; keep going; real error is probably yet to come as group 3731 // end positions work from end of the input data towards the front. 3732 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) { 3733 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)", 3734 line, i, expectedEndUTF8, UTF8Matcher->end(i, status)); 3735 failed = TRUE; 3736 // Error on end position; keep going; real error is probably yet to come as group 3737 // end positions work from end of the input data towards the front. 3738 } 3739 } 3740 if ( matcher->groupCount()+1 < groupStarts.size()) { 3741 errln("Error at line %d: Expected %d capture groups, found %d.", 3742 line, groupStarts.size()-1, matcher->groupCount()); 3743 failed = TRUE; 3744 } 3745 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) { 3746 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)", 3747 line, groupStarts.size()-1, UTF8Matcher->groupCount()); 3748 failed = TRUE; 3749 } 3750 3751 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false 3752 matcher->requireEnd() == TRUE) { 3753 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line); 3754 failed = TRUE; 3755 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false 3756 UTF8Matcher->requireEnd() == TRUE) { 3757 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line); 3758 failed = TRUE; 3759 } 3760 3761 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true 3762 matcher->requireEnd() == FALSE) { 3763 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line); 3764 failed = TRUE; 3765 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false 3766 UTF8Matcher->requireEnd() == FALSE) { 3767 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line); 3768 failed = TRUE; 3769 } 3770 3771 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false 3772 matcher->hitEnd() == TRUE) { 3773 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line); 3774 failed = TRUE; 3775 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false 3776 UTF8Matcher->hitEnd() == TRUE) { 3777 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line); 3778 failed = TRUE; 3779 } 3780 3781 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true 3782 matcher->hitEnd() == FALSE) { 3783 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line); 3784 failed = TRUE; 3785 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true 3786 UTF8Matcher->hitEnd() == FALSE) { 3787 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line); 3788 failed = TRUE; 3789 } 3790 3791 3792 cleanupAndReturn: 3793 if (failed) { 3794 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" " 3795 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\""); 3796 // callerPattern->dump(); 3797 } 3798 delete parseMatcher; 3799 delete parsePat; 3800 delete UTF8Matcher; 3801 delete UTF8Pattern; 3802 delete matcher; 3803 delete callerPattern; 3804 3805 utext_close(&inputText); 3806 delete[] inputChars; 3807 utext_close(&patternText); 3808 delete[] patternChars; 3809 ucnv_close(UTF8Converter); 3810 } 3811 3812 3813 3814 3815 //--------------------------------------------------------------------------- 3816 // 3817 // Errors Check for error handling in patterns. 3818 // 3819 //--------------------------------------------------------------------------- 3820 void RegexTest::Errors() { 3821 // \escape sequences that aren't implemented yet. 3822 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED); 3823 3824 // Missing close parentheses 3825 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN); 3826 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN); 3827 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN); 3828 3829 // Extra close paren 3830 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN); 3831 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN); 3832 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN); 3833 3834 // Look-ahead, Look-behind 3835 // TODO: add tests for unbounded length look-behinds. 3836 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct 3837 3838 // Attempt to use non-default flags 3839 { 3840 UParseError pe; 3841 UErrorCode status = U_ZERO_ERROR; 3842 int32_t flags = UREGEX_CANON_EQ | 3843 UREGEX_COMMENTS | UREGEX_DOTALL | 3844 UREGEX_MULTILINE; 3845 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status); 3846 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED); 3847 delete pat1; 3848 } 3849 3850 3851 // Quantifiers are allowed only after something that can be quantified. 3852 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX); 3853 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX); 3854 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX); 3855 3856 // Mal-formed {min,max} quantifiers 3857 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL); 3858 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN); 3859 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL); 3860 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL); 3861 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL); 3862 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG); 3863 REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan 3864 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format 3865 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG); 3866 3867 // Ticket 5389 3868 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX); 3869 3870 // Invalid Back Reference \0 3871 // For ICU 3.8 and earlier 3872 // For ICU versions newer than 3.8, \0 introduces an octal escape. 3873 // 3874 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE); 3875 3876 } 3877 3878 3879 //------------------------------------------------------------------------------- 3880 // 3881 // Read a text data file, convert it to UChars, and return the data 3882 // in one big UChar * buffer, which the caller must delete. 3883 // 3884 //-------------------------------------------------------------------------------- 3885 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen, 3886 const char *defEncoding, UErrorCode &status) { 3887 UChar *retPtr = NULL; 3888 char *fileBuf = NULL; 3889 UConverter* conv = NULL; 3890 FILE *f = NULL; 3891 3892 ulen = 0; 3893 if (U_FAILURE(status)) { 3894 return retPtr; 3895 } 3896 3897 // 3898 // Open the file. 3899 // 3900 f = fopen(fileName, "rb"); 3901 if (f == 0) { 3902 dataerrln("Error opening test data file %s\n", fileName); 3903 status = U_FILE_ACCESS_ERROR; 3904 return NULL; 3905 } 3906 // 3907 // Read it in 3908 // 3909 int32_t fileSize; 3910 int32_t amt_read; 3911 3912 fseek( f, 0, SEEK_END); 3913 fileSize = ftell(f); 3914 fileBuf = new char[fileSize]; 3915 fseek(f, 0, SEEK_SET); 3916 amt_read = fread(fileBuf, 1, fileSize, f); 3917 if (amt_read != fileSize || fileSize <= 0) { 3918 errln("Error reading test data file."); 3919 goto cleanUpAndReturn; 3920 } 3921 3922 // 3923 // Look for a Unicode Signature (BOM) on the data just read 3924 // 3925 int32_t signatureLength; 3926 const char * fileBufC; 3927 const char* encoding; 3928 3929 fileBufC = fileBuf; 3930 encoding = ucnv_detectUnicodeSignature( 3931 fileBuf, fileSize, &signatureLength, &status); 3932 if(encoding!=NULL ){ 3933 fileBufC += signatureLength; 3934 fileSize -= signatureLength; 3935 } else { 3936 encoding = defEncoding; 3937 if (strcmp(encoding, "utf-8") == 0) { 3938 errln("file %s is missing its BOM", fileName); 3939 } 3940 } 3941 3942 // 3943 // Open a converter to take the rule file to UTF-16 3944 // 3945 conv = ucnv_open(encoding, &status); 3946 if (U_FAILURE(status)) { 3947 goto cleanUpAndReturn; 3948 } 3949 3950 // 3951 // Convert the rules to UChar. 3952 // Preflight first to determine required buffer size. 3953 // 3954 ulen = ucnv_toUChars(conv, 3955 NULL, // dest, 3956 0, // destCapacity, 3957 fileBufC, 3958 fileSize, 3959 &status); 3960 if (status == U_BUFFER_OVERFLOW_ERROR) { 3961 // Buffer Overflow is expected from the preflight operation. 3962 status = U_ZERO_ERROR; 3963 3964 retPtr = new UChar[ulen+1]; 3965 ucnv_toUChars(conv, 3966 retPtr, // dest, 3967 ulen+1, 3968 fileBufC, 3969 fileSize, 3970 &status); 3971 } 3972 3973 cleanUpAndReturn: 3974 fclose(f); 3975 delete[] fileBuf; 3976 ucnv_close(conv); 3977 if (U_FAILURE(status)) { 3978 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 3979 delete []retPtr; 3980 retPtr = 0; 3981 ulen = 0; 3982 }; 3983 return retPtr; 3984 } 3985 3986 3987 //------------------------------------------------------------------------------- 3988 // 3989 // PerlTests - Run Perl's regular expression tests 3990 // The input file for this test is re_tests, the standard regular 3991 // expression test data distributed with the Perl source code. 3992 // 3993 // Here is Perl's description of the test data file: 3994 // 3995 // # The tests are in a separate file 't/op/re_tests'. 3996 // # Each line in that file is a separate test. 3997 // # There are five columns, separated by tabs. 3998 // # 3999 // # Column 1 contains the pattern, optionally enclosed in C<''>. 4000 // # Modifiers can be put after the closing C<'>. 4001 // # 4002 // # Column 2 contains the string to be matched. 4003 // # 4004 // # Column 3 contains the expected result: 4005 // # y expect a match 4006 // # n expect no match 4007 // # c expect an error 4008 // # B test exposes a known bug in Perl, should be skipped 4009 // # b test exposes a known bug in Perl, should be skipped if noamp 4010 // # 4011 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>. 4012 // # 4013 // # Column 4 contains a string, usually C<$&>. 4014 // # 4015 // # Column 5 contains the expected result of double-quote 4016 // # interpolating that string after the match, or start of error message. 4017 // # 4018 // # Column 6, if present, contains a reason why the test is skipped. 4019 // # This is printed with "skipped", for harness to pick up. 4020 // # 4021 // # \n in the tests are interpolated, as are variables of the form ${\w+}. 4022 // # 4023 // # If you want to add a regular expression test that can't be expressed 4024 // # in this format, don't add it here: put it in op/pat.t instead. 4025 // 4026 // For ICU, if field 3 contains an 'i', the test will be skipped. 4027 // The test exposes is some known incompatibility between ICU and Perl regexps. 4028 // (The i is in addition to whatever was there before.) 4029 // 4030 //------------------------------------------------------------------------------- 4031 void RegexTest::PerlTests() { 4032 char tdd[2048]; 4033 const char *srcPath; 4034 UErrorCode status = U_ZERO_ERROR; 4035 UParseError pe; 4036 4037 // 4038 // Open and read the test data file. 4039 // 4040 srcPath=getPath(tdd, "re_tests.txt"); 4041 if(srcPath==NULL) { 4042 return; /* something went wrong, error already output */ 4043 } 4044 4045 int32_t len; 4046 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status); 4047 if (U_FAILURE(status)) { 4048 return; /* something went wrong, error already output */ 4049 } 4050 4051 // 4052 // Put the test data into a UnicodeString 4053 // 4054 UnicodeString testDataString(FALSE, testData, len); 4055 4056 // 4057 // Regex to break the input file into lines, and strip the new lines. 4058 // One line per match, capture group one is the desired data. 4059 // 4060 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status); 4061 if (U_FAILURE(status)) { 4062 dataerrln("RegexPattern::compile() error"); 4063 return; 4064 } 4065 RegexMatcher* lineMat = linePat->matcher(testDataString, status); 4066 4067 // 4068 // Regex to split a test file line into fields. 4069 // There are six fields, separated by tabs. 4070 // 4071 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status); 4072 4073 // 4074 // Regex to identify test patterns with flag settings, and to separate them. 4075 // Test patterns with flags look like 'pattern'i 4076 // Test patterns without flags are not quoted: pattern 4077 // Coming out, capture group 2 is the pattern, capture group 3 is the flags. 4078 // 4079 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status); 4080 RegexMatcher* flagMat = flagPat->matcher(status); 4081 4082 // 4083 // The Perl tests reference several perl-isms, which are evaluated/substituted 4084 // in the test data. Not being perl, this must be done explicitly. Here 4085 // are string constants and REs for these constructs. 4086 // 4087 UnicodeString nulnulSrc("${nulnul}"); 4088 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV); 4089 nulnul = nulnul.unescape(); 4090 4091 UnicodeString ffffSrc("${ffff}"); 4092 UnicodeString ffff("\\uffff", -1, US_INV); 4093 ffff = ffff.unescape(); 4094 4095 // regexp for $-[0], $+[2], etc. 4096 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status); 4097 RegexMatcher *groupsMat = groupsPat->matcher(status); 4098 4099 // regexp for $0, $1, $2, etc. 4100 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status); 4101 RegexMatcher *cgMat = cgPat->matcher(status); 4102 4103 4104 // 4105 // Main Loop for the Perl Tests, runs once per line from the 4106 // test data file. 4107 // 4108 int32_t lineNum = 0; 4109 int32_t skippedUnimplementedCount = 0; 4110 while (lineMat->find()) { 4111 lineNum++; 4112 4113 // 4114 // Get a line, break it into its fields, do the Perl 4115 // variable substitutions. 4116 // 4117 UnicodeString line = lineMat->group(1, status); 4118 UnicodeString fields[7]; 4119 fieldPat->split(line, fields, 7, status); 4120 4121 flagMat->reset(fields[0]); 4122 flagMat->matches(status); 4123 UnicodeString pattern = flagMat->group(2, status); 4124 pattern.findAndReplace("${bang}", "!"); 4125 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000")); 4126 pattern.findAndReplace(ffffSrc, ffff); 4127 4128 // 4129 // Identify patterns that include match flag settings, 4130 // split off the flags, remove the extra quotes. 4131 // 4132 UnicodeString flagStr = flagMat->group(3, status); 4133 if (U_FAILURE(status)) { 4134 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 4135 return; 4136 } 4137 int32_t flags = 0; 4138 const UChar UChar_c = 0x63; // Char constants for the flag letters. 4139 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C) 4140 const UChar UChar_m = 0x6d; 4141 const UChar UChar_x = 0x78; 4142 const UChar UChar_y = 0x79; 4143 if (flagStr.indexOf(UChar_i) != -1) { 4144 flags |= UREGEX_CASE_INSENSITIVE; 4145 } 4146 if (flagStr.indexOf(UChar_m) != -1) { 4147 flags |= UREGEX_MULTILINE; 4148 } 4149 if (flagStr.indexOf(UChar_x) != -1) { 4150 flags |= UREGEX_COMMENTS; 4151 } 4152 4153 // 4154 // Compile the test pattern. 4155 // 4156 status = U_ZERO_ERROR; 4157 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status); 4158 if (status == U_REGEX_UNIMPLEMENTED) { 4159 // 4160 // Test of a feature that is planned for ICU, but not yet implemented. 4161 // skip the test. 4162 skippedUnimplementedCount++; 4163 delete testPat; 4164 status = U_ZERO_ERROR; 4165 continue; 4166 } 4167 4168 if (U_FAILURE(status)) { 4169 // Some tests are supposed to generate errors. 4170 // Only report an error for tests that are supposed to succeed. 4171 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND 4172 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility 4173 { 4174 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status)); 4175 } 4176 status = U_ZERO_ERROR; 4177 delete testPat; 4178 continue; 4179 } 4180 4181 if (fields[2].indexOf(UChar_i) >= 0) { 4182 // ICU should skip this test. 4183 delete testPat; 4184 continue; 4185 } 4186 4187 if (fields[2].indexOf(UChar_c) >= 0) { 4188 // This pattern should have caused a compilation error, but didn't/ 4189 errln("line %d: Expected a pattern compile error, got success.", lineNum); 4190 delete testPat; 4191 continue; 4192 } 4193 4194 // 4195 // replace the Perl variables that appear in some of the 4196 // match data strings. 4197 // 4198 UnicodeString matchString = fields[1]; 4199 matchString.findAndReplace(nulnulSrc, nulnul); 4200 matchString.findAndReplace(ffffSrc, ffff); 4201 4202 // Replace any \n in the match string with an actual new-line char. 4203 // Don't do full unescape, as this unescapes more than Perl does, which 4204 // causes other spurious failures in the tests. 4205 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4206 4207 4208 4209 // 4210 // Run the test, check for expected match/don't match result. 4211 // 4212 RegexMatcher *testMat = testPat->matcher(matchString, status); 4213 UBool found = testMat->find(); 4214 UBool expected = FALSE; 4215 if (fields[2].indexOf(UChar_y) >=0) { 4216 expected = TRUE; 4217 } 4218 if (expected != found) { 4219 errln("line %d: Expected %smatch, got %smatch", 4220 lineNum, expected?"":"no ", found?"":"no " ); 4221 continue; 4222 } 4223 4224 // Don't try to check expected results if there is no match. 4225 // (Some have stuff in the expected fields) 4226 if (!found) { 4227 delete testMat; 4228 delete testPat; 4229 continue; 4230 } 4231 4232 // 4233 // Interpret the Perl expression from the fourth field of the data file, 4234 // building up an ICU string from the results of the ICU match. 4235 // The Perl expression will contain references to the results of 4236 // a regex match, including the matched string, capture group strings, 4237 // group starting and ending indicies, etc. 4238 // 4239 UnicodeString resultString; 4240 UnicodeString perlExpr = fields[3]; 4241 #if SUPPORT_MUTATING_INPUT_STRING 4242 groupsMat->reset(perlExpr); 4243 cgMat->reset(perlExpr); 4244 #endif 4245 4246 while (perlExpr.length() > 0) { 4247 #if !SUPPORT_MUTATING_INPUT_STRING 4248 // Perferred usage. Reset after any modification to input string. 4249 groupsMat->reset(perlExpr); 4250 cgMat->reset(perlExpr); 4251 #endif 4252 4253 if (perlExpr.startsWith("$&")) { 4254 resultString.append(testMat->group(status)); 4255 perlExpr.remove(0, 2); 4256 } 4257 4258 else if (groupsMat->lookingAt(status)) { 4259 // $-[0] $+[2] etc. 4260 UnicodeString digitString = groupsMat->group(2, status); 4261 int32_t t = 0; 4262 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4263 UnicodeString plusOrMinus = groupsMat->group(1, status); 4264 int32_t matchPosition; 4265 if (plusOrMinus.compare("+") == 0) { 4266 matchPosition = testMat->end(groupNum, status); 4267 } else { 4268 matchPosition = testMat->start(groupNum, status); 4269 } 4270 if (matchPosition != -1) { 4271 ICU_Utility::appendNumber(resultString, matchPosition); 4272 } 4273 perlExpr.remove(0, groupsMat->end(status)); 4274 } 4275 4276 else if (cgMat->lookingAt(status)) { 4277 // $1, $2, $3, etc. 4278 UnicodeString digitString = cgMat->group(1, status); 4279 int32_t t = 0; 4280 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4281 if (U_SUCCESS(status)) { 4282 resultString.append(testMat->group(groupNum, status)); 4283 status = U_ZERO_ERROR; 4284 } 4285 perlExpr.remove(0, cgMat->end(status)); 4286 } 4287 4288 else if (perlExpr.startsWith("@-")) { 4289 int32_t i; 4290 for (i=0; i<=testMat->groupCount(); i++) { 4291 if (i>0) { 4292 resultString.append(" "); 4293 } 4294 ICU_Utility::appendNumber(resultString, testMat->start(i, status)); 4295 } 4296 perlExpr.remove(0, 2); 4297 } 4298 4299 else if (perlExpr.startsWith("@+")) { 4300 int32_t i; 4301 for (i=0; i<=testMat->groupCount(); i++) { 4302 if (i>0) { 4303 resultString.append(" "); 4304 } 4305 ICU_Utility::appendNumber(resultString, testMat->end(i, status)); 4306 } 4307 perlExpr.remove(0, 2); 4308 } 4309 4310 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal. 4311 // or as an escaped sequence (e.g. \n) 4312 if (perlExpr.length() > 1) { 4313 perlExpr.remove(0, 1); // Remove the '\', but only if not last char. 4314 } 4315 UChar c = perlExpr.charAt(0); 4316 switch (c) { 4317 case 'n': c = '\n'; break; 4318 // add any other escape sequences that show up in the test expected results. 4319 } 4320 resultString.append(c); 4321 perlExpr.remove(0, 1); 4322 } 4323 4324 else { 4325 // Any characters from the perl expression that we don't explicitly 4326 // recognize before here are assumed to be literals and copied 4327 // as-is to the expected results. 4328 resultString.append(perlExpr.charAt(0)); 4329 perlExpr.remove(0, 1); 4330 } 4331 4332 if (U_FAILURE(status)) { 4333 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status)); 4334 break; 4335 } 4336 } 4337 4338 // 4339 // Expected Results Compare 4340 // 4341 UnicodeString expectedS(fields[4]); 4342 expectedS.findAndReplace(nulnulSrc, nulnul); 4343 expectedS.findAndReplace(ffffSrc, ffff); 4344 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4345 4346 4347 if (expectedS.compare(resultString) != 0) { 4348 err("Line %d: Incorrect perl expression results.", lineNum); 4349 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\""); 4350 } 4351 4352 delete testMat; 4353 delete testPat; 4354 } 4355 4356 // 4357 // All done. Clean up allocated stuff. 4358 // 4359 delete cgMat; 4360 delete cgPat; 4361 4362 delete groupsMat; 4363 delete groupsPat; 4364 4365 delete flagMat; 4366 delete flagPat; 4367 4368 delete lineMat; 4369 delete linePat; 4370 4371 delete fieldPat; 4372 delete [] testData; 4373 4374 4375 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount); 4376 4377 } 4378 4379 4380 //------------------------------------------------------------------------------- 4381 // 4382 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts 4383 // (instead of using UnicodeStrings) to test the alternate engine. 4384 // The input file for this test is re_tests, the standard regular 4385 // expression test data distributed with the Perl source code. 4386 // See PerlTests() for more information. 4387 // 4388 //------------------------------------------------------------------------------- 4389 void RegexTest::PerlTestsUTF8() { 4390 char tdd[2048]; 4391 const char *srcPath; 4392 UErrorCode status = U_ZERO_ERROR; 4393 UParseError pe; 4394 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status)); 4395 UText patternText = UTEXT_INITIALIZER; 4396 char *patternChars = NULL; 4397 int32_t patternLength; 4398 int32_t patternCapacity = 0; 4399 UText inputText = UTEXT_INITIALIZER; 4400 char *inputChars = NULL; 4401 int32_t inputLength; 4402 int32_t inputCapacity = 0; 4403 4404 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); 4405 4406 // 4407 // Open and read the test data file. 4408 // 4409 srcPath=getPath(tdd, "re_tests.txt"); 4410 if(srcPath==NULL) { 4411 return; /* something went wrong, error already output */ 4412 } 4413 4414 int32_t len; 4415 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status); 4416 if (U_FAILURE(status)) { 4417 return; /* something went wrong, error already output */ 4418 } 4419 4420 // 4421 // Put the test data into a UnicodeString 4422 // 4423 UnicodeString testDataString(FALSE, testData, len); 4424 4425 // 4426 // Regex to break the input file into lines, and strip the new lines. 4427 // One line per match, capture group one is the desired data. 4428 // 4429 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status); 4430 if (U_FAILURE(status)) { 4431 dataerrln("RegexPattern::compile() error"); 4432 return; 4433 } 4434 RegexMatcher* lineMat = linePat->matcher(testDataString, status); 4435 4436 // 4437 // Regex to split a test file line into fields. 4438 // There are six fields, separated by tabs. 4439 // 4440 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status); 4441 4442 // 4443 // Regex to identify test patterns with flag settings, and to separate them. 4444 // Test patterns with flags look like 'pattern'i 4445 // Test patterns without flags are not quoted: pattern 4446 // Coming out, capture group 2 is the pattern, capture group 3 is the flags. 4447 // 4448 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status); 4449 RegexMatcher* flagMat = flagPat->matcher(status); 4450 4451 // 4452 // The Perl tests reference several perl-isms, which are evaluated/substituted 4453 // in the test data. Not being perl, this must be done explicitly. Here 4454 // are string constants and REs for these constructs. 4455 // 4456 UnicodeString nulnulSrc("${nulnul}"); 4457 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV); 4458 nulnul = nulnul.unescape(); 4459 4460 UnicodeString ffffSrc("${ffff}"); 4461 UnicodeString ffff("\\uffff", -1, US_INV); 4462 ffff = ffff.unescape(); 4463 4464 // regexp for $-[0], $+[2], etc. 4465 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status); 4466 RegexMatcher *groupsMat = groupsPat->matcher(status); 4467 4468 // regexp for $0, $1, $2, etc. 4469 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status); 4470 RegexMatcher *cgMat = cgPat->matcher(status); 4471 4472 4473 // 4474 // Main Loop for the Perl Tests, runs once per line from the 4475 // test data file. 4476 // 4477 int32_t lineNum = 0; 4478 int32_t skippedUnimplementedCount = 0; 4479 while (lineMat->find()) { 4480 lineNum++; 4481 4482 // 4483 // Get a line, break it into its fields, do the Perl 4484 // variable substitutions. 4485 // 4486 UnicodeString line = lineMat->group(1, status); 4487 UnicodeString fields[7]; 4488 fieldPat->split(line, fields, 7, status); 4489 4490 flagMat->reset(fields[0]); 4491 flagMat->matches(status); 4492 UnicodeString pattern = flagMat->group(2, status); 4493 pattern.findAndReplace("${bang}", "!"); 4494 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000")); 4495 pattern.findAndReplace(ffffSrc, ffff); 4496 4497 // 4498 // Identify patterns that include match flag settings, 4499 // split off the flags, remove the extra quotes. 4500 // 4501 UnicodeString flagStr = flagMat->group(3, status); 4502 if (U_FAILURE(status)) { 4503 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 4504 return; 4505 } 4506 int32_t flags = 0; 4507 const UChar UChar_c = 0x63; // Char constants for the flag letters. 4508 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C) 4509 const UChar UChar_m = 0x6d; 4510 const UChar UChar_x = 0x78; 4511 const UChar UChar_y = 0x79; 4512 if (flagStr.indexOf(UChar_i) != -1) { 4513 flags |= UREGEX_CASE_INSENSITIVE; 4514 } 4515 if (flagStr.indexOf(UChar_m) != -1) { 4516 flags |= UREGEX_MULTILINE; 4517 } 4518 if (flagStr.indexOf(UChar_x) != -1) { 4519 flags |= UREGEX_COMMENTS; 4520 } 4521 4522 // 4523 // Put the pattern in a UTF-8 UText 4524 // 4525 status = U_ZERO_ERROR; 4526 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status); 4527 if (status == U_BUFFER_OVERFLOW_ERROR) { 4528 status = U_ZERO_ERROR; 4529 delete[] patternChars; 4530 patternCapacity = patternLength + 1; 4531 patternChars = new char[patternCapacity]; 4532 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status); 4533 } 4534 utext_openUTF8(&patternText, patternChars, patternLength, &status); 4535 4536 // 4537 // Compile the test pattern. 4538 // 4539 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status); 4540 if (status == U_REGEX_UNIMPLEMENTED) { 4541 // 4542 // Test of a feature that is planned for ICU, but not yet implemented. 4543 // skip the test. 4544 skippedUnimplementedCount++; 4545 delete testPat; 4546 status = U_ZERO_ERROR; 4547 continue; 4548 } 4549 4550 if (U_FAILURE(status)) { 4551 // Some tests are supposed to generate errors. 4552 // Only report an error for tests that are supposed to succeed. 4553 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND 4554 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility 4555 { 4556 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status)); 4557 } 4558 status = U_ZERO_ERROR; 4559 delete testPat; 4560 continue; 4561 } 4562 4563 if (fields[2].indexOf(UChar_i) >= 0) { 4564 // ICU should skip this test. 4565 delete testPat; 4566 continue; 4567 } 4568 4569 if (fields[2].indexOf(UChar_c) >= 0) { 4570 // This pattern should have caused a compilation error, but didn't/ 4571 errln("line %d: Expected a pattern compile error, got success.", lineNum); 4572 delete testPat; 4573 continue; 4574 } 4575 4576 4577 // 4578 // replace the Perl variables that appear in some of the 4579 // match data strings. 4580 // 4581 UnicodeString matchString = fields[1]; 4582 matchString.findAndReplace(nulnulSrc, nulnul); 4583 matchString.findAndReplace(ffffSrc, ffff); 4584 4585 // Replace any \n in the match string with an actual new-line char. 4586 // Don't do full unescape, as this unescapes more than Perl does, which 4587 // causes other spurious failures in the tests. 4588 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4589 4590 // 4591 // Put the input in a UTF-8 UText 4592 // 4593 status = U_ZERO_ERROR; 4594 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status); 4595 if (status == U_BUFFER_OVERFLOW_ERROR) { 4596 status = U_ZERO_ERROR; 4597 delete[] inputChars; 4598 inputCapacity = inputLength + 1; 4599 inputChars = new char[inputCapacity]; 4600 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status); 4601 } 4602 utext_openUTF8(&inputText, inputChars, inputLength, &status); 4603 4604 // 4605 // Run the test, check for expected match/don't match result. 4606 // 4607 RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText); 4608 UBool found = testMat->find(); 4609 UBool expected = FALSE; 4610 if (fields[2].indexOf(UChar_y) >=0) { 4611 expected = TRUE; 4612 } 4613 if (expected != found) { 4614 errln("line %d: Expected %smatch, got %smatch", 4615 lineNum, expected?"":"no ", found?"":"no " ); 4616 continue; 4617 } 4618 4619 // Don't try to check expected results if there is no match. 4620 // (Some have stuff in the expected fields) 4621 if (!found) { 4622 delete testMat; 4623 delete testPat; 4624 continue; 4625 } 4626 4627 // 4628 // Interpret the Perl expression from the fourth field of the data file, 4629 // building up an ICU string from the results of the ICU match. 4630 // The Perl expression will contain references to the results of 4631 // a regex match, including the matched string, capture group strings, 4632 // group starting and ending indicies, etc. 4633 // 4634 UnicodeString resultString; 4635 UnicodeString perlExpr = fields[3]; 4636 4637 while (perlExpr.length() > 0) { 4638 groupsMat->reset(perlExpr); 4639 cgMat->reset(perlExpr); 4640 4641 if (perlExpr.startsWith("$&")) { 4642 resultString.append(testMat->group(status)); 4643 perlExpr.remove(0, 2); 4644 } 4645 4646 else if (groupsMat->lookingAt(status)) { 4647 // $-[0] $+[2] etc. 4648 UnicodeString digitString = groupsMat->group(2, status); 4649 int32_t t = 0; 4650 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4651 UnicodeString plusOrMinus = groupsMat->group(1, status); 4652 int32_t matchPosition; 4653 if (plusOrMinus.compare("+") == 0) { 4654 matchPosition = testMat->end(groupNum, status); 4655 } else { 4656 matchPosition = testMat->start(groupNum, status); 4657 } 4658 if (matchPosition != -1) { 4659 ICU_Utility::appendNumber(resultString, matchPosition); 4660 } 4661 perlExpr.remove(0, groupsMat->end(status)); 4662 } 4663 4664 else if (cgMat->lookingAt(status)) { 4665 // $1, $2, $3, etc. 4666 UnicodeString digitString = cgMat->group(1, status); 4667 int32_t t = 0; 4668 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4669 if (U_SUCCESS(status)) { 4670 resultString.append(testMat->group(groupNum, status)); 4671 status = U_ZERO_ERROR; 4672 } 4673 perlExpr.remove(0, cgMat->end(status)); 4674 } 4675 4676 else if (perlExpr.startsWith("@-")) { 4677 int32_t i; 4678 for (i=0; i<=testMat->groupCount(); i++) { 4679 if (i>0) { 4680 resultString.append(" "); 4681 } 4682 ICU_Utility::appendNumber(resultString, testMat->start(i, status)); 4683 } 4684 perlExpr.remove(0, 2); 4685 } 4686 4687 else if (perlExpr.startsWith("@+")) { 4688 int32_t i; 4689 for (i=0; i<=testMat->groupCount(); i++) { 4690 if (i>0) { 4691 resultString.append(" "); 4692 } 4693 ICU_Utility::appendNumber(resultString, testMat->end(i, status)); 4694 } 4695 perlExpr.remove(0, 2); 4696 } 4697 4698 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal. 4699 // or as an escaped sequence (e.g. \n) 4700 if (perlExpr.length() > 1) { 4701 perlExpr.remove(0, 1); // Remove the '\', but only if not last char. 4702 } 4703 UChar c = perlExpr.charAt(0); 4704 switch (c) { 4705 case 'n': c = '\n'; break; 4706 // add any other escape sequences that show up in the test expected results. 4707 } 4708 resultString.append(c); 4709 perlExpr.remove(0, 1); 4710 } 4711 4712 else { 4713 // Any characters from the perl expression that we don't explicitly 4714 // recognize before here are assumed to be literals and copied 4715 // as-is to the expected results. 4716 resultString.append(perlExpr.charAt(0)); 4717 perlExpr.remove(0, 1); 4718 } 4719 4720 if (U_FAILURE(status)) { 4721 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status)); 4722 break; 4723 } 4724 } 4725 4726 // 4727 // Expected Results Compare 4728 // 4729 UnicodeString expectedS(fields[4]); 4730 expectedS.findAndReplace(nulnulSrc, nulnul); 4731 expectedS.findAndReplace(ffffSrc, ffff); 4732 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4733 4734 4735 if (expectedS.compare(resultString) != 0) { 4736 err("Line %d: Incorrect perl expression results.", lineNum); 4737 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\""); 4738 } 4739 4740 delete testMat; 4741 delete testPat; 4742 } 4743 4744 // 4745 // All done. Clean up allocated stuff. 4746 // 4747 delete cgMat; 4748 delete cgPat; 4749 4750 delete groupsMat; 4751 delete groupsPat; 4752 4753 delete flagMat; 4754 delete flagPat; 4755 4756 delete lineMat; 4757 delete linePat; 4758 4759 delete fieldPat; 4760 delete [] testData; 4761 4762 utext_close(&patternText); 4763 utext_close(&inputText); 4764 4765 delete [] patternChars; 4766 delete [] inputChars; 4767 4768 4769 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount); 4770 4771 } 4772 4773 4774 //-------------------------------------------------------------- 4775 // 4776 // Bug6149 Verify limits to heap expansion for backtrack stack. 4777 // Use this pattern, 4778 // "(a?){1,8000000}" 4779 // Note: was an unbounded upperbounds, but that now has loop-breaking enabled. 4780 // This test is likely to be fragile, as further optimizations stop 4781 // more cases of pointless looping in the match engine. 4782 // 4783 //--------------------------------------------------------------- 4784 void RegexTest::Bug6149() { 4785 UnicodeString pattern("(a?){1,8000000}"); 4786 UnicodeString s("xyz"); 4787 uint32_t flags = 0; 4788 UErrorCode status = U_ZERO_ERROR; 4789 4790 RegexMatcher matcher(pattern, s, flags, status); 4791 UBool result = false; 4792 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW); 4793 REGEX_ASSERT(result == FALSE); 4794 } 4795 4796 4797 // 4798 // Callbacks() Test the callback function. 4799 // When set, callbacks occur periodically during matching operations, 4800 // giving the application code the ability to abort the operation 4801 // before it's normal completion. 4802 // 4803 4804 struct callBackContext { 4805 RegexTest *test; 4806 int32_t maxCalls; 4807 int32_t numCalls; 4808 int32_t lastSteps; 4809 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;}; 4810 }; 4811 4812 U_CDECL_BEGIN 4813 static UBool U_CALLCONV 4814 testCallBackFn(const void *context, int32_t steps) { 4815 callBackContext *info = (callBackContext *)context; 4816 if (info->lastSteps+1 != steps) { 4817 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps); 4818 } 4819 info->lastSteps = steps; 4820 info->numCalls++; 4821 return (info->numCalls < info->maxCalls); 4822 } 4823 U_CDECL_END 4824 4825 void RegexTest::Callbacks() { 4826 { 4827 // Getter returns NULLs if no callback has been set 4828 4829 // The variables that the getter will fill in. 4830 // Init to non-null values so that the action of the getter can be seen. 4831 const void *returnedContext = &returnedContext; 4832 URegexMatchCallback *returnedFn = &testCallBackFn; 4833 4834 UErrorCode status = U_ZERO_ERROR; 4835 RegexMatcher matcher("x", 0, status); 4836 REGEX_CHECK_STATUS; 4837 matcher.getMatchCallback(returnedFn, returnedContext, status); 4838 REGEX_CHECK_STATUS; 4839 REGEX_ASSERT(returnedFn == NULL); 4840 REGEX_ASSERT(returnedContext == NULL); 4841 } 4842 4843 { 4844 // Set and Get work 4845 callBackContext cbInfo = {this, 0, 0, 0}; 4846 const void *returnedContext; 4847 URegexMatchCallback *returnedFn; 4848 UErrorCode status = U_ZERO_ERROR; 4849 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long. 4850 REGEX_CHECK_STATUS; 4851 matcher.setMatchCallback(testCallBackFn, &cbInfo, status); 4852 REGEX_CHECK_STATUS; 4853 matcher.getMatchCallback(returnedFn, returnedContext, status); 4854 REGEX_CHECK_STATUS; 4855 REGEX_ASSERT(returnedFn == testCallBackFn); 4856 REGEX_ASSERT(returnedContext == &cbInfo); 4857 4858 // A short-running match shouldn't invoke the callback 4859 status = U_ZERO_ERROR; 4860 cbInfo.reset(1); 4861 UnicodeString s = "xxx"; 4862 matcher.reset(s); 4863 REGEX_ASSERT(matcher.matches(status)); 4864 REGEX_CHECK_STATUS; 4865 REGEX_ASSERT(cbInfo.numCalls == 0); 4866 4867 // A medium-length match that runs long enough to invoke the 4868 // callback, but not so long that the callback aborts it. 4869 status = U_ZERO_ERROR; 4870 cbInfo.reset(4); 4871 s = "aaaaaaaaaaaaaaaaaaab"; 4872 matcher.reset(s); 4873 REGEX_ASSERT(matcher.matches(status)==FALSE); 4874 REGEX_CHECK_STATUS; 4875 REGEX_ASSERT(cbInfo.numCalls > 0); 4876 4877 // A longer running match that the callback function will abort. 4878 status = U_ZERO_ERROR; 4879 cbInfo.reset(4); 4880 s = "aaaaaaaaaaaaaaaaaaaaaaab"; 4881 matcher.reset(s); 4882 REGEX_ASSERT(matcher.matches(status)==FALSE); 4883 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); 4884 REGEX_ASSERT(cbInfo.numCalls == 4); 4885 4886 // A longer running find that the callback function will abort. 4887 status = U_ZERO_ERROR; 4888 cbInfo.reset(4); 4889 s = "aaaaaaaaaaaaaaaaaaaaaaab"; 4890 matcher.reset(s); 4891 REGEX_ASSERT(matcher.find(status)==FALSE); 4892 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); 4893 REGEX_ASSERT(cbInfo.numCalls == 4); 4894 } 4895 4896 4897 } 4898 4899 4900 // 4901 // FindProgressCallbacks() Test the find "progress" callback function. 4902 // When set, the find progress callback will be invoked during a find operations 4903 // after each return from a match attempt, giving the application the opportunity 4904 // to terminate a long-running find operation before it's normal completion. 4905 // 4906 4907 struct progressCallBackContext { 4908 RegexTest *test; 4909 int64_t lastIndex; 4910 int32_t maxCalls; 4911 int32_t numCalls; 4912 void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;}; 4913 }; 4914 4915 // call-back function for find(). 4916 // Return TRUE to continue the find(). 4917 // Return FALSE to stop the find(). 4918 U_CDECL_BEGIN 4919 static UBool U_CALLCONV 4920 testProgressCallBackFn(const void *context, int64_t matchIndex) { 4921 progressCallBackContext *info = (progressCallBackContext *)context; 4922 info->numCalls++; 4923 info->lastIndex = matchIndex; 4924 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls); 4925 return (info->numCalls < info->maxCalls); 4926 } 4927 U_CDECL_END 4928 4929 void RegexTest::FindProgressCallbacks() { 4930 { 4931 // Getter returns NULLs if no callback has been set 4932 4933 // The variables that the getter will fill in. 4934 // Init to non-null values so that the action of the getter can be seen. 4935 const void *returnedContext = &returnedContext; 4936 URegexFindProgressCallback *returnedFn = &testProgressCallBackFn; 4937 4938 UErrorCode status = U_ZERO_ERROR; 4939 RegexMatcher matcher("x", 0, status); 4940 REGEX_CHECK_STATUS; 4941 matcher.getFindProgressCallback(returnedFn, returnedContext, status); 4942 REGEX_CHECK_STATUS; 4943 REGEX_ASSERT(returnedFn == NULL); 4944 REGEX_ASSERT(returnedContext == NULL); 4945 } 4946 4947 { 4948 // Set and Get work 4949 progressCallBackContext cbInfo = {this, 0, 0, 0}; 4950 const void *returnedContext; 4951 URegexFindProgressCallback *returnedFn; 4952 UErrorCode status = U_ZERO_ERROR; 4953 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status); 4954 REGEX_CHECK_STATUS; 4955 matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status); 4956 REGEX_CHECK_STATUS; 4957 matcher.getFindProgressCallback(returnedFn, returnedContext, status); 4958 REGEX_CHECK_STATUS; 4959 REGEX_ASSERT(returnedFn == testProgressCallBackFn); 4960 REGEX_ASSERT(returnedContext == &cbInfo); 4961 4962 // A find that matches on the initial position does NOT invoke the callback. 4963 status = U_ZERO_ERROR; 4964 cbInfo.reset(100); 4965 UnicodeString s = "aaxxx"; 4966 matcher.reset(s); 4967 #if 0 4968 matcher.setTrace(TRUE); 4969 #endif 4970 REGEX_ASSERT(matcher.find(0, status)); 4971 REGEX_CHECK_STATUS; 4972 REGEX_ASSERT(cbInfo.numCalls == 0); 4973 4974 // A medium running find() that causes matcher.find() to invoke our callback for each index, 4975 // but not so many times that we interrupt the operation. 4976 status = U_ZERO_ERROR; 4977 s = "aaaaaaaaaaaaaaaaaaab"; 4978 cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string 4979 matcher.reset(s); 4980 REGEX_ASSERT(matcher.find(0, status)==FALSE); 4981 REGEX_CHECK_STATUS; 4982 REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25); 4983 4984 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point. 4985 status = U_ZERO_ERROR; 4986 UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab"; 4987 cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string 4988 matcher.reset(s1); 4989 REGEX_ASSERT(matcher.find(0, status)==FALSE); 4990 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); 4991 REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5); 4992 4993 // Now a match that will succeed, but after an interruption 4994 status = U_ZERO_ERROR; 4995 UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx"; 4996 cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string 4997 matcher.reset(s2); 4998 REGEX_ASSERT(matcher.find(0, status)==FALSE); 4999 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); 5000 // Now retry the match from where left off 5001 cbInfo.maxCalls = 100; // No callback limit 5002 status = U_ZERO_ERROR; 5003 REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status)); 5004 REGEX_CHECK_STATUS; 5005 } 5006 5007 5008 } 5009 5010 5011 //--------------------------------------------------------------------------- 5012 // 5013 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable 5014 // UTexts. The pure-C implementation of UText 5015 // has no mutable backing stores, but we can 5016 // use UnicodeString here to test the functionality. 5017 // 5018 //--------------------------------------------------------------------------- 5019 void RegexTest::PreAllocatedUTextCAPI () { 5020 UErrorCode status = U_ZERO_ERROR; 5021 URegularExpression *re; 5022 UText patternText = UTEXT_INITIALIZER; 5023 UnicodeString buffer; 5024 UText bufferText = UTEXT_INITIALIZER; 5025 5026 utext_openUnicodeString(&bufferText, &buffer, &status); 5027 5028 /* 5029 * getText() and getUText() 5030 */ 5031 { 5032 UText text1 = UTEXT_INITIALIZER; 5033 UText text2 = UTEXT_INITIALIZER; 5034 UChar text2Chars[20]; 5035 UText *resultText; 5036 5037 status = U_ZERO_ERROR; 5038 regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status); 5039 regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status); 5040 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2); 5041 utext_openUChars(&text2, text2Chars, -1, &status); 5042 5043 regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status); 5044 re = uregex_openUText(&patternText, 0, NULL, &status); 5045 5046 /* First set a UText */ 5047 uregex_setUText(re, &text1, &status); 5048 resultText = uregex_getUText(re, &bufferText, &status); 5049 REGEX_CHECK_STATUS; 5050 REGEX_ASSERT(resultText == &bufferText); 5051 utext_setNativeIndex(resultText, 0); 5052 utext_setNativeIndex(&text1, 0); 5053 REGEX_ASSERT(testUTextEqual(resultText, &text1)); 5054 5055 resultText = uregex_getUText(re, &bufferText, &status); 5056 REGEX_CHECK_STATUS; 5057 REGEX_ASSERT(resultText == &bufferText); 5058 utext_setNativeIndex(resultText, 0); 5059 utext_setNativeIndex(&text1, 0); 5060 REGEX_ASSERT(testUTextEqual(resultText, &text1)); 5061 5062 /* Then set a UChar * */ 5063 uregex_setText(re, text2Chars, 7, &status); 5064 resultText = uregex_getUText(re, &bufferText, &status); 5065 REGEX_CHECK_STATUS; 5066 REGEX_ASSERT(resultText == &bufferText); 5067 utext_setNativeIndex(resultText, 0); 5068 utext_setNativeIndex(&text2, 0); 5069 REGEX_ASSERT(testUTextEqual(resultText, &text2)); 5070 5071 uregex_close(re); 5072 utext_close(&text1); 5073 utext_close(&text2); 5074 } 5075 5076 /* 5077 * group() 5078 */ 5079 { 5080 UChar text1[80]; 5081 UText *actual; 5082 UBool result; 5083 int64_t length = 0; 5084 5085 u_uastrncpy(text1, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1)); 5086 // 012345678901234567890123456789012345678901234567 5087 // 0 1 2 3 4 5088 5089 status = U_ZERO_ERROR; 5090 re = uregex_openC("abc(.*?)def", 0, NULL, &status); 5091 REGEX_CHECK_STATUS; 5092 5093 uregex_setText(re, text1, -1, &status); 5094 result = uregex_find(re, 0, &status); 5095 REGEX_ASSERT(result==TRUE); 5096 5097 /* Capture Group 0, the full match. Should succeed. "abc interior def" */ 5098 status = U_ZERO_ERROR; 5099 actual = uregex_groupUText(re, 0, &bufferText, &length, &status); 5100 REGEX_CHECK_STATUS; 5101 REGEX_ASSERT(actual == &bufferText); 5102 REGEX_ASSERT(utext_getNativeIndex(actual) == 6); 5103 REGEX_ASSERT(length == 16); 5104 REGEX_ASSERT(utext_nativeLength(actual) == 47); 5105 5106 /* Capture group #1. Should succeed, matching " interior ". */ 5107 status = U_ZERO_ERROR; 5108 actual = uregex_groupUText(re, 1, &bufferText, &length, &status); 5109 REGEX_CHECK_STATUS; 5110 REGEX_ASSERT(actual == &bufferText); 5111 REGEX_ASSERT(utext_getNativeIndex(actual) == 9); // position of " interior " 5112 REGEX_ASSERT(length == 10); 5113 REGEX_ASSERT(utext_nativeLength(actual) == 47); 5114 5115 /* Capture group out of range. Error. */ 5116 status = U_ZERO_ERROR; 5117 actual = uregex_groupUText(re, 2, &bufferText, &length, &status); 5118 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 5119 REGEX_ASSERT(actual == &bufferText); 5120 uregex_close(re); 5121 5122 } 5123 5124 /* 5125 * replaceFirst() 5126 */ 5127 { 5128 UChar text1[80]; 5129 UChar text2[80]; 5130 UText replText = UTEXT_INITIALIZER; 5131 UText *result; 5132 status = U_ZERO_ERROR; 5133 utext_openUnicodeString(&bufferText, &buffer, &status); 5134 5135 status = U_ZERO_ERROR; 5136 u_uastrncpy(text1, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1)); 5137 u_uastrncpy(text2, "No match here.", UPRV_LENGTHOF(text2)/2); 5138 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status); 5139 5140 re = uregex_openC("x(.*?)x", 0, NULL, &status); 5141 REGEX_CHECK_STATUS; 5142 5143 /* Normal case, with match */ 5144 uregex_setText(re, text1, -1, &status); 5145 REGEX_CHECK_STATUS; 5146 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5147 REGEX_CHECK_STATUS; 5148 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 5149 REGEX_CHECK_STATUS; 5150 REGEX_ASSERT(result == &bufferText); 5151 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result); 5152 5153 /* No match. Text should copy to output with no changes. */ 5154 uregex_setText(re, text2, -1, &status); 5155 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5156 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 5157 REGEX_CHECK_STATUS; 5158 REGEX_ASSERT(result == &bufferText); 5159 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result); 5160 5161 /* Unicode escapes */ 5162 uregex_setText(re, text1, -1, &status); 5163 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status); 5164 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5165 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 5166 REGEX_CHECK_STATUS; 5167 REGEX_ASSERT(result == &bufferText); 5168 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result); 5169 5170 uregex_close(re); 5171 utext_close(&replText); 5172 } 5173 5174 5175 /* 5176 * replaceAll() 5177 */ 5178 { 5179 UChar text1[80]; 5180 UChar text2[80]; 5181 UText replText = UTEXT_INITIALIZER; 5182 UText *result; 5183 5184 status = U_ZERO_ERROR; 5185 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2); 5186 u_uastrncpy(text2, "No match here.", sizeof(text2)/2); 5187 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status); 5188 5189 re = uregex_openC("x(.*?)x", 0, NULL, &status); 5190 REGEX_CHECK_STATUS; 5191 5192 /* Normal case, with match */ 5193 uregex_setText(re, text1, -1, &status); 5194 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5195 result = uregex_replaceAllUText(re, &replText, &bufferText, &status); 5196 REGEX_CHECK_STATUS; 5197 REGEX_ASSERT(result == &bufferText); 5198 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result); 5199 5200 /* No match. Text should copy to output with no changes. */ 5201 uregex_setText(re, text2, -1, &status); 5202 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5203 result = uregex_replaceAllUText(re, &replText, &bufferText, &status); 5204 REGEX_CHECK_STATUS; 5205 REGEX_ASSERT(result == &bufferText); 5206 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result); 5207 5208 uregex_close(re); 5209 utext_close(&replText); 5210 } 5211 5212 5213 /* 5214 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts, 5215 * so we don't need to test it here. 5216 */ 5217 5218 utext_close(&bufferText); 5219 utext_close(&patternText); 5220 } 5221 5222 5223 //-------------------------------------------------------------- 5224 // 5225 // NamedCapture Check basic named capture group functionality 5226 // 5227 //-------------------------------------------------------------- 5228 void RegexTest::NamedCapture() { 5229 UErrorCode status = U_ZERO_ERROR; 5230 RegexPattern *pat = RegexPattern::compile(UnicodeString( 5231 "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status); 5232 REGEX_CHECK_STATUS; 5233 int32_t group = pat->groupNumberFromName("five", -1, status); 5234 REGEX_CHECK_STATUS; 5235 REGEX_ASSERT(5 == group); 5236 group = pat->groupNumberFromName("three", -1, status); 5237 REGEX_CHECK_STATUS; 5238 REGEX_ASSERT(3 == group); 5239 5240 status = U_ZERO_ERROR; 5241 group = pat->groupNumberFromName(UnicodeString("six"), status); 5242 REGEX_CHECK_STATUS; 5243 REGEX_ASSERT(6 == group); 5244 5245 status = U_ZERO_ERROR; 5246 group = pat->groupNumberFromName(UnicodeString("nosuch"), status); 5247 U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5248 5249 status = U_ZERO_ERROR; 5250 5251 // After copying a pattern, named capture should still work in the copy. 5252 RegexPattern *copiedPat = new RegexPattern(*pat); 5253 REGEX_ASSERT(*copiedPat == *pat); 5254 delete pat; pat = NULL; // Delete original, copy should have no references back to it. 5255 5256 group = copiedPat->groupNumberFromName("five", -1, status); 5257 REGEX_CHECK_STATUS; 5258 REGEX_ASSERT(5 == group); 5259 group = copiedPat->groupNumberFromName("three", -1, status); 5260 REGEX_CHECK_STATUS; 5261 REGEX_ASSERT(3 == group); 5262 delete copiedPat; 5263 5264 // ReplaceAll with named capture group. 5265 status = U_ZERO_ERROR; 5266 UnicodeString text("Substitution of <<quotes>> for <<double brackets>>"); 5267 RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status); 5268 REGEX_CHECK_STATUS; 5269 // m.pattern().dumpPattern(); 5270 UnicodeString replacedText = m->replaceAll("'${mid}'", status); 5271 REGEX_CHECK_STATUS; 5272 REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText); 5273 delete m; 5274 5275 // ReplaceAll, allowed capture group numbers. 5276 text = UnicodeString("abcmxyz"); 5277 m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status); 5278 REGEX_CHECK_STATUS; 5279 5280 status = U_ZERO_ERROR; 5281 replacedText = m->replaceAll(UnicodeString("<$0>"), status); // group 0, full match, is allowed. 5282 REGEX_CHECK_STATUS; 5283 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText); 5284 5285 status = U_ZERO_ERROR; 5286 replacedText = m->replaceAll(UnicodeString("<$1>"), status); // group 1 by number. 5287 REGEX_CHECK_STATUS; 5288 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText); 5289 5290 status = U_ZERO_ERROR; 5291 replacedText = m->replaceAll(UnicodeString("<${one}>"), status); // group 1 by name. 5292 REGEX_CHECK_STATUS; 5293 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText); 5294 5295 status = U_ZERO_ERROR; 5296 replacedText = m->replaceAll(UnicodeString("<$2>"), status); // group 2. 5297 REGEX_CHECK_STATUS; 5298 REGEX_ASSERT(UnicodeString("a<x>z") == replacedText); 5299 5300 status = U_ZERO_ERROR; 5301 replacedText = m->replaceAll(UnicodeString("<$3>"), status); 5302 REGEX_CHECK_STATUS; 5303 REGEX_ASSERT(UnicodeString("a<y>z") == replacedText); 5304 5305 status = U_ZERO_ERROR; 5306 replacedText = m->replaceAll(UnicodeString("<$4>"), status); 5307 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 5308 5309 status = U_ZERO_ERROR; 5310 replacedText = m->replaceAll(UnicodeString("<$04>"), status); // group 0, leading 0, 5311 REGEX_CHECK_STATUS; // trailing out-of-range 4 passes through. 5312 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText); 5313 5314 status = U_ZERO_ERROR; 5315 replacedText = m->replaceAll(UnicodeString("<$000016>"), status); // Consume leading zeroes. Don't consume digits 5316 REGEX_CHECK_STATUS; // that push group num out of range. 5317 REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText); // This is group 1. 5318 5319 status = U_ZERO_ERROR; 5320 replacedText = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status); 5321 REGEX_CHECK_STATUS; 5322 REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText); 5323 5324 status = U_ZERO_ERROR; 5325 replacedText = m->replaceAll(UnicodeString("$3$2$1${one}"), status); 5326 REGEX_CHECK_STATUS; 5327 REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText); 5328 5329 status = U_ZERO_ERROR; 5330 replacedText = m->replaceAll(UnicodeString("<${noSuchName}>"), status); 5331 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5332 5333 status = U_ZERO_ERROR; 5334 replacedText = m->replaceAll(UnicodeString("<${invalid-name}>"), status); 5335 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5336 5337 status = U_ZERO_ERROR; 5338 replacedText = m->replaceAll(UnicodeString("<${one"), status); 5339 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5340 5341 status = U_ZERO_ERROR; 5342 replacedText = m->replaceAll(UnicodeString("$not a capture group"), status); 5343 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5344 5345 delete m; 5346 5347 // Repeat the above replaceAll() tests using the plain C API, which 5348 // has a separate implementation internally. 5349 // TODO: factor out the test data. 5350 5351 status = U_ZERO_ERROR; 5352 URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status); 5353 REGEX_CHECK_STATUS; 5354 text = UnicodeString("abcmxyz"); 5355 uregex_setText(re, text.getBuffer(), text.length(), &status); 5356 REGEX_CHECK_STATUS; 5357 5358 UChar resultBuf[100]; 5359 int32_t resultLength; 5360 UnicodeString repl; 5361 5362 status = U_ZERO_ERROR; 5363 repl = UnicodeString("<$0>"); 5364 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5365 REGEX_CHECK_STATUS; 5366 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength)); 5367 5368 status = U_ZERO_ERROR; 5369 repl = UnicodeString("<$1>"); 5370 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5371 REGEX_CHECK_STATUS; 5372 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength)); 5373 5374 status = U_ZERO_ERROR; 5375 repl = UnicodeString("<${one}>"); 5376 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5377 REGEX_CHECK_STATUS; 5378 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength)); 5379 5380 status = U_ZERO_ERROR; 5381 repl = UnicodeString("<$2>"); 5382 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5383 REGEX_CHECK_STATUS; 5384 REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength)); 5385 5386 status = U_ZERO_ERROR; 5387 repl = UnicodeString("<$3>"); 5388 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5389 REGEX_CHECK_STATUS; 5390 REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength)); 5391 5392 status = U_ZERO_ERROR; 5393 repl = UnicodeString("<$4>"); 5394 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5395 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 5396 5397 status = U_ZERO_ERROR; 5398 repl = UnicodeString("<$04>"); 5399 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5400 REGEX_CHECK_STATUS; 5401 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength)); 5402 5403 status = U_ZERO_ERROR; 5404 repl = UnicodeString("<$000016>"); 5405 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5406 REGEX_CHECK_STATUS; 5407 REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength)); 5408 5409 status = U_ZERO_ERROR; 5410 repl = UnicodeString("<$3$2$1${one}>"); 5411 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5412 REGEX_CHECK_STATUS; 5413 REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength)); 5414 5415 status = U_ZERO_ERROR; 5416 repl = UnicodeString("$3$2$1${one}"); 5417 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5418 REGEX_CHECK_STATUS; 5419 REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength)); 5420 5421 status = U_ZERO_ERROR; 5422 repl = UnicodeString("<${noSuchName}>"); 5423 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5424 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5425 5426 status = U_ZERO_ERROR; 5427 repl = UnicodeString("<${invalid-name}>"); 5428 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5429 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5430 5431 status = U_ZERO_ERROR; 5432 repl = UnicodeString("<${one"); 5433 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5434 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5435 5436 status = U_ZERO_ERROR; 5437 repl = UnicodeString("$not a capture group"); 5438 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5439 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5440 5441 uregex_close(re); 5442 } 5443 5444 //-------------------------------------------------------------- 5445 // 5446 // NamedCaptureLimits Patterns with huge numbers of named capture groups. 5447 // The point is not so much what the exact limit is, 5448 // but that a largish number doesn't hit bad non-linear performance, 5449 // and that exceeding the limit fails cleanly. 5450 // 5451 //-------------------------------------------------------------- 5452 void RegexTest::NamedCaptureLimits() { 5453 if (quick) { 5454 logln("Skipping test. Runs in exhuastive mode only."); 5455 return; 5456 } 5457 const int32_t goodLimit = 1000000; // Pattern w this many groups builds successfully. 5458 const int32_t failLimit = 10000000; // Pattern exceeds internal limits, fails to compile. 5459 char nnbuf[100]; 5460 UnicodeString pattern; 5461 int32_t nn; 5462 5463 for (nn=1; nn<goodLimit; nn++) { 5464 sprintf(nnbuf, "(?<nn%d>)", nn); 5465 pattern.append(UnicodeString(nnbuf, -1, US_INV)); 5466 } 5467 UErrorCode status = U_ZERO_ERROR; 5468 RegexPattern *pat = RegexPattern::compile(pattern, 0, status); 5469 REGEX_CHECK_STATUS; 5470 for (nn=1; nn<goodLimit; nn++) { 5471 sprintf(nnbuf, "nn%d", nn); 5472 int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status); 5473 REGEX_ASSERT(nn == groupNum); 5474 if (nn != groupNum) { 5475 break; 5476 } 5477 } 5478 delete pat; 5479 5480 pattern.remove(); 5481 for (nn=1; nn<failLimit; nn++) { 5482 sprintf(nnbuf, "(?<nn%d>)", nn); 5483 pattern.append(UnicodeString(nnbuf, -1, US_INV)); 5484 } 5485 status = U_ZERO_ERROR; 5486 pat = RegexPattern::compile(pattern, 0, status); 5487 REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG); 5488 delete pat; 5489 } 5490 5491 5492 //-------------------------------------------------------------- 5493 // 5494 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher. 5495 // 5496 //--------------------------------------------------------------- 5497 void RegexTest::Bug7651() { 5498 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)"); 5499 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData. 5500 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation. 5501 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)"); 5502 UnicodeString s("#ff @abcd This is test"); 5503 RegexPattern *REPattern = NULL; 5504 RegexMatcher *REMatcher = NULL; 5505 UErrorCode status = U_ZERO_ERROR; 5506 UParseError pe; 5507 5508 REPattern = RegexPattern::compile(pattern1, 0, pe, status); 5509 REGEX_CHECK_STATUS; 5510 REMatcher = REPattern->matcher(s, status); 5511 REGEX_CHECK_STATUS; 5512 REGEX_ASSERT(REMatcher->find()); 5513 REGEX_ASSERT(REMatcher->start(status) == 0); 5514 delete REPattern; 5515 delete REMatcher; 5516 status = U_ZERO_ERROR; 5517 5518 REPattern = RegexPattern::compile(pattern2, 0, pe, status); 5519 REGEX_CHECK_STATUS; 5520 REMatcher = REPattern->matcher(s, status); 5521 REGEX_CHECK_STATUS; 5522 REGEX_ASSERT(REMatcher->find()); 5523 REGEX_ASSERT(REMatcher->start(status) == 0); 5524 delete REPattern; 5525 delete REMatcher; 5526 status = U_ZERO_ERROR; 5527 } 5528 5529 void RegexTest::Bug7740() { 5530 UErrorCode status = U_ZERO_ERROR; 5531 UnicodeString pattern = "(a)"; 5532 UnicodeString text = "abcdef"; 5533 RegexMatcher *m = new RegexMatcher(pattern, text, 0, status); 5534 REGEX_CHECK_STATUS; 5535 REGEX_ASSERT(m->lookingAt(status)); 5536 REGEX_CHECK_STATUS; 5537 status = U_ILLEGAL_ARGUMENT_ERROR; 5538 UnicodeString s = m->group(1, status); // Bug 7740: segfault here. 5539 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); 5540 REGEX_ASSERT(s == ""); 5541 delete m; 5542 } 5543 5544 // Bug 8479: was crashing whith a Bogus UnicodeString as input. 5545 5546 void RegexTest::Bug8479() { 5547 UErrorCode status = U_ZERO_ERROR; 5548 5549 RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status); 5550 REGEX_CHECK_STATUS; 5551 if (U_SUCCESS(status)) 5552 { 5553 UnicodeString str; 5554 str.setToBogus(); 5555 pMatcher->reset(str); 5556 status = U_ZERO_ERROR; 5557 pMatcher->matches(status); 5558 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); 5559 delete pMatcher; 5560 } 5561 } 5562 5563 5564 // Bug 7029 5565 void RegexTest::Bug7029() { 5566 UErrorCode status = U_ZERO_ERROR; 5567 5568 RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status); 5569 UnicodeString text = "abc.def"; 5570 UnicodeString splits[10]; 5571 REGEX_CHECK_STATUS; 5572 int32_t numFields = pMatcher->split(text, splits, 10, status); 5573 REGEX_CHECK_STATUS; 5574 REGEX_ASSERT(numFields == 8); 5575 delete pMatcher; 5576 } 5577 5578 // Bug 9283 5579 // This test is checking for the existance of any supplemental characters that case-fold 5580 // to a bmp character. 5581 // 5582 // At the time of this writing there are none. If any should appear in a subsequent release 5583 // of Unicode, the code in regular expressions compilation that determines the longest 5584 // posssible match for a literal string will need to be enhanced. 5585 // 5586 // See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength() 5587 // for details on what to do in case of a failure of this test. 5588 // 5589 void RegexTest::Bug9283() { 5590 #if !UCONFIG_NO_NORMALIZATION 5591 UErrorCode status = U_ZERO_ERROR; 5592 UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status); 5593 REGEX_CHECK_STATUS; 5594 int32_t index; 5595 UChar32 c; 5596 for (index=0; ; index++) { 5597 c = supplementalsWithCaseFolding.charAt(index); 5598 if (c == -1) { 5599 break; 5600 } 5601 UnicodeString cf = UnicodeString(c).foldCase(); 5602 REGEX_ASSERT(cf.length() >= 2); 5603 } 5604 #endif /* #if !UCONFIG_NO_NORMALIZATION */ 5605 } 5606 5607 5608 void RegexTest::CheckInvBufSize() { 5609 if(inv_next>=INV_BUFSIZ) { 5610 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n", 5611 __FILE__, INV_BUFSIZ, inv_next); 5612 } else { 5613 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next); 5614 } 5615 } 5616 5617 5618 void RegexTest::Bug10459() { 5619 UErrorCode status = U_ZERO_ERROR; 5620 UnicodeString patternString("(txt)"); 5621 UnicodeString txtString("txt"); 5622 5623 UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status); 5624 REGEX_CHECK_STATUS; 5625 UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status); 5626 REGEX_CHECK_STATUS; 5627 5628 URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status); 5629 REGEX_CHECK_STATUS; 5630 5631 uregex_setUText(icu_re, utext_txt, &status); 5632 REGEX_CHECK_STATUS; 5633 5634 // The bug was that calling uregex_group() before doing a matching operation 5635 // was causing a segfault. Only for Regular Expressions created from UText. 5636 // It should set an U_REGEX_INVALID_STATE. 5637 5638 UChar buf[100]; 5639 int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status); 5640 REGEX_ASSERT(status == U_REGEX_INVALID_STATE); 5641 REGEX_ASSERT(len == 0); 5642 5643 uregex_close(icu_re); 5644 utext_close(utext_pat); 5645 utext_close(utext_txt); 5646 } 5647 5648 void RegexTest::TestCaseInsensitiveStarters() { 5649 // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't 5650 // become stale because of new Unicode characters. 5651 // If it is stale, rerun the generation tool 5652 // svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing 5653 // and replace the embedded data in i18n/regexcmp.cpp 5654 5655 for (UChar32 cp=0; cp<=0x10ffff; cp++) { 5656 if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) { 5657 continue; 5658 } 5659 UnicodeSet s(cp, cp); 5660 s.closeOver(USET_CASE_INSENSITIVE); 5661 UnicodeSetIterator setIter(s); 5662 while (setIter.next()) { 5663 if (!setIter.isString()) { 5664 continue; 5665 } 5666 const UnicodeString &str = setIter.getString(); 5667 UChar32 firstChar = str.char32At(0); 5668 UnicodeSet starters; 5669 RegexCompile::findCaseInsensitiveStarters(firstChar, &starters); 5670 if (!starters.contains(cp)) { 5671 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar); 5672 return; 5673 } 5674 } 5675 } 5676 } 5677 5678 5679 void RegexTest::TestBug11049() { 5680 // Original bug report: pattern with match start consisting of one of several individual characters, 5681 // and the text being matched ending with a supplementary character. find() would read past the 5682 // end of the input text when searching for potential match starting points. 5683 5684 // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will 5685 // detect the bad read. 5686 5687 TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__); 5688 TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__); 5689 5690 // Test again with a pattern starting with a single character, 5691 // which takes a different code path than starting with an OR expression, 5692 // but with similar logic. 5693 TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__); 5694 TestCase11049("C", "string matches at end C", TRUE, __LINE__); 5695 } 5696 5697 // Run a single test case from TestBug11049(). Internal function. 5698 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) { 5699 UErrorCode status = U_ZERO_ERROR; 5700 UnicodeString patternString = UnicodeString(pattern).unescape(); 5701 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status)); 5702 5703 UnicodeString dataString = UnicodeString(data).unescape(); 5704 UChar *exactBuffer = new UChar[dataString.length()]; 5705 dataString.extract(exactBuffer, dataString.length(), status); 5706 UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status); 5707 5708 LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status)); 5709 REGEX_CHECK_STATUS; 5710 matcher->reset(ut); 5711 UBool result = matcher->find(); 5712 if (result != expectMatch) { 5713 errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"", 5714 __FILE__, lineNumber, expectMatch, result, pattern, data); 5715 } 5716 5717 // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see 5718 // off-by-one on find() with match at the last code point. 5719 // Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8 5720 // because string.unescape() will only shrink it. 5721 char * utf8Buffer = new char[uprv_strlen(data)+1]; 5722 u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status); 5723 REGEX_CHECK_STATUS; 5724 ut = utext_openUTF8(ut, utf8Buffer, -1, &status); 5725 REGEX_CHECK_STATUS; 5726 matcher->reset(ut); 5727 result = matcher->find(); 5728 if (result != expectMatch) { 5729 errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"", 5730 __FILE__, lineNumber, expectMatch, result, pattern, data); 5731 } 5732 delete [] utf8Buffer; 5733 5734 utext_close(ut); 5735 delete [] exactBuffer; 5736 } 5737 5738 5739 void RegexTest::TestBug11371() { 5740 if (quick) { 5741 logln("Skipping test. Runs in exhuastive mode only."); 5742 return; 5743 } 5744 UErrorCode status = U_ZERO_ERROR; 5745 UnicodeString patternString; 5746 5747 for (int i=0; i<8000000; i++) { 5748 patternString.append(UnicodeString("()")); 5749 } 5750 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status)); 5751 if (status != U_REGEX_PATTERN_TOO_BIG) { 5752 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.", 5753 __FILE__, __LINE__, u_errorName(status)); 5754 } 5755 5756 status = U_ZERO_ERROR; 5757 patternString = "("; 5758 for (int i=0; i<20000000; i++) { 5759 patternString.append(UnicodeString("A++")); 5760 } 5761 patternString.append(UnicodeString("){0}B++")); 5762 LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status)); 5763 if (status != U_REGEX_PATTERN_TOO_BIG) { 5764 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.", 5765 __FILE__, __LINE__, u_errorName(status)); 5766 } 5767 5768 // Pattern with too much string data, such that string indexes overflow operand data field size 5769 // in compiled instruction. 5770 status = U_ZERO_ERROR; 5771 patternString = ""; 5772 while (patternString.length() < 0x00ffffff) { 5773 patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n")); 5774 } 5775 patternString.append(UnicodeString("X? trailing string")); 5776 LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status)); 5777 if (status != U_REGEX_PATTERN_TOO_BIG) { 5778 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.", 5779 __FILE__, __LINE__, u_errorName(status)); 5780 } 5781 } 5782 5783 void RegexTest::TestBug11480() { 5784 // C API, get capture group of a group that does not participate in the match. 5785 // (Returns a zero length string, with nul termination, 5786 // indistinguishable from a group with a zero length match.) 5787 5788 UErrorCode status = U_ZERO_ERROR; 5789 URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status); 5790 REGEX_CHECK_STATUS; 5791 UnicodeString text = UNICODE_STRING_SIMPLE("A"); 5792 uregex_setText(re, text.getBuffer(), text.length(), &status); 5793 REGEX_CHECK_STATUS; 5794 REGEX_ASSERT(uregex_lookingAt(re, 0, &status)); 5795 UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13}; 5796 int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status); 5797 REGEX_ASSERT(length == 0); 5798 REGEX_ASSERT(buf[0] == 13); 5799 REGEX_ASSERT(buf[1] == 0); 5800 REGEX_ASSERT(buf[2] == 13); 5801 uregex_close(re); 5802 5803 // UText C++ API, length of match is 0 for non-participating matches. 5804 UText ut = UTEXT_INITIALIZER; 5805 utext_openUnicodeString(&ut, &text, &status); 5806 RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status); 5807 REGEX_CHECK_STATUS; 5808 matcher.reset(&ut); 5809 REGEX_ASSERT(matcher.lookingAt(0, status)); 5810 5811 // UText C++ API, Capture group 1 matches "A", position 0, length 1. 5812 int64_t groupLen = -666; 5813 UText group = UTEXT_INITIALIZER; 5814 matcher.group(1, &group, groupLen, status); 5815 REGEX_CHECK_STATUS; 5816 REGEX_ASSERT(groupLen == 1); 5817 REGEX_ASSERT(utext_getNativeIndex(&group) == 0); 5818 5819 // Capture group 2, the (B), does not participate in the match. 5820 matcher.group(2, &group, groupLen, status); 5821 REGEX_CHECK_STATUS; 5822 REGEX_ASSERT(groupLen == 0); 5823 REGEX_ASSERT(matcher.start(2, status) == -1); 5824 REGEX_CHECK_STATUS; 5825 } 5826 5827 5828 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ 5829