1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /******************************************************************** 4 * COPYRIGHT: 5 * Copyright (c) 2002-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ********************************************************************/ 8 9 // 10 // regextst.cpp 11 // 12 // ICU Regular Expressions test, part of intltest. 13 // 14 15 /* 16 NOTE!! 17 18 PLEASE be careful about ASCII assumptions in this test. 19 This test is one of the worst repeat offenders. 20 If you have questions, contact someone on the ICU PMC 21 who has access to an EBCDIC system. 22 23 */ 24 25 #include "intltest.h" 26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 27 28 #include <stdlib.h> 29 #include <stdio.h> 30 #include <string.h> 31 32 #include "unicode/localpointer.h" 33 #include "unicode/regex.h" 34 #include "unicode/uchar.h" 35 #include "unicode/ucnv.h" 36 #include "unicode/uniset.h" 37 #include "unicode/uregex.h" 38 #include "unicode/usetiter.h" 39 #include "unicode/ustring.h" 40 #include "unicode/utext.h" 41 42 #include "regextst.h" 43 #include "regexcmp.h" 44 #include "uvector.h" 45 #include "util.h" 46 #include "cmemory.h" 47 #include "cstring.h" 48 #include "uinvchar.h" 49 50 #define SUPPORT_MUTATING_INPUT_STRING 0 51 52 //--------------------------------------------------------------------------- 53 // 54 // Test class boilerplate 55 // 56 //--------------------------------------------------------------------------- 57 RegexTest::RegexTest() 58 { 59 } 60 61 62 RegexTest::~RegexTest() 63 { 64 } 65 66 67 68 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) 69 { 70 if (exec) logln("TestSuite RegexTest: "); 71 switch (index) { 72 73 case 0: name = "Basic"; 74 if (exec) Basic(); 75 break; 76 case 1: name = "API_Match"; 77 if (exec) API_Match(); 78 break; 79 case 2: name = "API_Replace"; 80 if (exec) API_Replace(); 81 break; 82 case 3: name = "API_Pattern"; 83 if (exec) API_Pattern(); 84 break; 85 case 4: 86 #if !UCONFIG_NO_FILE_IO 87 name = "Extended"; 88 if (exec) Extended(); 89 #else 90 name = "skip"; 91 #endif 92 break; 93 case 5: name = "Errors"; 94 if (exec) Errors(); 95 break; 96 case 6: name = "PerlTests"; 97 if (exec) PerlTests(); 98 break; 99 case 7: name = "Callbacks"; 100 if (exec) Callbacks(); 101 break; 102 case 8: name = "FindProgressCallbacks"; 103 if (exec) FindProgressCallbacks(); 104 break; 105 case 9: name = "Bug 6149"; 106 if (exec) Bug6149(); 107 break; 108 case 10: name = "UTextBasic"; 109 if (exec) UTextBasic(); 110 break; 111 case 11: name = "API_Match_UTF8"; 112 if (exec) API_Match_UTF8(); 113 break; 114 case 12: name = "API_Replace_UTF8"; 115 if (exec) API_Replace_UTF8(); 116 break; 117 case 13: name = "API_Pattern_UTF8"; 118 if (exec) API_Pattern_UTF8(); 119 break; 120 case 14: name = "PerlTestsUTF8"; 121 if (exec) PerlTestsUTF8(); 122 break; 123 case 15: name = "PreAllocatedUTextCAPI"; 124 if (exec) PreAllocatedUTextCAPI(); 125 break; 126 case 16: name = "Bug 7651"; 127 if (exec) Bug7651(); 128 break; 129 case 17: name = "Bug 7740"; 130 if (exec) Bug7740(); 131 break; 132 case 18: name = "Bug 8479"; 133 if (exec) Bug8479(); 134 break; 135 case 19: name = "Bug 7029"; 136 if (exec) Bug7029(); 137 break; 138 case 20: name = "CheckInvBufSize"; 139 if (exec) CheckInvBufSize(); 140 break; 141 case 21: name = "Bug 9283"; 142 if (exec) Bug9283(); 143 break; 144 case 22: name = "Bug10459"; 145 if (exec) Bug10459(); 146 break; 147 case 23: name = "TestCaseInsensitiveStarters"; 148 if (exec) TestCaseInsensitiveStarters(); 149 break; 150 case 24: name = "TestBug11049"; 151 if (exec) TestBug11049(); 152 break; 153 case 25: name = "TestBug11371"; 154 if (exec) TestBug11371(); 155 break; 156 case 26: name = "TestBug11480"; 157 if (exec) TestBug11480(); 158 break; 159 case 27: name = "NamedCapture"; 160 if (exec) NamedCapture(); 161 break; 162 case 28: name = "NamedCaptureLimits"; 163 if (exec) NamedCaptureLimits(); 164 break; 165 default: name = ""; 166 break; //needed to end loop 167 } 168 } 169 170 171 172 /** 173 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage 174 * into ASCII. 175 * @see utext_openUTF8 176 */ 177 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status); 178 179 //--------------------------------------------------------------------------- 180 // 181 // Error Checking / Reporting macros used in all of the tests. 182 // 183 //--------------------------------------------------------------------------- 184 185 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) { 186 int64_t oldIndex = utext_getNativeIndex(text); 187 utext_setNativeIndex(text, 0); 188 char *bufPtr = buf; 189 UChar32 c = utext_next32From(text, 0); 190 while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) { 191 if (0x000020<=c && c<0x00007e) { 192 *bufPtr = c; 193 } else { 194 #if 0 195 sprintf(bufPtr,"U+%04X", c); 196 bufPtr+= strlen(bufPtr)-1; 197 #else 198 *bufPtr = '%'; 199 #endif 200 } 201 bufPtr++; 202 c = UTEXT_NEXT32(text); 203 } 204 *bufPtr = 0; 205 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY) 206 char *ebuf = (char*)malloc(bufLen); 207 uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen); 208 uprv_strncpy(buf, ebuf, bufLen); 209 free((void*)ebuf); 210 #endif 211 utext_setNativeIndex(text, oldIndex); 212 } 213 214 215 static char ASSERT_BUF[1024]; 216 217 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) { 218 if(message.length()==0) { 219 strcpy(ASSERT_BUF, "[[empty UnicodeString]]"); 220 } else { 221 UnicodeString buf; 222 IntlTest::prettify(message,buf); 223 if(buf.length()==0) { 224 strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]"); 225 } else { 226 buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1); 227 if(ASSERT_BUF[0]==0) { 228 ASSERT_BUF[0]=0; 229 for(int32_t i=0;i<buf.length();i++) { 230 UChar ch = buf[i]; 231 sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch); 232 } 233 } 234 } 235 } 236 ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0; 237 return ASSERT_BUF; 238 } 239 240 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,UPRV_LENGTHOF(buf),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);} 241 242 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \ 243 __FILE__, __LINE__, u_errorName(status)); return;}} 244 245 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};} 246 247 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\ 248 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \ 249 __LINE__, u_errorName(errcode), u_errorName(status));};} 250 251 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \ 252 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }} 253 254 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \ 255 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}} 256 257 // expected: const char * , restricted to invariant characters. 258 // actual: const UnicodeString & 259 #define REGEX_ASSERT_UNISTR(expected, actual) { \ 260 if (UnicodeString(expected, -1, US_INV) != (actual)) { \ 261 errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \ 262 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};} 263 264 265 static UBool testUTextEqual(UText *uta, UText *utb) { 266 UChar32 ca = 0; 267 UChar32 cb = 0; 268 utext_setNativeIndex(uta, 0); 269 utext_setNativeIndex(utb, 0); 270 do { 271 ca = utext_next32(uta); 272 cb = utext_next32(utb); 273 if (ca != cb) { 274 break; 275 } 276 } while (ca != U_SENTINEL); 277 return ca == cb; 278 } 279 280 281 /** 282 * @param expected expected text in UTF-8 (not platform) codepage 283 */ 284 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) { 285 UErrorCode status = U_ZERO_ERROR; 286 UText expectedText = UTEXT_INITIALIZER; 287 utext_openUTF8(&expectedText, expected, -1, &status); 288 if(U_FAILURE(status)) { 289 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected)); 290 return; 291 } 292 if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) { 293 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected)); 294 return; 295 } 296 utext_setNativeIndex(actual, 0); 297 if (!testUTextEqual(&expectedText, actual)) { 298 char buf[201 /*21*/]; 299 char expectedBuf[201]; 300 utextToPrintable(buf, UPRV_LENGTHOF(buf), actual); 301 utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText); 302 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual)); 303 } 304 utext_close(&expectedText); 305 } 306 /** 307 * @param expected invariant (platform local text) input 308 */ 309 310 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) { 311 UErrorCode status = U_ZERO_ERROR; 312 UText expectedText = UTEXT_INITIALIZER; 313 regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status); 314 if(U_FAILURE(status)) { 315 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected)); 316 return; 317 } 318 utext_setNativeIndex(actual, 0); 319 if (!testUTextEqual(&expectedText, actual)) { 320 char buf[201 /*21*/]; 321 char expectedBuf[201]; 322 utextToPrintable(buf, UPRV_LENGTHOF(buf), actual); 323 utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText); 324 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual)); 325 } 326 utext_close(&expectedText); 327 } 328 329 /** 330 * Assumes utf-8 input 331 */ 332 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__) 333 /** 334 * Assumes Invariant input 335 */ 336 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__) 337 338 /** 339 * This buffer ( inv_buf ) is used to hold the UTF-8 strings 340 * passed into utext_openUTF8. An error will be given if 341 * INV_BUFSIZ is too small. It's only used on EBCDIC systems. 342 */ 343 344 #define INV_BUFSIZ 2048 /* increase this if too small */ 345 346 static int64_t inv_next=0; 347 348 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY 349 static char inv_buf[INV_BUFSIZ]; 350 #endif 351 352 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) { 353 if(length==-1) length=strlen(inv); 354 #if U_CHARSET_FAMILY==U_ASCII_FAMILY 355 inv_next+=length; 356 return utext_openUTF8(ut, inv, length, status); 357 #else 358 if(inv_next+length+1>INV_BUFSIZ) { 359 fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n", 360 __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1)); 361 *status = U_MEMORY_ALLOCATION_ERROR; 362 return NULL; 363 } 364 365 unsigned char *buf = (unsigned char*)inv_buf+inv_next; 366 uprv_aestrncpy(buf, (const uint8_t*)inv, length); 367 inv_next+=length; 368 369 #if 0 370 fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next); 371 #endif 372 373 return utext_openUTF8(ut, (const char*)buf, length, status); 374 #endif 375 } 376 377 378 //--------------------------------------------------------------------------- 379 // 380 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests 381 // for the LookingAt() and Match() functions. 382 // 383 // usage: 384 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected); 385 // 386 // The expected results are UBool - TRUE or FALSE. 387 // The input text is unescaped. The pattern is not. 388 // 389 // 390 //--------------------------------------------------------------------------- 391 392 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);} 393 394 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) { 395 const UnicodeString pattern(pat, -1, US_INV); 396 const UnicodeString inputText(text, -1, US_INV); 397 UErrorCode status = U_ZERO_ERROR; 398 UParseError pe; 399 RegexPattern *REPattern = NULL; 400 RegexMatcher *REMatcher = NULL; 401 UBool retVal = TRUE; 402 403 UnicodeString patString(pat, -1, US_INV); 404 REPattern = RegexPattern::compile(patString, 0, pe, status); 405 if (U_FAILURE(status)) { 406 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s", 407 line, u_errorName(status)); 408 return FALSE; 409 } 410 if (line==376) { REPattern->dumpPattern();} 411 412 UnicodeString inputString(inputText); 413 UnicodeString unEscapedInput = inputString.unescape(); 414 REMatcher = REPattern->matcher(unEscapedInput, status); 415 if (U_FAILURE(status)) { 416 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n", 417 line, u_errorName(status)); 418 return FALSE; 419 } 420 421 UBool actualmatch; 422 actualmatch = REMatcher->lookingAt(status); 423 if (U_FAILURE(status)) { 424 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n", 425 line, u_errorName(status)); 426 retVal = FALSE; 427 } 428 if (actualmatch != looking) { 429 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line); 430 retVal = FALSE; 431 } 432 433 status = U_ZERO_ERROR; 434 actualmatch = REMatcher->matches(status); 435 if (U_FAILURE(status)) { 436 errln("RegexTest failure in matches() at line %d. Status = %s\n", 437 line, u_errorName(status)); 438 retVal = FALSE; 439 } 440 if (actualmatch != match) { 441 errln("RegexTest: wrong return from matches() at line %d.\n", line); 442 retVal = FALSE; 443 } 444 445 if (retVal == FALSE) { 446 REPattern->dumpPattern(); 447 } 448 449 delete REPattern; 450 delete REMatcher; 451 return retVal; 452 } 453 454 455 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) { 456 UText pattern = UTEXT_INITIALIZER; 457 int32_t inputUTF8Length; 458 char *textChars = NULL; 459 UText inputText = UTEXT_INITIALIZER; 460 UErrorCode status = U_ZERO_ERROR; 461 UParseError pe; 462 RegexPattern *REPattern = NULL; 463 RegexMatcher *REMatcher = NULL; 464 UBool retVal = TRUE; 465 466 regextst_openUTF8FromInvariant(&pattern, pat, -1, &status); 467 REPattern = RegexPattern::compile(&pattern, 0, pe, status); 468 if (U_FAILURE(status)) { 469 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n", 470 line, u_errorName(status)); 471 return FALSE; 472 } 473 474 UnicodeString inputString(text, -1, US_INV); 475 UnicodeString unEscapedInput = inputString.unescape(); 476 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status)); 477 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); 478 479 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status); 480 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) { 481 // UTF-8 does not allow unpaired surrogates, so this could actually happen 482 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status)); 483 return TRUE; // not a failure of the Regex engine 484 } 485 status = U_ZERO_ERROR; // buffer overflow 486 textChars = new char[inputUTF8Length+1]; 487 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status); 488 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status); 489 490 REMatcher = &REPattern->matcher(status)->reset(&inputText); 491 if (U_FAILURE(status)) { 492 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n", 493 line, u_errorName(status)); 494 return FALSE; 495 } 496 497 UBool actualmatch; 498 actualmatch = REMatcher->lookingAt(status); 499 if (U_FAILURE(status)) { 500 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n", 501 line, u_errorName(status)); 502 retVal = FALSE; 503 } 504 if (actualmatch != looking) { 505 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line); 506 retVal = FALSE; 507 } 508 509 status = U_ZERO_ERROR; 510 actualmatch = REMatcher->matches(status); 511 if (U_FAILURE(status)) { 512 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n", 513 line, u_errorName(status)); 514 retVal = FALSE; 515 } 516 if (actualmatch != match) { 517 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line); 518 retVal = FALSE; 519 } 520 521 if (retVal == FALSE) { 522 REPattern->dumpPattern(); 523 } 524 525 delete REPattern; 526 delete REMatcher; 527 utext_close(&inputText); 528 utext_close(&pattern); 529 delete[] textChars; 530 return retVal; 531 } 532 533 534 535 //--------------------------------------------------------------------------- 536 // 537 // REGEX_ERR Macro + invocation function to simplify writing tests 538 // regex tests for incorrect patterns 539 // 540 // usage: 541 // REGEX_ERR("pattern", expected error line, column, expected status); 542 // 543 //--------------------------------------------------------------------------- 544 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__); 545 546 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol, 547 UErrorCode expectedStatus, int32_t line) { 548 UnicodeString pattern(pat); 549 550 UErrorCode status = U_ZERO_ERROR; 551 UParseError pe; 552 RegexPattern *callerPattern = NULL; 553 554 // 555 // Compile the caller's pattern 556 // 557 UnicodeString patString(pat); 558 callerPattern = RegexPattern::compile(patString, 0, pe, status); 559 if (status != expectedStatus) { 560 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status)); 561 } else { 562 if (status != U_ZERO_ERROR) { 563 if (pe.line != errLine || pe.offset != errCol) { 564 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n", 565 line, errLine, errCol, pe.line, pe.offset); 566 } 567 } 568 } 569 570 delete callerPattern; 571 572 // 573 // Compile again, using a UTF-8-based UText 574 // 575 UText patternText = UTEXT_INITIALIZER; 576 regextst_openUTF8FromInvariant(&patternText, pat, -1, &status); 577 callerPattern = RegexPattern::compile(&patternText, 0, pe, status); 578 if (status != expectedStatus) { 579 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status)); 580 } else { 581 if (status != U_ZERO_ERROR) { 582 if (pe.line != errLine || pe.offset != errCol) { 583 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n", 584 line, errLine, errCol, pe.line, pe.offset); 585 } 586 } 587 } 588 589 delete callerPattern; 590 utext_close(&patternText); 591 } 592 593 594 595 //--------------------------------------------------------------------------- 596 // 597 // Basic Check for basic functionality of regex pattern matching. 598 // Avoid the use of REGEX_FIND test macro, which has 599 // substantial dependencies on basic Regex functionality. 600 // 601 //--------------------------------------------------------------------------- 602 void RegexTest::Basic() { 603 604 605 // 606 // Debug - slide failing test cases early 607 // 608 #if 0 609 { 610 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE); 611 UParseError pe; 612 UErrorCode status = U_ZERO_ERROR; 613 RegexPattern *pattern; 614 pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status); 615 pattern->dumpPattern(); 616 RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status); 617 UBool result = m->find(); 618 printf("result = %d\n", result); 619 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd"); 620 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX===================="); 621 } 622 exit(1); 623 #endif 624 625 626 // 627 // Pattern with parentheses 628 // 629 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE); 630 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE); 631 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE); 632 633 // 634 // Patterns with * 635 // 636 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE); 637 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE); 638 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE); 639 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE); 640 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE); 641 642 REGEX_TESTLM("a*", "", TRUE, TRUE); 643 REGEX_TESTLM("a*", "b", TRUE, FALSE); 644 645 646 // 647 // Patterns with "." 648 // 649 REGEX_TESTLM(".", "abc", TRUE, FALSE); 650 REGEX_TESTLM("...", "abc", TRUE, TRUE); 651 REGEX_TESTLM("....", "abc", FALSE, FALSE); 652 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE); 653 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE); 654 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE); 655 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE); 656 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE); 657 658 // 659 // Patterns with * applied to chars at end of literal string 660 // 661 REGEX_TESTLM("abc*", "ab", TRUE, TRUE); 662 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE); 663 664 // 665 // Supplemental chars match as single chars, not a pair of surrogates. 666 // 667 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE); 668 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE); 669 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE); 670 671 672 // 673 // UnicodeSets in the pattern 674 // 675 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE); 676 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE); 677 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE); 678 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE); 679 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE); 680 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE); 681 682 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE); 683 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE); 684 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE); 685 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences. 686 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE); 687 688 // 689 // OR operator in patterns 690 // 691 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE); 692 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE); 693 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE); 694 REGEX_TESTLM("a|b", "b", TRUE, TRUE); 695 696 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE); 697 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE); 698 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE); 699 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE); 700 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE); 701 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE); 702 703 // 704 // + 705 // 706 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE); 707 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE); 708 REGEX_TESTLM("b+", "", FALSE, FALSE); 709 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE); 710 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE); 711 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE); 712 713 // 714 // ? 715 // 716 REGEX_TESTLM("ab?", "ab", TRUE, TRUE); 717 REGEX_TESTLM("ab?", "a", TRUE, TRUE); 718 REGEX_TESTLM("ab?", "ac", TRUE, FALSE); 719 REGEX_TESTLM("ab?", "abb", TRUE, FALSE); 720 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE); 721 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE); 722 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE); 723 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE); 724 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE); 725 726 // 727 // Escape sequences that become single literal chars, handled internally 728 // by ICU's Unescape. 729 // 730 731 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet. 732 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL 733 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L 734 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape 735 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed 736 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line 737 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR 738 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab 739 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE); 740 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE); 741 742 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input 743 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input 744 745 // Escape of special chars in patterns 746 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE); 747 } 748 749 750 //--------------------------------------------------------------------------- 751 // 752 // UTextBasic Check for quirks that are specific to the UText 753 // implementation. 754 // 755 //--------------------------------------------------------------------------- 756 void RegexTest::UTextBasic() { 757 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 758 UErrorCode status = U_ZERO_ERROR; 759 UText pattern = UTEXT_INITIALIZER; 760 utext_openUTF8(&pattern, str_abc, -1, &status); 761 RegexMatcher matcher(&pattern, 0, status); 762 REGEX_CHECK_STATUS; 763 764 UText input = UTEXT_INITIALIZER; 765 utext_openUTF8(&input, str_abc, -1, &status); 766 REGEX_CHECK_STATUS; 767 matcher.reset(&input); 768 REGEX_CHECK_STATUS; 769 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText()); 770 771 matcher.reset(matcher.inputText()); 772 REGEX_CHECK_STATUS; 773 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText()); 774 775 utext_close(&pattern); 776 utext_close(&input); 777 } 778 779 780 //--------------------------------------------------------------------------- 781 // 782 // API_Match Test that the API for class RegexMatcher 783 // is present and nominally working, but excluding functions 784 // implementing replace operations. 785 // 786 //--------------------------------------------------------------------------- 787 void RegexTest::API_Match() { 788 UParseError pe; 789 UErrorCode status=U_ZERO_ERROR; 790 int32_t flags = 0; 791 792 // 793 // Debug - slide failing test cases early 794 // 795 #if 0 796 { 797 } 798 return; 799 #endif 800 801 // 802 // Simple pattern compilation 803 // 804 { 805 UnicodeString re("abc"); 806 RegexPattern *pat2; 807 pat2 = RegexPattern::compile(re, flags, pe, status); 808 REGEX_CHECK_STATUS; 809 810 UnicodeString inStr1 = "abcdef this is a test"; 811 UnicodeString instr2 = "not abc"; 812 UnicodeString empty = ""; 813 814 815 // 816 // Matcher creation and reset. 817 // 818 RegexMatcher *m1 = pat2->matcher(inStr1, status); 819 REGEX_CHECK_STATUS; 820 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 821 REGEX_ASSERT(m1->input() == inStr1); 822 m1->reset(instr2); 823 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 824 REGEX_ASSERT(m1->input() == instr2); 825 m1->reset(inStr1); 826 REGEX_ASSERT(m1->input() == inStr1); 827 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 828 m1->reset(empty); 829 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 830 REGEX_ASSERT(m1->input() == empty); 831 REGEX_ASSERT(&m1->pattern() == pat2); 832 833 // 834 // reset(pos, status) 835 // 836 m1->reset(inStr1); 837 m1->reset(4, status); 838 REGEX_CHECK_STATUS; 839 REGEX_ASSERT(m1->input() == inStr1); 840 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 841 842 m1->reset(-1, status); 843 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 844 status = U_ZERO_ERROR; 845 846 m1->reset(0, status); 847 REGEX_CHECK_STATUS; 848 status = U_ZERO_ERROR; 849 850 int32_t len = m1->input().length(); 851 m1->reset(len-1, status); 852 REGEX_CHECK_STATUS; 853 status = U_ZERO_ERROR; 854 855 m1->reset(len, status); 856 REGEX_CHECK_STATUS; 857 status = U_ZERO_ERROR; 858 859 m1->reset(len+1, status); 860 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 861 status = U_ZERO_ERROR; 862 863 // 864 // match(pos, status) 865 // 866 m1->reset(instr2); 867 REGEX_ASSERT(m1->matches(4, status) == TRUE); 868 m1->reset(); 869 REGEX_ASSERT(m1->matches(3, status) == FALSE); 870 m1->reset(); 871 REGEX_ASSERT(m1->matches(5, status) == FALSE); 872 REGEX_ASSERT(m1->matches(4, status) == TRUE); 873 REGEX_ASSERT(m1->matches(-1, status) == FALSE); 874 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 875 876 // Match() at end of string should fail, but should not 877 // be an error. 878 status = U_ZERO_ERROR; 879 len = m1->input().length(); 880 REGEX_ASSERT(m1->matches(len, status) == FALSE); 881 REGEX_CHECK_STATUS; 882 883 // Match beyond end of string should fail with an error. 884 status = U_ZERO_ERROR; 885 REGEX_ASSERT(m1->matches(len+1, status) == FALSE); 886 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 887 888 // Successful match at end of string. 889 { 890 status = U_ZERO_ERROR; 891 RegexMatcher m("A?", 0, status); // will match zero length string. 892 REGEX_CHECK_STATUS; 893 m.reset(inStr1); 894 len = inStr1.length(); 895 REGEX_ASSERT(m.matches(len, status) == TRUE); 896 REGEX_CHECK_STATUS; 897 m.reset(empty); 898 REGEX_ASSERT(m.matches(0, status) == TRUE); 899 REGEX_CHECK_STATUS; 900 } 901 902 903 // 904 // lookingAt(pos, status) 905 // 906 status = U_ZERO_ERROR; 907 m1->reset(instr2); // "not abc" 908 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 909 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE); 910 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE); 911 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 912 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE); 913 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 914 status = U_ZERO_ERROR; 915 len = m1->input().length(); 916 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE); 917 REGEX_CHECK_STATUS; 918 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE); 919 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 920 921 delete m1; 922 delete pat2; 923 } 924 925 926 // 927 // Capture Group. 928 // RegexMatcher::start(); 929 // RegexMatcher::end(); 930 // RegexMatcher::groupCount(); 931 // 932 { 933 int32_t flags=0; 934 UParseError pe; 935 UErrorCode status=U_ZERO_ERROR; 936 937 UnicodeString re("01(23(45)67)(.*)"); 938 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 939 REGEX_CHECK_STATUS; 940 UnicodeString data = "0123456789"; 941 942 RegexMatcher *matcher = pat->matcher(data, status); 943 REGEX_CHECK_STATUS; 944 REGEX_ASSERT(matcher->lookingAt(status) == TRUE); 945 static const int32_t matchStarts[] = {0, 2, 4, 8}; 946 static const int32_t matchEnds[] = {10, 8, 6, 10}; 947 int32_t i; 948 for (i=0; i<4; i++) { 949 int32_t actualStart = matcher->start(i, status); 950 REGEX_CHECK_STATUS; 951 if (actualStart != matchStarts[i]) { 952 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n", 953 __LINE__, i, matchStarts[i], actualStart); 954 } 955 int32_t actualEnd = matcher->end(i, status); 956 REGEX_CHECK_STATUS; 957 if (actualEnd != matchEnds[i]) { 958 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n", 959 __LINE__, i, matchEnds[i], actualEnd); 960 } 961 } 962 963 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status)); 964 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status)); 965 966 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 967 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 968 matcher->reset(); 969 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE); 970 971 matcher->lookingAt(status); 972 REGEX_ASSERT(matcher->group(status) == "0123456789"); 973 REGEX_ASSERT(matcher->group(0, status) == "0123456789"); 974 REGEX_ASSERT(matcher->group(1, status) == "234567" ); 975 REGEX_ASSERT(matcher->group(2, status) == "45" ); 976 REGEX_ASSERT(matcher->group(3, status) == "89" ); 977 REGEX_CHECK_STATUS; 978 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 979 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 980 matcher->reset(); 981 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); 982 983 delete matcher; 984 delete pat; 985 986 } 987 988 // 989 // find 990 // 991 { 992 int32_t flags=0; 993 UParseError pe; 994 UErrorCode status=U_ZERO_ERROR; 995 996 UnicodeString re("abc"); 997 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 998 REGEX_CHECK_STATUS; 999 UnicodeString data = ".abc..abc...abc.."; 1000 // 012345678901234567 1001 1002 RegexMatcher *matcher = pat->matcher(data, status); 1003 REGEX_CHECK_STATUS; 1004 REGEX_ASSERT(matcher->find()); 1005 REGEX_ASSERT(matcher->start(status) == 1); 1006 REGEX_ASSERT(matcher->find()); 1007 REGEX_ASSERT(matcher->start(status) == 6); 1008 REGEX_ASSERT(matcher->find()); 1009 REGEX_ASSERT(matcher->start(status) == 12); 1010 REGEX_ASSERT(matcher->find() == FALSE); 1011 REGEX_ASSERT(matcher->find() == FALSE); 1012 1013 matcher->reset(); 1014 REGEX_ASSERT(matcher->find()); 1015 REGEX_ASSERT(matcher->start(status) == 1); 1016 1017 REGEX_ASSERT(matcher->find(0, status)); 1018 REGEX_ASSERT(matcher->start(status) == 1); 1019 REGEX_ASSERT(matcher->find(1, status)); 1020 REGEX_ASSERT(matcher->start(status) == 1); 1021 REGEX_ASSERT(matcher->find(2, status)); 1022 REGEX_ASSERT(matcher->start(status) == 6); 1023 REGEX_ASSERT(matcher->find(12, status)); 1024 REGEX_ASSERT(matcher->start(status) == 12); 1025 REGEX_ASSERT(matcher->find(13, status) == FALSE); 1026 REGEX_ASSERT(matcher->find(16, status) == FALSE); 1027 REGEX_ASSERT(matcher->find(17, status) == FALSE); 1028 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE); 1029 1030 status = U_ZERO_ERROR; 1031 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 1032 status = U_ZERO_ERROR; 1033 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR); 1034 1035 REGEX_ASSERT(matcher->groupCount() == 0); 1036 1037 delete matcher; 1038 delete pat; 1039 } 1040 1041 1042 // 1043 // find, with \G in pattern (true if at the end of a previous match). 1044 // 1045 { 1046 int32_t flags=0; 1047 UParseError pe; 1048 UErrorCode status=U_ZERO_ERROR; 1049 1050 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV); 1051 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 1052 REGEX_CHECK_STATUS; 1053 UnicodeString data = ".abcabc.abc.."; 1054 // 012345678901234567 1055 1056 RegexMatcher *matcher = pat->matcher(data, status); 1057 REGEX_CHECK_STATUS; 1058 REGEX_ASSERT(matcher->find()); 1059 REGEX_ASSERT(matcher->start(status) == 0); 1060 REGEX_ASSERT(matcher->start(1, status) == -1); 1061 REGEX_ASSERT(matcher->start(2, status) == 1); 1062 1063 REGEX_ASSERT(matcher->find()); 1064 REGEX_ASSERT(matcher->start(status) == 4); 1065 REGEX_ASSERT(matcher->start(1, status) == 4); 1066 REGEX_ASSERT(matcher->start(2, status) == -1); 1067 REGEX_CHECK_STATUS; 1068 1069 delete matcher; 1070 delete pat; 1071 } 1072 1073 // 1074 // find with zero length matches, match position should bump ahead 1075 // to prevent loops. 1076 // 1077 { 1078 int32_t i; 1079 UErrorCode status=U_ZERO_ERROR; 1080 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere, 1081 // using an always-true look-ahead. 1082 REGEX_CHECK_STATUS; 1083 UnicodeString s(" "); 1084 m.reset(s); 1085 for (i=0; ; i++) { 1086 if (m.find() == FALSE) { 1087 break; 1088 } 1089 REGEX_ASSERT(m.start(status) == i); 1090 REGEX_ASSERT(m.end(status) == i); 1091 } 1092 REGEX_ASSERT(i==5); 1093 1094 // Check that the bump goes over surrogate pairs OK 1095 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004"); 1096 s = s.unescape(); 1097 m.reset(s); 1098 for (i=0; ; i+=2) { 1099 if (m.find() == FALSE) { 1100 break; 1101 } 1102 REGEX_ASSERT(m.start(status) == i); 1103 REGEX_ASSERT(m.end(status) == i); 1104 } 1105 REGEX_ASSERT(i==10); 1106 } 1107 { 1108 // find() loop breaking test. 1109 // with pattern of /.?/, should see a series of one char matches, then a single 1110 // match of zero length at the end of the input string. 1111 int32_t i; 1112 UErrorCode status=U_ZERO_ERROR; 1113 RegexMatcher m(".?", 0, status); 1114 REGEX_CHECK_STATUS; 1115 UnicodeString s(" "); 1116 m.reset(s); 1117 for (i=0; ; i++) { 1118 if (m.find() == FALSE) { 1119 break; 1120 } 1121 REGEX_ASSERT(m.start(status) == i); 1122 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i)); 1123 } 1124 REGEX_ASSERT(i==5); 1125 } 1126 1127 1128 // 1129 // Matchers with no input string behave as if they had an empty input string. 1130 // 1131 1132 { 1133 UErrorCode status = U_ZERO_ERROR; 1134 RegexMatcher m(".?", 0, status); 1135 REGEX_CHECK_STATUS; 1136 REGEX_ASSERT(m.find()); 1137 REGEX_ASSERT(m.start(status) == 0); 1138 REGEX_ASSERT(m.input() == ""); 1139 } 1140 { 1141 UErrorCode status = U_ZERO_ERROR; 1142 RegexPattern *p = RegexPattern::compile(".", 0, status); 1143 RegexMatcher *m = p->matcher(status); 1144 REGEX_CHECK_STATUS; 1145 1146 REGEX_ASSERT(m->find() == FALSE); 1147 REGEX_ASSERT(m->input() == ""); 1148 delete m; 1149 delete p; 1150 } 1151 1152 // 1153 // Regions 1154 // 1155 { 1156 UErrorCode status = U_ZERO_ERROR; 1157 UnicodeString testString("This is test data"); 1158 RegexMatcher m(".*", testString, 0, status); 1159 REGEX_CHECK_STATUS; 1160 REGEX_ASSERT(m.regionStart() == 0); 1161 REGEX_ASSERT(m.regionEnd() == testString.length()); 1162 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1163 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1164 1165 m.region(2,4, status); 1166 REGEX_CHECK_STATUS; 1167 REGEX_ASSERT(m.matches(status)); 1168 REGEX_ASSERT(m.start(status)==2); 1169 REGEX_ASSERT(m.end(status)==4); 1170 REGEX_CHECK_STATUS; 1171 1172 m.reset(); 1173 REGEX_ASSERT(m.regionStart() == 0); 1174 REGEX_ASSERT(m.regionEnd() == testString.length()); 1175 1176 UnicodeString shorterString("short"); 1177 m.reset(shorterString); 1178 REGEX_ASSERT(m.regionStart() == 0); 1179 REGEX_ASSERT(m.regionEnd() == shorterString.length()); 1180 1181 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1182 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE)); 1183 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 1184 REGEX_ASSERT(&m == &m.reset()); 1185 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 1186 1187 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE)); 1188 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1189 REGEX_ASSERT(&m == &m.reset()); 1190 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1191 1192 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1193 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE)); 1194 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 1195 REGEX_ASSERT(&m == &m.reset()); 1196 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 1197 1198 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE)); 1199 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1200 REGEX_ASSERT(&m == &m.reset()); 1201 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1202 1203 } 1204 1205 // 1206 // hitEnd() and requireEnd() 1207 // 1208 { 1209 UErrorCode status = U_ZERO_ERROR; 1210 UnicodeString testString("aabb"); 1211 RegexMatcher m1(".*", testString, 0, status); 1212 REGEX_ASSERT(m1.lookingAt(status) == TRUE); 1213 REGEX_ASSERT(m1.hitEnd() == TRUE); 1214 REGEX_ASSERT(m1.requireEnd() == FALSE); 1215 REGEX_CHECK_STATUS; 1216 1217 status = U_ZERO_ERROR; 1218 RegexMatcher m2("a*", testString, 0, status); 1219 REGEX_ASSERT(m2.lookingAt(status) == TRUE); 1220 REGEX_ASSERT(m2.hitEnd() == FALSE); 1221 REGEX_ASSERT(m2.requireEnd() == FALSE); 1222 REGEX_CHECK_STATUS; 1223 1224 status = U_ZERO_ERROR; 1225 RegexMatcher m3(".*$", testString, 0, status); 1226 REGEX_ASSERT(m3.lookingAt(status) == TRUE); 1227 REGEX_ASSERT(m3.hitEnd() == TRUE); 1228 REGEX_ASSERT(m3.requireEnd() == TRUE); 1229 REGEX_CHECK_STATUS; 1230 } 1231 1232 1233 // 1234 // Compilation error on reset with UChar * 1235 // These were a hazard that people were stumbling over with runtime errors. 1236 // Changed them to compiler errors by adding private methods that more closely 1237 // matched the incorrect use of the functions. 1238 // 1239 #if 0 1240 { 1241 UErrorCode status = U_ZERO_ERROR; 1242 UChar ucharString[20]; 1243 RegexMatcher m(".", 0, status); 1244 m.reset(ucharString); // should not compile. 1245 1246 RegexPattern *p = RegexPattern::compile(".", 0, status); 1247 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile. 1248 1249 RegexMatcher m3(".", ucharString, 0, status); // Should not compile 1250 } 1251 #endif 1252 1253 // 1254 // Time Outs. 1255 // Note: These tests will need to be changed when the regexp engine is 1256 // able to detect and cut short the exponential time behavior on 1257 // this type of match. 1258 // 1259 { 1260 UErrorCode status = U_ZERO_ERROR; 1261 // Enough 'a's in the string to cause the match to time out. 1262 // (Each on additonal 'a' doubles the time) 1263 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa"); 1264 RegexMatcher matcher("(a+)+b", testString, 0, status); 1265 REGEX_CHECK_STATUS; 1266 REGEX_ASSERT(matcher.getTimeLimit() == 0); 1267 matcher.setTimeLimit(100, status); 1268 REGEX_ASSERT(matcher.getTimeLimit() == 100); 1269 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1270 REGEX_ASSERT(status == U_REGEX_TIME_OUT); 1271 } 1272 { 1273 UErrorCode status = U_ZERO_ERROR; 1274 // Few enough 'a's to slip in under the time limit. 1275 UnicodeString testString("aaaaaaaaaaaaaaaaaa"); 1276 RegexMatcher matcher("(a+)+b", testString, 0, status); 1277 REGEX_CHECK_STATUS; 1278 matcher.setTimeLimit(100, status); 1279 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1280 REGEX_CHECK_STATUS; 1281 } 1282 1283 // 1284 // Stack Limits 1285 // 1286 { 1287 UErrorCode status = U_ZERO_ERROR; 1288 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A' 1289 1290 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations 1291 // of the '+', and makes the stack frames larger. 1292 RegexMatcher matcher("(A)+A$", testString, 0, status); 1293 1294 // With the default stack, this match should fail to run 1295 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1296 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW); 1297 1298 // With unlimited stack, it should run 1299 status = U_ZERO_ERROR; 1300 matcher.setStackLimit(0, status); 1301 REGEX_CHECK_STATUS; 1302 REGEX_ASSERT(matcher.lookingAt(status) == TRUE); 1303 REGEX_CHECK_STATUS; 1304 REGEX_ASSERT(matcher.getStackLimit() == 0); 1305 1306 // With a limited stack, it the match should fail 1307 status = U_ZERO_ERROR; 1308 matcher.setStackLimit(10000, status); 1309 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1310 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW); 1311 REGEX_ASSERT(matcher.getStackLimit() == 10000); 1312 } 1313 1314 // A pattern that doesn't save state should work with 1315 // a minimal sized stack 1316 { 1317 UErrorCode status = U_ZERO_ERROR; 1318 UnicodeString testString = "abc"; 1319 RegexMatcher matcher("abc", testString, 0, status); 1320 REGEX_CHECK_STATUS; 1321 matcher.setStackLimit(30, status); 1322 REGEX_CHECK_STATUS; 1323 REGEX_ASSERT(matcher.matches(status) == TRUE); 1324 REGEX_CHECK_STATUS; 1325 REGEX_ASSERT(matcher.getStackLimit() == 30); 1326 1327 // Negative stack sizes should fail 1328 status = U_ZERO_ERROR; 1329 matcher.setStackLimit(1000, status); 1330 REGEX_CHECK_STATUS; 1331 matcher.setStackLimit(-1, status); 1332 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); 1333 REGEX_ASSERT(matcher.getStackLimit() == 1000); 1334 } 1335 1336 1337 } 1338 1339 1340 1341 1342 1343 1344 //--------------------------------------------------------------------------- 1345 // 1346 // API_Replace API test for class RegexMatcher, testing the 1347 // Replace family of functions. 1348 // 1349 //--------------------------------------------------------------------------- 1350 void RegexTest::API_Replace() { 1351 // 1352 // Replace 1353 // 1354 int32_t flags=0; 1355 UParseError pe; 1356 UErrorCode status=U_ZERO_ERROR; 1357 1358 UnicodeString re("abc"); 1359 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 1360 REGEX_CHECK_STATUS; 1361 UnicodeString data = ".abc..abc...abc.."; 1362 // 012345678901234567 1363 RegexMatcher *matcher = pat->matcher(data, status); 1364 1365 // 1366 // Plain vanilla matches. 1367 // 1368 UnicodeString dest; 1369 dest = matcher->replaceFirst("yz", status); 1370 REGEX_CHECK_STATUS; 1371 REGEX_ASSERT(dest == ".yz..abc...abc.."); 1372 1373 dest = matcher->replaceAll("yz", status); 1374 REGEX_CHECK_STATUS; 1375 REGEX_ASSERT(dest == ".yz..yz...yz.."); 1376 1377 // 1378 // Plain vanilla non-matches. 1379 // 1380 UnicodeString d2 = ".abx..abx...abx.."; 1381 matcher->reset(d2); 1382 dest = matcher->replaceFirst("yz", status); 1383 REGEX_CHECK_STATUS; 1384 REGEX_ASSERT(dest == ".abx..abx...abx.."); 1385 1386 dest = matcher->replaceAll("yz", status); 1387 REGEX_CHECK_STATUS; 1388 REGEX_ASSERT(dest == ".abx..abx...abx.."); 1389 1390 // 1391 // Empty source string 1392 // 1393 UnicodeString d3 = ""; 1394 matcher->reset(d3); 1395 dest = matcher->replaceFirst("yz", status); 1396 REGEX_CHECK_STATUS; 1397 REGEX_ASSERT(dest == ""); 1398 1399 dest = matcher->replaceAll("yz", status); 1400 REGEX_CHECK_STATUS; 1401 REGEX_ASSERT(dest == ""); 1402 1403 // 1404 // Empty substitution string 1405 // 1406 matcher->reset(data); // ".abc..abc...abc.." 1407 dest = matcher->replaceFirst("", status); 1408 REGEX_CHECK_STATUS; 1409 REGEX_ASSERT(dest == "...abc...abc.."); 1410 1411 dest = matcher->replaceAll("", status); 1412 REGEX_CHECK_STATUS; 1413 REGEX_ASSERT(dest == "........"); 1414 1415 // 1416 // match whole string 1417 // 1418 UnicodeString d4 = "abc"; 1419 matcher->reset(d4); 1420 dest = matcher->replaceFirst("xyz", status); 1421 REGEX_CHECK_STATUS; 1422 REGEX_ASSERT(dest == "xyz"); 1423 1424 dest = matcher->replaceAll("xyz", status); 1425 REGEX_CHECK_STATUS; 1426 REGEX_ASSERT(dest == "xyz"); 1427 1428 // 1429 // Capture Group, simple case 1430 // 1431 UnicodeString re2("a(..)"); 1432 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status); 1433 REGEX_CHECK_STATUS; 1434 UnicodeString d5 = "abcdefg"; 1435 RegexMatcher *matcher2 = pat2->matcher(d5, status); 1436 REGEX_CHECK_STATUS; 1437 dest = matcher2->replaceFirst("$1$1", status); 1438 REGEX_CHECK_STATUS; 1439 REGEX_ASSERT(dest == "bcbcdefg"); 1440 1441 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status); 1442 REGEX_CHECK_STATUS; 1443 REGEX_ASSERT(dest == "The value of $1 is bc.defg"); 1444 1445 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status); 1446 REGEX_ASSERT(U_FAILURE(status)); 1447 status = U_ZERO_ERROR; 1448 1449 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF."); 1450 replacement = replacement.unescape(); 1451 dest = matcher2->replaceFirst(replacement, status); 1452 REGEX_CHECK_STATUS; 1453 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg"); 1454 1455 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR); 1456 1457 1458 // 1459 // Replacement String with \u hex escapes 1460 // 1461 { 1462 UnicodeString src = "abc 1 abc 2 abc 3"; 1463 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--"); 1464 matcher->reset(src); 1465 UnicodeString result = matcher->replaceAll(substitute, status); 1466 REGEX_CHECK_STATUS; 1467 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3"); 1468 } 1469 { 1470 UnicodeString src = "abc !"; 1471 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--"); 1472 matcher->reset(src); 1473 UnicodeString result = matcher->replaceAll(substitute, status); 1474 REGEX_CHECK_STATUS; 1475 UnicodeString expected = UnicodeString("--"); 1476 expected.append((UChar32)0x10000); 1477 expected.append("-- !"); 1478 REGEX_ASSERT(result == expected); 1479 } 1480 // TODO: need more through testing of capture substitutions. 1481 1482 // Bug 4057 1483 // 1484 { 1485 status = U_ZERO_ERROR; 1486 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin"; 1487 RegexMatcher m("ss(.*?)ee", 0, status); 1488 REGEX_CHECK_STATUS; 1489 UnicodeString result; 1490 1491 // Multiple finds do NOT bump up the previous appendReplacement postion. 1492 m.reset(s); 1493 m.find(); 1494 m.find(); 1495 m.appendReplacement(result, "ooh", status); 1496 REGEX_CHECK_STATUS; 1497 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1498 1499 // After a reset into the interior of a string, appendReplacemnt still starts at beginning. 1500 status = U_ZERO_ERROR; 1501 result.truncate(0); 1502 m.reset(10, status); 1503 m.find(); 1504 m.find(); 1505 m.appendReplacement(result, "ooh", status); 1506 REGEX_CHECK_STATUS; 1507 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1508 1509 // find() at interior of string, appendReplacemnt still starts at beginning. 1510 status = U_ZERO_ERROR; 1511 result.truncate(0); 1512 m.reset(); 1513 m.find(10, status); 1514 m.find(); 1515 m.appendReplacement(result, "ooh", status); 1516 REGEX_CHECK_STATUS; 1517 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1518 1519 m.appendTail(result); 1520 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin"); 1521 1522 } 1523 1524 delete matcher2; 1525 delete pat2; 1526 delete matcher; 1527 delete pat; 1528 } 1529 1530 1531 //--------------------------------------------------------------------------- 1532 // 1533 // API_Pattern Test that the API for class RegexPattern is 1534 // present and nominally working. 1535 // 1536 //--------------------------------------------------------------------------- 1537 void RegexTest::API_Pattern() { 1538 RegexPattern pata; // Test default constructor to not crash. 1539 RegexPattern patb; 1540 1541 REGEX_ASSERT(pata == patb); 1542 REGEX_ASSERT(pata == pata); 1543 1544 UnicodeString re1("abc[a-l][m-z]"); 1545 UnicodeString re2("def"); 1546 UErrorCode status = U_ZERO_ERROR; 1547 UParseError pe; 1548 1549 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status); 1550 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status); 1551 REGEX_CHECK_STATUS; 1552 REGEX_ASSERT(*pat1 == *pat1); 1553 REGEX_ASSERT(*pat1 != pata); 1554 1555 // Assign 1556 patb = *pat1; 1557 REGEX_ASSERT(patb == *pat1); 1558 1559 // Copy Construct 1560 RegexPattern patc(*pat1); 1561 REGEX_ASSERT(patc == *pat1); 1562 REGEX_ASSERT(patb == patc); 1563 REGEX_ASSERT(pat1 != pat2); 1564 patb = *pat2; 1565 REGEX_ASSERT(patb != patc); 1566 REGEX_ASSERT(patb == *pat2); 1567 1568 // Compile with no flags. 1569 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status); 1570 REGEX_ASSERT(*pat1a == *pat1); 1571 1572 REGEX_ASSERT(pat1a->flags() == 0); 1573 1574 // Compile with different flags should be not equal 1575 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status); 1576 REGEX_CHECK_STATUS; 1577 1578 REGEX_ASSERT(*pat1b != *pat1a); 1579 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE); 1580 REGEX_ASSERT(pat1a->flags() == 0); 1581 delete pat1b; 1582 1583 // clone 1584 RegexPattern *pat1c = pat1->clone(); 1585 REGEX_ASSERT(*pat1c == *pat1); 1586 REGEX_ASSERT(*pat1c != *pat2); 1587 1588 delete pat1c; 1589 delete pat1a; 1590 delete pat1; 1591 delete pat2; 1592 1593 1594 // 1595 // Verify that a matcher created from a cloned pattern works. 1596 // (Jitterbug 3423) 1597 // 1598 { 1599 UErrorCode status = U_ZERO_ERROR; 1600 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status); 1601 RegexPattern *pClone = pSource->clone(); 1602 delete pSource; 1603 RegexMatcher *mFromClone = pClone->matcher(status); 1604 REGEX_CHECK_STATUS; 1605 UnicodeString s = "Hello World"; 1606 mFromClone->reset(s); 1607 REGEX_ASSERT(mFromClone->find() == TRUE); 1608 REGEX_ASSERT(mFromClone->group(status) == "Hello"); 1609 REGEX_ASSERT(mFromClone->find() == TRUE); 1610 REGEX_ASSERT(mFromClone->group(status) == "World"); 1611 REGEX_ASSERT(mFromClone->find() == FALSE); 1612 delete mFromClone; 1613 delete pClone; 1614 } 1615 1616 // 1617 // matches convenience API 1618 // 1619 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE); 1620 REGEX_CHECK_STATUS; 1621 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE); 1622 REGEX_CHECK_STATUS; 1623 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE); 1624 REGEX_CHECK_STATUS; 1625 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE); 1626 REGEX_CHECK_STATUS; 1627 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE); 1628 REGEX_CHECK_STATUS; 1629 status = U_INDEX_OUTOFBOUNDS_ERROR; 1630 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE); 1631 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1632 1633 1634 // 1635 // Split() 1636 // 1637 status = U_ZERO_ERROR; 1638 pat1 = RegexPattern::compile(" +", pe, status); 1639 REGEX_CHECK_STATUS; 1640 UnicodeString fields[10]; 1641 1642 int32_t n; 1643 n = pat1->split("Now is the time", fields, 10, status); 1644 REGEX_CHECK_STATUS; 1645 REGEX_ASSERT(n==4); 1646 REGEX_ASSERT(fields[0]=="Now"); 1647 REGEX_ASSERT(fields[1]=="is"); 1648 REGEX_ASSERT(fields[2]=="the"); 1649 REGEX_ASSERT(fields[3]=="time"); 1650 REGEX_ASSERT(fields[4]==""); 1651 1652 n = pat1->split("Now is the time", fields, 2, status); 1653 REGEX_CHECK_STATUS; 1654 REGEX_ASSERT(n==2); 1655 REGEX_ASSERT(fields[0]=="Now"); 1656 REGEX_ASSERT(fields[1]=="is the time"); 1657 REGEX_ASSERT(fields[2]=="the"); // left over from previous test 1658 1659 fields[1] = "*"; 1660 status = U_ZERO_ERROR; 1661 n = pat1->split("Now is the time", fields, 1, status); 1662 REGEX_CHECK_STATUS; 1663 REGEX_ASSERT(n==1); 1664 REGEX_ASSERT(fields[0]=="Now is the time"); 1665 REGEX_ASSERT(fields[1]=="*"); 1666 status = U_ZERO_ERROR; 1667 1668 n = pat1->split(" Now is the time ", fields, 10, status); 1669 REGEX_CHECK_STATUS; 1670 REGEX_ASSERT(n==6); 1671 REGEX_ASSERT(fields[0]==""); 1672 REGEX_ASSERT(fields[1]=="Now"); 1673 REGEX_ASSERT(fields[2]=="is"); 1674 REGEX_ASSERT(fields[3]=="the"); 1675 REGEX_ASSERT(fields[4]=="time"); 1676 REGEX_ASSERT(fields[5]==""); 1677 1678 n = pat1->split(" ", fields, 10, status); 1679 REGEX_CHECK_STATUS; 1680 REGEX_ASSERT(n==2); 1681 REGEX_ASSERT(fields[0]==""); 1682 REGEX_ASSERT(fields[1]==""); 1683 1684 fields[0] = "foo"; 1685 n = pat1->split("", fields, 10, status); 1686 REGEX_CHECK_STATUS; 1687 REGEX_ASSERT(n==0); 1688 REGEX_ASSERT(fields[0]=="foo"); 1689 1690 delete pat1; 1691 1692 // split, with a pattern with (capture) 1693 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status); 1694 REGEX_CHECK_STATUS; 1695 1696 status = U_ZERO_ERROR; 1697 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status); 1698 REGEX_CHECK_STATUS; 1699 REGEX_ASSERT(n==7); 1700 REGEX_ASSERT(fields[0]==""); 1701 REGEX_ASSERT(fields[1]=="a"); 1702 REGEX_ASSERT(fields[2]=="Now is "); 1703 REGEX_ASSERT(fields[3]=="b"); 1704 REGEX_ASSERT(fields[4]=="the time"); 1705 REGEX_ASSERT(fields[5]=="c"); 1706 REGEX_ASSERT(fields[6]==""); 1707 REGEX_ASSERT(status==U_ZERO_ERROR); 1708 1709 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status); 1710 REGEX_CHECK_STATUS; 1711 REGEX_ASSERT(n==7); 1712 REGEX_ASSERT(fields[0]==" "); 1713 REGEX_ASSERT(fields[1]=="a"); 1714 REGEX_ASSERT(fields[2]=="Now is "); 1715 REGEX_ASSERT(fields[3]=="b"); 1716 REGEX_ASSERT(fields[4]=="the time"); 1717 REGEX_ASSERT(fields[5]=="c"); 1718 REGEX_ASSERT(fields[6]==""); 1719 1720 status = U_ZERO_ERROR; 1721 fields[6] = "foo"; 1722 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status); 1723 REGEX_CHECK_STATUS; 1724 REGEX_ASSERT(n==6); 1725 REGEX_ASSERT(fields[0]==" "); 1726 REGEX_ASSERT(fields[1]=="a"); 1727 REGEX_ASSERT(fields[2]=="Now is "); 1728 REGEX_ASSERT(fields[3]=="b"); 1729 REGEX_ASSERT(fields[4]=="the time"); 1730 REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter. 1731 REGEX_ASSERT(fields[6]=="foo"); 1732 1733 status = U_ZERO_ERROR; 1734 fields[5] = "foo"; 1735 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status); 1736 REGEX_CHECK_STATUS; 1737 REGEX_ASSERT(n==5); 1738 REGEX_ASSERT(fields[0]==" "); 1739 REGEX_ASSERT(fields[1]=="a"); 1740 REGEX_ASSERT(fields[2]=="Now is "); 1741 REGEX_ASSERT(fields[3]=="b"); 1742 REGEX_ASSERT(fields[4]=="the time<c>"); 1743 REGEX_ASSERT(fields[5]=="foo"); 1744 1745 status = U_ZERO_ERROR; 1746 fields[5] = "foo"; 1747 n = pat1->split(" <a>Now is <b>the time", fields, 5, status); 1748 REGEX_CHECK_STATUS; 1749 REGEX_ASSERT(n==5); 1750 REGEX_ASSERT(fields[0]==" "); 1751 REGEX_ASSERT(fields[1]=="a"); 1752 REGEX_ASSERT(fields[2]=="Now is "); 1753 REGEX_ASSERT(fields[3]=="b"); 1754 REGEX_ASSERT(fields[4]=="the time"); 1755 REGEX_ASSERT(fields[5]=="foo"); 1756 1757 status = U_ZERO_ERROR; 1758 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status); 1759 REGEX_CHECK_STATUS; 1760 REGEX_ASSERT(n==4); 1761 REGEX_ASSERT(fields[0]==" "); 1762 REGEX_ASSERT(fields[1]=="a"); 1763 REGEX_ASSERT(fields[2]=="Now is "); 1764 REGEX_ASSERT(fields[3]=="the time<c>"); 1765 status = U_ZERO_ERROR; 1766 delete pat1; 1767 1768 pat1 = RegexPattern::compile("([-,])", pe, status); 1769 REGEX_CHECK_STATUS; 1770 n = pat1->split("1-10,20", fields, 10, status); 1771 REGEX_CHECK_STATUS; 1772 REGEX_ASSERT(n==5); 1773 REGEX_ASSERT(fields[0]=="1"); 1774 REGEX_ASSERT(fields[1]=="-"); 1775 REGEX_ASSERT(fields[2]=="10"); 1776 REGEX_ASSERT(fields[3]==","); 1777 REGEX_ASSERT(fields[4]=="20"); 1778 delete pat1; 1779 1780 // Test split of string with empty trailing fields 1781 pat1 = RegexPattern::compile(",", pe, status); 1782 REGEX_CHECK_STATUS; 1783 n = pat1->split("a,b,c,", fields, 10, status); 1784 REGEX_CHECK_STATUS; 1785 REGEX_ASSERT(n==4); 1786 REGEX_ASSERT(fields[0]=="a"); 1787 REGEX_ASSERT(fields[1]=="b"); 1788 REGEX_ASSERT(fields[2]=="c"); 1789 REGEX_ASSERT(fields[3]==""); 1790 1791 n = pat1->split("a,,,", fields, 10, status); 1792 REGEX_CHECK_STATUS; 1793 REGEX_ASSERT(n==4); 1794 REGEX_ASSERT(fields[0]=="a"); 1795 REGEX_ASSERT(fields[1]==""); 1796 REGEX_ASSERT(fields[2]==""); 1797 REGEX_ASSERT(fields[3]==""); 1798 delete pat1; 1799 1800 // Split Separator with zero length match. 1801 pat1 = RegexPattern::compile(":?", pe, status); 1802 REGEX_CHECK_STATUS; 1803 n = pat1->split("abc", fields, 10, status); 1804 REGEX_CHECK_STATUS; 1805 REGEX_ASSERT(n==5); 1806 REGEX_ASSERT(fields[0]==""); 1807 REGEX_ASSERT(fields[1]=="a"); 1808 REGEX_ASSERT(fields[2]=="b"); 1809 REGEX_ASSERT(fields[3]=="c"); 1810 REGEX_ASSERT(fields[4]==""); 1811 1812 delete pat1; 1813 1814 // 1815 // RegexPattern::pattern() 1816 // 1817 pat1 = new RegexPattern(); 1818 REGEX_ASSERT(pat1->pattern() == ""); 1819 delete pat1; 1820 1821 pat1 = RegexPattern::compile("(Hello, world)*", pe, status); 1822 REGEX_CHECK_STATUS; 1823 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*"); 1824 delete pat1; 1825 1826 1827 // 1828 // classID functions 1829 // 1830 pat1 = RegexPattern::compile("(Hello, world)*", pe, status); 1831 REGEX_CHECK_STATUS; 1832 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID()); 1833 REGEX_ASSERT(pat1->getDynamicClassID() != NULL); 1834 UnicodeString Hello("Hello, world."); 1835 RegexMatcher *m = pat1->matcher(Hello, status); 1836 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID()); 1837 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID()); 1838 REGEX_ASSERT(m->getDynamicClassID() != NULL); 1839 delete m; 1840 delete pat1; 1841 1842 } 1843 1844 //--------------------------------------------------------------------------- 1845 // 1846 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher 1847 // is present and working, but excluding functions 1848 // implementing replace operations. 1849 // 1850 //--------------------------------------------------------------------------- 1851 void RegexTest::API_Match_UTF8() { 1852 UParseError pe; 1853 UErrorCode status=U_ZERO_ERROR; 1854 int32_t flags = 0; 1855 1856 // 1857 // Debug - slide failing test cases early 1858 // 1859 #if 0 1860 { 1861 } 1862 return; 1863 #endif 1864 1865 // 1866 // Simple pattern compilation 1867 // 1868 { 1869 UText re = UTEXT_INITIALIZER; 1870 regextst_openUTF8FromInvariant(&re, "abc", -1, &status); 1871 REGEX_VERBOSE_TEXT(&re); 1872 RegexPattern *pat2; 1873 pat2 = RegexPattern::compile(&re, flags, pe, status); 1874 REGEX_CHECK_STATUS; 1875 1876 UText input1 = UTEXT_INITIALIZER; 1877 UText input2 = UTEXT_INITIALIZER; 1878 UText empty = UTEXT_INITIALIZER; 1879 regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status); 1880 REGEX_VERBOSE_TEXT(&input1); 1881 regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status); 1882 REGEX_VERBOSE_TEXT(&input2); 1883 utext_openUChars(&empty, NULL, 0, &status); 1884 1885 int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */ 1886 int32_t input2Len = strlen("not abc"); 1887 1888 1889 // 1890 // Matcher creation and reset. 1891 // 1892 RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1); 1893 REGEX_CHECK_STATUS; 1894 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 1895 const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */ 1896 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText()); 1897 m1->reset(&input2); 1898 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 1899 const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */ 1900 REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText()); 1901 m1->reset(&input1); 1902 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText()); 1903 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 1904 m1->reset(&empty); 1905 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 1906 REGEX_ASSERT(utext_nativeLength(&empty) == 0); 1907 1908 // 1909 // reset(pos, status) 1910 // 1911 m1->reset(&input1); 1912 m1->reset(4, status); 1913 REGEX_CHECK_STATUS; 1914 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText()); 1915 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 1916 1917 m1->reset(-1, status); 1918 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1919 status = U_ZERO_ERROR; 1920 1921 m1->reset(0, status); 1922 REGEX_CHECK_STATUS; 1923 status = U_ZERO_ERROR; 1924 1925 m1->reset(input1Len-1, status); 1926 REGEX_CHECK_STATUS; 1927 status = U_ZERO_ERROR; 1928 1929 m1->reset(input1Len, status); 1930 REGEX_CHECK_STATUS; 1931 status = U_ZERO_ERROR; 1932 1933 m1->reset(input1Len+1, status); 1934 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1935 status = U_ZERO_ERROR; 1936 1937 // 1938 // match(pos, status) 1939 // 1940 m1->reset(&input2); 1941 REGEX_ASSERT(m1->matches(4, status) == TRUE); 1942 m1->reset(); 1943 REGEX_ASSERT(m1->matches(3, status) == FALSE); 1944 m1->reset(); 1945 REGEX_ASSERT(m1->matches(5, status) == FALSE); 1946 REGEX_ASSERT(m1->matches(4, status) == TRUE); 1947 REGEX_ASSERT(m1->matches(-1, status) == FALSE); 1948 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1949 1950 // Match() at end of string should fail, but should not 1951 // be an error. 1952 status = U_ZERO_ERROR; 1953 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE); 1954 REGEX_CHECK_STATUS; 1955 1956 // Match beyond end of string should fail with an error. 1957 status = U_ZERO_ERROR; 1958 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE); 1959 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1960 1961 // Successful match at end of string. 1962 { 1963 status = U_ZERO_ERROR; 1964 RegexMatcher m("A?", 0, status); // will match zero length string. 1965 REGEX_CHECK_STATUS; 1966 m.reset(&input1); 1967 REGEX_ASSERT(m.matches(input1Len, status) == TRUE); 1968 REGEX_CHECK_STATUS; 1969 m.reset(&empty); 1970 REGEX_ASSERT(m.matches(0, status) == TRUE); 1971 REGEX_CHECK_STATUS; 1972 } 1973 1974 1975 // 1976 // lookingAt(pos, status) 1977 // 1978 status = U_ZERO_ERROR; 1979 m1->reset(&input2); // "not abc" 1980 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 1981 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE); 1982 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE); 1983 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 1984 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE); 1985 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1986 status = U_ZERO_ERROR; 1987 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE); 1988 REGEX_CHECK_STATUS; 1989 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE); 1990 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1991 1992 delete m1; 1993 delete pat2; 1994 1995 utext_close(&re); 1996 utext_close(&input1); 1997 utext_close(&input2); 1998 utext_close(&empty); 1999 } 2000 2001 2002 // 2003 // Capture Group. 2004 // RegexMatcher::start(); 2005 // RegexMatcher::end(); 2006 // RegexMatcher::groupCount(); 2007 // 2008 { 2009 int32_t flags=0; 2010 UParseError pe; 2011 UErrorCode status=U_ZERO_ERROR; 2012 UText re=UTEXT_INITIALIZER; 2013 const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */ 2014 utext_openUTF8(&re, str_01234567_pat, -1, &status); 2015 2016 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 2017 REGEX_CHECK_STATUS; 2018 2019 UText input = UTEXT_INITIALIZER; 2020 const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */ 2021 utext_openUTF8(&input, str_0123456789, -1, &status); 2022 2023 RegexMatcher *matcher = &pat->matcher(status)->reset(&input); 2024 REGEX_CHECK_STATUS; 2025 REGEX_ASSERT(matcher->lookingAt(status) == TRUE); 2026 static const int32_t matchStarts[] = {0, 2, 4, 8}; 2027 static const int32_t matchEnds[] = {10, 8, 6, 10}; 2028 int32_t i; 2029 for (i=0; i<4; i++) { 2030 int32_t actualStart = matcher->start(i, status); 2031 REGEX_CHECK_STATUS; 2032 if (actualStart != matchStarts[i]) { 2033 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n", 2034 __FILE__, __LINE__, i, matchStarts[i], actualStart); 2035 } 2036 int32_t actualEnd = matcher->end(i, status); 2037 REGEX_CHECK_STATUS; 2038 if (actualEnd != matchEnds[i]) { 2039 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n", 2040 __FILE__, __LINE__, i, matchEnds[i], actualEnd); 2041 } 2042 } 2043 2044 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status)); 2045 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status)); 2046 2047 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 2048 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 2049 matcher->reset(); 2050 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE); 2051 2052 matcher->lookingAt(status); 2053 2054 UnicodeString dest; 2055 UText destText = UTEXT_INITIALIZER; 2056 utext_openUnicodeString(&destText, &dest, &status); 2057 UText *result; 2058 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */ 2059 // Test shallow-clone API 2060 int64_t group_len; 2061 result = matcher->group((UText *)NULL, group_len, status); 2062 REGEX_CHECK_STATUS; 2063 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 2064 utext_close(result); 2065 result = matcher->group(0, &destText, group_len, status); 2066 REGEX_CHECK_STATUS; 2067 REGEX_ASSERT(result == &destText); 2068 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 2069 // destText is now immutable, reopen it 2070 utext_close(&destText); 2071 utext_openUnicodeString(&destText, &dest, &status); 2072 2073 int64_t length; 2074 result = matcher->group(0, NULL, length, status); 2075 REGEX_CHECK_STATUS; 2076 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 2077 utext_close(result); 2078 result = matcher->group(0, &destText, length, status); 2079 REGEX_CHECK_STATUS; 2080 REGEX_ASSERT(result == &destText); 2081 REGEX_ASSERT(utext_getNativeIndex(result) == 0); 2082 REGEX_ASSERT(length == 10); 2083 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2084 2085 // Capture Group 1 == "234567" 2086 result = matcher->group(1, NULL, length, status); 2087 REGEX_CHECK_STATUS; 2088 REGEX_ASSERT(utext_getNativeIndex(result) == 2); 2089 REGEX_ASSERT(length == 6); 2090 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2091 utext_close(result); 2092 2093 result = matcher->group(1, &destText, length, status); 2094 REGEX_CHECK_STATUS; 2095 REGEX_ASSERT(result == &destText); 2096 REGEX_ASSERT(utext_getNativeIndex(result) == 2); 2097 REGEX_ASSERT(length == 6); 2098 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2099 utext_close(result); 2100 2101 // Capture Group 2 == "45" 2102 result = matcher->group(2, NULL, length, status); 2103 REGEX_CHECK_STATUS; 2104 REGEX_ASSERT(utext_getNativeIndex(result) == 4); 2105 REGEX_ASSERT(length == 2); 2106 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2107 utext_close(result); 2108 2109 result = matcher->group(2, &destText, length, status); 2110 REGEX_CHECK_STATUS; 2111 REGEX_ASSERT(result == &destText); 2112 REGEX_ASSERT(utext_getNativeIndex(result) == 4); 2113 REGEX_ASSERT(length == 2); 2114 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2115 utext_close(result); 2116 2117 // Capture Group 3 == "89" 2118 result = matcher->group(3, NULL, length, status); 2119 REGEX_CHECK_STATUS; 2120 REGEX_ASSERT(utext_getNativeIndex(result) == 8); 2121 REGEX_ASSERT(length == 2); 2122 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2123 utext_close(result); 2124 2125 result = matcher->group(3, &destText, length, status); 2126 REGEX_CHECK_STATUS; 2127 REGEX_ASSERT(result == &destText); 2128 REGEX_ASSERT(utext_getNativeIndex(result) == 8); 2129 REGEX_ASSERT(length == 2); 2130 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2131 utext_close(result); 2132 2133 // Capture Group number out of range. 2134 status = U_ZERO_ERROR; 2135 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 2136 status = U_ZERO_ERROR; 2137 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 2138 status = U_ZERO_ERROR; 2139 matcher->reset(); 2140 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); 2141 2142 delete matcher; 2143 delete pat; 2144 2145 utext_close(&destText); 2146 utext_close(&input); 2147 utext_close(&re); 2148 } 2149 2150 // 2151 // find 2152 // 2153 { 2154 int32_t flags=0; 2155 UParseError pe; 2156 UErrorCode status=U_ZERO_ERROR; 2157 UText re=UTEXT_INITIALIZER; 2158 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 2159 utext_openUTF8(&re, str_abc, -1, &status); 2160 2161 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 2162 REGEX_CHECK_STATUS; 2163 UText input = UTEXT_INITIALIZER; 2164 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */ 2165 utext_openUTF8(&input, str_abcabcabc, -1, &status); 2166 // 012345678901234567 2167 2168 RegexMatcher *matcher = &pat->matcher(status)->reset(&input); 2169 REGEX_CHECK_STATUS; 2170 REGEX_ASSERT(matcher->find()); 2171 REGEX_ASSERT(matcher->start(status) == 1); 2172 REGEX_ASSERT(matcher->find()); 2173 REGEX_ASSERT(matcher->start(status) == 6); 2174 REGEX_ASSERT(matcher->find()); 2175 REGEX_ASSERT(matcher->start(status) == 12); 2176 REGEX_ASSERT(matcher->find() == FALSE); 2177 REGEX_ASSERT(matcher->find() == FALSE); 2178 2179 matcher->reset(); 2180 REGEX_ASSERT(matcher->find()); 2181 REGEX_ASSERT(matcher->start(status) == 1); 2182 2183 REGEX_ASSERT(matcher->find(0, status)); 2184 REGEX_ASSERT(matcher->start(status) == 1); 2185 REGEX_ASSERT(matcher->find(1, status)); 2186 REGEX_ASSERT(matcher->start(status) == 1); 2187 REGEX_ASSERT(matcher->find(2, status)); 2188 REGEX_ASSERT(matcher->start(status) == 6); 2189 REGEX_ASSERT(matcher->find(12, status)); 2190 REGEX_ASSERT(matcher->start(status) == 12); 2191 REGEX_ASSERT(matcher->find(13, status) == FALSE); 2192 REGEX_ASSERT(matcher->find(16, status) == FALSE); 2193 REGEX_ASSERT(matcher->find(17, status) == FALSE); 2194 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE); 2195 2196 status = U_ZERO_ERROR; 2197 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 2198 status = U_ZERO_ERROR; 2199 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR); 2200 2201 REGEX_ASSERT(matcher->groupCount() == 0); 2202 2203 delete matcher; 2204 delete pat; 2205 2206 utext_close(&input); 2207 utext_close(&re); 2208 } 2209 2210 2211 // 2212 // find, with \G in pattern (true if at the end of a previous match). 2213 // 2214 { 2215 int32_t flags=0; 2216 UParseError pe; 2217 UErrorCode status=U_ZERO_ERROR; 2218 UText re=UTEXT_INITIALIZER; 2219 const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */ 2220 utext_openUTF8(&re, str_Gabcabc, -1, &status); 2221 2222 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 2223 2224 REGEX_CHECK_STATUS; 2225 UText input = UTEXT_INITIALIZER; 2226 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */ 2227 utext_openUTF8(&input, str_abcabcabc, -1, &status); 2228 // 012345678901234567 2229 2230 RegexMatcher *matcher = &pat->matcher(status)->reset(&input); 2231 REGEX_CHECK_STATUS; 2232 REGEX_ASSERT(matcher->find()); 2233 REGEX_ASSERT(matcher->start(status) == 0); 2234 REGEX_ASSERT(matcher->start(1, status) == -1); 2235 REGEX_ASSERT(matcher->start(2, status) == 1); 2236 2237 REGEX_ASSERT(matcher->find()); 2238 REGEX_ASSERT(matcher->start(status) == 4); 2239 REGEX_ASSERT(matcher->start(1, status) == 4); 2240 REGEX_ASSERT(matcher->start(2, status) == -1); 2241 REGEX_CHECK_STATUS; 2242 2243 delete matcher; 2244 delete pat; 2245 2246 utext_close(&input); 2247 utext_close(&re); 2248 } 2249 2250 // 2251 // find with zero length matches, match position should bump ahead 2252 // to prevent loops. 2253 // 2254 { 2255 int32_t i; 2256 UErrorCode status=U_ZERO_ERROR; 2257 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere, 2258 // using an always-true look-ahead. 2259 REGEX_CHECK_STATUS; 2260 UText s = UTEXT_INITIALIZER; 2261 utext_openUTF8(&s, " ", -1, &status); 2262 m.reset(&s); 2263 for (i=0; ; i++) { 2264 if (m.find() == FALSE) { 2265 break; 2266 } 2267 REGEX_ASSERT(m.start(status) == i); 2268 REGEX_ASSERT(m.end(status) == i); 2269 } 2270 REGEX_ASSERT(i==5); 2271 2272 // Check that the bump goes over characters outside the BMP OK 2273 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8 2274 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00}; 2275 utext_openUTF8(&s, (char *)aboveBMP, -1, &status); 2276 m.reset(&s); 2277 for (i=0; ; i+=4) { 2278 if (m.find() == FALSE) { 2279 break; 2280 } 2281 REGEX_ASSERT(m.start(status) == i); 2282 REGEX_ASSERT(m.end(status) == i); 2283 } 2284 REGEX_ASSERT(i==20); 2285 2286 utext_close(&s); 2287 } 2288 { 2289 // find() loop breaking test. 2290 // with pattern of /.?/, should see a series of one char matches, then a single 2291 // match of zero length at the end of the input string. 2292 int32_t i; 2293 UErrorCode status=U_ZERO_ERROR; 2294 RegexMatcher m(".?", 0, status); 2295 REGEX_CHECK_STATUS; 2296 UText s = UTEXT_INITIALIZER; 2297 utext_openUTF8(&s, " ", -1, &status); 2298 m.reset(&s); 2299 for (i=0; ; i++) { 2300 if (m.find() == FALSE) { 2301 break; 2302 } 2303 REGEX_ASSERT(m.start(status) == i); 2304 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i)); 2305 } 2306 REGEX_ASSERT(i==5); 2307 2308 utext_close(&s); 2309 } 2310 2311 2312 // 2313 // Matchers with no input string behave as if they had an empty input string. 2314 // 2315 2316 { 2317 UErrorCode status = U_ZERO_ERROR; 2318 RegexMatcher m(".?", 0, status); 2319 REGEX_CHECK_STATUS; 2320 REGEX_ASSERT(m.find()); 2321 REGEX_ASSERT(m.start(status) == 0); 2322 REGEX_ASSERT(m.input() == ""); 2323 } 2324 { 2325 UErrorCode status = U_ZERO_ERROR; 2326 RegexPattern *p = RegexPattern::compile(".", 0, status); 2327 RegexMatcher *m = p->matcher(status); 2328 REGEX_CHECK_STATUS; 2329 2330 REGEX_ASSERT(m->find() == FALSE); 2331 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0); 2332 delete m; 2333 delete p; 2334 } 2335 2336 // 2337 // Regions 2338 // 2339 { 2340 UErrorCode status = U_ZERO_ERROR; 2341 UText testPattern = UTEXT_INITIALIZER; 2342 UText testText = UTEXT_INITIALIZER; 2343 regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status); 2344 REGEX_VERBOSE_TEXT(&testPattern); 2345 regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status); 2346 REGEX_VERBOSE_TEXT(&testText); 2347 2348 RegexMatcher m(&testPattern, &testText, 0, status); 2349 REGEX_CHECK_STATUS; 2350 REGEX_ASSERT(m.regionStart() == 0); 2351 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data")); 2352 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2353 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2354 2355 m.region(2,4, status); 2356 REGEX_CHECK_STATUS; 2357 REGEX_ASSERT(m.matches(status)); 2358 REGEX_ASSERT(m.start(status)==2); 2359 REGEX_ASSERT(m.end(status)==4); 2360 REGEX_CHECK_STATUS; 2361 2362 m.reset(); 2363 REGEX_ASSERT(m.regionStart() == 0); 2364 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data")); 2365 2366 regextst_openUTF8FromInvariant(&testText, "short", -1, &status); 2367 REGEX_VERBOSE_TEXT(&testText); 2368 m.reset(&testText); 2369 REGEX_ASSERT(m.regionStart() == 0); 2370 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short")); 2371 2372 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2373 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE)); 2374 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 2375 REGEX_ASSERT(&m == &m.reset()); 2376 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 2377 2378 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE)); 2379 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2380 REGEX_ASSERT(&m == &m.reset()); 2381 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2382 2383 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2384 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE)); 2385 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 2386 REGEX_ASSERT(&m == &m.reset()); 2387 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 2388 2389 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE)); 2390 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2391 REGEX_ASSERT(&m == &m.reset()); 2392 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2393 2394 utext_close(&testText); 2395 utext_close(&testPattern); 2396 } 2397 2398 // 2399 // hitEnd() and requireEnd() 2400 // 2401 { 2402 UErrorCode status = U_ZERO_ERROR; 2403 UText testPattern = UTEXT_INITIALIZER; 2404 UText testText = UTEXT_INITIALIZER; 2405 const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */ 2406 const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */ 2407 utext_openUTF8(&testPattern, str_, -1, &status); 2408 utext_openUTF8(&testText, str_aabb, -1, &status); 2409 2410 RegexMatcher m1(&testPattern, &testText, 0, status); 2411 REGEX_ASSERT(m1.lookingAt(status) == TRUE); 2412 REGEX_ASSERT(m1.hitEnd() == TRUE); 2413 REGEX_ASSERT(m1.requireEnd() == FALSE); 2414 REGEX_CHECK_STATUS; 2415 2416 status = U_ZERO_ERROR; 2417 const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */ 2418 utext_openUTF8(&testPattern, str_a, -1, &status); 2419 RegexMatcher m2(&testPattern, &testText, 0, status); 2420 REGEX_ASSERT(m2.lookingAt(status) == TRUE); 2421 REGEX_ASSERT(m2.hitEnd() == FALSE); 2422 REGEX_ASSERT(m2.requireEnd() == FALSE); 2423 REGEX_CHECK_STATUS; 2424 2425 status = U_ZERO_ERROR; 2426 const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */ 2427 utext_openUTF8(&testPattern, str_dotstardollar, -1, &status); 2428 RegexMatcher m3(&testPattern, &testText, 0, status); 2429 REGEX_ASSERT(m3.lookingAt(status) == TRUE); 2430 REGEX_ASSERT(m3.hitEnd() == TRUE); 2431 REGEX_ASSERT(m3.requireEnd() == TRUE); 2432 REGEX_CHECK_STATUS; 2433 2434 utext_close(&testText); 2435 utext_close(&testPattern); 2436 } 2437 } 2438 2439 2440 //--------------------------------------------------------------------------- 2441 // 2442 // API_Replace_UTF8 API test for class RegexMatcher, testing the 2443 // Replace family of functions. 2444 // 2445 //--------------------------------------------------------------------------- 2446 void RegexTest::API_Replace_UTF8() { 2447 // 2448 // Replace 2449 // 2450 int32_t flags=0; 2451 UParseError pe; 2452 UErrorCode status=U_ZERO_ERROR; 2453 2454 UText re=UTEXT_INITIALIZER; 2455 regextst_openUTF8FromInvariant(&re, "abc", -1, &status); 2456 REGEX_VERBOSE_TEXT(&re); 2457 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 2458 REGEX_CHECK_STATUS; 2459 2460 char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */ 2461 // 012345678901234567 2462 UText dataText = UTEXT_INITIALIZER; 2463 utext_openUTF8(&dataText, data, -1, &status); 2464 REGEX_CHECK_STATUS; 2465 REGEX_VERBOSE_TEXT(&dataText); 2466 RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText); 2467 2468 // 2469 // Plain vanilla matches. 2470 // 2471 UnicodeString dest; 2472 UText destText = UTEXT_INITIALIZER; 2473 utext_openUnicodeString(&destText, &dest, &status); 2474 UText *result; 2475 2476 UText replText = UTEXT_INITIALIZER; 2477 2478 const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */ 2479 utext_openUTF8(&replText, str_yz, -1, &status); 2480 REGEX_VERBOSE_TEXT(&replText); 2481 result = matcher->replaceFirst(&replText, NULL, status); 2482 REGEX_CHECK_STATUS; 2483 const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */ 2484 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result); 2485 utext_close(result); 2486 result = matcher->replaceFirst(&replText, &destText, status); 2487 REGEX_CHECK_STATUS; 2488 REGEX_ASSERT(result == &destText); 2489 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result); 2490 2491 result = matcher->replaceAll(&replText, NULL, status); 2492 REGEX_CHECK_STATUS; 2493 const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */ 2494 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result); 2495 utext_close(result); 2496 2497 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2498 result = matcher->replaceAll(&replText, &destText, status); 2499 REGEX_CHECK_STATUS; 2500 REGEX_ASSERT(result == &destText); 2501 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result); 2502 2503 // 2504 // Plain vanilla non-matches. 2505 // 2506 const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */ 2507 utext_openUTF8(&dataText, str_abxabxabx, -1, &status); 2508 matcher->reset(&dataText); 2509 2510 result = matcher->replaceFirst(&replText, NULL, status); 2511 REGEX_CHECK_STATUS; 2512 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2513 utext_close(result); 2514 result = matcher->replaceFirst(&replText, &destText, status); 2515 REGEX_CHECK_STATUS; 2516 REGEX_ASSERT(result == &destText); 2517 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2518 2519 result = matcher->replaceAll(&replText, NULL, status); 2520 REGEX_CHECK_STATUS; 2521 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2522 utext_close(result); 2523 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2524 result = matcher->replaceAll(&replText, &destText, status); 2525 REGEX_CHECK_STATUS; 2526 REGEX_ASSERT(result == &destText); 2527 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2528 2529 // 2530 // Empty source string 2531 // 2532 utext_openUTF8(&dataText, NULL, 0, &status); 2533 matcher->reset(&dataText); 2534 2535 result = matcher->replaceFirst(&replText, NULL, status); 2536 REGEX_CHECK_STATUS; 2537 REGEX_ASSERT_UTEXT_UTF8("", result); 2538 utext_close(result); 2539 result = matcher->replaceFirst(&replText, &destText, status); 2540 REGEX_CHECK_STATUS; 2541 REGEX_ASSERT(result == &destText); 2542 REGEX_ASSERT_UTEXT_UTF8("", result); 2543 2544 result = matcher->replaceAll(&replText, NULL, status); 2545 REGEX_CHECK_STATUS; 2546 REGEX_ASSERT_UTEXT_UTF8("", result); 2547 utext_close(result); 2548 result = matcher->replaceAll(&replText, &destText, status); 2549 REGEX_CHECK_STATUS; 2550 REGEX_ASSERT(result == &destText); 2551 REGEX_ASSERT_UTEXT_UTF8("", result); 2552 2553 // 2554 // Empty substitution string 2555 // 2556 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.." 2557 matcher->reset(&dataText); 2558 2559 utext_openUTF8(&replText, NULL, 0, &status); 2560 result = matcher->replaceFirst(&replText, NULL, status); 2561 REGEX_CHECK_STATUS; 2562 const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */ 2563 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result); 2564 utext_close(result); 2565 result = matcher->replaceFirst(&replText, &destText, status); 2566 REGEX_CHECK_STATUS; 2567 REGEX_ASSERT(result == &destText); 2568 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result); 2569 2570 result = matcher->replaceAll(&replText, NULL, status); 2571 REGEX_CHECK_STATUS; 2572 const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */ 2573 REGEX_ASSERT_UTEXT_UTF8(str_dots, result); 2574 utext_close(result); 2575 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2576 result = matcher->replaceAll(&replText, &destText, status); 2577 REGEX_CHECK_STATUS; 2578 REGEX_ASSERT(result == &destText); 2579 REGEX_ASSERT_UTEXT_UTF8(str_dots, result); 2580 2581 // 2582 // match whole string 2583 // 2584 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 2585 utext_openUTF8(&dataText, str_abc, -1, &status); 2586 matcher->reset(&dataText); 2587 2588 const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */ 2589 utext_openUTF8(&replText, str_xyz, -1, &status); 2590 result = matcher->replaceFirst(&replText, NULL, status); 2591 REGEX_CHECK_STATUS; 2592 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2593 utext_close(result); 2594 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2595 result = matcher->replaceFirst(&replText, &destText, status); 2596 REGEX_CHECK_STATUS; 2597 REGEX_ASSERT(result == &destText); 2598 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2599 2600 result = matcher->replaceAll(&replText, NULL, status); 2601 REGEX_CHECK_STATUS; 2602 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2603 utext_close(result); 2604 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2605 result = matcher->replaceAll(&replText, &destText, status); 2606 REGEX_CHECK_STATUS; 2607 REGEX_ASSERT(result == &destText); 2608 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2609 2610 // 2611 // Capture Group, simple case 2612 // 2613 const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */ 2614 utext_openUTF8(&re, str_add, -1, &status); 2615 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status); 2616 REGEX_CHECK_STATUS; 2617 2618 const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */ 2619 utext_openUTF8(&dataText, str_abcdefg, -1, &status); 2620 RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText); 2621 REGEX_CHECK_STATUS; 2622 2623 const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */ 2624 utext_openUTF8(&replText, str_11, -1, &status); 2625 result = matcher2->replaceFirst(&replText, NULL, status); 2626 REGEX_CHECK_STATUS; 2627 const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */ 2628 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result); 2629 utext_close(result); 2630 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2631 result = matcher2->replaceFirst(&replText, &destText, status); 2632 REGEX_CHECK_STATUS; 2633 REGEX_ASSERT(result == &destText); 2634 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result); 2635 2636 const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */ 2637 utext_openUTF8(&replText, str_v, -1, &status); 2638 REGEX_VERBOSE_TEXT(&replText); 2639 result = matcher2->replaceFirst(&replText, NULL, status); 2640 REGEX_CHECK_STATUS; 2641 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */ 2642 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result); 2643 utext_close(result); 2644 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2645 result = matcher2->replaceFirst(&replText, &destText, status); 2646 REGEX_CHECK_STATUS; 2647 REGEX_ASSERT(result == &destText); 2648 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result); 2649 2650 const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 2651 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 2652 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */ 2653 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status); 2654 result = matcher2->replaceFirst(&replText, NULL, status); 2655 REGEX_CHECK_STATUS; 2656 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */ 2657 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result); 2658 utext_close(result); 2659 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2660 result = matcher2->replaceFirst(&replText, &destText, status); 2661 REGEX_CHECK_STATUS; 2662 REGEX_ASSERT(result == &destText); 2663 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result); 2664 2665 unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */ 2666 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE 2667 // 012345678901234567890123456 2668 supplDigitChars[22] = 0xF0; 2669 supplDigitChars[23] = 0x9D; 2670 supplDigitChars[24] = 0x9F; 2671 supplDigitChars[25] = 0x8F; 2672 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status); 2673 2674 result = matcher2->replaceFirst(&replText, NULL, status); 2675 REGEX_CHECK_STATUS; 2676 const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */ 2677 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result); 2678 utext_close(result); 2679 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2680 result = matcher2->replaceFirst(&replText, &destText, status); 2681 REGEX_CHECK_STATUS; 2682 REGEX_ASSERT(result == &destText); 2683 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result); 2684 const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */ 2685 utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status); 2686 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR); 2687 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result); 2688 utext_close(result); 2689 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2690 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR); 2691 REGEX_ASSERT(result == &destText); 2692 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result); 2693 2694 // 2695 // Replacement String with \u hex escapes 2696 // 2697 { 2698 const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */ 2699 const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */ 2700 utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status); 2701 utext_openUTF8(&replText, str_u0043, -1, &status); 2702 matcher->reset(&dataText); 2703 2704 result = matcher->replaceAll(&replText, NULL, status); 2705 REGEX_CHECK_STATUS; 2706 const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */ 2707 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result); 2708 utext_close(result); 2709 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2710 result = matcher->replaceAll(&replText, &destText, status); 2711 REGEX_CHECK_STATUS; 2712 REGEX_ASSERT(result == &destText); 2713 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result); 2714 } 2715 { 2716 const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */ 2717 utext_openUTF8(&dataText, str_abc, -1, &status); 2718 const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */ 2719 utext_openUTF8(&replText, str_U00010000, -1, &status); 2720 matcher->reset(&dataText); 2721 2722 unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A" 2723 // 0123456789 2724 expected[2] = 0xF0; 2725 expected[3] = 0x90; 2726 expected[4] = 0x80; 2727 expected[5] = 0x80; 2728 2729 result = matcher->replaceAll(&replText, NULL, status); 2730 REGEX_CHECK_STATUS; 2731 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result); 2732 utext_close(result); 2733 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2734 result = matcher->replaceAll(&replText, &destText, status); 2735 REGEX_CHECK_STATUS; 2736 REGEX_ASSERT(result == &destText); 2737 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result); 2738 } 2739 // TODO: need more through testing of capture substitutions. 2740 2741 // Bug 4057 2742 // 2743 { 2744 status = U_ZERO_ERROR; 2745 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */ 2746 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */ 2747 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */ 2748 utext_openUTF8(&re, str_ssee, -1, &status); 2749 utext_openUTF8(&dataText, str_blah, -1, &status); 2750 utext_openUTF8(&replText, str_ooh, -1, &status); 2751 2752 RegexMatcher m(&re, 0, status); 2753 REGEX_CHECK_STATUS; 2754 2755 UnicodeString result; 2756 UText resultText = UTEXT_INITIALIZER; 2757 utext_openUnicodeString(&resultText, &result, &status); 2758 2759 // Multiple finds do NOT bump up the previous appendReplacement postion. 2760 m.reset(&dataText); 2761 m.find(); 2762 m.find(); 2763 m.appendReplacement(&resultText, &replText, status); 2764 REGEX_CHECK_STATUS; 2765 const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */ 2766 REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText); 2767 2768 // After a reset into the interior of a string, appendReplacement still starts at beginning. 2769 status = U_ZERO_ERROR; 2770 result.truncate(0); 2771 utext_openUnicodeString(&resultText, &result, &status); 2772 m.reset(10, status); 2773 m.find(); 2774 m.find(); 2775 m.appendReplacement(&resultText, &replText, status); 2776 REGEX_CHECK_STATUS; 2777 const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */ 2778 REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText); 2779 2780 // find() at interior of string, appendReplacement still starts at beginning. 2781 status = U_ZERO_ERROR; 2782 result.truncate(0); 2783 utext_openUnicodeString(&resultText, &result, &status); 2784 m.reset(); 2785 m.find(10, status); 2786 m.find(); 2787 m.appendReplacement(&resultText, &replText, status); 2788 REGEX_CHECK_STATUS; 2789 const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */ 2790 REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText); 2791 2792 m.appendTail(&resultText, status); 2793 const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */ 2794 REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText); 2795 2796 utext_close(&resultText); 2797 } 2798 2799 delete matcher2; 2800 delete pat2; 2801 delete matcher; 2802 delete pat; 2803 2804 utext_close(&dataText); 2805 utext_close(&replText); 2806 utext_close(&destText); 2807 utext_close(&re); 2808 } 2809 2810 2811 //--------------------------------------------------------------------------- 2812 // 2813 // API_Pattern_UTF8 Test that the API for class RegexPattern is 2814 // present and nominally working. 2815 // 2816 //--------------------------------------------------------------------------- 2817 void RegexTest::API_Pattern_UTF8() { 2818 RegexPattern pata; // Test default constructor to not crash. 2819 RegexPattern patb; 2820 2821 REGEX_ASSERT(pata == patb); 2822 REGEX_ASSERT(pata == pata); 2823 2824 UText re1 = UTEXT_INITIALIZER; 2825 UText re2 = UTEXT_INITIALIZER; 2826 UErrorCode status = U_ZERO_ERROR; 2827 UParseError pe; 2828 2829 const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */ 2830 const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */ 2831 utext_openUTF8(&re1, str_abcalmz, -1, &status); 2832 utext_openUTF8(&re2, str_def, -1, &status); 2833 2834 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status); 2835 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status); 2836 REGEX_CHECK_STATUS; 2837 REGEX_ASSERT(*pat1 == *pat1); 2838 REGEX_ASSERT(*pat1 != pata); 2839 2840 // Assign 2841 patb = *pat1; 2842 REGEX_ASSERT(patb == *pat1); 2843 2844 // Copy Construct 2845 RegexPattern patc(*pat1); 2846 REGEX_ASSERT(patc == *pat1); 2847 REGEX_ASSERT(patb == patc); 2848 REGEX_ASSERT(pat1 != pat2); 2849 patb = *pat2; 2850 REGEX_ASSERT(patb != patc); 2851 REGEX_ASSERT(patb == *pat2); 2852 2853 // Compile with no flags. 2854 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status); 2855 REGEX_ASSERT(*pat1a == *pat1); 2856 2857 REGEX_ASSERT(pat1a->flags() == 0); 2858 2859 // Compile with different flags should be not equal 2860 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status); 2861 REGEX_CHECK_STATUS; 2862 2863 REGEX_ASSERT(*pat1b != *pat1a); 2864 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE); 2865 REGEX_ASSERT(pat1a->flags() == 0); 2866 delete pat1b; 2867 2868 // clone 2869 RegexPattern *pat1c = pat1->clone(); 2870 REGEX_ASSERT(*pat1c == *pat1); 2871 REGEX_ASSERT(*pat1c != *pat2); 2872 2873 delete pat1c; 2874 delete pat1a; 2875 delete pat1; 2876 delete pat2; 2877 2878 utext_close(&re1); 2879 utext_close(&re2); 2880 2881 2882 // 2883 // Verify that a matcher created from a cloned pattern works. 2884 // (Jitterbug 3423) 2885 // 2886 { 2887 UErrorCode status = U_ZERO_ERROR; 2888 UText pattern = UTEXT_INITIALIZER; 2889 const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */ 2890 utext_openUTF8(&pattern, str_pL, -1, &status); 2891 2892 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status); 2893 RegexPattern *pClone = pSource->clone(); 2894 delete pSource; 2895 RegexMatcher *mFromClone = pClone->matcher(status); 2896 REGEX_CHECK_STATUS; 2897 2898 UText input = UTEXT_INITIALIZER; 2899 const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */ 2900 utext_openUTF8(&input, str_HelloWorld, -1, &status); 2901 mFromClone->reset(&input); 2902 REGEX_ASSERT(mFromClone->find() == TRUE); 2903 REGEX_ASSERT(mFromClone->group(status) == "Hello"); 2904 REGEX_ASSERT(mFromClone->find() == TRUE); 2905 REGEX_ASSERT(mFromClone->group(status) == "World"); 2906 REGEX_ASSERT(mFromClone->find() == FALSE); 2907 delete mFromClone; 2908 delete pClone; 2909 2910 utext_close(&input); 2911 utext_close(&pattern); 2912 } 2913 2914 // 2915 // matches convenience API 2916 // 2917 { 2918 UErrorCode status = U_ZERO_ERROR; 2919 UText pattern = UTEXT_INITIALIZER; 2920 UText input = UTEXT_INITIALIZER; 2921 2922 const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */ 2923 utext_openUTF8(&input, str_randominput, -1, &status); 2924 2925 const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */ 2926 utext_openUTF8(&pattern, str_dotstar, -1, &status); 2927 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE); 2928 REGEX_CHECK_STATUS; 2929 2930 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 2931 utext_openUTF8(&pattern, str_abc, -1, &status); 2932 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE); 2933 REGEX_CHECK_STATUS; 2934 2935 const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */ 2936 utext_openUTF8(&pattern, str_nput, -1, &status); 2937 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE); 2938 REGEX_CHECK_STATUS; 2939 2940 utext_openUTF8(&pattern, str_randominput, -1, &status); 2941 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE); 2942 REGEX_CHECK_STATUS; 2943 2944 const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */ 2945 utext_openUTF8(&pattern, str_u, -1, &status); 2946 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE); 2947 REGEX_CHECK_STATUS; 2948 2949 utext_openUTF8(&input, str_abc, -1, &status); 2950 utext_openUTF8(&pattern, str_abc, -1, &status); 2951 status = U_INDEX_OUTOFBOUNDS_ERROR; 2952 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE); 2953 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 2954 2955 utext_close(&input); 2956 utext_close(&pattern); 2957 } 2958 2959 2960 // 2961 // Split() 2962 // 2963 status = U_ZERO_ERROR; 2964 const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */ 2965 utext_openUTF8(&re1, str_spaceplus, -1, &status); 2966 pat1 = RegexPattern::compile(&re1, pe, status); 2967 REGEX_CHECK_STATUS; 2968 UnicodeString fields[10]; 2969 2970 int32_t n; 2971 n = pat1->split("Now is the time", fields, 10, status); 2972 REGEX_CHECK_STATUS; 2973 REGEX_ASSERT(n==4); 2974 REGEX_ASSERT(fields[0]=="Now"); 2975 REGEX_ASSERT(fields[1]=="is"); 2976 REGEX_ASSERT(fields[2]=="the"); 2977 REGEX_ASSERT(fields[3]=="time"); 2978 REGEX_ASSERT(fields[4]==""); 2979 2980 n = pat1->split("Now is the time", fields, 2, status); 2981 REGEX_CHECK_STATUS; 2982 REGEX_ASSERT(n==2); 2983 REGEX_ASSERT(fields[0]=="Now"); 2984 REGEX_ASSERT(fields[1]=="is the time"); 2985 REGEX_ASSERT(fields[2]=="the"); // left over from previous test 2986 2987 fields[1] = "*"; 2988 status = U_ZERO_ERROR; 2989 n = pat1->split("Now is the time", fields, 1, status); 2990 REGEX_CHECK_STATUS; 2991 REGEX_ASSERT(n==1); 2992 REGEX_ASSERT(fields[0]=="Now is the time"); 2993 REGEX_ASSERT(fields[1]=="*"); 2994 status = U_ZERO_ERROR; 2995 2996 n = pat1->split(" Now is the time ", fields, 10, status); 2997 REGEX_CHECK_STATUS; 2998 REGEX_ASSERT(n==6); 2999 REGEX_ASSERT(fields[0]==""); 3000 REGEX_ASSERT(fields[1]=="Now"); 3001 REGEX_ASSERT(fields[2]=="is"); 3002 REGEX_ASSERT(fields[3]=="the"); 3003 REGEX_ASSERT(fields[4]=="time"); 3004 REGEX_ASSERT(fields[5]==""); 3005 REGEX_ASSERT(fields[6]==""); 3006 3007 fields[2] = "*"; 3008 n = pat1->split(" ", fields, 10, status); 3009 REGEX_CHECK_STATUS; 3010 REGEX_ASSERT(n==2); 3011 REGEX_ASSERT(fields[0]==""); 3012 REGEX_ASSERT(fields[1]==""); 3013 REGEX_ASSERT(fields[2]=="*"); 3014 3015 fields[0] = "foo"; 3016 n = pat1->split("", fields, 10, status); 3017 REGEX_CHECK_STATUS; 3018 REGEX_ASSERT(n==0); 3019 REGEX_ASSERT(fields[0]=="foo"); 3020 3021 delete pat1; 3022 3023 // split, with a pattern with (capture) 3024 regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status); 3025 pat1 = RegexPattern::compile(&re1, pe, status); 3026 REGEX_CHECK_STATUS; 3027 3028 status = U_ZERO_ERROR; 3029 fields[6] = fields[7] = "*"; 3030 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status); 3031 REGEX_CHECK_STATUS; 3032 REGEX_ASSERT(n==7); 3033 REGEX_ASSERT(fields[0]==""); 3034 REGEX_ASSERT(fields[1]=="a"); 3035 REGEX_ASSERT(fields[2]=="Now is "); 3036 REGEX_ASSERT(fields[3]=="b"); 3037 REGEX_ASSERT(fields[4]=="the time"); 3038 REGEX_ASSERT(fields[5]=="c"); 3039 REGEX_ASSERT(fields[6]==""); 3040 REGEX_ASSERT(fields[7]=="*"); 3041 REGEX_ASSERT(status==U_ZERO_ERROR); 3042 3043 fields[6] = fields[7] = "*"; 3044 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status); 3045 REGEX_CHECK_STATUS; 3046 REGEX_ASSERT(n==7); 3047 REGEX_ASSERT(fields[0]==" "); 3048 REGEX_ASSERT(fields[1]=="a"); 3049 REGEX_ASSERT(fields[2]=="Now is "); 3050 REGEX_ASSERT(fields[3]=="b"); 3051 REGEX_ASSERT(fields[4]=="the time"); 3052 REGEX_ASSERT(fields[5]=="c"); 3053 REGEX_ASSERT(fields[6]==""); 3054 REGEX_ASSERT(fields[7]=="*"); 3055 3056 status = U_ZERO_ERROR; 3057 fields[6] = "foo"; 3058 n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status); 3059 REGEX_CHECK_STATUS; 3060 REGEX_ASSERT(n==6); 3061 REGEX_ASSERT(fields[0]==" "); 3062 REGEX_ASSERT(fields[1]=="a"); 3063 REGEX_ASSERT(fields[2]=="Now is "); 3064 REGEX_ASSERT(fields[3]=="b"); 3065 REGEX_ASSERT(fields[4]=="the time"); 3066 REGEX_ASSERT(fields[5]==" "); 3067 REGEX_ASSERT(fields[6]=="foo"); 3068 3069 status = U_ZERO_ERROR; 3070 fields[5] = "foo"; 3071 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status); 3072 REGEX_CHECK_STATUS; 3073 REGEX_ASSERT(n==5); 3074 REGEX_ASSERT(fields[0]==" "); 3075 REGEX_ASSERT(fields[1]=="a"); 3076 REGEX_ASSERT(fields[2]=="Now is "); 3077 REGEX_ASSERT(fields[3]=="b"); 3078 REGEX_ASSERT(fields[4]=="the time<c>"); 3079 REGEX_ASSERT(fields[5]=="foo"); 3080 3081 status = U_ZERO_ERROR; 3082 fields[5] = "foo"; 3083 n = pat1->split(" <a>Now is <b>the time", fields, 5, status); 3084 REGEX_CHECK_STATUS; 3085 REGEX_ASSERT(n==5); 3086 REGEX_ASSERT(fields[0]==" "); 3087 REGEX_ASSERT(fields[1]=="a"); 3088 REGEX_ASSERT(fields[2]=="Now is "); 3089 REGEX_ASSERT(fields[3]=="b"); 3090 REGEX_ASSERT(fields[4]=="the time"); 3091 REGEX_ASSERT(fields[5]=="foo"); 3092 3093 status = U_ZERO_ERROR; 3094 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status); 3095 REGEX_CHECK_STATUS; 3096 REGEX_ASSERT(n==4); 3097 REGEX_ASSERT(fields[0]==" "); 3098 REGEX_ASSERT(fields[1]=="a"); 3099 REGEX_ASSERT(fields[2]=="Now is "); 3100 REGEX_ASSERT(fields[3]=="the time<c>"); 3101 status = U_ZERO_ERROR; 3102 delete pat1; 3103 3104 regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status); 3105 pat1 = RegexPattern::compile(&re1, pe, status); 3106 REGEX_CHECK_STATUS; 3107 n = pat1->split("1-10,20", fields, 10, status); 3108 REGEX_CHECK_STATUS; 3109 REGEX_ASSERT(n==5); 3110 REGEX_ASSERT(fields[0]=="1"); 3111 REGEX_ASSERT(fields[1]=="-"); 3112 REGEX_ASSERT(fields[2]=="10"); 3113 REGEX_ASSERT(fields[3]==","); 3114 REGEX_ASSERT(fields[4]=="20"); 3115 delete pat1; 3116 3117 3118 // 3119 // split of a UText based string, with library allocating output UTexts. 3120 // 3121 { 3122 status = U_ZERO_ERROR; 3123 RegexMatcher matcher(UnicodeString("(:)"), 0, status); 3124 UnicodeString stringToSplit("first:second:third"); 3125 UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status); 3126 REGEX_CHECK_STATUS; 3127 3128 UText *splits[10] = {NULL}; 3129 int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status); 3130 REGEX_CHECK_STATUS; 3131 REGEX_ASSERT(numFields == 5); 3132 REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]); 3133 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]); 3134 REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]); 3135 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]); 3136 REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]); 3137 REGEX_ASSERT(splits[5] == NULL); 3138 3139 for (int i=0; i<UPRV_LENGTHOF(splits); i++) { 3140 if (splits[i]) { 3141 utext_close(splits[i]); 3142 splits[i] = NULL; 3143 } 3144 } 3145 utext_close(textToSplit); 3146 } 3147 3148 3149 // 3150 // RegexPattern::pattern() and patternText() 3151 // 3152 pat1 = new RegexPattern(); 3153 REGEX_ASSERT(pat1->pattern() == ""); 3154 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status)); 3155 delete pat1; 3156 const char *helloWorldInvariant = "(Hello, world)*"; 3157 regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status); 3158 pat1 = RegexPattern::compile(&re1, pe, status); 3159 REGEX_CHECK_STATUS; 3160 REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern()); 3161 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status)); 3162 delete pat1; 3163 3164 utext_close(&re1); 3165 } 3166 3167 3168 //--------------------------------------------------------------------------- 3169 // 3170 // Extended A more thorough check for features of regex patterns 3171 // The test cases are in a separate data file, 3172 // source/tests/testdata/regextst.txt 3173 // A description of the test data format is included in that file. 3174 // 3175 //--------------------------------------------------------------------------- 3176 3177 const char * 3178 RegexTest::getPath(char buffer[2048], const char *filename) { 3179 UErrorCode status=U_ZERO_ERROR; 3180 const char *testDataDirectory = IntlTest::getSourceTestData(status); 3181 if (U_FAILURE(status)) { 3182 errln("ERROR: loadTestData() failed - %s", u_errorName(status)); 3183 return NULL; 3184 } 3185 3186 strcpy(buffer, testDataDirectory); 3187 strcat(buffer, filename); 3188 return buffer; 3189 } 3190 3191 void RegexTest::Extended() { 3192 char tdd[2048]; 3193 const char *srcPath; 3194 UErrorCode status = U_ZERO_ERROR; 3195 int32_t lineNum = 0; 3196 3197 // 3198 // Open and read the test data file. 3199 // 3200 srcPath=getPath(tdd, "regextst.txt"); 3201 if(srcPath==NULL) { 3202 return; /* something went wrong, error already output */ 3203 } 3204 3205 int32_t len; 3206 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status); 3207 if (U_FAILURE(status)) { 3208 return; /* something went wrong, error already output */ 3209 } 3210 3211 // 3212 // Put the test data into a UnicodeString 3213 // 3214 UnicodeString testString(FALSE, testData, len); 3215 3216 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status); 3217 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status); 3218 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status); 3219 3220 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status); 3221 UnicodeString testPattern; // The pattern for test from the test file. 3222 UnicodeString testFlags; // the flags for a test. 3223 UnicodeString matchString; // The marked up string to be used as input 3224 3225 if (U_FAILURE(status)){ 3226 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status)); 3227 delete [] testData; 3228 return; 3229 } 3230 3231 // 3232 // Loop over the test data file, once per line. 3233 // 3234 while (lineMat.find()) { 3235 lineNum++; 3236 if (U_FAILURE(status)) { 3237 errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status)); 3238 } 3239 3240 status = U_ZERO_ERROR; 3241 UnicodeString testLine = lineMat.group(1, status); 3242 if (testLine.length() == 0) { 3243 continue; 3244 } 3245 3246 // 3247 // Parse the test line. Skip blank and comment only lines. 3248 // Separate out the three main fields - pattern, flags, target. 3249 // 3250 3251 commentMat.reset(testLine); 3252 if (commentMat.lookingAt(status)) { 3253 // This line is a comment, or blank. 3254 continue; 3255 } 3256 3257 // 3258 // Pull out the pattern field, remove it from the test file line. 3259 // 3260 quotedStuffMat.reset(testLine); 3261 if (quotedStuffMat.lookingAt(status)) { 3262 testPattern = quotedStuffMat.group(2, status); 3263 testLine.remove(0, quotedStuffMat.end(0, status)); 3264 } else { 3265 errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum); 3266 continue; 3267 } 3268 3269 3270 // 3271 // Pull out the flags from the test file line. 3272 // 3273 flagsMat.reset(testLine); 3274 flagsMat.lookingAt(status); // Will always match, possibly an empty string. 3275 testFlags = flagsMat.group(1, status); 3276 if (flagsMat.group(2, status).length() > 0) { 3277 errln("Bad Match flag at line %d. Scanning %c\n", 3278 lineNum, flagsMat.group(2, status).charAt(0)); 3279 continue; 3280 } 3281 testLine.remove(0, flagsMat.end(0, status)); 3282 3283 // 3284 // Pull out the match string, as a whole. 3285 // We'll process the <tags> later. 3286 // 3287 quotedStuffMat.reset(testLine); 3288 if (quotedStuffMat.lookingAt(status)) { 3289 matchString = quotedStuffMat.group(2, status); 3290 testLine.remove(0, quotedStuffMat.end(0, status)); 3291 } else { 3292 errln("Bad match string at test file line %d", lineNum); 3293 continue; 3294 } 3295 3296 // 3297 // The only thing left from the input line should be an optional trailing comment. 3298 // 3299 commentMat.reset(testLine); 3300 if (commentMat.lookingAt(status) == FALSE) { 3301 errln("Line %d: unexpected characters at end of test line.", lineNum); 3302 continue; 3303 } 3304 3305 // 3306 // Run the test 3307 // 3308 regex_find(testPattern, testFlags, matchString, srcPath, lineNum); 3309 } 3310 3311 delete [] testData; 3312 3313 } 3314 3315 3316 3317 //--------------------------------------------------------------------------- 3318 // 3319 // regex_find(pattern, flags, inputString, lineNumber) 3320 // 3321 // Function to run a single test from the Extended (data driven) tests. 3322 // See file test/testdata/regextst.txt for a description of the 3323 // pattern and inputString fields, and the allowed flags. 3324 // lineNumber is the source line in regextst.txt of the test. 3325 // 3326 //--------------------------------------------------------------------------- 3327 3328 3329 // Set a value into a UVector at position specified by a decimal number in 3330 // a UnicodeString. This is a utility function needed by the actual test function, 3331 // which follows. 3332 static void set(UVector &vec, int32_t val, UnicodeString index) { 3333 UErrorCode status=U_ZERO_ERROR; 3334 int32_t idx = 0; 3335 for (int32_t i=0; i<index.length(); i++) { 3336 int32_t d=u_charDigitValue(index.charAt(i)); 3337 if (d<0) {return;} 3338 idx = idx*10 + d; 3339 } 3340 while (vec.size()<idx+1) {vec.addElement(-1, status);} 3341 vec.setElementAt(val, idx); 3342 } 3343 3344 static void setInt(UVector &vec, int32_t val, int32_t idx) { 3345 UErrorCode status=U_ZERO_ERROR; 3346 while (vec.size()<idx+1) {vec.addElement(-1, status);} 3347 vec.setElementAt(val, idx); 3348 } 3349 3350 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex) 3351 { 3352 UBool couldFind = TRUE; 3353 UTEXT_SETNATIVEINDEX(utext, 0); 3354 int32_t i = 0; 3355 while (i < unistrOffset) { 3356 UChar32 c = UTEXT_NEXT32(utext); 3357 if (c != U_SENTINEL) { 3358 i += U16_LENGTH(c); 3359 } else { 3360 couldFind = FALSE; 3361 break; 3362 } 3363 } 3364 nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext); 3365 return couldFind; 3366 } 3367 3368 3369 void RegexTest::regex_find(const UnicodeString &pattern, 3370 const UnicodeString &flags, 3371 const UnicodeString &inputString, 3372 const char *srcPath, 3373 int32_t line) { 3374 UnicodeString unEscapedInput; 3375 UnicodeString deTaggedInput; 3376 3377 int32_t patternUTF8Length, inputUTF8Length; 3378 char *patternChars = NULL, *inputChars = NULL; 3379 UText patternText = UTEXT_INITIALIZER; 3380 UText inputText = UTEXT_INITIALIZER; 3381 UConverter *UTF8Converter = NULL; 3382 3383 UErrorCode status = U_ZERO_ERROR; 3384 UParseError pe; 3385 RegexPattern *parsePat = NULL; 3386 RegexMatcher *parseMatcher = NULL; 3387 RegexPattern *callerPattern = NULL, *UTF8Pattern = NULL; 3388 RegexMatcher *matcher = NULL, *UTF8Matcher = NULL; 3389 UVector groupStarts(status); 3390 UVector groupEnds(status); 3391 UVector groupStartsUTF8(status); 3392 UVector groupEndsUTF8(status); 3393 UBool isMatch = FALSE, isUTF8Match = FALSE; 3394 UBool failed = FALSE; 3395 int32_t numFinds; 3396 int32_t i; 3397 UBool useMatchesFunc = FALSE; 3398 UBool useLookingAtFunc = FALSE; 3399 int32_t regionStart = -1; 3400 int32_t regionEnd = -1; 3401 int32_t regionStartUTF8 = -1; 3402 int32_t regionEndUTF8 = -1; 3403 3404 3405 // 3406 // Compile the caller's pattern 3407 // 3408 uint32_t bflags = 0; 3409 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag 3410 bflags |= UREGEX_CASE_INSENSITIVE; 3411 } 3412 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag 3413 bflags |= UREGEX_COMMENTS; 3414 } 3415 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag 3416 bflags |= UREGEX_DOTALL; 3417 } 3418 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag 3419 bflags |= UREGEX_MULTILINE; 3420 } 3421 3422 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag 3423 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES; 3424 } 3425 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag 3426 bflags |= UREGEX_UNIX_LINES; 3427 } 3428 if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag 3429 bflags |= UREGEX_LITERAL; 3430 } 3431 3432 3433 callerPattern = RegexPattern::compile(pattern, bflags, pe, status); 3434 if (status != U_ZERO_ERROR) { 3435 #if UCONFIG_NO_BREAK_ITERATION==1 3436 // 'v' test flag means that the test pattern should not compile if ICU was configured 3437 // to not include break iteration. RBBI is needed for Unicode word boundaries. 3438 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) { 3439 goto cleanupAndReturn; 3440 } 3441 #endif 3442 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E' 3443 // Expected pattern compilation error. 3444 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd' 3445 logln("Pattern Compile returns \"%s\"", u_errorName(status)); 3446 } 3447 goto cleanupAndReturn; 3448 } else { 3449 // Unexpected pattern compilation error. 3450 dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status)); 3451 goto cleanupAndReturn; 3452 } 3453 } 3454 3455 UTF8Converter = ucnv_open("UTF8", &status); 3456 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); 3457 3458 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status); 3459 status = U_ZERO_ERROR; // buffer overflow 3460 patternChars = new char[patternUTF8Length+1]; 3461 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status); 3462 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status); 3463 3464 if (status == U_ZERO_ERROR) { 3465 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status); 3466 3467 if (status != U_ZERO_ERROR) { 3468 #if UCONFIG_NO_BREAK_ITERATION==1 3469 // 'v' test flag means that the test pattern should not compile if ICU was configured 3470 // to not include break iteration. RBBI is needed for Unicode word boundaries. 3471 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) { 3472 goto cleanupAndReturn; 3473 } 3474 #endif 3475 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E' 3476 // Expected pattern compilation error. 3477 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd' 3478 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status)); 3479 } 3480 goto cleanupAndReturn; 3481 } else { 3482 // Unexpected pattern compilation error. 3483 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status)); 3484 goto cleanupAndReturn; 3485 } 3486 } 3487 } 3488 3489 if (UTF8Pattern == NULL) { 3490 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine 3491 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line); 3492 status = U_ZERO_ERROR; 3493 } 3494 3495 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag 3496 callerPattern->dumpPattern(); 3497 } 3498 3499 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag 3500 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line); 3501 goto cleanupAndReturn; 3502 } 3503 3504 3505 // 3506 // Number of times find() should be called on the test string, default to 1 3507 // 3508 numFinds = 1; 3509 for (i=2; i<=9; i++) { 3510 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag 3511 if (numFinds != 1) { 3512 errln("Line %d: more than one digit flag. Scanning %d.", line, i); 3513 goto cleanupAndReturn; 3514 } 3515 numFinds = i; 3516 } 3517 } 3518 3519 // 'M' flag. Use matches() instead of find() 3520 if (flags.indexOf((UChar)0x4d) >= 0) { 3521 useMatchesFunc = TRUE; 3522 } 3523 if (flags.indexOf((UChar)0x4c) >= 0) { 3524 useLookingAtFunc = TRUE; 3525 } 3526 3527 // 3528 // Find the tags in the input data, remove them, and record the group boundary 3529 // positions. 3530 // 3531 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status); 3532 REGEX_CHECK_STATUS_L(line); 3533 3534 unEscapedInput = inputString.unescape(); 3535 parseMatcher = parsePat->matcher(unEscapedInput, status); 3536 REGEX_CHECK_STATUS_L(line); 3537 while(parseMatcher->find()) { 3538 parseMatcher->appendReplacement(deTaggedInput, "", status); 3539 REGEX_CHECK_STATUS; 3540 UnicodeString groupNum = parseMatcher->group(2, status); 3541 if (groupNum == "r") { 3542 // <r> or </r>, a region specification within the string 3543 if (parseMatcher->group(1, status) == "/") { 3544 regionEnd = deTaggedInput.length(); 3545 } else { 3546 regionStart = deTaggedInput.length(); 3547 } 3548 } else { 3549 // <digits> or </digits>, a group match boundary tag. 3550 if (parseMatcher->group(1, status) == "/") { 3551 set(groupEnds, deTaggedInput.length(), groupNum); 3552 } else { 3553 set(groupStarts, deTaggedInput.length(), groupNum); 3554 } 3555 } 3556 } 3557 parseMatcher->appendTail(deTaggedInput); 3558 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line); 3559 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) { 3560 errln("mismatched <r> tags"); 3561 failed = TRUE; 3562 goto cleanupAndReturn; 3563 } 3564 3565 // 3566 // Configure the matcher according to the flags specified with this test. 3567 // 3568 matcher = callerPattern->matcher(deTaggedInput, status); 3569 REGEX_CHECK_STATUS_L(line); 3570 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag 3571 matcher->setTrace(TRUE); 3572 } 3573 3574 if (UTF8Pattern != NULL) { 3575 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status); 3576 status = U_ZERO_ERROR; // buffer overflow 3577 inputChars = new char[inputUTF8Length+1]; 3578 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status); 3579 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status); 3580 3581 if (status == U_ZERO_ERROR) { 3582 UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText); 3583 REGEX_CHECK_STATUS_L(line); 3584 } 3585 3586 if (UTF8Matcher == NULL) { 3587 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine 3588 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line); 3589 status = U_ZERO_ERROR; 3590 } 3591 } 3592 3593 // 3594 // Generate native indices for UTF8 versions of region and capture group info 3595 // 3596 if (UTF8Matcher != NULL) { 3597 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag 3598 UTF8Matcher->setTrace(TRUE); 3599 } 3600 if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8); 3601 if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8); 3602 3603 // Fill out the native index UVector info. 3604 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size() 3605 for (i=0; i<groupStarts.size(); i++) { 3606 int32_t start = groupStarts.elementAti(i); 3607 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting 3608 if (start >= 0) { 3609 int32_t startUTF8; 3610 if (!utextOffsetToNative(&inputText, start, startUTF8)) { 3611 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start); 3612 failed = TRUE; 3613 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3614 } 3615 setInt(groupStartsUTF8, startUTF8, i); 3616 } 3617 3618 int32_t end = groupEnds.elementAti(i); 3619 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting 3620 if (end >= 0) { 3621 int32_t endUTF8; 3622 if (!utextOffsetToNative(&inputText, end, endUTF8)) { 3623 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end); 3624 failed = TRUE; 3625 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3626 } 3627 setInt(groupEndsUTF8, endUTF8, i); 3628 } 3629 } 3630 } 3631 3632 if (regionStart>=0) { 3633 matcher->region(regionStart, regionEnd, status); 3634 REGEX_CHECK_STATUS_L(line); 3635 if (UTF8Matcher != NULL) { 3636 UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status); 3637 REGEX_CHECK_STATUS_L(line); 3638 } 3639 } 3640 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag 3641 matcher->useAnchoringBounds(FALSE); 3642 if (UTF8Matcher != NULL) { 3643 UTF8Matcher->useAnchoringBounds(FALSE); 3644 } 3645 } 3646 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag 3647 matcher->useTransparentBounds(TRUE); 3648 if (UTF8Matcher != NULL) { 3649 UTF8Matcher->useTransparentBounds(TRUE); 3650 } 3651 } 3652 3653 3654 3655 // 3656 // Do a find on the de-tagged input using the caller's pattern 3657 // TODO: error on count>1 and not find(). 3658 // error on both matches() and lookingAt(). 3659 // 3660 for (i=0; i<numFinds; i++) { 3661 if (useMatchesFunc) { 3662 isMatch = matcher->matches(status); 3663 if (UTF8Matcher != NULL) { 3664 isUTF8Match = UTF8Matcher->matches(status); 3665 } 3666 } else if (useLookingAtFunc) { 3667 isMatch = matcher->lookingAt(status); 3668 if (UTF8Matcher != NULL) { 3669 isUTF8Match = UTF8Matcher->lookingAt(status); 3670 } 3671 } else { 3672 isMatch = matcher->find(); 3673 if (UTF8Matcher != NULL) { 3674 isUTF8Match = UTF8Matcher->find(); 3675 } 3676 } 3677 } 3678 matcher->setTrace(FALSE); 3679 if (UTF8Matcher) { 3680 UTF8Matcher->setTrace(FALSE); 3681 } 3682 if (U_FAILURE(status)) { 3683 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status)); 3684 } 3685 3686 // 3687 // Match up the groups from the find() with the groups from the tags 3688 // 3689 3690 // number of tags should match number of groups from find operation. 3691 // matcher->groupCount does not include group 0, the entire match, hence the +1. 3692 // G option in test means that capture group data is not available in the 3693 // expected results, so the check needs to be suppressed. 3694 if (isMatch == FALSE && groupStarts.size() != 0) { 3695 dataerrln("Error at line %d: Match expected, but none found.", line); 3696 failed = TRUE; 3697 goto cleanupAndReturn; 3698 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) { 3699 errln("Error at line %d: Match expected, but none found. (UTF8)", line); 3700 failed = TRUE; 3701 goto cleanupAndReturn; 3702 } 3703 if (isMatch && groupStarts.size() == 0) { 3704 errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status)); 3705 failed = TRUE; 3706 } 3707 if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) { 3708 errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status)); 3709 failed = TRUE; 3710 } 3711 3712 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) { 3713 // Only check for match / no match. Don't check capture groups. 3714 goto cleanupAndReturn; 3715 } 3716 3717 REGEX_CHECK_STATUS_L(line); 3718 for (i=0; i<=matcher->groupCount(); i++) { 3719 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i)); 3720 int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i)); 3721 if (matcher->start(i, status) != expectedStart) { 3722 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d", 3723 line, i, expectedStart, matcher->start(i, status)); 3724 failed = TRUE; 3725 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3726 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) { 3727 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)", 3728 line, i, expectedStartUTF8, UTF8Matcher->start(i, status)); 3729 failed = TRUE; 3730 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3731 } 3732 3733 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i)); 3734 int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i)); 3735 if (matcher->end(i, status) != expectedEnd) { 3736 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d", 3737 line, i, expectedEnd, matcher->end(i, status)); 3738 failed = TRUE; 3739 // Error on end position; keep going; real error is probably yet to come as group 3740 // end positions work from end of the input data towards the front. 3741 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) { 3742 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)", 3743 line, i, expectedEndUTF8, UTF8Matcher->end(i, status)); 3744 failed = TRUE; 3745 // Error on end position; keep going; real error is probably yet to come as group 3746 // end positions work from end of the input data towards the front. 3747 } 3748 } 3749 if ( matcher->groupCount()+1 < groupStarts.size()) { 3750 errln("Error at line %d: Expected %d capture groups, found %d.", 3751 line, groupStarts.size()-1, matcher->groupCount()); 3752 failed = TRUE; 3753 } 3754 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) { 3755 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)", 3756 line, groupStarts.size()-1, UTF8Matcher->groupCount()); 3757 failed = TRUE; 3758 } 3759 3760 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false 3761 matcher->requireEnd() == TRUE) { 3762 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line); 3763 failed = TRUE; 3764 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false 3765 UTF8Matcher->requireEnd() == TRUE) { 3766 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line); 3767 failed = TRUE; 3768 } 3769 3770 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true 3771 matcher->requireEnd() == FALSE) { 3772 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line); 3773 failed = TRUE; 3774 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false 3775 UTF8Matcher->requireEnd() == FALSE) { 3776 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line); 3777 failed = TRUE; 3778 } 3779 3780 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false 3781 matcher->hitEnd() == TRUE) { 3782 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line); 3783 failed = TRUE; 3784 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false 3785 UTF8Matcher->hitEnd() == TRUE) { 3786 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line); 3787 failed = TRUE; 3788 } 3789 3790 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true 3791 matcher->hitEnd() == FALSE) { 3792 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line); 3793 failed = TRUE; 3794 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true 3795 UTF8Matcher->hitEnd() == FALSE) { 3796 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line); 3797 failed = TRUE; 3798 } 3799 3800 3801 cleanupAndReturn: 3802 if (failed) { 3803 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" " 3804 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\""); 3805 // callerPattern->dump(); 3806 } 3807 delete parseMatcher; 3808 delete parsePat; 3809 delete UTF8Matcher; 3810 delete UTF8Pattern; 3811 delete matcher; 3812 delete callerPattern; 3813 3814 utext_close(&inputText); 3815 delete[] inputChars; 3816 utext_close(&patternText); 3817 delete[] patternChars; 3818 ucnv_close(UTF8Converter); 3819 } 3820 3821 3822 3823 3824 //--------------------------------------------------------------------------- 3825 // 3826 // Errors Check for error handling in patterns. 3827 // 3828 //--------------------------------------------------------------------------- 3829 void RegexTest::Errors() { 3830 // \escape sequences that aren't implemented yet. 3831 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED); 3832 3833 // Missing close parentheses 3834 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN); 3835 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN); 3836 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN); 3837 3838 // Extra close paren 3839 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN); 3840 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN); 3841 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN); 3842 3843 // Look-ahead, Look-behind 3844 // TODO: add tests for unbounded length look-behinds. 3845 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct 3846 3847 // Attempt to use non-default flags 3848 { 3849 UParseError pe; 3850 UErrorCode status = U_ZERO_ERROR; 3851 int32_t flags = UREGEX_CANON_EQ | 3852 UREGEX_COMMENTS | UREGEX_DOTALL | 3853 UREGEX_MULTILINE; 3854 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status); 3855 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED); 3856 delete pat1; 3857 } 3858 3859 3860 // Quantifiers are allowed only after something that can be quantified. 3861 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX); 3862 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX); 3863 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX); 3864 3865 // Mal-formed {min,max} quantifiers 3866 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL); 3867 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN); 3868 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL); 3869 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL); 3870 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL); 3871 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG); 3872 REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan 3873 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format 3874 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG); 3875 3876 // Ticket 5389 3877 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX); 3878 3879 // Invalid Back Reference \0 3880 // For ICU 3.8 and earlier 3881 // For ICU versions newer than 3.8, \0 introduces an octal escape. 3882 // 3883 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE); 3884 3885 } 3886 3887 3888 //------------------------------------------------------------------------------- 3889 // 3890 // Read a text data file, convert it to UChars, and return the data 3891 // in one big UChar * buffer, which the caller must delete. 3892 // 3893 //-------------------------------------------------------------------------------- 3894 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen, 3895 const char *defEncoding, UErrorCode &status) { 3896 UChar *retPtr = NULL; 3897 char *fileBuf = NULL; 3898 UConverter* conv = NULL; 3899 FILE *f = NULL; 3900 3901 ulen = 0; 3902 if (U_FAILURE(status)) { 3903 return retPtr; 3904 } 3905 3906 // 3907 // Open the file. 3908 // 3909 f = fopen(fileName, "rb"); 3910 if (f == 0) { 3911 dataerrln("Error opening test data file %s\n", fileName); 3912 status = U_FILE_ACCESS_ERROR; 3913 return NULL; 3914 } 3915 // 3916 // Read it in 3917 // 3918 int32_t fileSize; 3919 int32_t amt_read; 3920 3921 fseek( f, 0, SEEK_END); 3922 fileSize = ftell(f); 3923 fileBuf = new char[fileSize]; 3924 fseek(f, 0, SEEK_SET); 3925 amt_read = fread(fileBuf, 1, fileSize, f); 3926 if (amt_read != fileSize || fileSize <= 0) { 3927 errln("Error reading test data file."); 3928 goto cleanUpAndReturn; 3929 } 3930 3931 // 3932 // Look for a Unicode Signature (BOM) on the data just read 3933 // 3934 int32_t signatureLength; 3935 const char * fileBufC; 3936 const char* encoding; 3937 3938 fileBufC = fileBuf; 3939 encoding = ucnv_detectUnicodeSignature( 3940 fileBuf, fileSize, &signatureLength, &status); 3941 if(encoding!=NULL ){ 3942 fileBufC += signatureLength; 3943 fileSize -= signatureLength; 3944 } else { 3945 encoding = defEncoding; 3946 if (strcmp(encoding, "utf-8") == 0) { 3947 errln("file %s is missing its BOM", fileName); 3948 } 3949 } 3950 3951 // 3952 // Open a converter to take the rule file to UTF-16 3953 // 3954 conv = ucnv_open(encoding, &status); 3955 if (U_FAILURE(status)) { 3956 goto cleanUpAndReturn; 3957 } 3958 3959 // 3960 // Convert the rules to UChar. 3961 // Preflight first to determine required buffer size. 3962 // 3963 ulen = ucnv_toUChars(conv, 3964 NULL, // dest, 3965 0, // destCapacity, 3966 fileBufC, 3967 fileSize, 3968 &status); 3969 if (status == U_BUFFER_OVERFLOW_ERROR) { 3970 // Buffer Overflow is expected from the preflight operation. 3971 status = U_ZERO_ERROR; 3972 3973 retPtr = new UChar[ulen+1]; 3974 ucnv_toUChars(conv, 3975 retPtr, // dest, 3976 ulen+1, 3977 fileBufC, 3978 fileSize, 3979 &status); 3980 } 3981 3982 cleanUpAndReturn: 3983 fclose(f); 3984 delete[] fileBuf; 3985 ucnv_close(conv); 3986 if (U_FAILURE(status)) { 3987 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 3988 delete []retPtr; 3989 retPtr = 0; 3990 ulen = 0; 3991 }; 3992 return retPtr; 3993 } 3994 3995 3996 //------------------------------------------------------------------------------- 3997 // 3998 // PerlTests - Run Perl's regular expression tests 3999 // The input file for this test is re_tests, the standard regular 4000 // expression test data distributed with the Perl source code. 4001 // 4002 // Here is Perl's description of the test data file: 4003 // 4004 // # The tests are in a separate file 't/op/re_tests'. 4005 // # Each line in that file is a separate test. 4006 // # There are five columns, separated by tabs. 4007 // # 4008 // # Column 1 contains the pattern, optionally enclosed in C<''>. 4009 // # Modifiers can be put after the closing C<'>. 4010 // # 4011 // # Column 2 contains the string to be matched. 4012 // # 4013 // # Column 3 contains the expected result: 4014 // # y expect a match 4015 // # n expect no match 4016 // # c expect an error 4017 // # B test exposes a known bug in Perl, should be skipped 4018 // # b test exposes a known bug in Perl, should be skipped if noamp 4019 // # 4020 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>. 4021 // # 4022 // # Column 4 contains a string, usually C<$&>. 4023 // # 4024 // # Column 5 contains the expected result of double-quote 4025 // # interpolating that string after the match, or start of error message. 4026 // # 4027 // # Column 6, if present, contains a reason why the test is skipped. 4028 // # This is printed with "skipped", for harness to pick up. 4029 // # 4030 // # \n in the tests are interpolated, as are variables of the form ${\w+}. 4031 // # 4032 // # If you want to add a regular expression test that can't be expressed 4033 // # in this format, don't add it here: put it in op/pat.t instead. 4034 // 4035 // For ICU, if field 3 contains an 'i', the test will be skipped. 4036 // The test exposes is some known incompatibility between ICU and Perl regexps. 4037 // (The i is in addition to whatever was there before.) 4038 // 4039 //------------------------------------------------------------------------------- 4040 void RegexTest::PerlTests() { 4041 char tdd[2048]; 4042 const char *srcPath; 4043 UErrorCode status = U_ZERO_ERROR; 4044 UParseError pe; 4045 4046 // 4047 // Open and read the test data file. 4048 // 4049 srcPath=getPath(tdd, "re_tests.txt"); 4050 if(srcPath==NULL) { 4051 return; /* something went wrong, error already output */ 4052 } 4053 4054 int32_t len; 4055 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status); 4056 if (U_FAILURE(status)) { 4057 return; /* something went wrong, error already output */ 4058 } 4059 4060 // 4061 // Put the test data into a UnicodeString 4062 // 4063 UnicodeString testDataString(FALSE, testData, len); 4064 4065 // 4066 // Regex to break the input file into lines, and strip the new lines. 4067 // One line per match, capture group one is the desired data. 4068 // 4069 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status); 4070 if (U_FAILURE(status)) { 4071 dataerrln("RegexPattern::compile() error"); 4072 return; 4073 } 4074 RegexMatcher* lineMat = linePat->matcher(testDataString, status); 4075 4076 // 4077 // Regex to split a test file line into fields. 4078 // There are six fields, separated by tabs. 4079 // 4080 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status); 4081 4082 // 4083 // Regex to identify test patterns with flag settings, and to separate them. 4084 // Test patterns with flags look like 'pattern'i 4085 // Test patterns without flags are not quoted: pattern 4086 // Coming out, capture group 2 is the pattern, capture group 3 is the flags. 4087 // 4088 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status); 4089 RegexMatcher* flagMat = flagPat->matcher(status); 4090 4091 // 4092 // The Perl tests reference several perl-isms, which are evaluated/substituted 4093 // in the test data. Not being perl, this must be done explicitly. Here 4094 // are string constants and REs for these constructs. 4095 // 4096 UnicodeString nulnulSrc("${nulnul}"); 4097 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV); 4098 nulnul = nulnul.unescape(); 4099 4100 UnicodeString ffffSrc("${ffff}"); 4101 UnicodeString ffff("\\uffff", -1, US_INV); 4102 ffff = ffff.unescape(); 4103 4104 // regexp for $-[0], $+[2], etc. 4105 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status); 4106 RegexMatcher *groupsMat = groupsPat->matcher(status); 4107 4108 // regexp for $0, $1, $2, etc. 4109 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status); 4110 RegexMatcher *cgMat = cgPat->matcher(status); 4111 4112 4113 // 4114 // Main Loop for the Perl Tests, runs once per line from the 4115 // test data file. 4116 // 4117 int32_t lineNum = 0; 4118 int32_t skippedUnimplementedCount = 0; 4119 while (lineMat->find()) { 4120 lineNum++; 4121 4122 // 4123 // Get a line, break it into its fields, do the Perl 4124 // variable substitutions. 4125 // 4126 UnicodeString line = lineMat->group(1, status); 4127 UnicodeString fields[7]; 4128 fieldPat->split(line, fields, 7, status); 4129 4130 flagMat->reset(fields[0]); 4131 flagMat->matches(status); 4132 UnicodeString pattern = flagMat->group(2, status); 4133 pattern.findAndReplace("${bang}", "!"); 4134 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000")); 4135 pattern.findAndReplace(ffffSrc, ffff); 4136 4137 // 4138 // Identify patterns that include match flag settings, 4139 // split off the flags, remove the extra quotes. 4140 // 4141 UnicodeString flagStr = flagMat->group(3, status); 4142 if (U_FAILURE(status)) { 4143 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 4144 return; 4145 } 4146 int32_t flags = 0; 4147 const UChar UChar_c = 0x63; // Char constants for the flag letters. 4148 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C) 4149 const UChar UChar_m = 0x6d; 4150 const UChar UChar_x = 0x78; 4151 const UChar UChar_y = 0x79; 4152 if (flagStr.indexOf(UChar_i) != -1) { 4153 flags |= UREGEX_CASE_INSENSITIVE; 4154 } 4155 if (flagStr.indexOf(UChar_m) != -1) { 4156 flags |= UREGEX_MULTILINE; 4157 } 4158 if (flagStr.indexOf(UChar_x) != -1) { 4159 flags |= UREGEX_COMMENTS; 4160 } 4161 4162 // 4163 // Compile the test pattern. 4164 // 4165 status = U_ZERO_ERROR; 4166 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status); 4167 if (status == U_REGEX_UNIMPLEMENTED) { 4168 // 4169 // Test of a feature that is planned for ICU, but not yet implemented. 4170 // skip the test. 4171 skippedUnimplementedCount++; 4172 delete testPat; 4173 status = U_ZERO_ERROR; 4174 continue; 4175 } 4176 4177 if (U_FAILURE(status)) { 4178 // Some tests are supposed to generate errors. 4179 // Only report an error for tests that are supposed to succeed. 4180 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND 4181 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility 4182 { 4183 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status)); 4184 } 4185 status = U_ZERO_ERROR; 4186 delete testPat; 4187 continue; 4188 } 4189 4190 if (fields[2].indexOf(UChar_i) >= 0) { 4191 // ICU should skip this test. 4192 delete testPat; 4193 continue; 4194 } 4195 4196 if (fields[2].indexOf(UChar_c) >= 0) { 4197 // This pattern should have caused a compilation error, but didn't/ 4198 errln("line %d: Expected a pattern compile error, got success.", lineNum); 4199 delete testPat; 4200 continue; 4201 } 4202 4203 // 4204 // replace the Perl variables that appear in some of the 4205 // match data strings. 4206 // 4207 UnicodeString matchString = fields[1]; 4208 matchString.findAndReplace(nulnulSrc, nulnul); 4209 matchString.findAndReplace(ffffSrc, ffff); 4210 4211 // Replace any \n in the match string with an actual new-line char. 4212 // Don't do full unescape, as this unescapes more than Perl does, which 4213 // causes other spurious failures in the tests. 4214 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4215 4216 4217 4218 // 4219 // Run the test, check for expected match/don't match result. 4220 // 4221 RegexMatcher *testMat = testPat->matcher(matchString, status); 4222 UBool found = testMat->find(); 4223 UBool expected = FALSE; 4224 if (fields[2].indexOf(UChar_y) >=0) { 4225 expected = TRUE; 4226 } 4227 if (expected != found) { 4228 errln("line %d: Expected %smatch, got %smatch", 4229 lineNum, expected?"":"no ", found?"":"no " ); 4230 continue; 4231 } 4232 4233 // Don't try to check expected results if there is no match. 4234 // (Some have stuff in the expected fields) 4235 if (!found) { 4236 delete testMat; 4237 delete testPat; 4238 continue; 4239 } 4240 4241 // 4242 // Interpret the Perl expression from the fourth field of the data file, 4243 // building up an ICU string from the results of the ICU match. 4244 // The Perl expression will contain references to the results of 4245 // a regex match, including the matched string, capture group strings, 4246 // group starting and ending indicies, etc. 4247 // 4248 UnicodeString resultString; 4249 UnicodeString perlExpr = fields[3]; 4250 #if SUPPORT_MUTATING_INPUT_STRING 4251 groupsMat->reset(perlExpr); 4252 cgMat->reset(perlExpr); 4253 #endif 4254 4255 while (perlExpr.length() > 0) { 4256 #if !SUPPORT_MUTATING_INPUT_STRING 4257 // Perferred usage. Reset after any modification to input string. 4258 groupsMat->reset(perlExpr); 4259 cgMat->reset(perlExpr); 4260 #endif 4261 4262 if (perlExpr.startsWith("$&")) { 4263 resultString.append(testMat->group(status)); 4264 perlExpr.remove(0, 2); 4265 } 4266 4267 else if (groupsMat->lookingAt(status)) { 4268 // $-[0] $+[2] etc. 4269 UnicodeString digitString = groupsMat->group(2, status); 4270 int32_t t = 0; 4271 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4272 UnicodeString plusOrMinus = groupsMat->group(1, status); 4273 int32_t matchPosition; 4274 if (plusOrMinus.compare("+") == 0) { 4275 matchPosition = testMat->end(groupNum, status); 4276 } else { 4277 matchPosition = testMat->start(groupNum, status); 4278 } 4279 if (matchPosition != -1) { 4280 ICU_Utility::appendNumber(resultString, matchPosition); 4281 } 4282 perlExpr.remove(0, groupsMat->end(status)); 4283 } 4284 4285 else if (cgMat->lookingAt(status)) { 4286 // $1, $2, $3, etc. 4287 UnicodeString digitString = cgMat->group(1, status); 4288 int32_t t = 0; 4289 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4290 if (U_SUCCESS(status)) { 4291 resultString.append(testMat->group(groupNum, status)); 4292 status = U_ZERO_ERROR; 4293 } 4294 perlExpr.remove(0, cgMat->end(status)); 4295 } 4296 4297 else if (perlExpr.startsWith("@-")) { 4298 int32_t i; 4299 for (i=0; i<=testMat->groupCount(); i++) { 4300 if (i>0) { 4301 resultString.append(" "); 4302 } 4303 ICU_Utility::appendNumber(resultString, testMat->start(i, status)); 4304 } 4305 perlExpr.remove(0, 2); 4306 } 4307 4308 else if (perlExpr.startsWith("@+")) { 4309 int32_t i; 4310 for (i=0; i<=testMat->groupCount(); i++) { 4311 if (i>0) { 4312 resultString.append(" "); 4313 } 4314 ICU_Utility::appendNumber(resultString, testMat->end(i, status)); 4315 } 4316 perlExpr.remove(0, 2); 4317 } 4318 4319 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal. 4320 // or as an escaped sequence (e.g. \n) 4321 if (perlExpr.length() > 1) { 4322 perlExpr.remove(0, 1); // Remove the '\', but only if not last char. 4323 } 4324 UChar c = perlExpr.charAt(0); 4325 switch (c) { 4326 case 'n': c = '\n'; break; 4327 // add any other escape sequences that show up in the test expected results. 4328 } 4329 resultString.append(c); 4330 perlExpr.remove(0, 1); 4331 } 4332 4333 else { 4334 // Any characters from the perl expression that we don't explicitly 4335 // recognize before here are assumed to be literals and copied 4336 // as-is to the expected results. 4337 resultString.append(perlExpr.charAt(0)); 4338 perlExpr.remove(0, 1); 4339 } 4340 4341 if (U_FAILURE(status)) { 4342 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status)); 4343 break; 4344 } 4345 } 4346 4347 // 4348 // Expected Results Compare 4349 // 4350 UnicodeString expectedS(fields[4]); 4351 expectedS.findAndReplace(nulnulSrc, nulnul); 4352 expectedS.findAndReplace(ffffSrc, ffff); 4353 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4354 4355 4356 if (expectedS.compare(resultString) != 0) { 4357 err("Line %d: Incorrect perl expression results.", lineNum); 4358 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\""); 4359 } 4360 4361 delete testMat; 4362 delete testPat; 4363 } 4364 4365 // 4366 // All done. Clean up allocated stuff. 4367 // 4368 delete cgMat; 4369 delete cgPat; 4370 4371 delete groupsMat; 4372 delete groupsPat; 4373 4374 delete flagMat; 4375 delete flagPat; 4376 4377 delete lineMat; 4378 delete linePat; 4379 4380 delete fieldPat; 4381 delete [] testData; 4382 4383 4384 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount); 4385 4386 } 4387 4388 4389 //------------------------------------------------------------------------------- 4390 // 4391 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts 4392 // (instead of using UnicodeStrings) to test the alternate engine. 4393 // The input file for this test is re_tests, the standard regular 4394 // expression test data distributed with the Perl source code. 4395 // See PerlTests() for more information. 4396 // 4397 //------------------------------------------------------------------------------- 4398 void RegexTest::PerlTestsUTF8() { 4399 char tdd[2048]; 4400 const char *srcPath; 4401 UErrorCode status = U_ZERO_ERROR; 4402 UParseError pe; 4403 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status)); 4404 UText patternText = UTEXT_INITIALIZER; 4405 char *patternChars = NULL; 4406 int32_t patternLength; 4407 int32_t patternCapacity = 0; 4408 UText inputText = UTEXT_INITIALIZER; 4409 char *inputChars = NULL; 4410 int32_t inputLength; 4411 int32_t inputCapacity = 0; 4412 4413 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); 4414 4415 // 4416 // Open and read the test data file. 4417 // 4418 srcPath=getPath(tdd, "re_tests.txt"); 4419 if(srcPath==NULL) { 4420 return; /* something went wrong, error already output */ 4421 } 4422 4423 int32_t len; 4424 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status); 4425 if (U_FAILURE(status)) { 4426 return; /* something went wrong, error already output */ 4427 } 4428 4429 // 4430 // Put the test data into a UnicodeString 4431 // 4432 UnicodeString testDataString(FALSE, testData, len); 4433 4434 // 4435 // Regex to break the input file into lines, and strip the new lines. 4436 // One line per match, capture group one is the desired data. 4437 // 4438 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status); 4439 if (U_FAILURE(status)) { 4440 dataerrln("RegexPattern::compile() error"); 4441 return; 4442 } 4443 RegexMatcher* lineMat = linePat->matcher(testDataString, status); 4444 4445 // 4446 // Regex to split a test file line into fields. 4447 // There are six fields, separated by tabs. 4448 // 4449 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status); 4450 4451 // 4452 // Regex to identify test patterns with flag settings, and to separate them. 4453 // Test patterns with flags look like 'pattern'i 4454 // Test patterns without flags are not quoted: pattern 4455 // Coming out, capture group 2 is the pattern, capture group 3 is the flags. 4456 // 4457 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status); 4458 RegexMatcher* flagMat = flagPat->matcher(status); 4459 4460 // 4461 // The Perl tests reference several perl-isms, which are evaluated/substituted 4462 // in the test data. Not being perl, this must be done explicitly. Here 4463 // are string constants and REs for these constructs. 4464 // 4465 UnicodeString nulnulSrc("${nulnul}"); 4466 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV); 4467 nulnul = nulnul.unescape(); 4468 4469 UnicodeString ffffSrc("${ffff}"); 4470 UnicodeString ffff("\\uffff", -1, US_INV); 4471 ffff = ffff.unescape(); 4472 4473 // regexp for $-[0], $+[2], etc. 4474 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status); 4475 RegexMatcher *groupsMat = groupsPat->matcher(status); 4476 4477 // regexp for $0, $1, $2, etc. 4478 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status); 4479 RegexMatcher *cgMat = cgPat->matcher(status); 4480 4481 4482 // 4483 // Main Loop for the Perl Tests, runs once per line from the 4484 // test data file. 4485 // 4486 int32_t lineNum = 0; 4487 int32_t skippedUnimplementedCount = 0; 4488 while (lineMat->find()) { 4489 lineNum++; 4490 4491 // 4492 // Get a line, break it into its fields, do the Perl 4493 // variable substitutions. 4494 // 4495 UnicodeString line = lineMat->group(1, status); 4496 UnicodeString fields[7]; 4497 fieldPat->split(line, fields, 7, status); 4498 4499 flagMat->reset(fields[0]); 4500 flagMat->matches(status); 4501 UnicodeString pattern = flagMat->group(2, status); 4502 pattern.findAndReplace("${bang}", "!"); 4503 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000")); 4504 pattern.findAndReplace(ffffSrc, ffff); 4505 4506 // 4507 // Identify patterns that include match flag settings, 4508 // split off the flags, remove the extra quotes. 4509 // 4510 UnicodeString flagStr = flagMat->group(3, status); 4511 if (U_FAILURE(status)) { 4512 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 4513 return; 4514 } 4515 int32_t flags = 0; 4516 const UChar UChar_c = 0x63; // Char constants for the flag letters. 4517 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C) 4518 const UChar UChar_m = 0x6d; 4519 const UChar UChar_x = 0x78; 4520 const UChar UChar_y = 0x79; 4521 if (flagStr.indexOf(UChar_i) != -1) { 4522 flags |= UREGEX_CASE_INSENSITIVE; 4523 } 4524 if (flagStr.indexOf(UChar_m) != -1) { 4525 flags |= UREGEX_MULTILINE; 4526 } 4527 if (flagStr.indexOf(UChar_x) != -1) { 4528 flags |= UREGEX_COMMENTS; 4529 } 4530 4531 // 4532 // Put the pattern in a UTF-8 UText 4533 // 4534 status = U_ZERO_ERROR; 4535 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status); 4536 if (status == U_BUFFER_OVERFLOW_ERROR) { 4537 status = U_ZERO_ERROR; 4538 delete[] patternChars; 4539 patternCapacity = patternLength + 1; 4540 patternChars = new char[patternCapacity]; 4541 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status); 4542 } 4543 utext_openUTF8(&patternText, patternChars, patternLength, &status); 4544 4545 // 4546 // Compile the test pattern. 4547 // 4548 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status); 4549 if (status == U_REGEX_UNIMPLEMENTED) { 4550 // 4551 // Test of a feature that is planned for ICU, but not yet implemented. 4552 // skip the test. 4553 skippedUnimplementedCount++; 4554 delete testPat; 4555 status = U_ZERO_ERROR; 4556 continue; 4557 } 4558 4559 if (U_FAILURE(status)) { 4560 // Some tests are supposed to generate errors. 4561 // Only report an error for tests that are supposed to succeed. 4562 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND 4563 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility 4564 { 4565 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status)); 4566 } 4567 status = U_ZERO_ERROR; 4568 delete testPat; 4569 continue; 4570 } 4571 4572 if (fields[2].indexOf(UChar_i) >= 0) { 4573 // ICU should skip this test. 4574 delete testPat; 4575 continue; 4576 } 4577 4578 if (fields[2].indexOf(UChar_c) >= 0) { 4579 // This pattern should have caused a compilation error, but didn't/ 4580 errln("line %d: Expected a pattern compile error, got success.", lineNum); 4581 delete testPat; 4582 continue; 4583 } 4584 4585 4586 // 4587 // replace the Perl variables that appear in some of the 4588 // match data strings. 4589 // 4590 UnicodeString matchString = fields[1]; 4591 matchString.findAndReplace(nulnulSrc, nulnul); 4592 matchString.findAndReplace(ffffSrc, ffff); 4593 4594 // Replace any \n in the match string with an actual new-line char. 4595 // Don't do full unescape, as this unescapes more than Perl does, which 4596 // causes other spurious failures in the tests. 4597 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4598 4599 // 4600 // Put the input in a UTF-8 UText 4601 // 4602 status = U_ZERO_ERROR; 4603 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status); 4604 if (status == U_BUFFER_OVERFLOW_ERROR) { 4605 status = U_ZERO_ERROR; 4606 delete[] inputChars; 4607 inputCapacity = inputLength + 1; 4608 inputChars = new char[inputCapacity]; 4609 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status); 4610 } 4611 utext_openUTF8(&inputText, inputChars, inputLength, &status); 4612 4613 // 4614 // Run the test, check for expected match/don't match result. 4615 // 4616 RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText); 4617 UBool found = testMat->find(); 4618 UBool expected = FALSE; 4619 if (fields[2].indexOf(UChar_y) >=0) { 4620 expected = TRUE; 4621 } 4622 if (expected != found) { 4623 errln("line %d: Expected %smatch, got %smatch", 4624 lineNum, expected?"":"no ", found?"":"no " ); 4625 continue; 4626 } 4627 4628 // Don't try to check expected results if there is no match. 4629 // (Some have stuff in the expected fields) 4630 if (!found) { 4631 delete testMat; 4632 delete testPat; 4633 continue; 4634 } 4635 4636 // 4637 // Interpret the Perl expression from the fourth field of the data file, 4638 // building up an ICU string from the results of the ICU match. 4639 // The Perl expression will contain references to the results of 4640 // a regex match, including the matched string, capture group strings, 4641 // group starting and ending indicies, etc. 4642 // 4643 UnicodeString resultString; 4644 UnicodeString perlExpr = fields[3]; 4645 4646 while (perlExpr.length() > 0) { 4647 groupsMat->reset(perlExpr); 4648 cgMat->reset(perlExpr); 4649 4650 if (perlExpr.startsWith("$&")) { 4651 resultString.append(testMat->group(status)); 4652 perlExpr.remove(0, 2); 4653 } 4654 4655 else if (groupsMat->lookingAt(status)) { 4656 // $-[0] $+[2] etc. 4657 UnicodeString digitString = groupsMat->group(2, status); 4658 int32_t t = 0; 4659 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4660 UnicodeString plusOrMinus = groupsMat->group(1, status); 4661 int32_t matchPosition; 4662 if (plusOrMinus.compare("+") == 0) { 4663 matchPosition = testMat->end(groupNum, status); 4664 } else { 4665 matchPosition = testMat->start(groupNum, status); 4666 } 4667 if (matchPosition != -1) { 4668 ICU_Utility::appendNumber(resultString, matchPosition); 4669 } 4670 perlExpr.remove(0, groupsMat->end(status)); 4671 } 4672 4673 else if (cgMat->lookingAt(status)) { 4674 // $1, $2, $3, etc. 4675 UnicodeString digitString = cgMat->group(1, status); 4676 int32_t t = 0; 4677 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4678 if (U_SUCCESS(status)) { 4679 resultString.append(testMat->group(groupNum, status)); 4680 status = U_ZERO_ERROR; 4681 } 4682 perlExpr.remove(0, cgMat->end(status)); 4683 } 4684 4685 else if (perlExpr.startsWith("@-")) { 4686 int32_t i; 4687 for (i=0; i<=testMat->groupCount(); i++) { 4688 if (i>0) { 4689 resultString.append(" "); 4690 } 4691 ICU_Utility::appendNumber(resultString, testMat->start(i, status)); 4692 } 4693 perlExpr.remove(0, 2); 4694 } 4695 4696 else if (perlExpr.startsWith("@+")) { 4697 int32_t i; 4698 for (i=0; i<=testMat->groupCount(); i++) { 4699 if (i>0) { 4700 resultString.append(" "); 4701 } 4702 ICU_Utility::appendNumber(resultString, testMat->end(i, status)); 4703 } 4704 perlExpr.remove(0, 2); 4705 } 4706 4707 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal. 4708 // or as an escaped sequence (e.g. \n) 4709 if (perlExpr.length() > 1) { 4710 perlExpr.remove(0, 1); // Remove the '\', but only if not last char. 4711 } 4712 UChar c = perlExpr.charAt(0); 4713 switch (c) { 4714 case 'n': c = '\n'; break; 4715 // add any other escape sequences that show up in the test expected results. 4716 } 4717 resultString.append(c); 4718 perlExpr.remove(0, 1); 4719 } 4720 4721 else { 4722 // Any characters from the perl expression that we don't explicitly 4723 // recognize before here are assumed to be literals and copied 4724 // as-is to the expected results. 4725 resultString.append(perlExpr.charAt(0)); 4726 perlExpr.remove(0, 1); 4727 } 4728 4729 if (U_FAILURE(status)) { 4730 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status)); 4731 break; 4732 } 4733 } 4734 4735 // 4736 // Expected Results Compare 4737 // 4738 UnicodeString expectedS(fields[4]); 4739 expectedS.findAndReplace(nulnulSrc, nulnul); 4740 expectedS.findAndReplace(ffffSrc, ffff); 4741 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4742 4743 4744 if (expectedS.compare(resultString) != 0) { 4745 err("Line %d: Incorrect perl expression results.", lineNum); 4746 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\""); 4747 } 4748 4749 delete testMat; 4750 delete testPat; 4751 } 4752 4753 // 4754 // All done. Clean up allocated stuff. 4755 // 4756 delete cgMat; 4757 delete cgPat; 4758 4759 delete groupsMat; 4760 delete groupsPat; 4761 4762 delete flagMat; 4763 delete flagPat; 4764 4765 delete lineMat; 4766 delete linePat; 4767 4768 delete fieldPat; 4769 delete [] testData; 4770 4771 utext_close(&patternText); 4772 utext_close(&inputText); 4773 4774 delete [] patternChars; 4775 delete [] inputChars; 4776 4777 4778 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount); 4779 4780 } 4781 4782 4783 //-------------------------------------------------------------- 4784 // 4785 // Bug6149 Verify limits to heap expansion for backtrack stack. 4786 // Use this pattern, 4787 // "(a?){1,8000000}" 4788 // Note: was an unbounded upperbounds, but that now has loop-breaking enabled. 4789 // This test is likely to be fragile, as further optimizations stop 4790 // more cases of pointless looping in the match engine. 4791 // 4792 //--------------------------------------------------------------- 4793 void RegexTest::Bug6149() { 4794 UnicodeString pattern("(a?){1,8000000}"); 4795 UnicodeString s("xyz"); 4796 uint32_t flags = 0; 4797 UErrorCode status = U_ZERO_ERROR; 4798 4799 RegexMatcher matcher(pattern, s, flags, status); 4800 UBool result = false; 4801 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW); 4802 REGEX_ASSERT(result == FALSE); 4803 } 4804 4805 4806 // 4807 // Callbacks() Test the callback function. 4808 // When set, callbacks occur periodically during matching operations, 4809 // giving the application code the ability to abort the operation 4810 // before it's normal completion. 4811 // 4812 4813 struct callBackContext { 4814 RegexTest *test; 4815 int32_t maxCalls; 4816 int32_t numCalls; 4817 int32_t lastSteps; 4818 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;}; 4819 }; 4820 4821 U_CDECL_BEGIN 4822 static UBool U_CALLCONV 4823 testCallBackFn(const void *context, int32_t steps) { 4824 callBackContext *info = (callBackContext *)context; 4825 if (info->lastSteps+1 != steps) { 4826 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps); 4827 } 4828 info->lastSteps = steps; 4829 info->numCalls++; 4830 return (info->numCalls < info->maxCalls); 4831 } 4832 U_CDECL_END 4833 4834 void RegexTest::Callbacks() { 4835 { 4836 // Getter returns NULLs if no callback has been set 4837 4838 // The variables that the getter will fill in. 4839 // Init to non-null values so that the action of the getter can be seen. 4840 const void *returnedContext = &returnedContext; 4841 URegexMatchCallback *returnedFn = &testCallBackFn; 4842 4843 UErrorCode status = U_ZERO_ERROR; 4844 RegexMatcher matcher("x", 0, status); 4845 REGEX_CHECK_STATUS; 4846 matcher.getMatchCallback(returnedFn, returnedContext, status); 4847 REGEX_CHECK_STATUS; 4848 REGEX_ASSERT(returnedFn == NULL); 4849 REGEX_ASSERT(returnedContext == NULL); 4850 } 4851 4852 { 4853 // Set and Get work 4854 callBackContext cbInfo = {this, 0, 0, 0}; 4855 const void *returnedContext; 4856 URegexMatchCallback *returnedFn; 4857 UErrorCode status = U_ZERO_ERROR; 4858 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long. 4859 REGEX_CHECK_STATUS; 4860 matcher.setMatchCallback(testCallBackFn, &cbInfo, status); 4861 REGEX_CHECK_STATUS; 4862 matcher.getMatchCallback(returnedFn, returnedContext, status); 4863 REGEX_CHECK_STATUS; 4864 REGEX_ASSERT(returnedFn == testCallBackFn); 4865 REGEX_ASSERT(returnedContext == &cbInfo); 4866 4867 // A short-running match shouldn't invoke the callback 4868 status = U_ZERO_ERROR; 4869 cbInfo.reset(1); 4870 UnicodeString s = "xxx"; 4871 matcher.reset(s); 4872 REGEX_ASSERT(matcher.matches(status)); 4873 REGEX_CHECK_STATUS; 4874 REGEX_ASSERT(cbInfo.numCalls == 0); 4875 4876 // A medium-length match that runs long enough to invoke the 4877 // callback, but not so long that the callback aborts it. 4878 status = U_ZERO_ERROR; 4879 cbInfo.reset(4); 4880 s = "aaaaaaaaaaaaaaaaaaab"; 4881 matcher.reset(s); 4882 REGEX_ASSERT(matcher.matches(status)==FALSE); 4883 REGEX_CHECK_STATUS; 4884 REGEX_ASSERT(cbInfo.numCalls > 0); 4885 4886 // A longer running match that the callback function will abort. 4887 status = U_ZERO_ERROR; 4888 cbInfo.reset(4); 4889 s = "aaaaaaaaaaaaaaaaaaaaaaab"; 4890 matcher.reset(s); 4891 REGEX_ASSERT(matcher.matches(status)==FALSE); 4892 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); 4893 REGEX_ASSERT(cbInfo.numCalls == 4); 4894 4895 // A longer running find that the callback function will abort. 4896 status = U_ZERO_ERROR; 4897 cbInfo.reset(4); 4898 s = "aaaaaaaaaaaaaaaaaaaaaaab"; 4899 matcher.reset(s); 4900 REGEX_ASSERT(matcher.find(status)==FALSE); 4901 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); 4902 REGEX_ASSERT(cbInfo.numCalls == 4); 4903 } 4904 4905 4906 } 4907 4908 4909 // 4910 // FindProgressCallbacks() Test the find "progress" callback function. 4911 // When set, the find progress callback will be invoked during a find operations 4912 // after each return from a match attempt, giving the application the opportunity 4913 // to terminate a long-running find operation before it's normal completion. 4914 // 4915 4916 struct progressCallBackContext { 4917 RegexTest *test; 4918 int64_t lastIndex; 4919 int32_t maxCalls; 4920 int32_t numCalls; 4921 void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;}; 4922 }; 4923 4924 // call-back function for find(). 4925 // Return TRUE to continue the find(). 4926 // Return FALSE to stop the find(). 4927 U_CDECL_BEGIN 4928 static UBool U_CALLCONV 4929 testProgressCallBackFn(const void *context, int64_t matchIndex) { 4930 progressCallBackContext *info = (progressCallBackContext *)context; 4931 info->numCalls++; 4932 info->lastIndex = matchIndex; 4933 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls); 4934 return (info->numCalls < info->maxCalls); 4935 } 4936 U_CDECL_END 4937 4938 void RegexTest::FindProgressCallbacks() { 4939 { 4940 // Getter returns NULLs if no callback has been set 4941 4942 // The variables that the getter will fill in. 4943 // Init to non-null values so that the action of the getter can be seen. 4944 const void *returnedContext = &returnedContext; 4945 URegexFindProgressCallback *returnedFn = &testProgressCallBackFn; 4946 4947 UErrorCode status = U_ZERO_ERROR; 4948 RegexMatcher matcher("x", 0, status); 4949 REGEX_CHECK_STATUS; 4950 matcher.getFindProgressCallback(returnedFn, returnedContext, status); 4951 REGEX_CHECK_STATUS; 4952 REGEX_ASSERT(returnedFn == NULL); 4953 REGEX_ASSERT(returnedContext == NULL); 4954 } 4955 4956 { 4957 // Set and Get work 4958 progressCallBackContext cbInfo = {this, 0, 0, 0}; 4959 const void *returnedContext; 4960 URegexFindProgressCallback *returnedFn; 4961 UErrorCode status = U_ZERO_ERROR; 4962 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status); 4963 REGEX_CHECK_STATUS; 4964 matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status); 4965 REGEX_CHECK_STATUS; 4966 matcher.getFindProgressCallback(returnedFn, returnedContext, status); 4967 REGEX_CHECK_STATUS; 4968 REGEX_ASSERT(returnedFn == testProgressCallBackFn); 4969 REGEX_ASSERT(returnedContext == &cbInfo); 4970 4971 // A find that matches on the initial position does NOT invoke the callback. 4972 status = U_ZERO_ERROR; 4973 cbInfo.reset(100); 4974 UnicodeString s = "aaxxx"; 4975 matcher.reset(s); 4976 #if 0 4977 matcher.setTrace(TRUE); 4978 #endif 4979 REGEX_ASSERT(matcher.find(0, status)); 4980 REGEX_CHECK_STATUS; 4981 REGEX_ASSERT(cbInfo.numCalls == 0); 4982 4983 // A medium running find() that causes matcher.find() to invoke our callback for each index, 4984 // but not so many times that we interrupt the operation. 4985 status = U_ZERO_ERROR; 4986 s = "aaaaaaaaaaaaaaaaaaab"; 4987 cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string 4988 matcher.reset(s); 4989 REGEX_ASSERT(matcher.find(0, status)==FALSE); 4990 REGEX_CHECK_STATUS; 4991 REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25); 4992 4993 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point. 4994 status = U_ZERO_ERROR; 4995 UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab"; 4996 cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string 4997 matcher.reset(s1); 4998 REGEX_ASSERT(matcher.find(0, status)==FALSE); 4999 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); 5000 REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5); 5001 5002 // Now a match that will succeed, but after an interruption 5003 status = U_ZERO_ERROR; 5004 UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx"; 5005 cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string 5006 matcher.reset(s2); 5007 REGEX_ASSERT(matcher.find(0, status)==FALSE); 5008 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); 5009 // Now retry the match from where left off 5010 cbInfo.maxCalls = 100; // No callback limit 5011 status = U_ZERO_ERROR; 5012 REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status)); 5013 REGEX_CHECK_STATUS; 5014 } 5015 5016 5017 } 5018 5019 5020 //--------------------------------------------------------------------------- 5021 // 5022 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable 5023 // UTexts. The pure-C implementation of UText 5024 // has no mutable backing stores, but we can 5025 // use UnicodeString here to test the functionality. 5026 // 5027 //--------------------------------------------------------------------------- 5028 void RegexTest::PreAllocatedUTextCAPI () { 5029 UErrorCode status = U_ZERO_ERROR; 5030 URegularExpression *re; 5031 UText patternText = UTEXT_INITIALIZER; 5032 UnicodeString buffer; 5033 UText bufferText = UTEXT_INITIALIZER; 5034 5035 utext_openUnicodeString(&bufferText, &buffer, &status); 5036 5037 /* 5038 * getText() and getUText() 5039 */ 5040 { 5041 UText text1 = UTEXT_INITIALIZER; 5042 UText text2 = UTEXT_INITIALIZER; 5043 UChar text2Chars[20]; 5044 UText *resultText; 5045 5046 status = U_ZERO_ERROR; 5047 regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status); 5048 regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status); 5049 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2); 5050 utext_openUChars(&text2, text2Chars, -1, &status); 5051 5052 regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status); 5053 re = uregex_openUText(&patternText, 0, NULL, &status); 5054 5055 /* First set a UText */ 5056 uregex_setUText(re, &text1, &status); 5057 resultText = uregex_getUText(re, &bufferText, &status); 5058 REGEX_CHECK_STATUS; 5059 REGEX_ASSERT(resultText == &bufferText); 5060 utext_setNativeIndex(resultText, 0); 5061 utext_setNativeIndex(&text1, 0); 5062 REGEX_ASSERT(testUTextEqual(resultText, &text1)); 5063 5064 resultText = uregex_getUText(re, &bufferText, &status); 5065 REGEX_CHECK_STATUS; 5066 REGEX_ASSERT(resultText == &bufferText); 5067 utext_setNativeIndex(resultText, 0); 5068 utext_setNativeIndex(&text1, 0); 5069 REGEX_ASSERT(testUTextEqual(resultText, &text1)); 5070 5071 /* Then set a UChar * */ 5072 uregex_setText(re, text2Chars, 7, &status); 5073 resultText = uregex_getUText(re, &bufferText, &status); 5074 REGEX_CHECK_STATUS; 5075 REGEX_ASSERT(resultText == &bufferText); 5076 utext_setNativeIndex(resultText, 0); 5077 utext_setNativeIndex(&text2, 0); 5078 REGEX_ASSERT(testUTextEqual(resultText, &text2)); 5079 5080 uregex_close(re); 5081 utext_close(&text1); 5082 utext_close(&text2); 5083 } 5084 5085 /* 5086 * group() 5087 */ 5088 { 5089 UChar text1[80]; 5090 UText *actual; 5091 UBool result; 5092 int64_t length = 0; 5093 5094 u_uastrncpy(text1, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1)); 5095 // 012345678901234567890123456789012345678901234567 5096 // 0 1 2 3 4 5097 5098 status = U_ZERO_ERROR; 5099 re = uregex_openC("abc(.*?)def", 0, NULL, &status); 5100 REGEX_CHECK_STATUS; 5101 5102 uregex_setText(re, text1, -1, &status); 5103 result = uregex_find(re, 0, &status); 5104 REGEX_ASSERT(result==TRUE); 5105 5106 /* Capture Group 0, the full match. Should succeed. "abc interior def" */ 5107 status = U_ZERO_ERROR; 5108 actual = uregex_groupUText(re, 0, &bufferText, &length, &status); 5109 REGEX_CHECK_STATUS; 5110 REGEX_ASSERT(actual == &bufferText); 5111 REGEX_ASSERT(utext_getNativeIndex(actual) == 6); 5112 REGEX_ASSERT(length == 16); 5113 REGEX_ASSERT(utext_nativeLength(actual) == 47); 5114 5115 /* Capture group #1. Should succeed, matching " interior ". */ 5116 status = U_ZERO_ERROR; 5117 actual = uregex_groupUText(re, 1, &bufferText, &length, &status); 5118 REGEX_CHECK_STATUS; 5119 REGEX_ASSERT(actual == &bufferText); 5120 REGEX_ASSERT(utext_getNativeIndex(actual) == 9); // position of " interior " 5121 REGEX_ASSERT(length == 10); 5122 REGEX_ASSERT(utext_nativeLength(actual) == 47); 5123 5124 /* Capture group out of range. Error. */ 5125 status = U_ZERO_ERROR; 5126 actual = uregex_groupUText(re, 2, &bufferText, &length, &status); 5127 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 5128 REGEX_ASSERT(actual == &bufferText); 5129 uregex_close(re); 5130 5131 } 5132 5133 /* 5134 * replaceFirst() 5135 */ 5136 { 5137 UChar text1[80]; 5138 UChar text2[80]; 5139 UText replText = UTEXT_INITIALIZER; 5140 UText *result; 5141 status = U_ZERO_ERROR; 5142 utext_openUnicodeString(&bufferText, &buffer, &status); 5143 5144 status = U_ZERO_ERROR; 5145 u_uastrncpy(text1, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1)); 5146 u_uastrncpy(text2, "No match here.", UPRV_LENGTHOF(text2)/2); 5147 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status); 5148 5149 re = uregex_openC("x(.*?)x", 0, NULL, &status); 5150 REGEX_CHECK_STATUS; 5151 5152 /* Normal case, with match */ 5153 uregex_setText(re, text1, -1, &status); 5154 REGEX_CHECK_STATUS; 5155 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5156 REGEX_CHECK_STATUS; 5157 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 5158 REGEX_CHECK_STATUS; 5159 REGEX_ASSERT(result == &bufferText); 5160 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result); 5161 5162 /* No match. Text should copy to output with no changes. */ 5163 uregex_setText(re, text2, -1, &status); 5164 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5165 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 5166 REGEX_CHECK_STATUS; 5167 REGEX_ASSERT(result == &bufferText); 5168 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result); 5169 5170 /* Unicode escapes */ 5171 uregex_setText(re, text1, -1, &status); 5172 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status); 5173 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5174 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 5175 REGEX_CHECK_STATUS; 5176 REGEX_ASSERT(result == &bufferText); 5177 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result); 5178 5179 uregex_close(re); 5180 utext_close(&replText); 5181 } 5182 5183 5184 /* 5185 * replaceAll() 5186 */ 5187 { 5188 UChar text1[80]; 5189 UChar text2[80]; 5190 UText replText = UTEXT_INITIALIZER; 5191 UText *result; 5192 5193 status = U_ZERO_ERROR; 5194 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2); 5195 u_uastrncpy(text2, "No match here.", sizeof(text2)/2); 5196 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status); 5197 5198 re = uregex_openC("x(.*?)x", 0, NULL, &status); 5199 REGEX_CHECK_STATUS; 5200 5201 /* Normal case, with match */ 5202 uregex_setText(re, text1, -1, &status); 5203 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5204 result = uregex_replaceAllUText(re, &replText, &bufferText, &status); 5205 REGEX_CHECK_STATUS; 5206 REGEX_ASSERT(result == &bufferText); 5207 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result); 5208 5209 /* No match. Text should copy to output with no changes. */ 5210 uregex_setText(re, text2, -1, &status); 5211 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5212 result = uregex_replaceAllUText(re, &replText, &bufferText, &status); 5213 REGEX_CHECK_STATUS; 5214 REGEX_ASSERT(result == &bufferText); 5215 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result); 5216 5217 uregex_close(re); 5218 utext_close(&replText); 5219 } 5220 5221 5222 /* 5223 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts, 5224 * so we don't need to test it here. 5225 */ 5226 5227 utext_close(&bufferText); 5228 utext_close(&patternText); 5229 } 5230 5231 5232 //-------------------------------------------------------------- 5233 // 5234 // NamedCapture Check basic named capture group functionality 5235 // 5236 //-------------------------------------------------------------- 5237 void RegexTest::NamedCapture() { 5238 UErrorCode status = U_ZERO_ERROR; 5239 RegexPattern *pat = RegexPattern::compile(UnicodeString( 5240 "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status); 5241 REGEX_CHECK_STATUS; 5242 int32_t group = pat->groupNumberFromName("five", -1, status); 5243 REGEX_CHECK_STATUS; 5244 REGEX_ASSERT(5 == group); 5245 group = pat->groupNumberFromName("three", -1, status); 5246 REGEX_CHECK_STATUS; 5247 REGEX_ASSERT(3 == group); 5248 5249 status = U_ZERO_ERROR; 5250 group = pat->groupNumberFromName(UnicodeString("six"), status); 5251 REGEX_CHECK_STATUS; 5252 REGEX_ASSERT(6 == group); 5253 5254 status = U_ZERO_ERROR; 5255 group = pat->groupNumberFromName(UnicodeString("nosuch"), status); 5256 U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5257 5258 status = U_ZERO_ERROR; 5259 5260 // After copying a pattern, named capture should still work in the copy. 5261 RegexPattern *copiedPat = new RegexPattern(*pat); 5262 REGEX_ASSERT(*copiedPat == *pat); 5263 delete pat; pat = NULL; // Delete original, copy should have no references back to it. 5264 5265 group = copiedPat->groupNumberFromName("five", -1, status); 5266 REGEX_CHECK_STATUS; 5267 REGEX_ASSERT(5 == group); 5268 group = copiedPat->groupNumberFromName("three", -1, status); 5269 REGEX_CHECK_STATUS; 5270 REGEX_ASSERT(3 == group); 5271 delete copiedPat; 5272 5273 // ReplaceAll with named capture group. 5274 status = U_ZERO_ERROR; 5275 UnicodeString text("Substitution of <<quotes>> for <<double brackets>>"); 5276 RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status); 5277 REGEX_CHECK_STATUS; 5278 // m.pattern().dumpPattern(); 5279 UnicodeString replacedText = m->replaceAll("'${mid}'", status); 5280 REGEX_CHECK_STATUS; 5281 REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText); 5282 delete m; 5283 5284 // ReplaceAll, allowed capture group numbers. 5285 text = UnicodeString("abcmxyz"); 5286 m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status); 5287 REGEX_CHECK_STATUS; 5288 5289 status = U_ZERO_ERROR; 5290 replacedText = m->replaceAll(UnicodeString("<$0>"), status); // group 0, full match, is allowed. 5291 REGEX_CHECK_STATUS; 5292 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText); 5293 5294 status = U_ZERO_ERROR; 5295 replacedText = m->replaceAll(UnicodeString("<$1>"), status); // group 1 by number. 5296 REGEX_CHECK_STATUS; 5297 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText); 5298 5299 status = U_ZERO_ERROR; 5300 replacedText = m->replaceAll(UnicodeString("<${one}>"), status); // group 1 by name. 5301 REGEX_CHECK_STATUS; 5302 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText); 5303 5304 status = U_ZERO_ERROR; 5305 replacedText = m->replaceAll(UnicodeString("<$2>"), status); // group 2. 5306 REGEX_CHECK_STATUS; 5307 REGEX_ASSERT(UnicodeString("a<x>z") == replacedText); 5308 5309 status = U_ZERO_ERROR; 5310 replacedText = m->replaceAll(UnicodeString("<$3>"), status); 5311 REGEX_CHECK_STATUS; 5312 REGEX_ASSERT(UnicodeString("a<y>z") == replacedText); 5313 5314 status = U_ZERO_ERROR; 5315 replacedText = m->replaceAll(UnicodeString("<$4>"), status); 5316 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 5317 5318 status = U_ZERO_ERROR; 5319 replacedText = m->replaceAll(UnicodeString("<$04>"), status); // group 0, leading 0, 5320 REGEX_CHECK_STATUS; // trailing out-of-range 4 passes through. 5321 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText); 5322 5323 status = U_ZERO_ERROR; 5324 replacedText = m->replaceAll(UnicodeString("<$000016>"), status); // Consume leading zeroes. Don't consume digits 5325 REGEX_CHECK_STATUS; // that push group num out of range. 5326 REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText); // This is group 1. 5327 5328 status = U_ZERO_ERROR; 5329 replacedText = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status); 5330 REGEX_CHECK_STATUS; 5331 REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText); 5332 5333 status = U_ZERO_ERROR; 5334 replacedText = m->replaceAll(UnicodeString("$3$2$1${one}"), status); 5335 REGEX_CHECK_STATUS; 5336 REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText); 5337 5338 status = U_ZERO_ERROR; 5339 replacedText = m->replaceAll(UnicodeString("<${noSuchName}>"), status); 5340 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5341 5342 status = U_ZERO_ERROR; 5343 replacedText = m->replaceAll(UnicodeString("<${invalid-name}>"), status); 5344 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5345 5346 status = U_ZERO_ERROR; 5347 replacedText = m->replaceAll(UnicodeString("<${one"), status); 5348 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5349 5350 status = U_ZERO_ERROR; 5351 replacedText = m->replaceAll(UnicodeString("$not a capture group"), status); 5352 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5353 5354 delete m; 5355 5356 // Repeat the above replaceAll() tests using the plain C API, which 5357 // has a separate implementation internally. 5358 // TODO: factor out the test data. 5359 5360 status = U_ZERO_ERROR; 5361 URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status); 5362 REGEX_CHECK_STATUS; 5363 text = UnicodeString("abcmxyz"); 5364 uregex_setText(re, text.getBuffer(), text.length(), &status); 5365 REGEX_CHECK_STATUS; 5366 5367 UChar resultBuf[100]; 5368 int32_t resultLength; 5369 UnicodeString repl; 5370 5371 status = U_ZERO_ERROR; 5372 repl = UnicodeString("<$0>"); 5373 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5374 REGEX_CHECK_STATUS; 5375 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength)); 5376 5377 status = U_ZERO_ERROR; 5378 repl = UnicodeString("<$1>"); 5379 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5380 REGEX_CHECK_STATUS; 5381 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength)); 5382 5383 status = U_ZERO_ERROR; 5384 repl = UnicodeString("<${one}>"); 5385 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5386 REGEX_CHECK_STATUS; 5387 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength)); 5388 5389 status = U_ZERO_ERROR; 5390 repl = UnicodeString("<$2>"); 5391 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5392 REGEX_CHECK_STATUS; 5393 REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength)); 5394 5395 status = U_ZERO_ERROR; 5396 repl = UnicodeString("<$3>"); 5397 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5398 REGEX_CHECK_STATUS; 5399 REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength)); 5400 5401 status = U_ZERO_ERROR; 5402 repl = UnicodeString("<$4>"); 5403 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5404 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 5405 5406 status = U_ZERO_ERROR; 5407 repl = UnicodeString("<$04>"); 5408 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5409 REGEX_CHECK_STATUS; 5410 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength)); 5411 5412 status = U_ZERO_ERROR; 5413 repl = UnicodeString("<$000016>"); 5414 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5415 REGEX_CHECK_STATUS; 5416 REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength)); 5417 5418 status = U_ZERO_ERROR; 5419 repl = UnicodeString("<$3$2$1${one}>"); 5420 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5421 REGEX_CHECK_STATUS; 5422 REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength)); 5423 5424 status = U_ZERO_ERROR; 5425 repl = UnicodeString("$3$2$1${one}"); 5426 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5427 REGEX_CHECK_STATUS; 5428 REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength)); 5429 5430 status = U_ZERO_ERROR; 5431 repl = UnicodeString("<${noSuchName}>"); 5432 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5433 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5434 5435 status = U_ZERO_ERROR; 5436 repl = UnicodeString("<${invalid-name}>"); 5437 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5438 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5439 5440 status = U_ZERO_ERROR; 5441 repl = UnicodeString("<${one"); 5442 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5443 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5444 5445 status = U_ZERO_ERROR; 5446 repl = UnicodeString("$not a capture group"); 5447 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5448 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5449 5450 uregex_close(re); 5451 } 5452 5453 //-------------------------------------------------------------- 5454 // 5455 // NamedCaptureLimits Patterns with huge numbers of named capture groups. 5456 // The point is not so much what the exact limit is, 5457 // but that a largish number doesn't hit bad non-linear performance, 5458 // and that exceeding the limit fails cleanly. 5459 // 5460 //-------------------------------------------------------------- 5461 void RegexTest::NamedCaptureLimits() { 5462 if (quick) { 5463 logln("Skipping test. Runs in exhuastive mode only."); 5464 return; 5465 } 5466 const int32_t goodLimit = 1000000; // Pattern w this many groups builds successfully. 5467 const int32_t failLimit = 10000000; // Pattern exceeds internal limits, fails to compile. 5468 char nnbuf[100]; 5469 UnicodeString pattern; 5470 int32_t nn; 5471 5472 for (nn=1; nn<goodLimit; nn++) { 5473 sprintf(nnbuf, "(?<nn%d>)", nn); 5474 pattern.append(UnicodeString(nnbuf, -1, US_INV)); 5475 } 5476 UErrorCode status = U_ZERO_ERROR; 5477 RegexPattern *pat = RegexPattern::compile(pattern, 0, status); 5478 REGEX_CHECK_STATUS; 5479 for (nn=1; nn<goodLimit; nn++) { 5480 sprintf(nnbuf, "nn%d", nn); 5481 int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status); 5482 REGEX_ASSERT(nn == groupNum); 5483 if (nn != groupNum) { 5484 break; 5485 } 5486 } 5487 delete pat; 5488 5489 pattern.remove(); 5490 for (nn=1; nn<failLimit; nn++) { 5491 sprintf(nnbuf, "(?<nn%d>)", nn); 5492 pattern.append(UnicodeString(nnbuf, -1, US_INV)); 5493 } 5494 status = U_ZERO_ERROR; 5495 pat = RegexPattern::compile(pattern, 0, status); 5496 REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG); 5497 delete pat; 5498 } 5499 5500 5501 //-------------------------------------------------------------- 5502 // 5503 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher. 5504 // 5505 //--------------------------------------------------------------- 5506 void RegexTest::Bug7651() { 5507 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)"); 5508 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData. 5509 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation. 5510 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)"); 5511 UnicodeString s("#ff @abcd This is test"); 5512 RegexPattern *REPattern = NULL; 5513 RegexMatcher *REMatcher = NULL; 5514 UErrorCode status = U_ZERO_ERROR; 5515 UParseError pe; 5516 5517 REPattern = RegexPattern::compile(pattern1, 0, pe, status); 5518 REGEX_CHECK_STATUS; 5519 REMatcher = REPattern->matcher(s, status); 5520 REGEX_CHECK_STATUS; 5521 REGEX_ASSERT(REMatcher->find()); 5522 REGEX_ASSERT(REMatcher->start(status) == 0); 5523 delete REPattern; 5524 delete REMatcher; 5525 status = U_ZERO_ERROR; 5526 5527 REPattern = RegexPattern::compile(pattern2, 0, pe, status); 5528 REGEX_CHECK_STATUS; 5529 REMatcher = REPattern->matcher(s, status); 5530 REGEX_CHECK_STATUS; 5531 REGEX_ASSERT(REMatcher->find()); 5532 REGEX_ASSERT(REMatcher->start(status) == 0); 5533 delete REPattern; 5534 delete REMatcher; 5535 status = U_ZERO_ERROR; 5536 } 5537 5538 void RegexTest::Bug7740() { 5539 UErrorCode status = U_ZERO_ERROR; 5540 UnicodeString pattern = "(a)"; 5541 UnicodeString text = "abcdef"; 5542 RegexMatcher *m = new RegexMatcher(pattern, text, 0, status); 5543 REGEX_CHECK_STATUS; 5544 REGEX_ASSERT(m->lookingAt(status)); 5545 REGEX_CHECK_STATUS; 5546 status = U_ILLEGAL_ARGUMENT_ERROR; 5547 UnicodeString s = m->group(1, status); // Bug 7740: segfault here. 5548 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); 5549 REGEX_ASSERT(s == ""); 5550 delete m; 5551 } 5552 5553 // Bug 8479: was crashing whith a Bogus UnicodeString as input. 5554 5555 void RegexTest::Bug8479() { 5556 UErrorCode status = U_ZERO_ERROR; 5557 5558 RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status); 5559 REGEX_CHECK_STATUS; 5560 if (U_SUCCESS(status)) 5561 { 5562 UnicodeString str; 5563 str.setToBogus(); 5564 pMatcher->reset(str); 5565 status = U_ZERO_ERROR; 5566 pMatcher->matches(status); 5567 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); 5568 delete pMatcher; 5569 } 5570 } 5571 5572 5573 // Bug 7029 5574 void RegexTest::Bug7029() { 5575 UErrorCode status = U_ZERO_ERROR; 5576 5577 RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status); 5578 UnicodeString text = "abc.def"; 5579 UnicodeString splits[10]; 5580 REGEX_CHECK_STATUS; 5581 int32_t numFields = pMatcher->split(text, splits, 10, status); 5582 REGEX_CHECK_STATUS; 5583 REGEX_ASSERT(numFields == 8); 5584 delete pMatcher; 5585 } 5586 5587 // Bug 9283 5588 // This test is checking for the existance of any supplemental characters that case-fold 5589 // to a bmp character. 5590 // 5591 // At the time of this writing there are none. If any should appear in a subsequent release 5592 // of Unicode, the code in regular expressions compilation that determines the longest 5593 // posssible match for a literal string will need to be enhanced. 5594 // 5595 // See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength() 5596 // for details on what to do in case of a failure of this test. 5597 // 5598 void RegexTest::Bug9283() { 5599 #if !UCONFIG_NO_NORMALIZATION 5600 UErrorCode status = U_ZERO_ERROR; 5601 UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status); 5602 REGEX_CHECK_STATUS; 5603 int32_t index; 5604 UChar32 c; 5605 for (index=0; ; index++) { 5606 c = supplementalsWithCaseFolding.charAt(index); 5607 if (c == -1) { 5608 break; 5609 } 5610 UnicodeString cf = UnicodeString(c).foldCase(); 5611 REGEX_ASSERT(cf.length() >= 2); 5612 } 5613 #endif /* #if !UCONFIG_NO_NORMALIZATION */ 5614 } 5615 5616 5617 void RegexTest::CheckInvBufSize() { 5618 if(inv_next>=INV_BUFSIZ) { 5619 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n", 5620 __FILE__, INV_BUFSIZ, inv_next); 5621 } else { 5622 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next); 5623 } 5624 } 5625 5626 5627 void RegexTest::Bug10459() { 5628 UErrorCode status = U_ZERO_ERROR; 5629 UnicodeString patternString("(txt)"); 5630 UnicodeString txtString("txt"); 5631 5632 UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status); 5633 REGEX_CHECK_STATUS; 5634 UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status); 5635 REGEX_CHECK_STATUS; 5636 5637 URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status); 5638 REGEX_CHECK_STATUS; 5639 5640 uregex_setUText(icu_re, utext_txt, &status); 5641 REGEX_CHECK_STATUS; 5642 5643 // The bug was that calling uregex_group() before doing a matching operation 5644 // was causing a segfault. Only for Regular Expressions created from UText. 5645 // It should set an U_REGEX_INVALID_STATE. 5646 5647 UChar buf[100]; 5648 int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status); 5649 REGEX_ASSERT(status == U_REGEX_INVALID_STATE); 5650 REGEX_ASSERT(len == 0); 5651 5652 uregex_close(icu_re); 5653 utext_close(utext_pat); 5654 utext_close(utext_txt); 5655 } 5656 5657 void RegexTest::TestCaseInsensitiveStarters() { 5658 // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't 5659 // become stale because of new Unicode characters. 5660 // If it is stale, rerun the generation tool 5661 // svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing 5662 // and replace the embedded data in i18n/regexcmp.cpp 5663 5664 for (UChar32 cp=0; cp<=0x10ffff; cp++) { 5665 if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) { 5666 continue; 5667 } 5668 UnicodeSet s(cp, cp); 5669 s.closeOver(USET_CASE_INSENSITIVE); 5670 UnicodeSetIterator setIter(s); 5671 while (setIter.next()) { 5672 if (!setIter.isString()) { 5673 continue; 5674 } 5675 const UnicodeString &str = setIter.getString(); 5676 UChar32 firstChar = str.char32At(0); 5677 UnicodeSet starters; 5678 RegexCompile::findCaseInsensitiveStarters(firstChar, &starters); 5679 if (!starters.contains(cp)) { 5680 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar); 5681 return; 5682 } 5683 } 5684 } 5685 } 5686 5687 5688 void RegexTest::TestBug11049() { 5689 // Original bug report: pattern with match start consisting of one of several individual characters, 5690 // and the text being matched ending with a supplementary character. find() would read past the 5691 // end of the input text when searching for potential match starting points. 5692 5693 // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will 5694 // detect the bad read. 5695 5696 TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__); 5697 TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__); 5698 5699 // Test again with a pattern starting with a single character, 5700 // which takes a different code path than starting with an OR expression, 5701 // but with similar logic. 5702 TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__); 5703 TestCase11049("C", "string matches at end C", TRUE, __LINE__); 5704 } 5705 5706 // Run a single test case from TestBug11049(). Internal function. 5707 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) { 5708 UErrorCode status = U_ZERO_ERROR; 5709 UnicodeString patternString = UnicodeString(pattern).unescape(); 5710 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status)); 5711 5712 UnicodeString dataString = UnicodeString(data).unescape(); 5713 UChar *exactBuffer = new UChar[dataString.length()]; 5714 dataString.extract(exactBuffer, dataString.length(), status); 5715 UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status); 5716 5717 LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status)); 5718 REGEX_CHECK_STATUS; 5719 matcher->reset(ut); 5720 UBool result = matcher->find(); 5721 if (result != expectMatch) { 5722 errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"", 5723 __FILE__, lineNumber, expectMatch, result, pattern, data); 5724 } 5725 5726 // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see 5727 // off-by-one on find() with match at the last code point. 5728 // Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8 5729 // because string.unescape() will only shrink it. 5730 char * utf8Buffer = new char[uprv_strlen(data)+1]; 5731 u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status); 5732 REGEX_CHECK_STATUS; 5733 ut = utext_openUTF8(ut, utf8Buffer, -1, &status); 5734 REGEX_CHECK_STATUS; 5735 matcher->reset(ut); 5736 result = matcher->find(); 5737 if (result != expectMatch) { 5738 errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"", 5739 __FILE__, lineNumber, expectMatch, result, pattern, data); 5740 } 5741 delete [] utf8Buffer; 5742 5743 utext_close(ut); 5744 delete [] exactBuffer; 5745 } 5746 5747 5748 void RegexTest::TestBug11371() { 5749 if (quick) { 5750 logln("Skipping test. Runs in exhuastive mode only."); 5751 return; 5752 } 5753 UErrorCode status = U_ZERO_ERROR; 5754 UnicodeString patternString; 5755 5756 for (int i=0; i<8000000; i++) { 5757 patternString.append(UnicodeString("()")); 5758 } 5759 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status)); 5760 if (status != U_REGEX_PATTERN_TOO_BIG) { 5761 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.", 5762 __FILE__, __LINE__, u_errorName(status)); 5763 } 5764 5765 status = U_ZERO_ERROR; 5766 patternString = "("; 5767 for (int i=0; i<20000000; i++) { 5768 patternString.append(UnicodeString("A++")); 5769 } 5770 patternString.append(UnicodeString("){0}B++")); 5771 LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status)); 5772 if (status != U_REGEX_PATTERN_TOO_BIG) { 5773 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.", 5774 __FILE__, __LINE__, u_errorName(status)); 5775 } 5776 5777 // Pattern with too much string data, such that string indexes overflow operand data field size 5778 // in compiled instruction. 5779 status = U_ZERO_ERROR; 5780 patternString = ""; 5781 while (patternString.length() < 0x00ffffff) { 5782 patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n")); 5783 } 5784 patternString.append(UnicodeString("X? trailing string")); 5785 LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status)); 5786 if (status != U_REGEX_PATTERN_TOO_BIG) { 5787 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.", 5788 __FILE__, __LINE__, u_errorName(status)); 5789 } 5790 } 5791 5792 void RegexTest::TestBug11480() { 5793 // C API, get capture group of a group that does not participate in the match. 5794 // (Returns a zero length string, with nul termination, 5795 // indistinguishable from a group with a zero length match.) 5796 5797 UErrorCode status = U_ZERO_ERROR; 5798 URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status); 5799 REGEX_CHECK_STATUS; 5800 UnicodeString text = UNICODE_STRING_SIMPLE("A"); 5801 uregex_setText(re, text.getBuffer(), text.length(), &status); 5802 REGEX_CHECK_STATUS; 5803 REGEX_ASSERT(uregex_lookingAt(re, 0, &status)); 5804 UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13}; 5805 int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status); 5806 REGEX_ASSERT(length == 0); 5807 REGEX_ASSERT(buf[0] == 13); 5808 REGEX_ASSERT(buf[1] == 0); 5809 REGEX_ASSERT(buf[2] == 13); 5810 uregex_close(re); 5811 5812 // UText C++ API, length of match is 0 for non-participating matches. 5813 UText ut = UTEXT_INITIALIZER; 5814 utext_openUnicodeString(&ut, &text, &status); 5815 RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status); 5816 REGEX_CHECK_STATUS; 5817 matcher.reset(&ut); 5818 REGEX_ASSERT(matcher.lookingAt(0, status)); 5819 5820 // UText C++ API, Capture group 1 matches "A", position 0, length 1. 5821 int64_t groupLen = -666; 5822 UText group = UTEXT_INITIALIZER; 5823 matcher.group(1, &group, groupLen, status); 5824 REGEX_CHECK_STATUS; 5825 REGEX_ASSERT(groupLen == 1); 5826 REGEX_ASSERT(utext_getNativeIndex(&group) == 0); 5827 5828 // Capture group 2, the (B), does not participate in the match. 5829 matcher.group(2, &group, groupLen, status); 5830 REGEX_CHECK_STATUS; 5831 REGEX_ASSERT(groupLen == 0); 5832 REGEX_ASSERT(matcher.start(2, status) == -1); 5833 REGEX_CHECK_STATUS; 5834 } 5835 5836 5837 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ 5838