1 /******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 2002-2014, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6 7 // 8 // regextst.cpp 9 // 10 // ICU Regular Expressions test, part of intltest. 11 // 12 13 /* 14 NOTE!! 15 16 PLEASE be careful about ASCII assumptions in this test. 17 This test is one of the worst repeat offenders. 18 If you have questions, contact someone on the ICU PMC 19 who has access to an EBCDIC system. 20 21 */ 22 23 #include "intltest.h" 24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 25 26 #include "unicode/regex.h" 27 #include "unicode/uchar.h" 28 #include "unicode/ucnv.h" 29 #include "unicode/uniset.h" 30 #include "unicode/uregex.h" 31 #include "unicode/ustring.h" 32 #include "regextst.h" 33 #include "uvector.h" 34 #include "util.h" 35 #include <stdlib.h> 36 #include <string.h> 37 #include <stdio.h> 38 #include "cstring.h" 39 #include "uinvchar.h" 40 41 #define SUPPORT_MUTATING_INPUT_STRING 0 42 43 //--------------------------------------------------------------------------- 44 // 45 // Test class boilerplate 46 // 47 //--------------------------------------------------------------------------- 48 RegexTest::RegexTest() 49 { 50 } 51 52 53 RegexTest::~RegexTest() 54 { 55 } 56 57 58 59 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) 60 { 61 if (exec) logln("TestSuite RegexTest: "); 62 switch (index) { 63 64 case 0: name = "Basic"; 65 if (exec) Basic(); 66 break; 67 case 1: name = "API_Match"; 68 if (exec) API_Match(); 69 break; 70 case 2: name = "API_Replace"; 71 if (exec) API_Replace(); 72 break; 73 case 3: name = "API_Pattern"; 74 if (exec) API_Pattern(); 75 break; 76 case 4: 77 #if !UCONFIG_NO_FILE_IO 78 name = "Extended"; 79 if (exec) Extended(); 80 #else 81 name = "skip"; 82 #endif 83 break; 84 case 5: name = "Errors"; 85 if (exec) Errors(); 86 break; 87 case 6: name = "PerlTests"; 88 if (exec) PerlTests(); 89 break; 90 case 7: name = "Callbacks"; 91 if (exec) Callbacks(); 92 break; 93 case 8: name = "FindProgressCallbacks"; 94 if (exec) FindProgressCallbacks(); 95 break; 96 case 9: name = "Bug 6149"; 97 if (exec) Bug6149(); 98 break; 99 case 10: name = "UTextBasic"; 100 if (exec) UTextBasic(); 101 break; 102 case 11: name = "API_Match_UTF8"; 103 if (exec) API_Match_UTF8(); 104 break; 105 case 12: name = "API_Replace_UTF8"; 106 if (exec) API_Replace_UTF8(); 107 break; 108 case 13: name = "API_Pattern_UTF8"; 109 if (exec) API_Pattern_UTF8(); 110 break; 111 case 14: name = "PerlTestsUTF8"; 112 if (exec) PerlTestsUTF8(); 113 break; 114 case 15: name = "PreAllocatedUTextCAPI"; 115 if (exec) PreAllocatedUTextCAPI(); 116 break; 117 case 16: name = "Bug 7651"; 118 if (exec) Bug7651(); 119 break; 120 case 17: name = "Bug 7740"; 121 if (exec) Bug7740(); 122 break; 123 case 18: name = "Bug 8479"; 124 if (exec) Bug8479(); 125 break; 126 case 19: name = "Bug 7029"; 127 if (exec) Bug7029(); 128 break; 129 case 20: name = "CheckInvBufSize"; 130 if (exec) CheckInvBufSize(); 131 break; 132 case 21: name = "Bug 9283"; 133 if (exec) Bug9283(); 134 break; 135 case 22: name = "Bug10459"; 136 if (exec) Bug10459(); 137 break; 138 139 default: name = ""; 140 break; //needed to end loop 141 } 142 } 143 144 145 146 /** 147 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage 148 * into ASCII. 149 * @see utext_openUTF8 150 */ 151 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status); 152 153 //--------------------------------------------------------------------------- 154 // 155 // Error Checking / Reporting macros used in all of the tests. 156 // 157 //--------------------------------------------------------------------------- 158 159 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) { 160 int64_t oldIndex = utext_getNativeIndex(text); 161 utext_setNativeIndex(text, 0); 162 char *bufPtr = buf; 163 UChar32 c = utext_next32From(text, 0); 164 while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) { 165 if (0x000020<=c && c<0x00007e) { 166 *bufPtr = c; 167 } else { 168 #if 0 169 sprintf(bufPtr,"U+%04X", c); 170 bufPtr+= strlen(bufPtr)-1; 171 #else 172 *bufPtr = '%'; 173 #endif 174 } 175 bufPtr++; 176 c = UTEXT_NEXT32(text); 177 } 178 *bufPtr = 0; 179 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY) 180 char *ebuf = (char*)malloc(bufLen); 181 uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen); 182 uprv_strncpy(buf, ebuf, bufLen); 183 free((void*)ebuf); 184 #endif 185 utext_setNativeIndex(text, oldIndex); 186 } 187 188 189 static char ASSERT_BUF[1024]; 190 191 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) { 192 if(message.length()==0) { 193 strcpy(ASSERT_BUF, "[[empty UnicodeString]]"); 194 } else { 195 UnicodeString buf; 196 IntlTest::prettify(message,buf); 197 if(buf.length()==0) { 198 strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]"); 199 } else { 200 buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1); 201 if(ASSERT_BUF[0]==0) { 202 ASSERT_BUF[0]=0; 203 for(int32_t i=0;i<buf.length();i++) { 204 UChar ch = buf[i]; 205 sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch); 206 } 207 } 208 } 209 } 210 ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0; 211 return ASSERT_BUF; 212 } 213 214 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 215 216 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);} 217 218 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \ 219 __FILE__, __LINE__, u_errorName(status)); return;}} 220 221 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};} 222 223 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\ 224 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \ 225 __LINE__, u_errorName(errcode), u_errorName(status));};} 226 227 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \ 228 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }} 229 230 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \ 231 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}} 232 233 #define REGEX_ASSERT_UNISTR(ustr,inv) {if (!(ustr==inv)) {errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s,%s) failed \n", __FILE__, __LINE__, extractToAssertBuf(ustr),inv);};} 234 235 236 static UBool testUTextEqual(UText *uta, UText *utb) { 237 UChar32 ca = 0; 238 UChar32 cb = 0; 239 utext_setNativeIndex(uta, 0); 240 utext_setNativeIndex(utb, 0); 241 do { 242 ca = utext_next32(uta); 243 cb = utext_next32(utb); 244 if (ca != cb) { 245 break; 246 } 247 } while (ca != U_SENTINEL); 248 return ca == cb; 249 } 250 251 252 /** 253 * @param expected expected text in UTF-8 (not platform) codepage 254 */ 255 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) { 256 UErrorCode status = U_ZERO_ERROR; 257 UText expectedText = UTEXT_INITIALIZER; 258 utext_openUTF8(&expectedText, expected, -1, &status); 259 if(U_FAILURE(status)) { 260 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected)); 261 return; 262 } 263 if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) { 264 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected)); 265 return; 266 } 267 utext_setNativeIndex(actual, 0); 268 if (!testUTextEqual(&expectedText, actual)) { 269 char buf[201 /*21*/]; 270 char expectedBuf[201]; 271 utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual); 272 utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText); 273 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual)); 274 } 275 utext_close(&expectedText); 276 } 277 /** 278 * @param expected invariant (platform local text) input 279 */ 280 281 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) { 282 UErrorCode status = U_ZERO_ERROR; 283 UText expectedText = UTEXT_INITIALIZER; 284 regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status); 285 if(U_FAILURE(status)) { 286 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected)); 287 return; 288 } 289 utext_setNativeIndex(actual, 0); 290 if (!testUTextEqual(&expectedText, actual)) { 291 char buf[201 /*21*/]; 292 char expectedBuf[201]; 293 utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual); 294 utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText); 295 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual)); 296 } 297 utext_close(&expectedText); 298 } 299 300 /** 301 * Assumes utf-8 input 302 */ 303 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__) 304 /** 305 * Assumes Invariant input 306 */ 307 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__) 308 309 /** 310 * This buffer ( inv_buf ) is used to hold the UTF-8 strings 311 * passed into utext_openUTF8. An error will be given if 312 * INV_BUFSIZ is too small. It's only used on EBCDIC systems. 313 */ 314 315 #define INV_BUFSIZ 2048 /* increase this if too small */ 316 317 static int64_t inv_next=0; 318 319 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY 320 static char inv_buf[INV_BUFSIZ]; 321 #endif 322 323 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) { 324 if(length==-1) length=strlen(inv); 325 #if U_CHARSET_FAMILY==U_ASCII_FAMILY 326 inv_next+=length; 327 return utext_openUTF8(ut, inv, length, status); 328 #else 329 if(inv_next+length+1>INV_BUFSIZ) { 330 fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n", 331 __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1)); 332 *status = U_MEMORY_ALLOCATION_ERROR; 333 return NULL; 334 } 335 336 unsigned char *buf = (unsigned char*)inv_buf+inv_next; 337 uprv_aestrncpy(buf, (const uint8_t*)inv, length); 338 inv_next+=length; 339 340 #if 0 341 fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next); 342 #endif 343 344 return utext_openUTF8(ut, (const char*)buf, length, status); 345 #endif 346 } 347 348 349 //--------------------------------------------------------------------------- 350 // 351 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests 352 // for the LookingAt() and Match() functions. 353 // 354 // usage: 355 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected); 356 // 357 // The expected results are UBool - TRUE or FALSE. 358 // The input text is unescaped. The pattern is not. 359 // 360 // 361 //--------------------------------------------------------------------------- 362 363 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);} 364 365 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) { 366 const UnicodeString pattern(pat, -1, US_INV); 367 const UnicodeString inputText(text, -1, US_INV); 368 UErrorCode status = U_ZERO_ERROR; 369 UParseError pe; 370 RegexPattern *REPattern = NULL; 371 RegexMatcher *REMatcher = NULL; 372 UBool retVal = TRUE; 373 374 UnicodeString patString(pat, -1, US_INV); 375 REPattern = RegexPattern::compile(patString, 0, pe, status); 376 if (U_FAILURE(status)) { 377 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s", 378 line, u_errorName(status)); 379 return FALSE; 380 } 381 if (line==376) { REPattern->dumpPattern();} 382 383 UnicodeString inputString(inputText); 384 UnicodeString unEscapedInput = inputString.unescape(); 385 REMatcher = REPattern->matcher(unEscapedInput, status); 386 if (U_FAILURE(status)) { 387 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n", 388 line, u_errorName(status)); 389 return FALSE; 390 } 391 392 UBool actualmatch; 393 actualmatch = REMatcher->lookingAt(status); 394 if (U_FAILURE(status)) { 395 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n", 396 line, u_errorName(status)); 397 retVal = FALSE; 398 } 399 if (actualmatch != looking) { 400 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line); 401 retVal = FALSE; 402 } 403 404 status = U_ZERO_ERROR; 405 actualmatch = REMatcher->matches(status); 406 if (U_FAILURE(status)) { 407 errln("RegexTest failure in matches() at line %d. Status = %s\n", 408 line, u_errorName(status)); 409 retVal = FALSE; 410 } 411 if (actualmatch != match) { 412 errln("RegexTest: wrong return from matches() at line %d.\n", line); 413 retVal = FALSE; 414 } 415 416 if (retVal == FALSE) { 417 REPattern->dumpPattern(); 418 } 419 420 delete REPattern; 421 delete REMatcher; 422 return retVal; 423 } 424 425 426 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) { 427 UText pattern = UTEXT_INITIALIZER; 428 int32_t inputUTF8Length; 429 char *textChars = NULL; 430 UText inputText = UTEXT_INITIALIZER; 431 UErrorCode status = U_ZERO_ERROR; 432 UParseError pe; 433 RegexPattern *REPattern = NULL; 434 RegexMatcher *REMatcher = NULL; 435 UBool retVal = TRUE; 436 437 regextst_openUTF8FromInvariant(&pattern, pat, -1, &status); 438 REPattern = RegexPattern::compile(&pattern, 0, pe, status); 439 if (U_FAILURE(status)) { 440 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n", 441 line, u_errorName(status)); 442 return FALSE; 443 } 444 445 UnicodeString inputString(text, -1, US_INV); 446 UnicodeString unEscapedInput = inputString.unescape(); 447 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status)); 448 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); 449 450 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status); 451 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) { 452 // UTF-8 does not allow unpaired surrogates, so this could actually happen 453 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status)); 454 return TRUE; // not a failure of the Regex engine 455 } 456 status = U_ZERO_ERROR; // buffer overflow 457 textChars = new char[inputUTF8Length+1]; 458 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status); 459 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status); 460 461 REMatcher = &REPattern->matcher(status)->reset(&inputText); 462 if (U_FAILURE(status)) { 463 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n", 464 line, u_errorName(status)); 465 return FALSE; 466 } 467 468 UBool actualmatch; 469 actualmatch = REMatcher->lookingAt(status); 470 if (U_FAILURE(status)) { 471 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n", 472 line, u_errorName(status)); 473 retVal = FALSE; 474 } 475 if (actualmatch != looking) { 476 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line); 477 retVal = FALSE; 478 } 479 480 status = U_ZERO_ERROR; 481 actualmatch = REMatcher->matches(status); 482 if (U_FAILURE(status)) { 483 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n", 484 line, u_errorName(status)); 485 retVal = FALSE; 486 } 487 if (actualmatch != match) { 488 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line); 489 retVal = FALSE; 490 } 491 492 if (retVal == FALSE) { 493 REPattern->dumpPattern(); 494 } 495 496 delete REPattern; 497 delete REMatcher; 498 utext_close(&inputText); 499 utext_close(&pattern); 500 delete[] textChars; 501 return retVal; 502 } 503 504 505 506 //--------------------------------------------------------------------------- 507 // 508 // REGEX_ERR Macro + invocation function to simplify writing tests 509 // regex tests for incorrect patterns 510 // 511 // usage: 512 // REGEX_ERR("pattern", expected error line, column, expected status); 513 // 514 //--------------------------------------------------------------------------- 515 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__); 516 517 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol, 518 UErrorCode expectedStatus, int32_t line) { 519 UnicodeString pattern(pat); 520 521 UErrorCode status = U_ZERO_ERROR; 522 UParseError pe; 523 RegexPattern *callerPattern = NULL; 524 525 // 526 // Compile the caller's pattern 527 // 528 UnicodeString patString(pat); 529 callerPattern = RegexPattern::compile(patString, 0, pe, status); 530 if (status != expectedStatus) { 531 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status)); 532 } else { 533 if (status != U_ZERO_ERROR) { 534 if (pe.line != errLine || pe.offset != errCol) { 535 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n", 536 line, errLine, errCol, pe.line, pe.offset); 537 } 538 } 539 } 540 541 delete callerPattern; 542 543 // 544 // Compile again, using a UTF-8-based UText 545 // 546 UText patternText = UTEXT_INITIALIZER; 547 regextst_openUTF8FromInvariant(&patternText, pat, -1, &status); 548 callerPattern = RegexPattern::compile(&patternText, 0, pe, status); 549 if (status != expectedStatus) { 550 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status)); 551 } else { 552 if (status != U_ZERO_ERROR) { 553 if (pe.line != errLine || pe.offset != errCol) { 554 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n", 555 line, errLine, errCol, pe.line, pe.offset); 556 } 557 } 558 } 559 560 delete callerPattern; 561 utext_close(&patternText); 562 } 563 564 565 566 //--------------------------------------------------------------------------- 567 // 568 // Basic Check for basic functionality of regex pattern matching. 569 // Avoid the use of REGEX_FIND test macro, which has 570 // substantial dependencies on basic Regex functionality. 571 // 572 //--------------------------------------------------------------------------- 573 void RegexTest::Basic() { 574 575 576 // 577 // Debug - slide failing test cases early 578 // 579 #if 0 580 { 581 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE); 582 UParseError pe; 583 UErrorCode status = U_ZERO_ERROR; 584 RegexPattern *pattern; 585 pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status); 586 pattern->dumpPattern(); 587 RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status); 588 UBool result = m->find(); 589 printf("result = %d\n", result); 590 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd"); 591 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX===================="); 592 } 593 exit(1); 594 #endif 595 596 597 // 598 // Pattern with parentheses 599 // 600 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE); 601 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE); 602 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE); 603 604 // 605 // Patterns with * 606 // 607 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE); 608 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE); 609 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE); 610 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE); 611 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE); 612 613 REGEX_TESTLM("a*", "", TRUE, TRUE); 614 REGEX_TESTLM("a*", "b", TRUE, FALSE); 615 616 617 // 618 // Patterns with "." 619 // 620 REGEX_TESTLM(".", "abc", TRUE, FALSE); 621 REGEX_TESTLM("...", "abc", TRUE, TRUE); 622 REGEX_TESTLM("....", "abc", FALSE, FALSE); 623 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE); 624 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE); 625 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE); 626 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE); 627 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE); 628 629 // 630 // Patterns with * applied to chars at end of literal string 631 // 632 REGEX_TESTLM("abc*", "ab", TRUE, TRUE); 633 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE); 634 635 // 636 // Supplemental chars match as single chars, not a pair of surrogates. 637 // 638 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE); 639 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE); 640 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE); 641 642 643 // 644 // UnicodeSets in the pattern 645 // 646 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE); 647 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE); 648 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE); 649 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE); 650 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE); 651 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE); 652 653 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE); 654 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE); 655 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE); 656 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences. 657 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE); 658 659 // 660 // OR operator in patterns 661 // 662 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE); 663 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE); 664 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE); 665 REGEX_TESTLM("a|b", "b", TRUE, TRUE); 666 667 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE); 668 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE); 669 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE); 670 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE); 671 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE); 672 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE); 673 674 // 675 // + 676 // 677 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE); 678 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE); 679 REGEX_TESTLM("b+", "", FALSE, FALSE); 680 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE); 681 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE); 682 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE); 683 684 // 685 // ? 686 // 687 REGEX_TESTLM("ab?", "ab", TRUE, TRUE); 688 REGEX_TESTLM("ab?", "a", TRUE, TRUE); 689 REGEX_TESTLM("ab?", "ac", TRUE, FALSE); 690 REGEX_TESTLM("ab?", "abb", TRUE, FALSE); 691 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE); 692 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE); 693 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE); 694 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE); 695 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE); 696 697 // 698 // Escape sequences that become single literal chars, handled internally 699 // by ICU's Unescape. 700 // 701 702 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet. 703 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL 704 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L 705 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape 706 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed 707 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line 708 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR 709 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab 710 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE); 711 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE); 712 713 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input 714 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input 715 716 // Escape of special chars in patterns 717 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE); 718 } 719 720 721 //--------------------------------------------------------------------------- 722 // 723 // UTextBasic Check for quirks that are specific to the UText 724 // implementation. 725 // 726 //--------------------------------------------------------------------------- 727 void RegexTest::UTextBasic() { 728 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 729 UErrorCode status = U_ZERO_ERROR; 730 UText pattern = UTEXT_INITIALIZER; 731 utext_openUTF8(&pattern, str_abc, -1, &status); 732 RegexMatcher matcher(&pattern, 0, status); 733 REGEX_CHECK_STATUS; 734 735 UText input = UTEXT_INITIALIZER; 736 utext_openUTF8(&input, str_abc, -1, &status); 737 REGEX_CHECK_STATUS; 738 matcher.reset(&input); 739 REGEX_CHECK_STATUS; 740 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText()); 741 742 matcher.reset(matcher.inputText()); 743 REGEX_CHECK_STATUS; 744 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText()); 745 746 utext_close(&pattern); 747 utext_close(&input); 748 } 749 750 751 //--------------------------------------------------------------------------- 752 // 753 // API_Match Test that the API for class RegexMatcher 754 // is present and nominally working, but excluding functions 755 // implementing replace operations. 756 // 757 //--------------------------------------------------------------------------- 758 void RegexTest::API_Match() { 759 UParseError pe; 760 UErrorCode status=U_ZERO_ERROR; 761 int32_t flags = 0; 762 763 // 764 // Debug - slide failing test cases early 765 // 766 #if 0 767 { 768 } 769 return; 770 #endif 771 772 // 773 // Simple pattern compilation 774 // 775 { 776 UnicodeString re("abc"); 777 RegexPattern *pat2; 778 pat2 = RegexPattern::compile(re, flags, pe, status); 779 REGEX_CHECK_STATUS; 780 781 UnicodeString inStr1 = "abcdef this is a test"; 782 UnicodeString instr2 = "not abc"; 783 UnicodeString empty = ""; 784 785 786 // 787 // Matcher creation and reset. 788 // 789 RegexMatcher *m1 = pat2->matcher(inStr1, status); 790 REGEX_CHECK_STATUS; 791 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 792 REGEX_ASSERT(m1->input() == inStr1); 793 m1->reset(instr2); 794 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 795 REGEX_ASSERT(m1->input() == instr2); 796 m1->reset(inStr1); 797 REGEX_ASSERT(m1->input() == inStr1); 798 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 799 m1->reset(empty); 800 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 801 REGEX_ASSERT(m1->input() == empty); 802 REGEX_ASSERT(&m1->pattern() == pat2); 803 804 // 805 // reset(pos, status) 806 // 807 m1->reset(inStr1); 808 m1->reset(4, status); 809 REGEX_CHECK_STATUS; 810 REGEX_ASSERT(m1->input() == inStr1); 811 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 812 813 m1->reset(-1, status); 814 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 815 status = U_ZERO_ERROR; 816 817 m1->reset(0, status); 818 REGEX_CHECK_STATUS; 819 status = U_ZERO_ERROR; 820 821 int32_t len = m1->input().length(); 822 m1->reset(len-1, status); 823 REGEX_CHECK_STATUS; 824 status = U_ZERO_ERROR; 825 826 m1->reset(len, status); 827 REGEX_CHECK_STATUS; 828 status = U_ZERO_ERROR; 829 830 m1->reset(len+1, status); 831 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 832 status = U_ZERO_ERROR; 833 834 // 835 // match(pos, status) 836 // 837 m1->reset(instr2); 838 REGEX_ASSERT(m1->matches(4, status) == TRUE); 839 m1->reset(); 840 REGEX_ASSERT(m1->matches(3, status) == FALSE); 841 m1->reset(); 842 REGEX_ASSERT(m1->matches(5, status) == FALSE); 843 REGEX_ASSERT(m1->matches(4, status) == TRUE); 844 REGEX_ASSERT(m1->matches(-1, status) == FALSE); 845 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 846 847 // Match() at end of string should fail, but should not 848 // be an error. 849 status = U_ZERO_ERROR; 850 len = m1->input().length(); 851 REGEX_ASSERT(m1->matches(len, status) == FALSE); 852 REGEX_CHECK_STATUS; 853 854 // Match beyond end of string should fail with an error. 855 status = U_ZERO_ERROR; 856 REGEX_ASSERT(m1->matches(len+1, status) == FALSE); 857 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 858 859 // Successful match at end of string. 860 { 861 status = U_ZERO_ERROR; 862 RegexMatcher m("A?", 0, status); // will match zero length string. 863 REGEX_CHECK_STATUS; 864 m.reset(inStr1); 865 len = inStr1.length(); 866 REGEX_ASSERT(m.matches(len, status) == TRUE); 867 REGEX_CHECK_STATUS; 868 m.reset(empty); 869 REGEX_ASSERT(m.matches(0, status) == TRUE); 870 REGEX_CHECK_STATUS; 871 } 872 873 874 // 875 // lookingAt(pos, status) 876 // 877 status = U_ZERO_ERROR; 878 m1->reset(instr2); // "not abc" 879 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 880 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE); 881 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE); 882 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 883 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE); 884 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 885 status = U_ZERO_ERROR; 886 len = m1->input().length(); 887 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE); 888 REGEX_CHECK_STATUS; 889 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE); 890 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 891 892 delete m1; 893 delete pat2; 894 } 895 896 897 // 898 // Capture Group. 899 // RegexMatcher::start(); 900 // RegexMatcher::end(); 901 // RegexMatcher::groupCount(); 902 // 903 { 904 int32_t flags=0; 905 UParseError pe; 906 UErrorCode status=U_ZERO_ERROR; 907 908 UnicodeString re("01(23(45)67)(.*)"); 909 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 910 REGEX_CHECK_STATUS; 911 UnicodeString data = "0123456789"; 912 913 RegexMatcher *matcher = pat->matcher(data, status); 914 REGEX_CHECK_STATUS; 915 REGEX_ASSERT(matcher->lookingAt(status) == TRUE); 916 static const int32_t matchStarts[] = {0, 2, 4, 8}; 917 static const int32_t matchEnds[] = {10, 8, 6, 10}; 918 int32_t i; 919 for (i=0; i<4; i++) { 920 int32_t actualStart = matcher->start(i, status); 921 REGEX_CHECK_STATUS; 922 if (actualStart != matchStarts[i]) { 923 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n", 924 __LINE__, i, matchStarts[i], actualStart); 925 } 926 int32_t actualEnd = matcher->end(i, status); 927 REGEX_CHECK_STATUS; 928 if (actualEnd != matchEnds[i]) { 929 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n", 930 __LINE__, i, matchEnds[i], actualEnd); 931 } 932 } 933 934 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status)); 935 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status)); 936 937 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 938 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 939 matcher->reset(); 940 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE); 941 942 matcher->lookingAt(status); 943 REGEX_ASSERT(matcher->group(status) == "0123456789"); 944 REGEX_ASSERT(matcher->group(0, status) == "0123456789"); 945 REGEX_ASSERT(matcher->group(1, status) == "234567" ); 946 REGEX_ASSERT(matcher->group(2, status) == "45" ); 947 REGEX_ASSERT(matcher->group(3, status) == "89" ); 948 REGEX_CHECK_STATUS; 949 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 950 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 951 matcher->reset(); 952 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); 953 954 delete matcher; 955 delete pat; 956 957 } 958 959 // 960 // find 961 // 962 { 963 int32_t flags=0; 964 UParseError pe; 965 UErrorCode status=U_ZERO_ERROR; 966 967 UnicodeString re("abc"); 968 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 969 REGEX_CHECK_STATUS; 970 UnicodeString data = ".abc..abc...abc.."; 971 // 012345678901234567 972 973 RegexMatcher *matcher = pat->matcher(data, status); 974 REGEX_CHECK_STATUS; 975 REGEX_ASSERT(matcher->find()); 976 REGEX_ASSERT(matcher->start(status) == 1); 977 REGEX_ASSERT(matcher->find()); 978 REGEX_ASSERT(matcher->start(status) == 6); 979 REGEX_ASSERT(matcher->find()); 980 REGEX_ASSERT(matcher->start(status) == 12); 981 REGEX_ASSERT(matcher->find() == FALSE); 982 REGEX_ASSERT(matcher->find() == FALSE); 983 984 matcher->reset(); 985 REGEX_ASSERT(matcher->find()); 986 REGEX_ASSERT(matcher->start(status) == 1); 987 988 REGEX_ASSERT(matcher->find(0, status)); 989 REGEX_ASSERT(matcher->start(status) == 1); 990 REGEX_ASSERT(matcher->find(1, status)); 991 REGEX_ASSERT(matcher->start(status) == 1); 992 REGEX_ASSERT(matcher->find(2, status)); 993 REGEX_ASSERT(matcher->start(status) == 6); 994 REGEX_ASSERT(matcher->find(12, status)); 995 REGEX_ASSERT(matcher->start(status) == 12); 996 REGEX_ASSERT(matcher->find(13, status) == FALSE); 997 REGEX_ASSERT(matcher->find(16, status) == FALSE); 998 REGEX_ASSERT(matcher->find(17, status) == FALSE); 999 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE); 1000 1001 status = U_ZERO_ERROR; 1002 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 1003 status = U_ZERO_ERROR; 1004 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR); 1005 1006 REGEX_ASSERT(matcher->groupCount() == 0); 1007 1008 delete matcher; 1009 delete pat; 1010 } 1011 1012 1013 // 1014 // find, with \G in pattern (true if at the end of a previous match). 1015 // 1016 { 1017 int32_t flags=0; 1018 UParseError pe; 1019 UErrorCode status=U_ZERO_ERROR; 1020 1021 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV); 1022 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 1023 REGEX_CHECK_STATUS; 1024 UnicodeString data = ".abcabc.abc.."; 1025 // 012345678901234567 1026 1027 RegexMatcher *matcher = pat->matcher(data, status); 1028 REGEX_CHECK_STATUS; 1029 REGEX_ASSERT(matcher->find()); 1030 REGEX_ASSERT(matcher->start(status) == 0); 1031 REGEX_ASSERT(matcher->start(1, status) == -1); 1032 REGEX_ASSERT(matcher->start(2, status) == 1); 1033 1034 REGEX_ASSERT(matcher->find()); 1035 REGEX_ASSERT(matcher->start(status) == 4); 1036 REGEX_ASSERT(matcher->start(1, status) == 4); 1037 REGEX_ASSERT(matcher->start(2, status) == -1); 1038 REGEX_CHECK_STATUS; 1039 1040 delete matcher; 1041 delete pat; 1042 } 1043 1044 // 1045 // find with zero length matches, match position should bump ahead 1046 // to prevent loops. 1047 // 1048 { 1049 int32_t i; 1050 UErrorCode status=U_ZERO_ERROR; 1051 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere, 1052 // using an always-true look-ahead. 1053 REGEX_CHECK_STATUS; 1054 UnicodeString s(" "); 1055 m.reset(s); 1056 for (i=0; ; i++) { 1057 if (m.find() == FALSE) { 1058 break; 1059 } 1060 REGEX_ASSERT(m.start(status) == i); 1061 REGEX_ASSERT(m.end(status) == i); 1062 } 1063 REGEX_ASSERT(i==5); 1064 1065 // Check that the bump goes over surrogate pairs OK 1066 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004"); 1067 s = s.unescape(); 1068 m.reset(s); 1069 for (i=0; ; i+=2) { 1070 if (m.find() == FALSE) { 1071 break; 1072 } 1073 REGEX_ASSERT(m.start(status) == i); 1074 REGEX_ASSERT(m.end(status) == i); 1075 } 1076 REGEX_ASSERT(i==10); 1077 } 1078 { 1079 // find() loop breaking test. 1080 // with pattern of /.?/, should see a series of one char matches, then a single 1081 // match of zero length at the end of the input string. 1082 int32_t i; 1083 UErrorCode status=U_ZERO_ERROR; 1084 RegexMatcher m(".?", 0, status); 1085 REGEX_CHECK_STATUS; 1086 UnicodeString s(" "); 1087 m.reset(s); 1088 for (i=0; ; i++) { 1089 if (m.find() == FALSE) { 1090 break; 1091 } 1092 REGEX_ASSERT(m.start(status) == i); 1093 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i)); 1094 } 1095 REGEX_ASSERT(i==5); 1096 } 1097 1098 1099 // 1100 // Matchers with no input string behave as if they had an empty input string. 1101 // 1102 1103 { 1104 UErrorCode status = U_ZERO_ERROR; 1105 RegexMatcher m(".?", 0, status); 1106 REGEX_CHECK_STATUS; 1107 REGEX_ASSERT(m.find()); 1108 REGEX_ASSERT(m.start(status) == 0); 1109 REGEX_ASSERT(m.input() == ""); 1110 } 1111 { 1112 UErrorCode status = U_ZERO_ERROR; 1113 RegexPattern *p = RegexPattern::compile(".", 0, status); 1114 RegexMatcher *m = p->matcher(status); 1115 REGEX_CHECK_STATUS; 1116 1117 REGEX_ASSERT(m->find() == FALSE); 1118 REGEX_ASSERT(m->input() == ""); 1119 delete m; 1120 delete p; 1121 } 1122 1123 // 1124 // Regions 1125 // 1126 { 1127 UErrorCode status = U_ZERO_ERROR; 1128 UnicodeString testString("This is test data"); 1129 RegexMatcher m(".*", testString, 0, status); 1130 REGEX_CHECK_STATUS; 1131 REGEX_ASSERT(m.regionStart() == 0); 1132 REGEX_ASSERT(m.regionEnd() == testString.length()); 1133 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1134 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1135 1136 m.region(2,4, status); 1137 REGEX_CHECK_STATUS; 1138 REGEX_ASSERT(m.matches(status)); 1139 REGEX_ASSERT(m.start(status)==2); 1140 REGEX_ASSERT(m.end(status)==4); 1141 REGEX_CHECK_STATUS; 1142 1143 m.reset(); 1144 REGEX_ASSERT(m.regionStart() == 0); 1145 REGEX_ASSERT(m.regionEnd() == testString.length()); 1146 1147 UnicodeString shorterString("short"); 1148 m.reset(shorterString); 1149 REGEX_ASSERT(m.regionStart() == 0); 1150 REGEX_ASSERT(m.regionEnd() == shorterString.length()); 1151 1152 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1153 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE)); 1154 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 1155 REGEX_ASSERT(&m == &m.reset()); 1156 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 1157 1158 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE)); 1159 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1160 REGEX_ASSERT(&m == &m.reset()); 1161 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1162 1163 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1164 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE)); 1165 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 1166 REGEX_ASSERT(&m == &m.reset()); 1167 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 1168 1169 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE)); 1170 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1171 REGEX_ASSERT(&m == &m.reset()); 1172 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1173 1174 } 1175 1176 // 1177 // hitEnd() and requireEnd() 1178 // 1179 { 1180 UErrorCode status = U_ZERO_ERROR; 1181 UnicodeString testString("aabb"); 1182 RegexMatcher m1(".*", testString, 0, status); 1183 REGEX_ASSERT(m1.lookingAt(status) == TRUE); 1184 REGEX_ASSERT(m1.hitEnd() == TRUE); 1185 REGEX_ASSERT(m1.requireEnd() == FALSE); 1186 REGEX_CHECK_STATUS; 1187 1188 status = U_ZERO_ERROR; 1189 RegexMatcher m2("a*", testString, 0, status); 1190 REGEX_ASSERT(m2.lookingAt(status) == TRUE); 1191 REGEX_ASSERT(m2.hitEnd() == FALSE); 1192 REGEX_ASSERT(m2.requireEnd() == FALSE); 1193 REGEX_CHECK_STATUS; 1194 1195 status = U_ZERO_ERROR; 1196 RegexMatcher m3(".*$", testString, 0, status); 1197 REGEX_ASSERT(m3.lookingAt(status) == TRUE); 1198 REGEX_ASSERT(m3.hitEnd() == TRUE); 1199 REGEX_ASSERT(m3.requireEnd() == TRUE); 1200 REGEX_CHECK_STATUS; 1201 } 1202 1203 1204 // 1205 // Compilation error on reset with UChar * 1206 // These were a hazard that people were stumbling over with runtime errors. 1207 // Changed them to compiler errors by adding private methods that more closely 1208 // matched the incorrect use of the functions. 1209 // 1210 #if 0 1211 { 1212 UErrorCode status = U_ZERO_ERROR; 1213 UChar ucharString[20]; 1214 RegexMatcher m(".", 0, status); 1215 m.reset(ucharString); // should not compile. 1216 1217 RegexPattern *p = RegexPattern::compile(".", 0, status); 1218 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile. 1219 1220 RegexMatcher m3(".", ucharString, 0, status); // Should not compile 1221 } 1222 #endif 1223 1224 // 1225 // Time Outs. 1226 // Note: These tests will need to be changed when the regexp engine is 1227 // able to detect and cut short the exponential time behavior on 1228 // this type of match. 1229 // 1230 { 1231 UErrorCode status = U_ZERO_ERROR; 1232 // Enough 'a's in the string to cause the match to time out. 1233 // (Each on additonal 'a' doubles the time) 1234 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa"); 1235 RegexMatcher matcher("(a+)+b", testString, 0, status); 1236 REGEX_CHECK_STATUS; 1237 REGEX_ASSERT(matcher.getTimeLimit() == 0); 1238 matcher.setTimeLimit(100, status); 1239 REGEX_ASSERT(matcher.getTimeLimit() == 100); 1240 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1241 REGEX_ASSERT(status == U_REGEX_TIME_OUT); 1242 } 1243 { 1244 UErrorCode status = U_ZERO_ERROR; 1245 // Few enough 'a's to slip in under the time limit. 1246 UnicodeString testString("aaaaaaaaaaaaaaaaaa"); 1247 RegexMatcher matcher("(a+)+b", testString, 0, status); 1248 REGEX_CHECK_STATUS; 1249 matcher.setTimeLimit(100, status); 1250 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1251 REGEX_CHECK_STATUS; 1252 } 1253 1254 // 1255 // Stack Limits 1256 // 1257 { 1258 UErrorCode status = U_ZERO_ERROR; 1259 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A' 1260 1261 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations 1262 // of the '+', and makes the stack frames larger. 1263 RegexMatcher matcher("(A)+A$", testString, 0, status); 1264 1265 // With the default stack, this match should fail to run 1266 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1267 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW); 1268 1269 // With unlimited stack, it should run 1270 status = U_ZERO_ERROR; 1271 matcher.setStackLimit(0, status); 1272 REGEX_CHECK_STATUS; 1273 REGEX_ASSERT(matcher.lookingAt(status) == TRUE); 1274 REGEX_CHECK_STATUS; 1275 REGEX_ASSERT(matcher.getStackLimit() == 0); 1276 1277 // With a limited stack, it the match should fail 1278 status = U_ZERO_ERROR; 1279 matcher.setStackLimit(10000, status); 1280 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1281 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW); 1282 REGEX_ASSERT(matcher.getStackLimit() == 10000); 1283 } 1284 1285 // A pattern that doesn't save state should work with 1286 // a minimal sized stack 1287 { 1288 UErrorCode status = U_ZERO_ERROR; 1289 UnicodeString testString = "abc"; 1290 RegexMatcher matcher("abc", testString, 0, status); 1291 REGEX_CHECK_STATUS; 1292 matcher.setStackLimit(30, status); 1293 REGEX_CHECK_STATUS; 1294 REGEX_ASSERT(matcher.matches(status) == TRUE); 1295 REGEX_CHECK_STATUS; 1296 REGEX_ASSERT(matcher.getStackLimit() == 30); 1297 1298 // Negative stack sizes should fail 1299 status = U_ZERO_ERROR; 1300 matcher.setStackLimit(1000, status); 1301 REGEX_CHECK_STATUS; 1302 matcher.setStackLimit(-1, status); 1303 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); 1304 REGEX_ASSERT(matcher.getStackLimit() == 1000); 1305 } 1306 1307 1308 } 1309 1310 1311 1312 1313 1314 1315 //--------------------------------------------------------------------------- 1316 // 1317 // API_Replace API test for class RegexMatcher, testing the 1318 // Replace family of functions. 1319 // 1320 //--------------------------------------------------------------------------- 1321 void RegexTest::API_Replace() { 1322 // 1323 // Replace 1324 // 1325 int32_t flags=0; 1326 UParseError pe; 1327 UErrorCode status=U_ZERO_ERROR; 1328 1329 UnicodeString re("abc"); 1330 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 1331 REGEX_CHECK_STATUS; 1332 UnicodeString data = ".abc..abc...abc.."; 1333 // 012345678901234567 1334 RegexMatcher *matcher = pat->matcher(data, status); 1335 1336 // 1337 // Plain vanilla matches. 1338 // 1339 UnicodeString dest; 1340 dest = matcher->replaceFirst("yz", status); 1341 REGEX_CHECK_STATUS; 1342 REGEX_ASSERT(dest == ".yz..abc...abc.."); 1343 1344 dest = matcher->replaceAll("yz", status); 1345 REGEX_CHECK_STATUS; 1346 REGEX_ASSERT(dest == ".yz..yz...yz.."); 1347 1348 // 1349 // Plain vanilla non-matches. 1350 // 1351 UnicodeString d2 = ".abx..abx...abx.."; 1352 matcher->reset(d2); 1353 dest = matcher->replaceFirst("yz", status); 1354 REGEX_CHECK_STATUS; 1355 REGEX_ASSERT(dest == ".abx..abx...abx.."); 1356 1357 dest = matcher->replaceAll("yz", status); 1358 REGEX_CHECK_STATUS; 1359 REGEX_ASSERT(dest == ".abx..abx...abx.."); 1360 1361 // 1362 // Empty source string 1363 // 1364 UnicodeString d3 = ""; 1365 matcher->reset(d3); 1366 dest = matcher->replaceFirst("yz", status); 1367 REGEX_CHECK_STATUS; 1368 REGEX_ASSERT(dest == ""); 1369 1370 dest = matcher->replaceAll("yz", status); 1371 REGEX_CHECK_STATUS; 1372 REGEX_ASSERT(dest == ""); 1373 1374 // 1375 // Empty substitution string 1376 // 1377 matcher->reset(data); // ".abc..abc...abc.." 1378 dest = matcher->replaceFirst("", status); 1379 REGEX_CHECK_STATUS; 1380 REGEX_ASSERT(dest == "...abc...abc.."); 1381 1382 dest = matcher->replaceAll("", status); 1383 REGEX_CHECK_STATUS; 1384 REGEX_ASSERT(dest == "........"); 1385 1386 // 1387 // match whole string 1388 // 1389 UnicodeString d4 = "abc"; 1390 matcher->reset(d4); 1391 dest = matcher->replaceFirst("xyz", status); 1392 REGEX_CHECK_STATUS; 1393 REGEX_ASSERT(dest == "xyz"); 1394 1395 dest = matcher->replaceAll("xyz", status); 1396 REGEX_CHECK_STATUS; 1397 REGEX_ASSERT(dest == "xyz"); 1398 1399 // 1400 // Capture Group, simple case 1401 // 1402 UnicodeString re2("a(..)"); 1403 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status); 1404 REGEX_CHECK_STATUS; 1405 UnicodeString d5 = "abcdefg"; 1406 RegexMatcher *matcher2 = pat2->matcher(d5, status); 1407 REGEX_CHECK_STATUS; 1408 dest = matcher2->replaceFirst("$1$1", status); 1409 REGEX_CHECK_STATUS; 1410 REGEX_ASSERT(dest == "bcbcdefg"); 1411 1412 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status); 1413 REGEX_CHECK_STATUS; 1414 REGEX_ASSERT(dest == "The value of $1 is bc.defg"); 1415 1416 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status); 1417 REGEX_CHECK_STATUS; 1418 REGEX_ASSERT(dest == "$ by itself, no group number $$$defg"); 1419 1420 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF."); 1421 replacement = replacement.unescape(); 1422 dest = matcher2->replaceFirst(replacement, status); 1423 REGEX_CHECK_STATUS; 1424 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg"); 1425 1426 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR); 1427 1428 1429 // 1430 // Replacement String with \u hex escapes 1431 // 1432 { 1433 UnicodeString src = "abc 1 abc 2 abc 3"; 1434 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--"); 1435 matcher->reset(src); 1436 UnicodeString result = matcher->replaceAll(substitute, status); 1437 REGEX_CHECK_STATUS; 1438 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3"); 1439 } 1440 { 1441 UnicodeString src = "abc !"; 1442 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--"); 1443 matcher->reset(src); 1444 UnicodeString result = matcher->replaceAll(substitute, status); 1445 REGEX_CHECK_STATUS; 1446 UnicodeString expected = UnicodeString("--"); 1447 expected.append((UChar32)0x10000); 1448 expected.append("-- !"); 1449 REGEX_ASSERT(result == expected); 1450 } 1451 // TODO: need more through testing of capture substitutions. 1452 1453 // Bug 4057 1454 // 1455 { 1456 status = U_ZERO_ERROR; 1457 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin"; 1458 RegexMatcher m("ss(.*?)ee", 0, status); 1459 REGEX_CHECK_STATUS; 1460 UnicodeString result; 1461 1462 // Multiple finds do NOT bump up the previous appendReplacement postion. 1463 m.reset(s); 1464 m.find(); 1465 m.find(); 1466 m.appendReplacement(result, "ooh", status); 1467 REGEX_CHECK_STATUS; 1468 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1469 1470 // After a reset into the interior of a string, appendReplacemnt still starts at beginning. 1471 status = U_ZERO_ERROR; 1472 result.truncate(0); 1473 m.reset(10, status); 1474 m.find(); 1475 m.find(); 1476 m.appendReplacement(result, "ooh", status); 1477 REGEX_CHECK_STATUS; 1478 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1479 1480 // find() at interior of string, appendReplacemnt still starts at beginning. 1481 status = U_ZERO_ERROR; 1482 result.truncate(0); 1483 m.reset(); 1484 m.find(10, status); 1485 m.find(); 1486 m.appendReplacement(result, "ooh", status); 1487 REGEX_CHECK_STATUS; 1488 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1489 1490 m.appendTail(result); 1491 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin"); 1492 1493 } 1494 1495 delete matcher2; 1496 delete pat2; 1497 delete matcher; 1498 delete pat; 1499 } 1500 1501 1502 //--------------------------------------------------------------------------- 1503 // 1504 // API_Pattern Test that the API for class RegexPattern is 1505 // present and nominally working. 1506 // 1507 //--------------------------------------------------------------------------- 1508 void RegexTest::API_Pattern() { 1509 RegexPattern pata; // Test default constructor to not crash. 1510 RegexPattern patb; 1511 1512 REGEX_ASSERT(pata == patb); 1513 REGEX_ASSERT(pata == pata); 1514 1515 UnicodeString re1("abc[a-l][m-z]"); 1516 UnicodeString re2("def"); 1517 UErrorCode status = U_ZERO_ERROR; 1518 UParseError pe; 1519 1520 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status); 1521 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status); 1522 REGEX_CHECK_STATUS; 1523 REGEX_ASSERT(*pat1 == *pat1); 1524 REGEX_ASSERT(*pat1 != pata); 1525 1526 // Assign 1527 patb = *pat1; 1528 REGEX_ASSERT(patb == *pat1); 1529 1530 // Copy Construct 1531 RegexPattern patc(*pat1); 1532 REGEX_ASSERT(patc == *pat1); 1533 REGEX_ASSERT(patb == patc); 1534 REGEX_ASSERT(pat1 != pat2); 1535 patb = *pat2; 1536 REGEX_ASSERT(patb != patc); 1537 REGEX_ASSERT(patb == *pat2); 1538 1539 // Compile with no flags. 1540 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status); 1541 REGEX_ASSERT(*pat1a == *pat1); 1542 1543 REGEX_ASSERT(pat1a->flags() == 0); 1544 1545 // Compile with different flags should be not equal 1546 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status); 1547 REGEX_CHECK_STATUS; 1548 1549 REGEX_ASSERT(*pat1b != *pat1a); 1550 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE); 1551 REGEX_ASSERT(pat1a->flags() == 0); 1552 delete pat1b; 1553 1554 // clone 1555 RegexPattern *pat1c = pat1->clone(); 1556 REGEX_ASSERT(*pat1c == *pat1); 1557 REGEX_ASSERT(*pat1c != *pat2); 1558 1559 delete pat1c; 1560 delete pat1a; 1561 delete pat1; 1562 delete pat2; 1563 1564 1565 // 1566 // Verify that a matcher created from a cloned pattern works. 1567 // (Jitterbug 3423) 1568 // 1569 { 1570 UErrorCode status = U_ZERO_ERROR; 1571 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status); 1572 RegexPattern *pClone = pSource->clone(); 1573 delete pSource; 1574 RegexMatcher *mFromClone = pClone->matcher(status); 1575 REGEX_CHECK_STATUS; 1576 UnicodeString s = "Hello World"; 1577 mFromClone->reset(s); 1578 REGEX_ASSERT(mFromClone->find() == TRUE); 1579 REGEX_ASSERT(mFromClone->group(status) == "Hello"); 1580 REGEX_ASSERT(mFromClone->find() == TRUE); 1581 REGEX_ASSERT(mFromClone->group(status) == "World"); 1582 REGEX_ASSERT(mFromClone->find() == FALSE); 1583 delete mFromClone; 1584 delete pClone; 1585 } 1586 1587 // 1588 // matches convenience API 1589 // 1590 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE); 1591 REGEX_CHECK_STATUS; 1592 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE); 1593 REGEX_CHECK_STATUS; 1594 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE); 1595 REGEX_CHECK_STATUS; 1596 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE); 1597 REGEX_CHECK_STATUS; 1598 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE); 1599 REGEX_CHECK_STATUS; 1600 status = U_INDEX_OUTOFBOUNDS_ERROR; 1601 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE); 1602 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1603 1604 1605 // 1606 // Split() 1607 // 1608 status = U_ZERO_ERROR; 1609 pat1 = RegexPattern::compile(" +", pe, status); 1610 REGEX_CHECK_STATUS; 1611 UnicodeString fields[10]; 1612 1613 int32_t n; 1614 n = pat1->split("Now is the time", fields, 10, status); 1615 REGEX_CHECK_STATUS; 1616 REGEX_ASSERT(n==4); 1617 REGEX_ASSERT(fields[0]=="Now"); 1618 REGEX_ASSERT(fields[1]=="is"); 1619 REGEX_ASSERT(fields[2]=="the"); 1620 REGEX_ASSERT(fields[3]=="time"); 1621 REGEX_ASSERT(fields[4]==""); 1622 1623 n = pat1->split("Now is the time", fields, 2, status); 1624 REGEX_CHECK_STATUS; 1625 REGEX_ASSERT(n==2); 1626 REGEX_ASSERT(fields[0]=="Now"); 1627 REGEX_ASSERT(fields[1]=="is the time"); 1628 REGEX_ASSERT(fields[2]=="the"); // left over from previous test 1629 1630 fields[1] = "*"; 1631 status = U_ZERO_ERROR; 1632 n = pat1->split("Now is the time", fields, 1, status); 1633 REGEX_CHECK_STATUS; 1634 REGEX_ASSERT(n==1); 1635 REGEX_ASSERT(fields[0]=="Now is the time"); 1636 REGEX_ASSERT(fields[1]=="*"); 1637 status = U_ZERO_ERROR; 1638 1639 n = pat1->split(" Now is the time ", fields, 10, status); 1640 REGEX_CHECK_STATUS; 1641 REGEX_ASSERT(n==6); 1642 REGEX_ASSERT(fields[0]==""); 1643 REGEX_ASSERT(fields[1]=="Now"); 1644 REGEX_ASSERT(fields[2]=="is"); 1645 REGEX_ASSERT(fields[3]=="the"); 1646 REGEX_ASSERT(fields[4]=="time"); 1647 REGEX_ASSERT(fields[5]==""); 1648 1649 n = pat1->split(" ", fields, 10, status); 1650 REGEX_CHECK_STATUS; 1651 REGEX_ASSERT(n==2); 1652 REGEX_ASSERT(fields[0]==""); 1653 REGEX_ASSERT(fields[1]==""); 1654 1655 fields[0] = "foo"; 1656 n = pat1->split("", fields, 10, status); 1657 REGEX_CHECK_STATUS; 1658 REGEX_ASSERT(n==0); 1659 REGEX_ASSERT(fields[0]=="foo"); 1660 1661 delete pat1; 1662 1663 // split, with a pattern with (capture) 1664 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status); 1665 REGEX_CHECK_STATUS; 1666 1667 status = U_ZERO_ERROR; 1668 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status); 1669 REGEX_CHECK_STATUS; 1670 REGEX_ASSERT(n==7); 1671 REGEX_ASSERT(fields[0]==""); 1672 REGEX_ASSERT(fields[1]=="a"); 1673 REGEX_ASSERT(fields[2]=="Now is "); 1674 REGEX_ASSERT(fields[3]=="b"); 1675 REGEX_ASSERT(fields[4]=="the time"); 1676 REGEX_ASSERT(fields[5]=="c"); 1677 REGEX_ASSERT(fields[6]==""); 1678 REGEX_ASSERT(status==U_ZERO_ERROR); 1679 1680 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status); 1681 REGEX_CHECK_STATUS; 1682 REGEX_ASSERT(n==7); 1683 REGEX_ASSERT(fields[0]==" "); 1684 REGEX_ASSERT(fields[1]=="a"); 1685 REGEX_ASSERT(fields[2]=="Now is "); 1686 REGEX_ASSERT(fields[3]=="b"); 1687 REGEX_ASSERT(fields[4]=="the time"); 1688 REGEX_ASSERT(fields[5]=="c"); 1689 REGEX_ASSERT(fields[6]==""); 1690 1691 status = U_ZERO_ERROR; 1692 fields[6] = "foo"; 1693 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status); 1694 REGEX_CHECK_STATUS; 1695 REGEX_ASSERT(n==6); 1696 REGEX_ASSERT(fields[0]==" "); 1697 REGEX_ASSERT(fields[1]=="a"); 1698 REGEX_ASSERT(fields[2]=="Now is "); 1699 REGEX_ASSERT(fields[3]=="b"); 1700 REGEX_ASSERT(fields[4]=="the time"); 1701 REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter. 1702 REGEX_ASSERT(fields[6]=="foo"); 1703 1704 status = U_ZERO_ERROR; 1705 fields[5] = "foo"; 1706 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status); 1707 REGEX_CHECK_STATUS; 1708 REGEX_ASSERT(n==5); 1709 REGEX_ASSERT(fields[0]==" "); 1710 REGEX_ASSERT(fields[1]=="a"); 1711 REGEX_ASSERT(fields[2]=="Now is "); 1712 REGEX_ASSERT(fields[3]=="b"); 1713 REGEX_ASSERT(fields[4]=="the time<c>"); 1714 REGEX_ASSERT(fields[5]=="foo"); 1715 1716 status = U_ZERO_ERROR; 1717 fields[5] = "foo"; 1718 n = pat1->split(" <a>Now is <b>the time", fields, 5, status); 1719 REGEX_CHECK_STATUS; 1720 REGEX_ASSERT(n==5); 1721 REGEX_ASSERT(fields[0]==" "); 1722 REGEX_ASSERT(fields[1]=="a"); 1723 REGEX_ASSERT(fields[2]=="Now is "); 1724 REGEX_ASSERT(fields[3]=="b"); 1725 REGEX_ASSERT(fields[4]=="the time"); 1726 REGEX_ASSERT(fields[5]=="foo"); 1727 1728 status = U_ZERO_ERROR; 1729 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status); 1730 REGEX_CHECK_STATUS; 1731 REGEX_ASSERT(n==4); 1732 REGEX_ASSERT(fields[0]==" "); 1733 REGEX_ASSERT(fields[1]=="a"); 1734 REGEX_ASSERT(fields[2]=="Now is "); 1735 REGEX_ASSERT(fields[3]=="the time<c>"); 1736 status = U_ZERO_ERROR; 1737 delete pat1; 1738 1739 pat1 = RegexPattern::compile("([-,])", pe, status); 1740 REGEX_CHECK_STATUS; 1741 n = pat1->split("1-10,20", fields, 10, status); 1742 REGEX_CHECK_STATUS; 1743 REGEX_ASSERT(n==5); 1744 REGEX_ASSERT(fields[0]=="1"); 1745 REGEX_ASSERT(fields[1]=="-"); 1746 REGEX_ASSERT(fields[2]=="10"); 1747 REGEX_ASSERT(fields[3]==","); 1748 REGEX_ASSERT(fields[4]=="20"); 1749 delete pat1; 1750 1751 // Test split of string with empty trailing fields 1752 pat1 = RegexPattern::compile(",", pe, status); 1753 REGEX_CHECK_STATUS; 1754 n = pat1->split("a,b,c,", fields, 10, status); 1755 REGEX_CHECK_STATUS; 1756 REGEX_ASSERT(n==4); 1757 REGEX_ASSERT(fields[0]=="a"); 1758 REGEX_ASSERT(fields[1]=="b"); 1759 REGEX_ASSERT(fields[2]=="c"); 1760 REGEX_ASSERT(fields[3]==""); 1761 1762 n = pat1->split("a,,,", fields, 10, status); 1763 REGEX_CHECK_STATUS; 1764 REGEX_ASSERT(n==4); 1765 REGEX_ASSERT(fields[0]=="a"); 1766 REGEX_ASSERT(fields[1]==""); 1767 REGEX_ASSERT(fields[2]==""); 1768 REGEX_ASSERT(fields[3]==""); 1769 delete pat1; 1770 1771 // Split Separator with zero length match. 1772 pat1 = RegexPattern::compile(":?", pe, status); 1773 REGEX_CHECK_STATUS; 1774 n = pat1->split("abc", fields, 10, status); 1775 REGEX_CHECK_STATUS; 1776 REGEX_ASSERT(n==5); 1777 REGEX_ASSERT(fields[0]==""); 1778 REGEX_ASSERT(fields[1]=="a"); 1779 REGEX_ASSERT(fields[2]=="b"); 1780 REGEX_ASSERT(fields[3]=="c"); 1781 REGEX_ASSERT(fields[4]==""); 1782 1783 delete pat1; 1784 1785 // 1786 // RegexPattern::pattern() 1787 // 1788 pat1 = new RegexPattern(); 1789 REGEX_ASSERT(pat1->pattern() == ""); 1790 delete pat1; 1791 1792 pat1 = RegexPattern::compile("(Hello, world)*", pe, status); 1793 REGEX_CHECK_STATUS; 1794 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*"); 1795 delete pat1; 1796 1797 1798 // 1799 // classID functions 1800 // 1801 pat1 = RegexPattern::compile("(Hello, world)*", pe, status); 1802 REGEX_CHECK_STATUS; 1803 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID()); 1804 REGEX_ASSERT(pat1->getDynamicClassID() != NULL); 1805 UnicodeString Hello("Hello, world."); 1806 RegexMatcher *m = pat1->matcher(Hello, status); 1807 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID()); 1808 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID()); 1809 REGEX_ASSERT(m->getDynamicClassID() != NULL); 1810 delete m; 1811 delete pat1; 1812 1813 } 1814 1815 //--------------------------------------------------------------------------- 1816 // 1817 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher 1818 // is present and working, but excluding functions 1819 // implementing replace operations. 1820 // 1821 //--------------------------------------------------------------------------- 1822 void RegexTest::API_Match_UTF8() { 1823 UParseError pe; 1824 UErrorCode status=U_ZERO_ERROR; 1825 int32_t flags = 0; 1826 1827 // 1828 // Debug - slide failing test cases early 1829 // 1830 #if 0 1831 { 1832 } 1833 return; 1834 #endif 1835 1836 // 1837 // Simple pattern compilation 1838 // 1839 { 1840 UText re = UTEXT_INITIALIZER; 1841 regextst_openUTF8FromInvariant(&re, "abc", -1, &status); 1842 REGEX_VERBOSE_TEXT(&re); 1843 RegexPattern *pat2; 1844 pat2 = RegexPattern::compile(&re, flags, pe, status); 1845 REGEX_CHECK_STATUS; 1846 1847 UText input1 = UTEXT_INITIALIZER; 1848 UText input2 = UTEXT_INITIALIZER; 1849 UText empty = UTEXT_INITIALIZER; 1850 regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status); 1851 REGEX_VERBOSE_TEXT(&input1); 1852 regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status); 1853 REGEX_VERBOSE_TEXT(&input2); 1854 utext_openUChars(&empty, NULL, 0, &status); 1855 1856 int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */ 1857 int32_t input2Len = strlen("not abc"); 1858 1859 1860 // 1861 // Matcher creation and reset. 1862 // 1863 RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1); 1864 REGEX_CHECK_STATUS; 1865 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 1866 const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */ 1867 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText()); 1868 m1->reset(&input2); 1869 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 1870 const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */ 1871 REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText()); 1872 m1->reset(&input1); 1873 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText()); 1874 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 1875 m1->reset(&empty); 1876 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 1877 REGEX_ASSERT(utext_nativeLength(&empty) == 0); 1878 1879 // 1880 // reset(pos, status) 1881 // 1882 m1->reset(&input1); 1883 m1->reset(4, status); 1884 REGEX_CHECK_STATUS; 1885 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText()); 1886 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 1887 1888 m1->reset(-1, status); 1889 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1890 status = U_ZERO_ERROR; 1891 1892 m1->reset(0, status); 1893 REGEX_CHECK_STATUS; 1894 status = U_ZERO_ERROR; 1895 1896 m1->reset(input1Len-1, status); 1897 REGEX_CHECK_STATUS; 1898 status = U_ZERO_ERROR; 1899 1900 m1->reset(input1Len, status); 1901 REGEX_CHECK_STATUS; 1902 status = U_ZERO_ERROR; 1903 1904 m1->reset(input1Len+1, status); 1905 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1906 status = U_ZERO_ERROR; 1907 1908 // 1909 // match(pos, status) 1910 // 1911 m1->reset(&input2); 1912 REGEX_ASSERT(m1->matches(4, status) == TRUE); 1913 m1->reset(); 1914 REGEX_ASSERT(m1->matches(3, status) == FALSE); 1915 m1->reset(); 1916 REGEX_ASSERT(m1->matches(5, status) == FALSE); 1917 REGEX_ASSERT(m1->matches(4, status) == TRUE); 1918 REGEX_ASSERT(m1->matches(-1, status) == FALSE); 1919 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1920 1921 // Match() at end of string should fail, but should not 1922 // be an error. 1923 status = U_ZERO_ERROR; 1924 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE); 1925 REGEX_CHECK_STATUS; 1926 1927 // Match beyond end of string should fail with an error. 1928 status = U_ZERO_ERROR; 1929 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE); 1930 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1931 1932 // Successful match at end of string. 1933 { 1934 status = U_ZERO_ERROR; 1935 RegexMatcher m("A?", 0, status); // will match zero length string. 1936 REGEX_CHECK_STATUS; 1937 m.reset(&input1); 1938 REGEX_ASSERT(m.matches(input1Len, status) == TRUE); 1939 REGEX_CHECK_STATUS; 1940 m.reset(&empty); 1941 REGEX_ASSERT(m.matches(0, status) == TRUE); 1942 REGEX_CHECK_STATUS; 1943 } 1944 1945 1946 // 1947 // lookingAt(pos, status) 1948 // 1949 status = U_ZERO_ERROR; 1950 m1->reset(&input2); // "not abc" 1951 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 1952 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE); 1953 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE); 1954 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 1955 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE); 1956 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1957 status = U_ZERO_ERROR; 1958 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE); 1959 REGEX_CHECK_STATUS; 1960 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE); 1961 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1962 1963 delete m1; 1964 delete pat2; 1965 1966 utext_close(&re); 1967 utext_close(&input1); 1968 utext_close(&input2); 1969 utext_close(&empty); 1970 } 1971 1972 1973 // 1974 // Capture Group. 1975 // RegexMatcher::start(); 1976 // RegexMatcher::end(); 1977 // RegexMatcher::groupCount(); 1978 // 1979 { 1980 int32_t flags=0; 1981 UParseError pe; 1982 UErrorCode status=U_ZERO_ERROR; 1983 UText re=UTEXT_INITIALIZER; 1984 const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */ 1985 utext_openUTF8(&re, str_01234567_pat, -1, &status); 1986 1987 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 1988 REGEX_CHECK_STATUS; 1989 1990 UText input = UTEXT_INITIALIZER; 1991 const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */ 1992 utext_openUTF8(&input, str_0123456789, -1, &status); 1993 1994 RegexMatcher *matcher = &pat->matcher(status)->reset(&input); 1995 REGEX_CHECK_STATUS; 1996 REGEX_ASSERT(matcher->lookingAt(status) == TRUE); 1997 static const int32_t matchStarts[] = {0, 2, 4, 8}; 1998 static const int32_t matchEnds[] = {10, 8, 6, 10}; 1999 int32_t i; 2000 for (i=0; i<4; i++) { 2001 int32_t actualStart = matcher->start(i, status); 2002 REGEX_CHECK_STATUS; 2003 if (actualStart != matchStarts[i]) { 2004 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n", 2005 __FILE__, __LINE__, i, matchStarts[i], actualStart); 2006 } 2007 int32_t actualEnd = matcher->end(i, status); 2008 REGEX_CHECK_STATUS; 2009 if (actualEnd != matchEnds[i]) { 2010 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n", 2011 __FILE__, __LINE__, i, matchEnds[i], actualEnd); 2012 } 2013 } 2014 2015 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status)); 2016 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status)); 2017 2018 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 2019 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 2020 matcher->reset(); 2021 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE); 2022 2023 matcher->lookingAt(status); 2024 2025 UnicodeString dest; 2026 UText destText = UTEXT_INITIALIZER; 2027 utext_openUnicodeString(&destText, &dest, &status); 2028 UText *result; 2029 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */ 2030 // Test shallow-clone API 2031 int64_t group_len; 2032 result = matcher->group((UText *)NULL, group_len, status); 2033 REGEX_CHECK_STATUS; 2034 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 2035 utext_close(result); 2036 result = matcher->group(0, &destText, group_len, status); 2037 REGEX_CHECK_STATUS; 2038 REGEX_ASSERT(result == &destText); 2039 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 2040 // destText is now immutable, reopen it 2041 utext_close(&destText); 2042 utext_openUnicodeString(&destText, &dest, &status); 2043 2044 result = matcher->group(0, NULL, status); 2045 REGEX_CHECK_STATUS; 2046 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 2047 utext_close(result); 2048 result = matcher->group(0, &destText, status); 2049 REGEX_CHECK_STATUS; 2050 REGEX_ASSERT(result == &destText); 2051 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 2052 2053 result = matcher->group(1, NULL, status); 2054 REGEX_CHECK_STATUS; 2055 const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */ 2056 REGEX_ASSERT_UTEXT_UTF8(str_234567, result); 2057 utext_close(result); 2058 result = matcher->group(1, &destText, status); 2059 REGEX_CHECK_STATUS; 2060 REGEX_ASSERT(result == &destText); 2061 REGEX_ASSERT_UTEXT_UTF8(str_234567, result); 2062 2063 result = matcher->group(2, NULL, status); 2064 REGEX_CHECK_STATUS; 2065 const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */ 2066 REGEX_ASSERT_UTEXT_UTF8(str_45, result); 2067 utext_close(result); 2068 result = matcher->group(2, &destText, status); 2069 REGEX_CHECK_STATUS; 2070 REGEX_ASSERT(result == &destText); 2071 REGEX_ASSERT_UTEXT_UTF8(str_45, result); 2072 2073 result = matcher->group(3, NULL, status); 2074 REGEX_CHECK_STATUS; 2075 const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */ 2076 REGEX_ASSERT_UTEXT_UTF8(str_89, result); 2077 utext_close(result); 2078 result = matcher->group(3, &destText, status); 2079 REGEX_CHECK_STATUS; 2080 REGEX_ASSERT(result == &destText); 2081 REGEX_ASSERT_UTEXT_UTF8(str_89, result); 2082 2083 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 2084 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 2085 matcher->reset(); 2086 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); 2087 2088 delete matcher; 2089 delete pat; 2090 2091 utext_close(&destText); 2092 utext_close(&input); 2093 utext_close(&re); 2094 } 2095 2096 // 2097 // find 2098 // 2099 { 2100 int32_t flags=0; 2101 UParseError pe; 2102 UErrorCode status=U_ZERO_ERROR; 2103 UText re=UTEXT_INITIALIZER; 2104 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 2105 utext_openUTF8(&re, str_abc, -1, &status); 2106 2107 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 2108 REGEX_CHECK_STATUS; 2109 UText input = UTEXT_INITIALIZER; 2110 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */ 2111 utext_openUTF8(&input, str_abcabcabc, -1, &status); 2112 // 012345678901234567 2113 2114 RegexMatcher *matcher = &pat->matcher(status)->reset(&input); 2115 REGEX_CHECK_STATUS; 2116 REGEX_ASSERT(matcher->find()); 2117 REGEX_ASSERT(matcher->start(status) == 1); 2118 REGEX_ASSERT(matcher->find()); 2119 REGEX_ASSERT(matcher->start(status) == 6); 2120 REGEX_ASSERT(matcher->find()); 2121 REGEX_ASSERT(matcher->start(status) == 12); 2122 REGEX_ASSERT(matcher->find() == FALSE); 2123 REGEX_ASSERT(matcher->find() == FALSE); 2124 2125 matcher->reset(); 2126 REGEX_ASSERT(matcher->find()); 2127 REGEX_ASSERT(matcher->start(status) == 1); 2128 2129 REGEX_ASSERT(matcher->find(0, status)); 2130 REGEX_ASSERT(matcher->start(status) == 1); 2131 REGEX_ASSERT(matcher->find(1, status)); 2132 REGEX_ASSERT(matcher->start(status) == 1); 2133 REGEX_ASSERT(matcher->find(2, status)); 2134 REGEX_ASSERT(matcher->start(status) == 6); 2135 REGEX_ASSERT(matcher->find(12, status)); 2136 REGEX_ASSERT(matcher->start(status) == 12); 2137 REGEX_ASSERT(matcher->find(13, status) == FALSE); 2138 REGEX_ASSERT(matcher->find(16, status) == FALSE); 2139 REGEX_ASSERT(matcher->find(17, status) == FALSE); 2140 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE); 2141 2142 status = U_ZERO_ERROR; 2143 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 2144 status = U_ZERO_ERROR; 2145 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR); 2146 2147 REGEX_ASSERT(matcher->groupCount() == 0); 2148 2149 delete matcher; 2150 delete pat; 2151 2152 utext_close(&input); 2153 utext_close(&re); 2154 } 2155 2156 2157 // 2158 // find, with \G in pattern (true if at the end of a previous match). 2159 // 2160 { 2161 int32_t flags=0; 2162 UParseError pe; 2163 UErrorCode status=U_ZERO_ERROR; 2164 UText re=UTEXT_INITIALIZER; 2165 const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */ 2166 utext_openUTF8(&re, str_Gabcabc, -1, &status); 2167 2168 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 2169 2170 REGEX_CHECK_STATUS; 2171 UText input = UTEXT_INITIALIZER; 2172 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */ 2173 utext_openUTF8(&input, str_abcabcabc, -1, &status); 2174 // 012345678901234567 2175 2176 RegexMatcher *matcher = &pat->matcher(status)->reset(&input); 2177 REGEX_CHECK_STATUS; 2178 REGEX_ASSERT(matcher->find()); 2179 REGEX_ASSERT(matcher->start(status) == 0); 2180 REGEX_ASSERT(matcher->start(1, status) == -1); 2181 REGEX_ASSERT(matcher->start(2, status) == 1); 2182 2183 REGEX_ASSERT(matcher->find()); 2184 REGEX_ASSERT(matcher->start(status) == 4); 2185 REGEX_ASSERT(matcher->start(1, status) == 4); 2186 REGEX_ASSERT(matcher->start(2, status) == -1); 2187 REGEX_CHECK_STATUS; 2188 2189 delete matcher; 2190 delete pat; 2191 2192 utext_close(&input); 2193 utext_close(&re); 2194 } 2195 2196 // 2197 // find with zero length matches, match position should bump ahead 2198 // to prevent loops. 2199 // 2200 { 2201 int32_t i; 2202 UErrorCode status=U_ZERO_ERROR; 2203 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere, 2204 // using an always-true look-ahead. 2205 REGEX_CHECK_STATUS; 2206 UText s = UTEXT_INITIALIZER; 2207 utext_openUTF8(&s, " ", -1, &status); 2208 m.reset(&s); 2209 for (i=0; ; i++) { 2210 if (m.find() == FALSE) { 2211 break; 2212 } 2213 REGEX_ASSERT(m.start(status) == i); 2214 REGEX_ASSERT(m.end(status) == i); 2215 } 2216 REGEX_ASSERT(i==5); 2217 2218 // Check that the bump goes over characters outside the BMP OK 2219 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8 2220 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00}; 2221 utext_openUTF8(&s, (char *)aboveBMP, -1, &status); 2222 m.reset(&s); 2223 for (i=0; ; i+=4) { 2224 if (m.find() == FALSE) { 2225 break; 2226 } 2227 REGEX_ASSERT(m.start(status) == i); 2228 REGEX_ASSERT(m.end(status) == i); 2229 } 2230 REGEX_ASSERT(i==20); 2231 2232 utext_close(&s); 2233 } 2234 { 2235 // find() loop breaking test. 2236 // with pattern of /.?/, should see a series of one char matches, then a single 2237 // match of zero length at the end of the input string. 2238 int32_t i; 2239 UErrorCode status=U_ZERO_ERROR; 2240 RegexMatcher m(".?", 0, status); 2241 REGEX_CHECK_STATUS; 2242 UText s = UTEXT_INITIALIZER; 2243 utext_openUTF8(&s, " ", -1, &status); 2244 m.reset(&s); 2245 for (i=0; ; i++) { 2246 if (m.find() == FALSE) { 2247 break; 2248 } 2249 REGEX_ASSERT(m.start(status) == i); 2250 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i)); 2251 } 2252 REGEX_ASSERT(i==5); 2253 2254 utext_close(&s); 2255 } 2256 2257 2258 // 2259 // Matchers with no input string behave as if they had an empty input string. 2260 // 2261 2262 { 2263 UErrorCode status = U_ZERO_ERROR; 2264 RegexMatcher m(".?", 0, status); 2265 REGEX_CHECK_STATUS; 2266 REGEX_ASSERT(m.find()); 2267 REGEX_ASSERT(m.start(status) == 0); 2268 REGEX_ASSERT(m.input() == ""); 2269 } 2270 { 2271 UErrorCode status = U_ZERO_ERROR; 2272 RegexPattern *p = RegexPattern::compile(".", 0, status); 2273 RegexMatcher *m = p->matcher(status); 2274 REGEX_CHECK_STATUS; 2275 2276 REGEX_ASSERT(m->find() == FALSE); 2277 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0); 2278 delete m; 2279 delete p; 2280 } 2281 2282 // 2283 // Regions 2284 // 2285 { 2286 UErrorCode status = U_ZERO_ERROR; 2287 UText testPattern = UTEXT_INITIALIZER; 2288 UText testText = UTEXT_INITIALIZER; 2289 regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status); 2290 REGEX_VERBOSE_TEXT(&testPattern); 2291 regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status); 2292 REGEX_VERBOSE_TEXT(&testText); 2293 2294 RegexMatcher m(&testPattern, &testText, 0, status); 2295 REGEX_CHECK_STATUS; 2296 REGEX_ASSERT(m.regionStart() == 0); 2297 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data")); 2298 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2299 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2300 2301 m.region(2,4, status); 2302 REGEX_CHECK_STATUS; 2303 REGEX_ASSERT(m.matches(status)); 2304 REGEX_ASSERT(m.start(status)==2); 2305 REGEX_ASSERT(m.end(status)==4); 2306 REGEX_CHECK_STATUS; 2307 2308 m.reset(); 2309 REGEX_ASSERT(m.regionStart() == 0); 2310 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data")); 2311 2312 regextst_openUTF8FromInvariant(&testText, "short", -1, &status); 2313 REGEX_VERBOSE_TEXT(&testText); 2314 m.reset(&testText); 2315 REGEX_ASSERT(m.regionStart() == 0); 2316 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short")); 2317 2318 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2319 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE)); 2320 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 2321 REGEX_ASSERT(&m == &m.reset()); 2322 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 2323 2324 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE)); 2325 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2326 REGEX_ASSERT(&m == &m.reset()); 2327 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2328 2329 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2330 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE)); 2331 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 2332 REGEX_ASSERT(&m == &m.reset()); 2333 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 2334 2335 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE)); 2336 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2337 REGEX_ASSERT(&m == &m.reset()); 2338 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2339 2340 utext_close(&testText); 2341 utext_close(&testPattern); 2342 } 2343 2344 // 2345 // hitEnd() and requireEnd() 2346 // 2347 { 2348 UErrorCode status = U_ZERO_ERROR; 2349 UText testPattern = UTEXT_INITIALIZER; 2350 UText testText = UTEXT_INITIALIZER; 2351 const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */ 2352 const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */ 2353 utext_openUTF8(&testPattern, str_, -1, &status); 2354 utext_openUTF8(&testText, str_aabb, -1, &status); 2355 2356 RegexMatcher m1(&testPattern, &testText, 0, status); 2357 REGEX_ASSERT(m1.lookingAt(status) == TRUE); 2358 REGEX_ASSERT(m1.hitEnd() == TRUE); 2359 REGEX_ASSERT(m1.requireEnd() == FALSE); 2360 REGEX_CHECK_STATUS; 2361 2362 status = U_ZERO_ERROR; 2363 const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */ 2364 utext_openUTF8(&testPattern, str_a, -1, &status); 2365 RegexMatcher m2(&testPattern, &testText, 0, status); 2366 REGEX_ASSERT(m2.lookingAt(status) == TRUE); 2367 REGEX_ASSERT(m2.hitEnd() == FALSE); 2368 REGEX_ASSERT(m2.requireEnd() == FALSE); 2369 REGEX_CHECK_STATUS; 2370 2371 status = U_ZERO_ERROR; 2372 const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */ 2373 utext_openUTF8(&testPattern, str_dotstardollar, -1, &status); 2374 RegexMatcher m3(&testPattern, &testText, 0, status); 2375 REGEX_ASSERT(m3.lookingAt(status) == TRUE); 2376 REGEX_ASSERT(m3.hitEnd() == TRUE); 2377 REGEX_ASSERT(m3.requireEnd() == TRUE); 2378 REGEX_CHECK_STATUS; 2379 2380 utext_close(&testText); 2381 utext_close(&testPattern); 2382 } 2383 } 2384 2385 2386 //--------------------------------------------------------------------------- 2387 // 2388 // API_Replace_UTF8 API test for class RegexMatcher, testing the 2389 // Replace family of functions. 2390 // 2391 //--------------------------------------------------------------------------- 2392 void RegexTest::API_Replace_UTF8() { 2393 // 2394 // Replace 2395 // 2396 int32_t flags=0; 2397 UParseError pe; 2398 UErrorCode status=U_ZERO_ERROR; 2399 2400 UText re=UTEXT_INITIALIZER; 2401 regextst_openUTF8FromInvariant(&re, "abc", -1, &status); 2402 REGEX_VERBOSE_TEXT(&re); 2403 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 2404 REGEX_CHECK_STATUS; 2405 2406 char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */ 2407 // 012345678901234567 2408 UText dataText = UTEXT_INITIALIZER; 2409 utext_openUTF8(&dataText, data, -1, &status); 2410 REGEX_CHECK_STATUS; 2411 REGEX_VERBOSE_TEXT(&dataText); 2412 RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText); 2413 2414 // 2415 // Plain vanilla matches. 2416 // 2417 UnicodeString dest; 2418 UText destText = UTEXT_INITIALIZER; 2419 utext_openUnicodeString(&destText, &dest, &status); 2420 UText *result; 2421 2422 UText replText = UTEXT_INITIALIZER; 2423 2424 const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */ 2425 utext_openUTF8(&replText, str_yz, -1, &status); 2426 REGEX_VERBOSE_TEXT(&replText); 2427 result = matcher->replaceFirst(&replText, NULL, status); 2428 REGEX_CHECK_STATUS; 2429 const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */ 2430 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result); 2431 utext_close(result); 2432 result = matcher->replaceFirst(&replText, &destText, status); 2433 REGEX_CHECK_STATUS; 2434 REGEX_ASSERT(result == &destText); 2435 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result); 2436 2437 result = matcher->replaceAll(&replText, NULL, status); 2438 REGEX_CHECK_STATUS; 2439 const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */ 2440 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result); 2441 utext_close(result); 2442 2443 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2444 result = matcher->replaceAll(&replText, &destText, status); 2445 REGEX_CHECK_STATUS; 2446 REGEX_ASSERT(result == &destText); 2447 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result); 2448 2449 // 2450 // Plain vanilla non-matches. 2451 // 2452 const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */ 2453 utext_openUTF8(&dataText, str_abxabxabx, -1, &status); 2454 matcher->reset(&dataText); 2455 2456 result = matcher->replaceFirst(&replText, NULL, status); 2457 REGEX_CHECK_STATUS; 2458 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2459 utext_close(result); 2460 result = matcher->replaceFirst(&replText, &destText, status); 2461 REGEX_CHECK_STATUS; 2462 REGEX_ASSERT(result == &destText); 2463 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2464 2465 result = matcher->replaceAll(&replText, NULL, status); 2466 REGEX_CHECK_STATUS; 2467 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2468 utext_close(result); 2469 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2470 result = matcher->replaceAll(&replText, &destText, status); 2471 REGEX_CHECK_STATUS; 2472 REGEX_ASSERT(result == &destText); 2473 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2474 2475 // 2476 // Empty source string 2477 // 2478 utext_openUTF8(&dataText, NULL, 0, &status); 2479 matcher->reset(&dataText); 2480 2481 result = matcher->replaceFirst(&replText, NULL, status); 2482 REGEX_CHECK_STATUS; 2483 REGEX_ASSERT_UTEXT_UTF8("", result); 2484 utext_close(result); 2485 result = matcher->replaceFirst(&replText, &destText, status); 2486 REGEX_CHECK_STATUS; 2487 REGEX_ASSERT(result == &destText); 2488 REGEX_ASSERT_UTEXT_UTF8("", result); 2489 2490 result = matcher->replaceAll(&replText, NULL, status); 2491 REGEX_CHECK_STATUS; 2492 REGEX_ASSERT_UTEXT_UTF8("", result); 2493 utext_close(result); 2494 result = matcher->replaceAll(&replText, &destText, status); 2495 REGEX_CHECK_STATUS; 2496 REGEX_ASSERT(result == &destText); 2497 REGEX_ASSERT_UTEXT_UTF8("", result); 2498 2499 // 2500 // Empty substitution string 2501 // 2502 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.." 2503 matcher->reset(&dataText); 2504 2505 utext_openUTF8(&replText, NULL, 0, &status); 2506 result = matcher->replaceFirst(&replText, NULL, status); 2507 REGEX_CHECK_STATUS; 2508 const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */ 2509 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result); 2510 utext_close(result); 2511 result = matcher->replaceFirst(&replText, &destText, status); 2512 REGEX_CHECK_STATUS; 2513 REGEX_ASSERT(result == &destText); 2514 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result); 2515 2516 result = matcher->replaceAll(&replText, NULL, status); 2517 REGEX_CHECK_STATUS; 2518 const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */ 2519 REGEX_ASSERT_UTEXT_UTF8(str_dots, result); 2520 utext_close(result); 2521 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2522 result = matcher->replaceAll(&replText, &destText, status); 2523 REGEX_CHECK_STATUS; 2524 REGEX_ASSERT(result == &destText); 2525 REGEX_ASSERT_UTEXT_UTF8(str_dots, result); 2526 2527 // 2528 // match whole string 2529 // 2530 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 2531 utext_openUTF8(&dataText, str_abc, -1, &status); 2532 matcher->reset(&dataText); 2533 2534 const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */ 2535 utext_openUTF8(&replText, str_xyz, -1, &status); 2536 result = matcher->replaceFirst(&replText, NULL, status); 2537 REGEX_CHECK_STATUS; 2538 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2539 utext_close(result); 2540 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2541 result = matcher->replaceFirst(&replText, &destText, status); 2542 REGEX_CHECK_STATUS; 2543 REGEX_ASSERT(result == &destText); 2544 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2545 2546 result = matcher->replaceAll(&replText, NULL, status); 2547 REGEX_CHECK_STATUS; 2548 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2549 utext_close(result); 2550 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2551 result = matcher->replaceAll(&replText, &destText, status); 2552 REGEX_CHECK_STATUS; 2553 REGEX_ASSERT(result == &destText); 2554 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2555 2556 // 2557 // Capture Group, simple case 2558 // 2559 const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */ 2560 utext_openUTF8(&re, str_add, -1, &status); 2561 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status); 2562 REGEX_CHECK_STATUS; 2563 2564 const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */ 2565 utext_openUTF8(&dataText, str_abcdefg, -1, &status); 2566 RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText); 2567 REGEX_CHECK_STATUS; 2568 2569 const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */ 2570 utext_openUTF8(&replText, str_11, -1, &status); 2571 result = matcher2->replaceFirst(&replText, NULL, status); 2572 REGEX_CHECK_STATUS; 2573 const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */ 2574 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result); 2575 utext_close(result); 2576 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2577 result = matcher2->replaceFirst(&replText, &destText, status); 2578 REGEX_CHECK_STATUS; 2579 REGEX_ASSERT(result == &destText); 2580 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result); 2581 2582 const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */ 2583 utext_openUTF8(&replText, str_v, -1, &status); 2584 REGEX_VERBOSE_TEXT(&replText); 2585 result = matcher2->replaceFirst(&replText, NULL, status); 2586 REGEX_CHECK_STATUS; 2587 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */ 2588 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result); 2589 utext_close(result); 2590 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2591 result = matcher2->replaceFirst(&replText, &destText, status); 2592 REGEX_CHECK_STATUS; 2593 REGEX_ASSERT(result == &destText); 2594 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result); 2595 2596 const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */ 2597 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status); 2598 result = matcher2->replaceFirst(&replText, NULL, status); 2599 REGEX_CHECK_STATUS; 2600 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */ 2601 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result); 2602 utext_close(result); 2603 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2604 result = matcher2->replaceFirst(&replText, &destText, status); 2605 REGEX_CHECK_STATUS; 2606 REGEX_ASSERT(result == &destText); 2607 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result); 2608 2609 unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */ 2610 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE 2611 // 012345678901234567890123456 2612 supplDigitChars[22] = 0xF0; 2613 supplDigitChars[23] = 0x9D; 2614 supplDigitChars[24] = 0x9F; 2615 supplDigitChars[25] = 0x8F; 2616 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status); 2617 2618 result = matcher2->replaceFirst(&replText, NULL, status); 2619 REGEX_CHECK_STATUS; 2620 const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */ 2621 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result); 2622 utext_close(result); 2623 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2624 result = matcher2->replaceFirst(&replText, &destText, status); 2625 REGEX_CHECK_STATUS; 2626 REGEX_ASSERT(result == &destText); 2627 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result); 2628 const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */ 2629 utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status); 2630 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR); 2631 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result); 2632 utext_close(result); 2633 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2634 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR); 2635 REGEX_ASSERT(result == &destText); 2636 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result); 2637 2638 // 2639 // Replacement String with \u hex escapes 2640 // 2641 { 2642 const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */ 2643 const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */ 2644 utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status); 2645 utext_openUTF8(&replText, str_u0043, -1, &status); 2646 matcher->reset(&dataText); 2647 2648 result = matcher->replaceAll(&replText, NULL, status); 2649 REGEX_CHECK_STATUS; 2650 const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */ 2651 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result); 2652 utext_close(result); 2653 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2654 result = matcher->replaceAll(&replText, &destText, status); 2655 REGEX_CHECK_STATUS; 2656 REGEX_ASSERT(result == &destText); 2657 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result); 2658 } 2659 { 2660 const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */ 2661 utext_openUTF8(&dataText, str_abc, -1, &status); 2662 const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */ 2663 utext_openUTF8(&replText, str_U00010000, -1, &status); 2664 matcher->reset(&dataText); 2665 2666 unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A" 2667 // 0123456789 2668 expected[2] = 0xF0; 2669 expected[3] = 0x90; 2670 expected[4] = 0x80; 2671 expected[5] = 0x80; 2672 2673 result = matcher->replaceAll(&replText, NULL, status); 2674 REGEX_CHECK_STATUS; 2675 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result); 2676 utext_close(result); 2677 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2678 result = matcher->replaceAll(&replText, &destText, status); 2679 REGEX_CHECK_STATUS; 2680 REGEX_ASSERT(result == &destText); 2681 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result); 2682 } 2683 // TODO: need more through testing of capture substitutions. 2684 2685 // Bug 4057 2686 // 2687 { 2688 status = U_ZERO_ERROR; 2689 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */ 2690 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */ 2691 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */ 2692 utext_openUTF8(&re, str_ssee, -1, &status); 2693 utext_openUTF8(&dataText, str_blah, -1, &status); 2694 utext_openUTF8(&replText, str_ooh, -1, &status); 2695 2696 RegexMatcher m(&re, 0, status); 2697 REGEX_CHECK_STATUS; 2698 2699 UnicodeString result; 2700 UText resultText = UTEXT_INITIALIZER; 2701 utext_openUnicodeString(&resultText, &result, &status); 2702 2703 // Multiple finds do NOT bump up the previous appendReplacement postion. 2704 m.reset(&dataText); 2705 m.find(); 2706 m.find(); 2707 m.appendReplacement(&resultText, &replText, status); 2708 REGEX_CHECK_STATUS; 2709 const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */ 2710 REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText); 2711 2712 // After a reset into the interior of a string, appendReplacement still starts at beginning. 2713 status = U_ZERO_ERROR; 2714 result.truncate(0); 2715 utext_openUnicodeString(&resultText, &result, &status); 2716 m.reset(10, status); 2717 m.find(); 2718 m.find(); 2719 m.appendReplacement(&resultText, &replText, status); 2720 REGEX_CHECK_STATUS; 2721 const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */ 2722 REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText); 2723 2724 // find() at interior of string, appendReplacement still starts at beginning. 2725 status = U_ZERO_ERROR; 2726 result.truncate(0); 2727 utext_openUnicodeString(&resultText, &result, &status); 2728 m.reset(); 2729 m.find(10, status); 2730 m.find(); 2731 m.appendReplacement(&resultText, &replText, status); 2732 REGEX_CHECK_STATUS; 2733 const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */ 2734 REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText); 2735 2736 m.appendTail(&resultText, status); 2737 const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */ 2738 REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText); 2739 2740 utext_close(&resultText); 2741 } 2742 2743 delete matcher2; 2744 delete pat2; 2745 delete matcher; 2746 delete pat; 2747 2748 utext_close(&dataText); 2749 utext_close(&replText); 2750 utext_close(&destText); 2751 utext_close(&re); 2752 } 2753 2754 2755 //--------------------------------------------------------------------------- 2756 // 2757 // API_Pattern_UTF8 Test that the API for class RegexPattern is 2758 // present and nominally working. 2759 // 2760 //--------------------------------------------------------------------------- 2761 void RegexTest::API_Pattern_UTF8() { 2762 RegexPattern pata; // Test default constructor to not crash. 2763 RegexPattern patb; 2764 2765 REGEX_ASSERT(pata == patb); 2766 REGEX_ASSERT(pata == pata); 2767 2768 UText re1 = UTEXT_INITIALIZER; 2769 UText re2 = UTEXT_INITIALIZER; 2770 UErrorCode status = U_ZERO_ERROR; 2771 UParseError pe; 2772 2773 const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */ 2774 const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */ 2775 utext_openUTF8(&re1, str_abcalmz, -1, &status); 2776 utext_openUTF8(&re2, str_def, -1, &status); 2777 2778 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status); 2779 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status); 2780 REGEX_CHECK_STATUS; 2781 REGEX_ASSERT(*pat1 == *pat1); 2782 REGEX_ASSERT(*pat1 != pata); 2783 2784 // Assign 2785 patb = *pat1; 2786 REGEX_ASSERT(patb == *pat1); 2787 2788 // Copy Construct 2789 RegexPattern patc(*pat1); 2790 REGEX_ASSERT(patc == *pat1); 2791 REGEX_ASSERT(patb == patc); 2792 REGEX_ASSERT(pat1 != pat2); 2793 patb = *pat2; 2794 REGEX_ASSERT(patb != patc); 2795 REGEX_ASSERT(patb == *pat2); 2796 2797 // Compile with no flags. 2798 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status); 2799 REGEX_ASSERT(*pat1a == *pat1); 2800 2801 REGEX_ASSERT(pat1a->flags() == 0); 2802 2803 // Compile with different flags should be not equal 2804 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status); 2805 REGEX_CHECK_STATUS; 2806 2807 REGEX_ASSERT(*pat1b != *pat1a); 2808 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE); 2809 REGEX_ASSERT(pat1a->flags() == 0); 2810 delete pat1b; 2811 2812 // clone 2813 RegexPattern *pat1c = pat1->clone(); 2814 REGEX_ASSERT(*pat1c == *pat1); 2815 REGEX_ASSERT(*pat1c != *pat2); 2816 2817 delete pat1c; 2818 delete pat1a; 2819 delete pat1; 2820 delete pat2; 2821 2822 utext_close(&re1); 2823 utext_close(&re2); 2824 2825 2826 // 2827 // Verify that a matcher created from a cloned pattern works. 2828 // (Jitterbug 3423) 2829 // 2830 { 2831 UErrorCode status = U_ZERO_ERROR; 2832 UText pattern = UTEXT_INITIALIZER; 2833 const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */ 2834 utext_openUTF8(&pattern, str_pL, -1, &status); 2835 2836 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status); 2837 RegexPattern *pClone = pSource->clone(); 2838 delete pSource; 2839 RegexMatcher *mFromClone = pClone->matcher(status); 2840 REGEX_CHECK_STATUS; 2841 2842 UText input = UTEXT_INITIALIZER; 2843 const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */ 2844 utext_openUTF8(&input, str_HelloWorld, -1, &status); 2845 mFromClone->reset(&input); 2846 REGEX_ASSERT(mFromClone->find() == TRUE); 2847 REGEX_ASSERT(mFromClone->group(status) == "Hello"); 2848 REGEX_ASSERT(mFromClone->find() == TRUE); 2849 REGEX_ASSERT(mFromClone->group(status) == "World"); 2850 REGEX_ASSERT(mFromClone->find() == FALSE); 2851 delete mFromClone; 2852 delete pClone; 2853 2854 utext_close(&input); 2855 utext_close(&pattern); 2856 } 2857 2858 // 2859 // matches convenience API 2860 // 2861 { 2862 UErrorCode status = U_ZERO_ERROR; 2863 UText pattern = UTEXT_INITIALIZER; 2864 UText input = UTEXT_INITIALIZER; 2865 2866 const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */ 2867 utext_openUTF8(&input, str_randominput, -1, &status); 2868 2869 const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */ 2870 utext_openUTF8(&pattern, str_dotstar, -1, &status); 2871 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE); 2872 REGEX_CHECK_STATUS; 2873 2874 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 2875 utext_openUTF8(&pattern, str_abc, -1, &status); 2876 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE); 2877 REGEX_CHECK_STATUS; 2878 2879 const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */ 2880 utext_openUTF8(&pattern, str_nput, -1, &status); 2881 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE); 2882 REGEX_CHECK_STATUS; 2883 2884 utext_openUTF8(&pattern, str_randominput, -1, &status); 2885 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE); 2886 REGEX_CHECK_STATUS; 2887 2888 const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */ 2889 utext_openUTF8(&pattern, str_u, -1, &status); 2890 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE); 2891 REGEX_CHECK_STATUS; 2892 2893 utext_openUTF8(&input, str_abc, -1, &status); 2894 utext_openUTF8(&pattern, str_abc, -1, &status); 2895 status = U_INDEX_OUTOFBOUNDS_ERROR; 2896 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE); 2897 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 2898 2899 utext_close(&input); 2900 utext_close(&pattern); 2901 } 2902 2903 2904 // 2905 // Split() 2906 // 2907 status = U_ZERO_ERROR; 2908 const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */ 2909 utext_openUTF8(&re1, str_spaceplus, -1, &status); 2910 pat1 = RegexPattern::compile(&re1, pe, status); 2911 REGEX_CHECK_STATUS; 2912 UnicodeString fields[10]; 2913 2914 int32_t n; 2915 n = pat1->split("Now is the time", fields, 10, status); 2916 REGEX_CHECK_STATUS; 2917 REGEX_ASSERT(n==4); 2918 REGEX_ASSERT(fields[0]=="Now"); 2919 REGEX_ASSERT(fields[1]=="is"); 2920 REGEX_ASSERT(fields[2]=="the"); 2921 REGEX_ASSERT(fields[3]=="time"); 2922 REGEX_ASSERT(fields[4]==""); 2923 2924 n = pat1->split("Now is the time", fields, 2, status); 2925 REGEX_CHECK_STATUS; 2926 REGEX_ASSERT(n==2); 2927 REGEX_ASSERT(fields[0]=="Now"); 2928 REGEX_ASSERT(fields[1]=="is the time"); 2929 REGEX_ASSERT(fields[2]=="the"); // left over from previous test 2930 2931 fields[1] = "*"; 2932 status = U_ZERO_ERROR; 2933 n = pat1->split("Now is the time", fields, 1, status); 2934 REGEX_CHECK_STATUS; 2935 REGEX_ASSERT(n==1); 2936 REGEX_ASSERT(fields[0]=="Now is the time"); 2937 REGEX_ASSERT(fields[1]=="*"); 2938 status = U_ZERO_ERROR; 2939 2940 n = pat1->split(" Now is the time ", fields, 10, status); 2941 REGEX_CHECK_STATUS; 2942 REGEX_ASSERT(n==6); 2943 REGEX_ASSERT(fields[0]==""); 2944 REGEX_ASSERT(fields[1]=="Now"); 2945 REGEX_ASSERT(fields[2]=="is"); 2946 REGEX_ASSERT(fields[3]=="the"); 2947 REGEX_ASSERT(fields[4]=="time"); 2948 REGEX_ASSERT(fields[5]==""); 2949 REGEX_ASSERT(fields[6]==""); 2950 2951 fields[2] = "*"; 2952 n = pat1->split(" ", fields, 10, status); 2953 REGEX_CHECK_STATUS; 2954 REGEX_ASSERT(n==2); 2955 REGEX_ASSERT(fields[0]==""); 2956 REGEX_ASSERT(fields[1]==""); 2957 REGEX_ASSERT(fields[2]=="*"); 2958 2959 fields[0] = "foo"; 2960 n = pat1->split("", fields, 10, status); 2961 REGEX_CHECK_STATUS; 2962 REGEX_ASSERT(n==0); 2963 REGEX_ASSERT(fields[0]=="foo"); 2964 2965 delete pat1; 2966 2967 // split, with a pattern with (capture) 2968 regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status); 2969 pat1 = RegexPattern::compile(&re1, pe, status); 2970 REGEX_CHECK_STATUS; 2971 2972 status = U_ZERO_ERROR; 2973 fields[6] = fields[7] = "*"; 2974 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status); 2975 REGEX_CHECK_STATUS; 2976 REGEX_ASSERT(n==7); 2977 REGEX_ASSERT(fields[0]==""); 2978 REGEX_ASSERT(fields[1]=="a"); 2979 REGEX_ASSERT(fields[2]=="Now is "); 2980 REGEX_ASSERT(fields[3]=="b"); 2981 REGEX_ASSERT(fields[4]=="the time"); 2982 REGEX_ASSERT(fields[5]=="c"); 2983 REGEX_ASSERT(fields[6]==""); 2984 REGEX_ASSERT(fields[7]=="*"); 2985 REGEX_ASSERT(status==U_ZERO_ERROR); 2986 2987 fields[6] = fields[7] = "*"; 2988 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status); 2989 REGEX_CHECK_STATUS; 2990 REGEX_ASSERT(n==7); 2991 REGEX_ASSERT(fields[0]==" "); 2992 REGEX_ASSERT(fields[1]=="a"); 2993 REGEX_ASSERT(fields[2]=="Now is "); 2994 REGEX_ASSERT(fields[3]=="b"); 2995 REGEX_ASSERT(fields[4]=="the time"); 2996 REGEX_ASSERT(fields[5]=="c"); 2997 REGEX_ASSERT(fields[6]==""); 2998 REGEX_ASSERT(fields[7]=="*"); 2999 3000 status = U_ZERO_ERROR; 3001 fields[6] = "foo"; 3002 n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status); 3003 REGEX_CHECK_STATUS; 3004 REGEX_ASSERT(n==6); 3005 REGEX_ASSERT(fields[0]==" "); 3006 REGEX_ASSERT(fields[1]=="a"); 3007 REGEX_ASSERT(fields[2]=="Now is "); 3008 REGEX_ASSERT(fields[3]=="b"); 3009 REGEX_ASSERT(fields[4]=="the time"); 3010 REGEX_ASSERT(fields[5]==" "); 3011 REGEX_ASSERT(fields[6]=="foo"); 3012 3013 status = U_ZERO_ERROR; 3014 fields[5] = "foo"; 3015 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status); 3016 REGEX_CHECK_STATUS; 3017 REGEX_ASSERT(n==5); 3018 REGEX_ASSERT(fields[0]==" "); 3019 REGEX_ASSERT(fields[1]=="a"); 3020 REGEX_ASSERT(fields[2]=="Now is "); 3021 REGEX_ASSERT(fields[3]=="b"); 3022 REGEX_ASSERT(fields[4]=="the time<c>"); 3023 REGEX_ASSERT(fields[5]=="foo"); 3024 3025 status = U_ZERO_ERROR; 3026 fields[5] = "foo"; 3027 n = pat1->split(" <a>Now is <b>the time", fields, 5, status); 3028 REGEX_CHECK_STATUS; 3029 REGEX_ASSERT(n==5); 3030 REGEX_ASSERT(fields[0]==" "); 3031 REGEX_ASSERT(fields[1]=="a"); 3032 REGEX_ASSERT(fields[2]=="Now is "); 3033 REGEX_ASSERT(fields[3]=="b"); 3034 REGEX_ASSERT(fields[4]=="the time"); 3035 REGEX_ASSERT(fields[5]=="foo"); 3036 3037 status = U_ZERO_ERROR; 3038 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status); 3039 REGEX_CHECK_STATUS; 3040 REGEX_ASSERT(n==4); 3041 REGEX_ASSERT(fields[0]==" "); 3042 REGEX_ASSERT(fields[1]=="a"); 3043 REGEX_ASSERT(fields[2]=="Now is "); 3044 REGEX_ASSERT(fields[3]=="the time<c>"); 3045 status = U_ZERO_ERROR; 3046 delete pat1; 3047 3048 regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status); 3049 pat1 = RegexPattern::compile(&re1, pe, status); 3050 REGEX_CHECK_STATUS; 3051 n = pat1->split("1-10,20", fields, 10, status); 3052 REGEX_CHECK_STATUS; 3053 REGEX_ASSERT(n==5); 3054 REGEX_ASSERT(fields[0]=="1"); 3055 REGEX_ASSERT(fields[1]=="-"); 3056 REGEX_ASSERT(fields[2]=="10"); 3057 REGEX_ASSERT(fields[3]==","); 3058 REGEX_ASSERT(fields[4]=="20"); 3059 delete pat1; 3060 3061 3062 // 3063 // RegexPattern::pattern() and patternText() 3064 // 3065 pat1 = new RegexPattern(); 3066 REGEX_ASSERT(pat1->pattern() == ""); 3067 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status)); 3068 delete pat1; 3069 const char *helloWorldInvariant = "(Hello, world)*"; 3070 regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status); 3071 pat1 = RegexPattern::compile(&re1, pe, status); 3072 REGEX_CHECK_STATUS; 3073 REGEX_ASSERT_UNISTR(pat1->pattern(),"(Hello, world)*"); 3074 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status)); 3075 delete pat1; 3076 3077 utext_close(&re1); 3078 } 3079 3080 3081 //--------------------------------------------------------------------------- 3082 // 3083 // Extended A more thorough check for features of regex patterns 3084 // The test cases are in a separate data file, 3085 // source/tests/testdata/regextst.txt 3086 // A description of the test data format is included in that file. 3087 // 3088 //--------------------------------------------------------------------------- 3089 3090 const char * 3091 RegexTest::getPath(char buffer[2048], const char *filename) { 3092 UErrorCode status=U_ZERO_ERROR; 3093 const char *testDataDirectory = IntlTest::getSourceTestData(status); 3094 if (U_FAILURE(status)) { 3095 errln("ERROR: loadTestData() failed - %s", u_errorName(status)); 3096 return NULL; 3097 } 3098 3099 strcpy(buffer, testDataDirectory); 3100 strcat(buffer, filename); 3101 return buffer; 3102 } 3103 3104 void RegexTest::Extended() { 3105 char tdd[2048]; 3106 const char *srcPath; 3107 UErrorCode status = U_ZERO_ERROR; 3108 int32_t lineNum = 0; 3109 3110 // 3111 // Open and read the test data file. 3112 // 3113 srcPath=getPath(tdd, "regextst.txt"); 3114 if(srcPath==NULL) { 3115 return; /* something went wrong, error already output */ 3116 } 3117 3118 int32_t len; 3119 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status); 3120 if (U_FAILURE(status)) { 3121 return; /* something went wrong, error already output */ 3122 } 3123 3124 // 3125 // Put the test data into a UnicodeString 3126 // 3127 UnicodeString testString(FALSE, testData, len); 3128 3129 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status); 3130 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status); 3131 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status); 3132 3133 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status); 3134 UnicodeString testPattern; // The pattern for test from the test file. 3135 UnicodeString testFlags; // the flags for a test. 3136 UnicodeString matchString; // The marked up string to be used as input 3137 3138 if (U_FAILURE(status)){ 3139 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status)); 3140 delete [] testData; 3141 return; 3142 } 3143 3144 // 3145 // Loop over the test data file, once per line. 3146 // 3147 while (lineMat.find()) { 3148 lineNum++; 3149 if (U_FAILURE(status)) { 3150 errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status)); 3151 } 3152 3153 status = U_ZERO_ERROR; 3154 UnicodeString testLine = lineMat.group(1, status); 3155 if (testLine.length() == 0) { 3156 continue; 3157 } 3158 3159 // 3160 // Parse the test line. Skip blank and comment only lines. 3161 // Separate out the three main fields - pattern, flags, target. 3162 // 3163 3164 commentMat.reset(testLine); 3165 if (commentMat.lookingAt(status)) { 3166 // This line is a comment, or blank. 3167 continue; 3168 } 3169 3170 // 3171 // Pull out the pattern field, remove it from the test file line. 3172 // 3173 quotedStuffMat.reset(testLine); 3174 if (quotedStuffMat.lookingAt(status)) { 3175 testPattern = quotedStuffMat.group(2, status); 3176 testLine.remove(0, quotedStuffMat.end(0, status)); 3177 } else { 3178 errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum); 3179 continue; 3180 } 3181 3182 3183 // 3184 // Pull out the flags from the test file line. 3185 // 3186 flagsMat.reset(testLine); 3187 flagsMat.lookingAt(status); // Will always match, possibly an empty string. 3188 testFlags = flagsMat.group(1, status); 3189 if (flagsMat.group(2, status).length() > 0) { 3190 errln("Bad Match flag at line %d. Scanning %c\n", 3191 lineNum, flagsMat.group(2, status).charAt(0)); 3192 continue; 3193 } 3194 testLine.remove(0, flagsMat.end(0, status)); 3195 3196 // 3197 // Pull out the match string, as a whole. 3198 // We'll process the <tags> later. 3199 // 3200 quotedStuffMat.reset(testLine); 3201 if (quotedStuffMat.lookingAt(status)) { 3202 matchString = quotedStuffMat.group(2, status); 3203 testLine.remove(0, quotedStuffMat.end(0, status)); 3204 } else { 3205 errln("Bad match string at test file line %d", lineNum); 3206 continue; 3207 } 3208 3209 // 3210 // The only thing left from the input line should be an optional trailing comment. 3211 // 3212 commentMat.reset(testLine); 3213 if (commentMat.lookingAt(status) == FALSE) { 3214 errln("Line %d: unexpected characters at end of test line.", lineNum); 3215 continue; 3216 } 3217 3218 // 3219 // Run the test 3220 // 3221 regex_find(testPattern, testFlags, matchString, srcPath, lineNum); 3222 } 3223 3224 delete [] testData; 3225 3226 } 3227 3228 3229 3230 //--------------------------------------------------------------------------- 3231 // 3232 // regex_find(pattern, flags, inputString, lineNumber) 3233 // 3234 // Function to run a single test from the Extended (data driven) tests. 3235 // See file test/testdata/regextst.txt for a description of the 3236 // pattern and inputString fields, and the allowed flags. 3237 // lineNumber is the source line in regextst.txt of the test. 3238 // 3239 //--------------------------------------------------------------------------- 3240 3241 3242 // Set a value into a UVector at position specified by a decimal number in 3243 // a UnicodeString. This is a utility function needed by the actual test function, 3244 // which follows. 3245 static void set(UVector &vec, int32_t val, UnicodeString index) { 3246 UErrorCode status=U_ZERO_ERROR; 3247 int32_t idx = 0; 3248 for (int32_t i=0; i<index.length(); i++) { 3249 int32_t d=u_charDigitValue(index.charAt(i)); 3250 if (d<0) {return;} 3251 idx = idx*10 + d; 3252 } 3253 while (vec.size()<idx+1) {vec.addElement(-1, status);} 3254 vec.setElementAt(val, idx); 3255 } 3256 3257 static void setInt(UVector &vec, int32_t val, int32_t idx) { 3258 UErrorCode status=U_ZERO_ERROR; 3259 while (vec.size()<idx+1) {vec.addElement(-1, status);} 3260 vec.setElementAt(val, idx); 3261 } 3262 3263 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex) 3264 { 3265 UBool couldFind = TRUE; 3266 UTEXT_SETNATIVEINDEX(utext, 0); 3267 int32_t i = 0; 3268 while (i < unistrOffset) { 3269 UChar32 c = UTEXT_NEXT32(utext); 3270 if (c != U_SENTINEL) { 3271 i += U16_LENGTH(c); 3272 } else { 3273 couldFind = FALSE; 3274 break; 3275 } 3276 } 3277 nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext); 3278 return couldFind; 3279 } 3280 3281 3282 void RegexTest::regex_find(const UnicodeString &pattern, 3283 const UnicodeString &flags, 3284 const UnicodeString &inputString, 3285 const char *srcPath, 3286 int32_t line) { 3287 UnicodeString unEscapedInput; 3288 UnicodeString deTaggedInput; 3289 3290 int32_t patternUTF8Length, inputUTF8Length; 3291 char *patternChars = NULL, *inputChars = NULL; 3292 UText patternText = UTEXT_INITIALIZER; 3293 UText inputText = UTEXT_INITIALIZER; 3294 UConverter *UTF8Converter = NULL; 3295 3296 UErrorCode status = U_ZERO_ERROR; 3297 UParseError pe; 3298 RegexPattern *parsePat = NULL; 3299 RegexMatcher *parseMatcher = NULL; 3300 RegexPattern *callerPattern = NULL, *UTF8Pattern = NULL; 3301 RegexMatcher *matcher = NULL, *UTF8Matcher = NULL; 3302 UVector groupStarts(status); 3303 UVector groupEnds(status); 3304 UVector groupStartsUTF8(status); 3305 UVector groupEndsUTF8(status); 3306 UBool isMatch = FALSE, isUTF8Match = FALSE; 3307 UBool failed = FALSE; 3308 int32_t numFinds; 3309 int32_t i; 3310 UBool useMatchesFunc = FALSE; 3311 UBool useLookingAtFunc = FALSE; 3312 int32_t regionStart = -1; 3313 int32_t regionEnd = -1; 3314 int32_t regionStartUTF8 = -1; 3315 int32_t regionEndUTF8 = -1; 3316 3317 3318 // 3319 // Compile the caller's pattern 3320 // 3321 uint32_t bflags = 0; 3322 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag 3323 bflags |= UREGEX_CASE_INSENSITIVE; 3324 } 3325 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag 3326 bflags |= UREGEX_COMMENTS; 3327 } 3328 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag 3329 bflags |= UREGEX_DOTALL; 3330 } 3331 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag 3332 bflags |= UREGEX_MULTILINE; 3333 } 3334 3335 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag 3336 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES; 3337 } 3338 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag 3339 bflags |= UREGEX_UNIX_LINES; 3340 } 3341 if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag 3342 bflags |= UREGEX_LITERAL; 3343 } 3344 3345 3346 callerPattern = RegexPattern::compile(pattern, bflags, pe, status); 3347 if (status != U_ZERO_ERROR) { 3348 #if UCONFIG_NO_BREAK_ITERATION==1 3349 // 'v' test flag means that the test pattern should not compile if ICU was configured 3350 // to not include break iteration. RBBI is needed for Unicode word boundaries. 3351 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) { 3352 goto cleanupAndReturn; 3353 } 3354 #endif 3355 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E' 3356 // Expected pattern compilation error. 3357 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd' 3358 logln("Pattern Compile returns \"%s\"", u_errorName(status)); 3359 } 3360 goto cleanupAndReturn; 3361 } else { 3362 // Unexpected pattern compilation error. 3363 dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status)); 3364 goto cleanupAndReturn; 3365 } 3366 } 3367 3368 UTF8Converter = ucnv_open("UTF8", &status); 3369 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); 3370 3371 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status); 3372 status = U_ZERO_ERROR; // buffer overflow 3373 patternChars = new char[patternUTF8Length+1]; 3374 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status); 3375 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status); 3376 3377 if (status == U_ZERO_ERROR) { 3378 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status); 3379 3380 if (status != U_ZERO_ERROR) { 3381 #if UCONFIG_NO_BREAK_ITERATION==1 3382 // 'v' test flag means that the test pattern should not compile if ICU was configured 3383 // to not include break iteration. RBBI is needed for Unicode word boundaries. 3384 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) { 3385 goto cleanupAndReturn; 3386 } 3387 #endif 3388 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E' 3389 // Expected pattern compilation error. 3390 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd' 3391 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status)); 3392 } 3393 goto cleanupAndReturn; 3394 } else { 3395 // Unexpected pattern compilation error. 3396 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status)); 3397 goto cleanupAndReturn; 3398 } 3399 } 3400 } 3401 3402 if (UTF8Pattern == NULL) { 3403 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine 3404 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line); 3405 status = U_ZERO_ERROR; 3406 } 3407 3408 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag 3409 callerPattern->dumpPattern(); 3410 } 3411 3412 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag 3413 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line); 3414 goto cleanupAndReturn; 3415 } 3416 3417 3418 // 3419 // Number of times find() should be called on the test string, default to 1 3420 // 3421 numFinds = 1; 3422 for (i=2; i<=9; i++) { 3423 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag 3424 if (numFinds != 1) { 3425 errln("Line %d: more than one digit flag. Scanning %d.", line, i); 3426 goto cleanupAndReturn; 3427 } 3428 numFinds = i; 3429 } 3430 } 3431 3432 // 'M' flag. Use matches() instead of find() 3433 if (flags.indexOf((UChar)0x4d) >= 0) { 3434 useMatchesFunc = TRUE; 3435 } 3436 if (flags.indexOf((UChar)0x4c) >= 0) { 3437 useLookingAtFunc = TRUE; 3438 } 3439 3440 // 3441 // Find the tags in the input data, remove them, and record the group boundary 3442 // positions. 3443 // 3444 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status); 3445 REGEX_CHECK_STATUS_L(line); 3446 3447 unEscapedInput = inputString.unescape(); 3448 parseMatcher = parsePat->matcher(unEscapedInput, status); 3449 REGEX_CHECK_STATUS_L(line); 3450 while(parseMatcher->find()) { 3451 parseMatcher->appendReplacement(deTaggedInput, "", status); 3452 REGEX_CHECK_STATUS; 3453 UnicodeString groupNum = parseMatcher->group(2, status); 3454 if (groupNum == "r") { 3455 // <r> or </r>, a region specification within the string 3456 if (parseMatcher->group(1, status) == "/") { 3457 regionEnd = deTaggedInput.length(); 3458 } else { 3459 regionStart = deTaggedInput.length(); 3460 } 3461 } else { 3462 // <digits> or </digits>, a group match boundary tag. 3463 if (parseMatcher->group(1, status) == "/") { 3464 set(groupEnds, deTaggedInput.length(), groupNum); 3465 } else { 3466 set(groupStarts, deTaggedInput.length(), groupNum); 3467 } 3468 } 3469 } 3470 parseMatcher->appendTail(deTaggedInput); 3471 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line); 3472 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) { 3473 errln("mismatched <r> tags"); 3474 failed = TRUE; 3475 goto cleanupAndReturn; 3476 } 3477 3478 // 3479 // Configure the matcher according to the flags specified with this test. 3480 // 3481 matcher = callerPattern->matcher(deTaggedInput, status); 3482 REGEX_CHECK_STATUS_L(line); 3483 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag 3484 matcher->setTrace(TRUE); 3485 } 3486 3487 if (UTF8Pattern != NULL) { 3488 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status); 3489 status = U_ZERO_ERROR; // buffer overflow 3490 inputChars = new char[inputUTF8Length+1]; 3491 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status); 3492 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status); 3493 3494 if (status == U_ZERO_ERROR) { 3495 UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText); 3496 REGEX_CHECK_STATUS_L(line); 3497 } 3498 3499 if (UTF8Matcher == NULL) { 3500 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine 3501 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line); 3502 status = U_ZERO_ERROR; 3503 } 3504 } 3505 3506 // 3507 // Generate native indices for UTF8 versions of region and capture group info 3508 // 3509 if (UTF8Matcher != NULL) { 3510 if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8); 3511 if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8); 3512 3513 // Fill out the native index UVector info. 3514 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size() 3515 for (i=0; i<groupStarts.size(); i++) { 3516 int32_t start = groupStarts.elementAti(i); 3517 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting 3518 if (start >= 0) { 3519 int32_t startUTF8; 3520 if (!utextOffsetToNative(&inputText, start, startUTF8)) { 3521 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start); 3522 failed = TRUE; 3523 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3524 } 3525 setInt(groupStartsUTF8, startUTF8, i); 3526 } 3527 3528 int32_t end = groupEnds.elementAti(i); 3529 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting 3530 if (end >= 0) { 3531 int32_t endUTF8; 3532 if (!utextOffsetToNative(&inputText, end, endUTF8)) { 3533 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end); 3534 failed = TRUE; 3535 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3536 } 3537 setInt(groupEndsUTF8, endUTF8, i); 3538 } 3539 } 3540 } 3541 3542 if (regionStart>=0) { 3543 matcher->region(regionStart, regionEnd, status); 3544 REGEX_CHECK_STATUS_L(line); 3545 if (UTF8Matcher != NULL) { 3546 UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status); 3547 REGEX_CHECK_STATUS_L(line); 3548 } 3549 } 3550 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag 3551 matcher->useAnchoringBounds(FALSE); 3552 if (UTF8Matcher != NULL) { 3553 UTF8Matcher->useAnchoringBounds(FALSE); 3554 } 3555 } 3556 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag 3557 matcher->useTransparentBounds(TRUE); 3558 if (UTF8Matcher != NULL) { 3559 UTF8Matcher->useTransparentBounds(TRUE); 3560 } 3561 } 3562 3563 3564 3565 // 3566 // Do a find on the de-tagged input using the caller's pattern 3567 // TODO: error on count>1 and not find(). 3568 // error on both matches() and lookingAt(). 3569 // 3570 for (i=0; i<numFinds; i++) { 3571 if (useMatchesFunc) { 3572 isMatch = matcher->matches(status); 3573 if (UTF8Matcher != NULL) { 3574 isUTF8Match = UTF8Matcher->matches(status); 3575 } 3576 } else if (useLookingAtFunc) { 3577 isMatch = matcher->lookingAt(status); 3578 if (UTF8Matcher != NULL) { 3579 isUTF8Match = UTF8Matcher->lookingAt(status); 3580 } 3581 } else { 3582 isMatch = matcher->find(); 3583 if (UTF8Matcher != NULL) { 3584 isUTF8Match = UTF8Matcher->find(); 3585 } 3586 } 3587 } 3588 matcher->setTrace(FALSE); 3589 if (U_FAILURE(status)) { 3590 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status)); 3591 } 3592 3593 // 3594 // Match up the groups from the find() with the groups from the tags 3595 // 3596 3597 // number of tags should match number of groups from find operation. 3598 // matcher->groupCount does not include group 0, the entire match, hence the +1. 3599 // G option in test means that capture group data is not available in the 3600 // expected results, so the check needs to be suppressed. 3601 if (isMatch == FALSE && groupStarts.size() != 0) { 3602 dataerrln("Error at line %d: Match expected, but none found.", line); 3603 failed = TRUE; 3604 goto cleanupAndReturn; 3605 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) { 3606 errln("Error at line %d: Match expected, but none found. (UTF8)", line); 3607 failed = TRUE; 3608 goto cleanupAndReturn; 3609 } 3610 3611 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) { 3612 // Only check for match / no match. Don't check capture groups. 3613 if (isMatch && groupStarts.size() == 0) { 3614 errln("Error at line %d: No match expected, but one found.", line); 3615 failed = TRUE; 3616 } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) { 3617 errln("Error at line %d: No match expected, but one found. (UTF8)", line); 3618 failed = TRUE; 3619 } 3620 goto cleanupAndReturn; 3621 } 3622 3623 REGEX_CHECK_STATUS_L(line); 3624 for (i=0; i<=matcher->groupCount(); i++) { 3625 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i)); 3626 int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i)); 3627 if (matcher->start(i, status) != expectedStart) { 3628 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d", 3629 line, i, expectedStart, matcher->start(i, status)); 3630 failed = TRUE; 3631 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3632 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) { 3633 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)", 3634 line, i, expectedStartUTF8, UTF8Matcher->start(i, status)); 3635 failed = TRUE; 3636 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3637 } 3638 3639 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i)); 3640 int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i)); 3641 if (matcher->end(i, status) != expectedEnd) { 3642 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d", 3643 line, i, expectedEnd, matcher->end(i, status)); 3644 failed = TRUE; 3645 // Error on end position; keep going; real error is probably yet to come as group 3646 // end positions work from end of the input data towards the front. 3647 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) { 3648 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)", 3649 line, i, expectedEndUTF8, UTF8Matcher->end(i, status)); 3650 failed = TRUE; 3651 // Error on end position; keep going; real error is probably yet to come as group 3652 // end positions work from end of the input data towards the front. 3653 } 3654 } 3655 if ( matcher->groupCount()+1 < groupStarts.size()) { 3656 errln("Error at line %d: Expected %d capture groups, found %d.", 3657 line, groupStarts.size()-1, matcher->groupCount()); 3658 failed = TRUE; 3659 } 3660 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) { 3661 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)", 3662 line, groupStarts.size()-1, UTF8Matcher->groupCount()); 3663 failed = TRUE; 3664 } 3665 3666 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false 3667 matcher->requireEnd() == TRUE) { 3668 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line); 3669 failed = TRUE; 3670 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false 3671 UTF8Matcher->requireEnd() == TRUE) { 3672 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line); 3673 failed = TRUE; 3674 } 3675 3676 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true 3677 matcher->requireEnd() == FALSE) { 3678 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line); 3679 failed = TRUE; 3680 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false 3681 UTF8Matcher->requireEnd() == FALSE) { 3682 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line); 3683 failed = TRUE; 3684 } 3685 3686 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false 3687 matcher->hitEnd() == TRUE) { 3688 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line); 3689 failed = TRUE; 3690 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false 3691 UTF8Matcher->hitEnd() == TRUE) { 3692 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line); 3693 failed = TRUE; 3694 } 3695 3696 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true 3697 matcher->hitEnd() == FALSE) { 3698 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line); 3699 failed = TRUE; 3700 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true 3701 UTF8Matcher->hitEnd() == FALSE) { 3702 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line); 3703 failed = TRUE; 3704 } 3705 3706 3707 cleanupAndReturn: 3708 if (failed) { 3709 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" " 3710 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\""); 3711 // callerPattern->dump(); 3712 } 3713 delete parseMatcher; 3714 delete parsePat; 3715 delete UTF8Matcher; 3716 delete UTF8Pattern; 3717 delete matcher; 3718 delete callerPattern; 3719 3720 utext_close(&inputText); 3721 delete[] inputChars; 3722 utext_close(&patternText); 3723 delete[] patternChars; 3724 ucnv_close(UTF8Converter); 3725 } 3726 3727 3728 3729 3730 //--------------------------------------------------------------------------- 3731 // 3732 // Errors Check for error handling in patterns. 3733 // 3734 //--------------------------------------------------------------------------- 3735 void RegexTest::Errors() { 3736 // \escape sequences that aren't implemented yet. 3737 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED); 3738 3739 // Missing close parentheses 3740 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN); 3741 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN); 3742 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN); 3743 3744 // Extra close paren 3745 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN); 3746 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN); 3747 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN); 3748 3749 // Look-ahead, Look-behind 3750 // TODO: add tests for unbounded length look-behinds. 3751 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct 3752 3753 // Attempt to use non-default flags 3754 { 3755 UParseError pe; 3756 UErrorCode status = U_ZERO_ERROR; 3757 int32_t flags = UREGEX_CANON_EQ | 3758 UREGEX_COMMENTS | UREGEX_DOTALL | 3759 UREGEX_MULTILINE; 3760 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status); 3761 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED); 3762 delete pat1; 3763 } 3764 3765 3766 // Quantifiers are allowed only after something that can be quantified. 3767 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX); 3768 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX); 3769 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX); 3770 3771 // Mal-formed {min,max} quantifiers 3772 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL); 3773 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN); 3774 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL); 3775 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL); 3776 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL); 3777 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG); 3778 REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan 3779 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format 3780 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG); 3781 3782 // Ticket 5389 3783 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX); 3784 3785 // Invalid Back Reference \0 3786 // For ICU 3.8 and earlier 3787 // For ICU versions newer than 3.8, \0 introduces an octal escape. 3788 // 3789 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE); 3790 3791 } 3792 3793 3794 //------------------------------------------------------------------------------- 3795 // 3796 // Read a text data file, convert it to UChars, and return the data 3797 // in one big UChar * buffer, which the caller must delete. 3798 // 3799 //-------------------------------------------------------------------------------- 3800 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen, 3801 const char *defEncoding, UErrorCode &status) { 3802 UChar *retPtr = NULL; 3803 char *fileBuf = NULL; 3804 UConverter* conv = NULL; 3805 FILE *f = NULL; 3806 3807 ulen = 0; 3808 if (U_FAILURE(status)) { 3809 return retPtr; 3810 } 3811 3812 // 3813 // Open the file. 3814 // 3815 f = fopen(fileName, "rb"); 3816 if (f == 0) { 3817 dataerrln("Error opening test data file %s\n", fileName); 3818 status = U_FILE_ACCESS_ERROR; 3819 return NULL; 3820 } 3821 // 3822 // Read it in 3823 // 3824 int32_t fileSize; 3825 int32_t amt_read; 3826 3827 fseek( f, 0, SEEK_END); 3828 fileSize = ftell(f); 3829 fileBuf = new char[fileSize]; 3830 fseek(f, 0, SEEK_SET); 3831 amt_read = fread(fileBuf, 1, fileSize, f); 3832 if (amt_read != fileSize || fileSize <= 0) { 3833 errln("Error reading test data file."); 3834 goto cleanUpAndReturn; 3835 } 3836 3837 // 3838 // Look for a Unicode Signature (BOM) on the data just read 3839 // 3840 int32_t signatureLength; 3841 const char * fileBufC; 3842 const char* encoding; 3843 3844 fileBufC = fileBuf; 3845 encoding = ucnv_detectUnicodeSignature( 3846 fileBuf, fileSize, &signatureLength, &status); 3847 if(encoding!=NULL ){ 3848 fileBufC += signatureLength; 3849 fileSize -= signatureLength; 3850 } else { 3851 encoding = defEncoding; 3852 if (strcmp(encoding, "utf-8") == 0) { 3853 errln("file %s is missing its BOM", fileName); 3854 } 3855 } 3856 3857 // 3858 // Open a converter to take the rule file to UTF-16 3859 // 3860 conv = ucnv_open(encoding, &status); 3861 if (U_FAILURE(status)) { 3862 goto cleanUpAndReturn; 3863 } 3864 3865 // 3866 // Convert the rules to UChar. 3867 // Preflight first to determine required buffer size. 3868 // 3869 ulen = ucnv_toUChars(conv, 3870 NULL, // dest, 3871 0, // destCapacity, 3872 fileBufC, 3873 fileSize, 3874 &status); 3875 if (status == U_BUFFER_OVERFLOW_ERROR) { 3876 // Buffer Overflow is expected from the preflight operation. 3877 status = U_ZERO_ERROR; 3878 3879 retPtr = new UChar[ulen+1]; 3880 ucnv_toUChars(conv, 3881 retPtr, // dest, 3882 ulen+1, 3883 fileBufC, 3884 fileSize, 3885 &status); 3886 } 3887 3888 cleanUpAndReturn: 3889 fclose(f); 3890 delete[] fileBuf; 3891 ucnv_close(conv); 3892 if (U_FAILURE(status)) { 3893 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 3894 delete []retPtr; 3895 retPtr = 0; 3896 ulen = 0; 3897 }; 3898 return retPtr; 3899 } 3900 3901 3902 //------------------------------------------------------------------------------- 3903 // 3904 // PerlTests - Run Perl's regular expression tests 3905 // The input file for this test is re_tests, the standard regular 3906 // expression test data distributed with the Perl source code. 3907 // 3908 // Here is Perl's description of the test data file: 3909 // 3910 // # The tests are in a separate file 't/op/re_tests'. 3911 // # Each line in that file is a separate test. 3912 // # There are five columns, separated by tabs. 3913 // # 3914 // # Column 1 contains the pattern, optionally enclosed in C<''>. 3915 // # Modifiers can be put after the closing C<'>. 3916 // # 3917 // # Column 2 contains the string to be matched. 3918 // # 3919 // # Column 3 contains the expected result: 3920 // # y expect a match 3921 // # n expect no match 3922 // # c expect an error 3923 // # B test exposes a known bug in Perl, should be skipped 3924 // # b test exposes a known bug in Perl, should be skipped if noamp 3925 // # 3926 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>. 3927 // # 3928 // # Column 4 contains a string, usually C<$&>. 3929 // # 3930 // # Column 5 contains the expected result of double-quote 3931 // # interpolating that string after the match, or start of error message. 3932 // # 3933 // # Column 6, if present, contains a reason why the test is skipped. 3934 // # This is printed with "skipped", for harness to pick up. 3935 // # 3936 // # \n in the tests are interpolated, as are variables of the form ${\w+}. 3937 // # 3938 // # If you want to add a regular expression test that can't be expressed 3939 // # in this format, don't add it here: put it in op/pat.t instead. 3940 // 3941 // For ICU, if field 3 contains an 'i', the test will be skipped. 3942 // The test exposes is some known incompatibility between ICU and Perl regexps. 3943 // (The i is in addition to whatever was there before.) 3944 // 3945 //------------------------------------------------------------------------------- 3946 void RegexTest::PerlTests() { 3947 char tdd[2048]; 3948 const char *srcPath; 3949 UErrorCode status = U_ZERO_ERROR; 3950 UParseError pe; 3951 3952 // 3953 // Open and read the test data file. 3954 // 3955 srcPath=getPath(tdd, "re_tests.txt"); 3956 if(srcPath==NULL) { 3957 return; /* something went wrong, error already output */ 3958 } 3959 3960 int32_t len; 3961 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status); 3962 if (U_FAILURE(status)) { 3963 return; /* something went wrong, error already output */ 3964 } 3965 3966 // 3967 // Put the test data into a UnicodeString 3968 // 3969 UnicodeString testDataString(FALSE, testData, len); 3970 3971 // 3972 // Regex to break the input file into lines, and strip the new lines. 3973 // One line per match, capture group one is the desired data. 3974 // 3975 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status); 3976 if (U_FAILURE(status)) { 3977 dataerrln("RegexPattern::compile() error"); 3978 return; 3979 } 3980 RegexMatcher* lineMat = linePat->matcher(testDataString, status); 3981 3982 // 3983 // Regex to split a test file line into fields. 3984 // There are six fields, separated by tabs. 3985 // 3986 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status); 3987 3988 // 3989 // Regex to identify test patterns with flag settings, and to separate them. 3990 // Test patterns with flags look like 'pattern'i 3991 // Test patterns without flags are not quoted: pattern 3992 // Coming out, capture group 2 is the pattern, capture group 3 is the flags. 3993 // 3994 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status); 3995 RegexMatcher* flagMat = flagPat->matcher(status); 3996 3997 // 3998 // The Perl tests reference several perl-isms, which are evaluated/substituted 3999 // in the test data. Not being perl, this must be done explicitly. Here 4000 // are string constants and REs for these constructs. 4001 // 4002 UnicodeString nulnulSrc("${nulnul}"); 4003 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV); 4004 nulnul = nulnul.unescape(); 4005 4006 UnicodeString ffffSrc("${ffff}"); 4007 UnicodeString ffff("\\uffff", -1, US_INV); 4008 ffff = ffff.unescape(); 4009 4010 // regexp for $-[0], $+[2], etc. 4011 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status); 4012 RegexMatcher *groupsMat = groupsPat->matcher(status); 4013 4014 // regexp for $0, $1, $2, etc. 4015 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status); 4016 RegexMatcher *cgMat = cgPat->matcher(status); 4017 4018 4019 // 4020 // Main Loop for the Perl Tests, runs once per line from the 4021 // test data file. 4022 // 4023 int32_t lineNum = 0; 4024 int32_t skippedUnimplementedCount = 0; 4025 while (lineMat->find()) { 4026 lineNum++; 4027 4028 // 4029 // Get a line, break it into its fields, do the Perl 4030 // variable substitutions. 4031 // 4032 UnicodeString line = lineMat->group(1, status); 4033 UnicodeString fields[7]; 4034 fieldPat->split(line, fields, 7, status); 4035 4036 flagMat->reset(fields[0]); 4037 flagMat->matches(status); 4038 UnicodeString pattern = flagMat->group(2, status); 4039 pattern.findAndReplace("${bang}", "!"); 4040 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000")); 4041 pattern.findAndReplace(ffffSrc, ffff); 4042 4043 // 4044 // Identify patterns that include match flag settings, 4045 // split off the flags, remove the extra quotes. 4046 // 4047 UnicodeString flagStr = flagMat->group(3, status); 4048 if (U_FAILURE(status)) { 4049 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 4050 return; 4051 } 4052 int32_t flags = 0; 4053 const UChar UChar_c = 0x63; // Char constants for the flag letters. 4054 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C) 4055 const UChar UChar_m = 0x6d; 4056 const UChar UChar_x = 0x78; 4057 const UChar UChar_y = 0x79; 4058 if (flagStr.indexOf(UChar_i) != -1) { 4059 flags |= UREGEX_CASE_INSENSITIVE; 4060 } 4061 if (flagStr.indexOf(UChar_m) != -1) { 4062 flags |= UREGEX_MULTILINE; 4063 } 4064 if (flagStr.indexOf(UChar_x) != -1) { 4065 flags |= UREGEX_COMMENTS; 4066 } 4067 4068 // 4069 // Compile the test pattern. 4070 // 4071 status = U_ZERO_ERROR; 4072 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status); 4073 if (status == U_REGEX_UNIMPLEMENTED) { 4074 // 4075 // Test of a feature that is planned for ICU, but not yet implemented. 4076 // skip the test. 4077 skippedUnimplementedCount++; 4078 delete testPat; 4079 status = U_ZERO_ERROR; 4080 continue; 4081 } 4082 4083 if (U_FAILURE(status)) { 4084 // Some tests are supposed to generate errors. 4085 // Only report an error for tests that are supposed to succeed. 4086 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND 4087 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility 4088 { 4089 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status)); 4090 } 4091 status = U_ZERO_ERROR; 4092 delete testPat; 4093 continue; 4094 } 4095 4096 if (fields[2].indexOf(UChar_i) >= 0) { 4097 // ICU should skip this test. 4098 delete testPat; 4099 continue; 4100 } 4101 4102 if (fields[2].indexOf(UChar_c) >= 0) { 4103 // This pattern should have caused a compilation error, but didn't/ 4104 errln("line %d: Expected a pattern compile error, got success.", lineNum); 4105 delete testPat; 4106 continue; 4107 } 4108 4109 // 4110 // replace the Perl variables that appear in some of the 4111 // match data strings. 4112 // 4113 UnicodeString matchString = fields[1]; 4114 matchString.findAndReplace(nulnulSrc, nulnul); 4115 matchString.findAndReplace(ffffSrc, ffff); 4116 4117 // Replace any \n in the match string with an actual new-line char. 4118 // Don't do full unescape, as this unescapes more than Perl does, which 4119 // causes other spurious failures in the tests. 4120 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4121 4122 4123 4124 // 4125 // Run the test, check for expected match/don't match result. 4126 // 4127 RegexMatcher *testMat = testPat->matcher(matchString, status); 4128 UBool found = testMat->find(); 4129 UBool expected = FALSE; 4130 if (fields[2].indexOf(UChar_y) >=0) { 4131 expected = TRUE; 4132 } 4133 if (expected != found) { 4134 errln("line %d: Expected %smatch, got %smatch", 4135 lineNum, expected?"":"no ", found?"":"no " ); 4136 continue; 4137 } 4138 4139 // Don't try to check expected results if there is no match. 4140 // (Some have stuff in the expected fields) 4141 if (!found) { 4142 delete testMat; 4143 delete testPat; 4144 continue; 4145 } 4146 4147 // 4148 // Interpret the Perl expression from the fourth field of the data file, 4149 // building up an ICU string from the results of the ICU match. 4150 // The Perl expression will contain references to the results of 4151 // a regex match, including the matched string, capture group strings, 4152 // group starting and ending indicies, etc. 4153 // 4154 UnicodeString resultString; 4155 UnicodeString perlExpr = fields[3]; 4156 #if SUPPORT_MUTATING_INPUT_STRING 4157 groupsMat->reset(perlExpr); 4158 cgMat->reset(perlExpr); 4159 #endif 4160 4161 while (perlExpr.length() > 0) { 4162 #if !SUPPORT_MUTATING_INPUT_STRING 4163 // Perferred usage. Reset after any modification to input string. 4164 groupsMat->reset(perlExpr); 4165 cgMat->reset(perlExpr); 4166 #endif 4167 4168 if (perlExpr.startsWith("$&")) { 4169 resultString.append(testMat->group(status)); 4170 perlExpr.remove(0, 2); 4171 } 4172 4173 else if (groupsMat->lookingAt(status)) { 4174 // $-[0] $+[2] etc. 4175 UnicodeString digitString = groupsMat->group(2, status); 4176 int32_t t = 0; 4177 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4178 UnicodeString plusOrMinus = groupsMat->group(1, status); 4179 int32_t matchPosition; 4180 if (plusOrMinus.compare("+") == 0) { 4181 matchPosition = testMat->end(groupNum, status); 4182 } else { 4183 matchPosition = testMat->start(groupNum, status); 4184 } 4185 if (matchPosition != -1) { 4186 ICU_Utility::appendNumber(resultString, matchPosition); 4187 } 4188 perlExpr.remove(0, groupsMat->end(status)); 4189 } 4190 4191 else if (cgMat->lookingAt(status)) { 4192 // $1, $2, $3, etc. 4193 UnicodeString digitString = cgMat->group(1, status); 4194 int32_t t = 0; 4195 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4196 if (U_SUCCESS(status)) { 4197 resultString.append(testMat->group(groupNum, status)); 4198 status = U_ZERO_ERROR; 4199 } 4200 perlExpr.remove(0, cgMat->end(status)); 4201 } 4202 4203 else if (perlExpr.startsWith("@-")) { 4204 int32_t i; 4205 for (i=0; i<=testMat->groupCount(); i++) { 4206 if (i>0) { 4207 resultString.append(" "); 4208 } 4209 ICU_Utility::appendNumber(resultString, testMat->start(i, status)); 4210 } 4211 perlExpr.remove(0, 2); 4212 } 4213 4214 else if (perlExpr.startsWith("@+")) { 4215 int32_t i; 4216 for (i=0; i<=testMat->groupCount(); i++) { 4217 if (i>0) { 4218 resultString.append(" "); 4219 } 4220 ICU_Utility::appendNumber(resultString, testMat->end(i, status)); 4221 } 4222 perlExpr.remove(0, 2); 4223 } 4224 4225 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal. 4226 // or as an escaped sequence (e.g. \n) 4227 if (perlExpr.length() > 1) { 4228 perlExpr.remove(0, 1); // Remove the '\', but only if not last char. 4229 } 4230 UChar c = perlExpr.charAt(0); 4231 switch (c) { 4232 case 'n': c = '\n'; break; 4233 // add any other escape sequences that show up in the test expected results. 4234 } 4235 resultString.append(c); 4236 perlExpr.remove(0, 1); 4237 } 4238 4239 else { 4240 // Any characters from the perl expression that we don't explicitly 4241 // recognize before here are assumed to be literals and copied 4242 // as-is to the expected results. 4243 resultString.append(perlExpr.charAt(0)); 4244 perlExpr.remove(0, 1); 4245 } 4246 4247 if (U_FAILURE(status)) { 4248 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status)); 4249 break; 4250 } 4251 } 4252 4253 // 4254 // Expected Results Compare 4255 // 4256 UnicodeString expectedS(fields[4]); 4257 expectedS.findAndReplace(nulnulSrc, nulnul); 4258 expectedS.findAndReplace(ffffSrc, ffff); 4259 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4260 4261 4262 if (expectedS.compare(resultString) != 0) { 4263 err("Line %d: Incorrect perl expression results.", lineNum); 4264 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\""); 4265 } 4266 4267 delete testMat; 4268 delete testPat; 4269 } 4270 4271 // 4272 // All done. Clean up allocated stuff. 4273 // 4274 delete cgMat; 4275 delete cgPat; 4276 4277 delete groupsMat; 4278 delete groupsPat; 4279 4280 delete flagMat; 4281 delete flagPat; 4282 4283 delete lineMat; 4284 delete linePat; 4285 4286 delete fieldPat; 4287 delete [] testData; 4288 4289 4290 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount); 4291 4292 } 4293 4294 4295 //------------------------------------------------------------------------------- 4296 // 4297 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts 4298 // (instead of using UnicodeStrings) to test the alternate engine. 4299 // The input file for this test is re_tests, the standard regular 4300 // expression test data distributed with the Perl source code. 4301 // See PerlTests() for more information. 4302 // 4303 //------------------------------------------------------------------------------- 4304 void RegexTest::PerlTestsUTF8() { 4305 char tdd[2048]; 4306 const char *srcPath; 4307 UErrorCode status = U_ZERO_ERROR; 4308 UParseError pe; 4309 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status)); 4310 UText patternText = UTEXT_INITIALIZER; 4311 char *patternChars = NULL; 4312 int32_t patternLength; 4313 int32_t patternCapacity = 0; 4314 UText inputText = UTEXT_INITIALIZER; 4315 char *inputChars = NULL; 4316 int32_t inputLength; 4317 int32_t inputCapacity = 0; 4318 4319 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); 4320 4321 // 4322 // Open and read the test data file. 4323 // 4324 srcPath=getPath(tdd, "re_tests.txt"); 4325 if(srcPath==NULL) { 4326 return; /* something went wrong, error already output */ 4327 } 4328 4329 int32_t len; 4330 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status); 4331 if (U_FAILURE(status)) { 4332 return; /* something went wrong, error already output */ 4333 } 4334 4335 // 4336 // Put the test data into a UnicodeString 4337 // 4338 UnicodeString testDataString(FALSE, testData, len); 4339 4340 // 4341 // Regex to break the input file into lines, and strip the new lines. 4342 // One line per match, capture group one is the desired data. 4343 // 4344 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status); 4345 if (U_FAILURE(status)) { 4346 dataerrln("RegexPattern::compile() error"); 4347 return; 4348 } 4349 RegexMatcher* lineMat = linePat->matcher(testDataString, status); 4350 4351 // 4352 // Regex to split a test file line into fields. 4353 // There are six fields, separated by tabs. 4354 // 4355 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status); 4356 4357 // 4358 // Regex to identify test patterns with flag settings, and to separate them. 4359 // Test patterns with flags look like 'pattern'i 4360 // Test patterns without flags are not quoted: pattern 4361 // Coming out, capture group 2 is the pattern, capture group 3 is the flags. 4362 // 4363 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status); 4364 RegexMatcher* flagMat = flagPat->matcher(status); 4365 4366 // 4367 // The Perl tests reference several perl-isms, which are evaluated/substituted 4368 // in the test data. Not being perl, this must be done explicitly. Here 4369 // are string constants and REs for these constructs. 4370 // 4371 UnicodeString nulnulSrc("${nulnul}"); 4372 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV); 4373 nulnul = nulnul.unescape(); 4374 4375 UnicodeString ffffSrc("${ffff}"); 4376 UnicodeString ffff("\\uffff", -1, US_INV); 4377 ffff = ffff.unescape(); 4378 4379 // regexp for $-[0], $+[2], etc. 4380 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status); 4381 RegexMatcher *groupsMat = groupsPat->matcher(status); 4382 4383 // regexp for $0, $1, $2, etc. 4384 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status); 4385 RegexMatcher *cgMat = cgPat->matcher(status); 4386 4387 4388 // 4389 // Main Loop for the Perl Tests, runs once per line from the 4390 // test data file. 4391 // 4392 int32_t lineNum = 0; 4393 int32_t skippedUnimplementedCount = 0; 4394 while (lineMat->find()) { 4395 lineNum++; 4396 4397 // 4398 // Get a line, break it into its fields, do the Perl 4399 // variable substitutions. 4400 // 4401 UnicodeString line = lineMat->group(1, status); 4402 UnicodeString fields[7]; 4403 fieldPat->split(line, fields, 7, status); 4404 4405 flagMat->reset(fields[0]); 4406 flagMat->matches(status); 4407 UnicodeString pattern = flagMat->group(2, status); 4408 pattern.findAndReplace("${bang}", "!"); 4409 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000")); 4410 pattern.findAndReplace(ffffSrc, ffff); 4411 4412 // 4413 // Identify patterns that include match flag settings, 4414 // split off the flags, remove the extra quotes. 4415 // 4416 UnicodeString flagStr = flagMat->group(3, status); 4417 if (U_FAILURE(status)) { 4418 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 4419 return; 4420 } 4421 int32_t flags = 0; 4422 const UChar UChar_c = 0x63; // Char constants for the flag letters. 4423 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C) 4424 const UChar UChar_m = 0x6d; 4425 const UChar UChar_x = 0x78; 4426 const UChar UChar_y = 0x79; 4427 if (flagStr.indexOf(UChar_i) != -1) { 4428 flags |= UREGEX_CASE_INSENSITIVE; 4429 } 4430 if (flagStr.indexOf(UChar_m) != -1) { 4431 flags |= UREGEX_MULTILINE; 4432 } 4433 if (flagStr.indexOf(UChar_x) != -1) { 4434 flags |= UREGEX_COMMENTS; 4435 } 4436 4437 // 4438 // Put the pattern in a UTF-8 UText 4439 // 4440 status = U_ZERO_ERROR; 4441 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status); 4442 if (status == U_BUFFER_OVERFLOW_ERROR) { 4443 status = U_ZERO_ERROR; 4444 delete[] patternChars; 4445 patternCapacity = patternLength + 1; 4446 patternChars = new char[patternCapacity]; 4447 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status); 4448 } 4449 utext_openUTF8(&patternText, patternChars, patternLength, &status); 4450 4451 // 4452 // Compile the test pattern. 4453 // 4454 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status); 4455 if (status == U_REGEX_UNIMPLEMENTED) { 4456 // 4457 // Test of a feature that is planned for ICU, but not yet implemented. 4458 // skip the test. 4459 skippedUnimplementedCount++; 4460 delete testPat; 4461 status = U_ZERO_ERROR; 4462 continue; 4463 } 4464 4465 if (U_FAILURE(status)) { 4466 // Some tests are supposed to generate errors. 4467 // Only report an error for tests that are supposed to succeed. 4468 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND 4469 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility 4470 { 4471 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status)); 4472 } 4473 status = U_ZERO_ERROR; 4474 delete testPat; 4475 continue; 4476 } 4477 4478 if (fields[2].indexOf(UChar_i) >= 0) { 4479 // ICU should skip this test. 4480 delete testPat; 4481 continue; 4482 } 4483 4484 if (fields[2].indexOf(UChar_c) >= 0) { 4485 // This pattern should have caused a compilation error, but didn't/ 4486 errln("line %d: Expected a pattern compile error, got success.", lineNum); 4487 delete testPat; 4488 continue; 4489 } 4490 4491 4492 // 4493 // replace the Perl variables that appear in some of the 4494 // match data strings. 4495 // 4496 UnicodeString matchString = fields[1]; 4497 matchString.findAndReplace(nulnulSrc, nulnul); 4498 matchString.findAndReplace(ffffSrc, ffff); 4499 4500 // Replace any \n in the match string with an actual new-line char. 4501 // Don't do full unescape, as this unescapes more than Perl does, which 4502 // causes other spurious failures in the tests. 4503 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4504 4505 // 4506 // Put the input in a UTF-8 UText 4507 // 4508 status = U_ZERO_ERROR; 4509 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status); 4510 if (status == U_BUFFER_OVERFLOW_ERROR) { 4511 status = U_ZERO_ERROR; 4512 delete[] inputChars; 4513 inputCapacity = inputLength + 1; 4514 inputChars = new char[inputCapacity]; 4515 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status); 4516 } 4517 utext_openUTF8(&inputText, inputChars, inputLength, &status); 4518 4519 // 4520 // Run the test, check for expected match/don't match result. 4521 // 4522 RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText); 4523 UBool found = testMat->find(); 4524 UBool expected = FALSE; 4525 if (fields[2].indexOf(UChar_y) >=0) { 4526 expected = TRUE; 4527 } 4528 if (expected != found) { 4529 errln("line %d: Expected %smatch, got %smatch", 4530 lineNum, expected?"":"no ", found?"":"no " ); 4531 continue; 4532 } 4533 4534 // Don't try to check expected results if there is no match. 4535 // (Some have stuff in the expected fields) 4536 if (!found) { 4537 delete testMat; 4538 delete testPat; 4539 continue; 4540 } 4541 4542 // 4543 // Interpret the Perl expression from the fourth field of the data file, 4544 // building up an ICU string from the results of the ICU match. 4545 // The Perl expression will contain references to the results of 4546 // a regex match, including the matched string, capture group strings, 4547 // group starting and ending indicies, etc. 4548 // 4549 UnicodeString resultString; 4550 UnicodeString perlExpr = fields[3]; 4551 4552 while (perlExpr.length() > 0) { 4553 groupsMat->reset(perlExpr); 4554 cgMat->reset(perlExpr); 4555 4556 if (perlExpr.startsWith("$&")) { 4557 resultString.append(testMat->group(status)); 4558 perlExpr.remove(0, 2); 4559 } 4560 4561 else if (groupsMat->lookingAt(status)) { 4562 // $-[0] $+[2] etc. 4563 UnicodeString digitString = groupsMat->group(2, status); 4564 int32_t t = 0; 4565 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4566 UnicodeString plusOrMinus = groupsMat->group(1, status); 4567 int32_t matchPosition; 4568 if (plusOrMinus.compare("+") == 0) { 4569 matchPosition = testMat->end(groupNum, status); 4570 } else { 4571 matchPosition = testMat->start(groupNum, status); 4572 } 4573 if (matchPosition != -1) { 4574 ICU_Utility::appendNumber(resultString, matchPosition); 4575 } 4576 perlExpr.remove(0, groupsMat->end(status)); 4577 } 4578 4579 else if (cgMat->lookingAt(status)) { 4580 // $1, $2, $3, etc. 4581 UnicodeString digitString = cgMat->group(1, status); 4582 int32_t t = 0; 4583 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4584 if (U_SUCCESS(status)) { 4585 resultString.append(testMat->group(groupNum, status)); 4586 status = U_ZERO_ERROR; 4587 } 4588 perlExpr.remove(0, cgMat->end(status)); 4589 } 4590 4591 else if (perlExpr.startsWith("@-")) { 4592 int32_t i; 4593 for (i=0; i<=testMat->groupCount(); i++) { 4594 if (i>0) { 4595 resultString.append(" "); 4596 } 4597 ICU_Utility::appendNumber(resultString, testMat->start(i, status)); 4598 } 4599 perlExpr.remove(0, 2); 4600 } 4601 4602 else if (perlExpr.startsWith("@+")) { 4603 int32_t i; 4604 for (i=0; i<=testMat->groupCount(); i++) { 4605 if (i>0) { 4606 resultString.append(" "); 4607 } 4608 ICU_Utility::appendNumber(resultString, testMat->end(i, status)); 4609 } 4610 perlExpr.remove(0, 2); 4611 } 4612 4613 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal. 4614 // or as an escaped sequence (e.g. \n) 4615 if (perlExpr.length() > 1) { 4616 perlExpr.remove(0, 1); // Remove the '\', but only if not last char. 4617 } 4618 UChar c = perlExpr.charAt(0); 4619 switch (c) { 4620 case 'n': c = '\n'; break; 4621 // add any other escape sequences that show up in the test expected results. 4622 } 4623 resultString.append(c); 4624 perlExpr.remove(0, 1); 4625 } 4626 4627 else { 4628 // Any characters from the perl expression that we don't explicitly 4629 // recognize before here are assumed to be literals and copied 4630 // as-is to the expected results. 4631 resultString.append(perlExpr.charAt(0)); 4632 perlExpr.remove(0, 1); 4633 } 4634 4635 if (U_FAILURE(status)) { 4636 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status)); 4637 break; 4638 } 4639 } 4640 4641 // 4642 // Expected Results Compare 4643 // 4644 UnicodeString expectedS(fields[4]); 4645 expectedS.findAndReplace(nulnulSrc, nulnul); 4646 expectedS.findAndReplace(ffffSrc, ffff); 4647 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4648 4649 4650 if (expectedS.compare(resultString) != 0) { 4651 err("Line %d: Incorrect perl expression results.", lineNum); 4652 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\""); 4653 } 4654 4655 delete testMat; 4656 delete testPat; 4657 } 4658 4659 // 4660 // All done. Clean up allocated stuff. 4661 // 4662 delete cgMat; 4663 delete cgPat; 4664 4665 delete groupsMat; 4666 delete groupsPat; 4667 4668 delete flagMat; 4669 delete flagPat; 4670 4671 delete lineMat; 4672 delete linePat; 4673 4674 delete fieldPat; 4675 delete [] testData; 4676 4677 utext_close(&patternText); 4678 utext_close(&inputText); 4679 4680 delete [] patternChars; 4681 delete [] inputChars; 4682 4683 4684 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount); 4685 4686 } 4687 4688 4689 //-------------------------------------------------------------- 4690 // 4691 // Bug6149 Verify limits to heap expansion for backtrack stack. 4692 // Use this pattern, 4693 // "(a?){1,8000000}" 4694 // Note: was an unbounded upperbounds, but that now has loop-breaking enabled. 4695 // This test is likely to be fragile, as further optimizations stop 4696 // more cases of pointless looping in the match engine. 4697 // 4698 //--------------------------------------------------------------- 4699 void RegexTest::Bug6149() { 4700 UnicodeString pattern("(a?){1,8000000}"); 4701 UnicodeString s("xyz"); 4702 uint32_t flags = 0; 4703 UErrorCode status = U_ZERO_ERROR; 4704 4705 RegexMatcher matcher(pattern, s, flags, status); 4706 UBool result = false; 4707 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW); 4708 REGEX_ASSERT(result == FALSE); 4709 } 4710 4711 4712 // 4713 // Callbacks() Test the callback function. 4714 // When set, callbacks occur periodically during matching operations, 4715 // giving the application code the ability to abort the operation 4716 // before it's normal completion. 4717 // 4718 4719 struct callBackContext { 4720 RegexTest *test; 4721 int32_t maxCalls; 4722 int32_t numCalls; 4723 int32_t lastSteps; 4724 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;}; 4725 }; 4726 4727 U_CDECL_BEGIN 4728 static UBool U_CALLCONV 4729 testCallBackFn(const void *context, int32_t steps) { 4730 callBackContext *info = (callBackContext *)context; 4731 if (info->lastSteps+1 != steps) { 4732 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps); 4733 } 4734 info->lastSteps = steps; 4735 info->numCalls++; 4736 return (info->numCalls < info->maxCalls); 4737 } 4738 U_CDECL_END 4739 4740 void RegexTest::Callbacks() { 4741 { 4742 // Getter returns NULLs if no callback has been set 4743 4744 // The variables that the getter will fill in. 4745 // Init to non-null values so that the action of the getter can be seen. 4746 const void *returnedContext = &returnedContext; 4747 URegexMatchCallback *returnedFn = &testCallBackFn; 4748 4749 UErrorCode status = U_ZERO_ERROR; 4750 RegexMatcher matcher("x", 0, status); 4751 REGEX_CHECK_STATUS; 4752 matcher.getMatchCallback(returnedFn, returnedContext, status); 4753 REGEX_CHECK_STATUS; 4754 REGEX_ASSERT(returnedFn == NULL); 4755 REGEX_ASSERT(returnedContext == NULL); 4756 } 4757 4758 { 4759 // Set and Get work 4760 callBackContext cbInfo = {this, 0, 0, 0}; 4761 const void *returnedContext; 4762 URegexMatchCallback *returnedFn; 4763 UErrorCode status = U_ZERO_ERROR; 4764 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long. 4765 REGEX_CHECK_STATUS; 4766 matcher.setMatchCallback(testCallBackFn, &cbInfo, status); 4767 REGEX_CHECK_STATUS; 4768 matcher.getMatchCallback(returnedFn, returnedContext, status); 4769 REGEX_CHECK_STATUS; 4770 REGEX_ASSERT(returnedFn == testCallBackFn); 4771 REGEX_ASSERT(returnedContext == &cbInfo); 4772 4773 // A short-running match shouldn't invoke the callback 4774 status = U_ZERO_ERROR; 4775 cbInfo.reset(1); 4776 UnicodeString s = "xxx"; 4777 matcher.reset(s); 4778 REGEX_ASSERT(matcher.matches(status)); 4779 REGEX_CHECK_STATUS; 4780 REGEX_ASSERT(cbInfo.numCalls == 0); 4781 4782 // A medium-length match that runs long enough to invoke the 4783 // callback, but not so long that the callback aborts it. 4784 status = U_ZERO_ERROR; 4785 cbInfo.reset(4); 4786 s = "aaaaaaaaaaaaaaaaaaab"; 4787 matcher.reset(s); 4788 REGEX_ASSERT(matcher.matches(status)==FALSE); 4789 REGEX_CHECK_STATUS; 4790 REGEX_ASSERT(cbInfo.numCalls > 0); 4791 4792 // A longer running match that the callback function will abort. 4793 status = U_ZERO_ERROR; 4794 cbInfo.reset(4); 4795 s = "aaaaaaaaaaaaaaaaaaaaaaab"; 4796 matcher.reset(s); 4797 REGEX_ASSERT(matcher.matches(status)==FALSE); 4798 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); 4799 REGEX_ASSERT(cbInfo.numCalls == 4); 4800 } 4801 4802 4803 } 4804 4805 4806 // 4807 // FindProgressCallbacks() Test the find "progress" callback function. 4808 // When set, the find progress callback will be invoked during a find operations 4809 // after each return from a match attempt, giving the application the opportunity 4810 // to terminate a long-running find operation before it's normal completion. 4811 // 4812 4813 struct progressCallBackContext { 4814 RegexTest *test; 4815 int64_t lastIndex; 4816 int32_t maxCalls; 4817 int32_t numCalls; 4818 void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;}; 4819 }; 4820 4821 U_CDECL_BEGIN 4822 static UBool U_CALLCONV 4823 testProgressCallBackFn(const void *context, int64_t matchIndex) { 4824 progressCallBackContext *info = (progressCallBackContext *)context; 4825 info->numCalls++; 4826 info->lastIndex = matchIndex; 4827 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls); 4828 return (info->numCalls < info->maxCalls); 4829 } 4830 U_CDECL_END 4831 4832 void RegexTest::FindProgressCallbacks() { 4833 { 4834 // Getter returns NULLs if no callback has been set 4835 4836 // The variables that the getter will fill in. 4837 // Init to non-null values so that the action of the getter can be seen. 4838 const void *returnedContext = &returnedContext; 4839 URegexFindProgressCallback *returnedFn = &testProgressCallBackFn; 4840 4841 UErrorCode status = U_ZERO_ERROR; 4842 RegexMatcher matcher("x", 0, status); 4843 REGEX_CHECK_STATUS; 4844 matcher.getFindProgressCallback(returnedFn, returnedContext, status); 4845 REGEX_CHECK_STATUS; 4846 REGEX_ASSERT(returnedFn == NULL); 4847 REGEX_ASSERT(returnedContext == NULL); 4848 } 4849 4850 { 4851 // Set and Get work 4852 progressCallBackContext cbInfo = {this, 0, 0, 0}; 4853 const void *returnedContext; 4854 URegexFindProgressCallback *returnedFn; 4855 UErrorCode status = U_ZERO_ERROR; 4856 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long. 4857 REGEX_CHECK_STATUS; 4858 matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status); 4859 REGEX_CHECK_STATUS; 4860 matcher.getFindProgressCallback(returnedFn, returnedContext, status); 4861 REGEX_CHECK_STATUS; 4862 REGEX_ASSERT(returnedFn == testProgressCallBackFn); 4863 REGEX_ASSERT(returnedContext == &cbInfo); 4864 4865 // A short-running match should NOT invoke the callback. 4866 status = U_ZERO_ERROR; 4867 cbInfo.reset(100); 4868 UnicodeString s = "abxxx"; 4869 matcher.reset(s); 4870 #if 0 4871 matcher.setTrace(TRUE); 4872 #endif 4873 REGEX_ASSERT(matcher.find(0, status)); 4874 REGEX_CHECK_STATUS; 4875 REGEX_ASSERT(cbInfo.numCalls == 0); 4876 4877 // A medium running match that causes matcher.find() to invoke our callback for each index. 4878 status = U_ZERO_ERROR; 4879 s = "aaaaaaaaaaaaaaaaaaab"; 4880 cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string 4881 matcher.reset(s); 4882 REGEX_ASSERT(matcher.find(0, status)==FALSE); 4883 REGEX_CHECK_STATUS; 4884 REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25); 4885 4886 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point. 4887 status = U_ZERO_ERROR; 4888 UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab"; 4889 cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string 4890 matcher.reset(s1); 4891 REGEX_ASSERT(matcher.find(0, status)==FALSE); 4892 REGEX_CHECK_STATUS; 4893 REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5); 4894 4895 #if 0 4896 // Now a match that will succeed, but after an interruption 4897 status = U_ZERO_ERROR; 4898 UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx"; 4899 cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string 4900 matcher.reset(s2); 4901 REGEX_ASSERT(matcher.find(0, status)==FALSE); 4902 REGEX_CHECK_STATUS; 4903 // Now retry the match from where left off 4904 cbInfo.maxCalls = 100; // No callback limit 4905 REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status)); 4906 REGEX_CHECK_STATUS; 4907 #endif 4908 } 4909 4910 4911 } 4912 4913 4914 //--------------------------------------------------------------------------- 4915 // 4916 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable 4917 // UTexts. The pure-C implementation of UText 4918 // has no mutable backing stores, but we can 4919 // use UnicodeString here to test the functionality. 4920 // 4921 //--------------------------------------------------------------------------- 4922 void RegexTest::PreAllocatedUTextCAPI () { 4923 UErrorCode status = U_ZERO_ERROR; 4924 URegularExpression *re; 4925 UText patternText = UTEXT_INITIALIZER; 4926 UnicodeString buffer; 4927 UText bufferText = UTEXT_INITIALIZER; 4928 4929 utext_openUnicodeString(&bufferText, &buffer, &status); 4930 4931 /* 4932 * getText() and getUText() 4933 */ 4934 { 4935 UText text1 = UTEXT_INITIALIZER; 4936 UText text2 = UTEXT_INITIALIZER; 4937 UChar text2Chars[20]; 4938 UText *resultText; 4939 4940 status = U_ZERO_ERROR; 4941 regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status); 4942 regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status); 4943 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2); 4944 utext_openUChars(&text2, text2Chars, -1, &status); 4945 4946 regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status); 4947 re = uregex_openUText(&patternText, 0, NULL, &status); 4948 4949 /* First set a UText */ 4950 uregex_setUText(re, &text1, &status); 4951 resultText = uregex_getUText(re, &bufferText, &status); 4952 REGEX_CHECK_STATUS; 4953 REGEX_ASSERT(resultText == &bufferText); 4954 utext_setNativeIndex(resultText, 0); 4955 utext_setNativeIndex(&text1, 0); 4956 REGEX_ASSERT(testUTextEqual(resultText, &text1)); 4957 4958 resultText = uregex_getUText(re, &bufferText, &status); 4959 REGEX_CHECK_STATUS; 4960 REGEX_ASSERT(resultText == &bufferText); 4961 utext_setNativeIndex(resultText, 0); 4962 utext_setNativeIndex(&text1, 0); 4963 REGEX_ASSERT(testUTextEqual(resultText, &text1)); 4964 4965 /* Then set a UChar * */ 4966 uregex_setText(re, text2Chars, 7, &status); 4967 resultText = uregex_getUText(re, &bufferText, &status); 4968 REGEX_CHECK_STATUS; 4969 REGEX_ASSERT(resultText == &bufferText); 4970 utext_setNativeIndex(resultText, 0); 4971 utext_setNativeIndex(&text2, 0); 4972 REGEX_ASSERT(testUTextEqual(resultText, &text2)); 4973 4974 uregex_close(re); 4975 utext_close(&text1); 4976 utext_close(&text2); 4977 } 4978 4979 /* 4980 * group() 4981 */ 4982 { 4983 UChar text1[80]; 4984 UText *actual; 4985 UBool result; 4986 u_uastrncpy(text1, "noise abc interior def, and this is off the end", sizeof(text1)/2); 4987 4988 status = U_ZERO_ERROR; 4989 re = uregex_openC("abc(.*?)def", 0, NULL, &status); 4990 REGEX_CHECK_STATUS; 4991 4992 uregex_setText(re, text1, -1, &status); 4993 result = uregex_find(re, 0, &status); 4994 REGEX_ASSERT(result==TRUE); 4995 4996 /* Capture Group 0, the full match. Should succeed. */ 4997 status = U_ZERO_ERROR; 4998 actual = uregex_groupUTextDeep(re, 0, &bufferText, &status); 4999 REGEX_CHECK_STATUS; 5000 REGEX_ASSERT(actual == &bufferText); 5001 REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual); 5002 5003 /* Capture group #1. Should succeed. */ 5004 status = U_ZERO_ERROR; 5005 actual = uregex_groupUTextDeep(re, 1, &bufferText, &status); 5006 REGEX_CHECK_STATUS; 5007 REGEX_ASSERT(actual == &bufferText); 5008 REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual); 5009 5010 /* Capture group out of range. Error. */ 5011 status = U_ZERO_ERROR; 5012 actual = uregex_groupUTextDeep(re, 2, &bufferText, &status); 5013 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 5014 REGEX_ASSERT(actual == &bufferText); 5015 5016 uregex_close(re); 5017 5018 } 5019 5020 /* 5021 * replaceFirst() 5022 */ 5023 { 5024 UChar text1[80]; 5025 UChar text2[80]; 5026 UText replText = UTEXT_INITIALIZER; 5027 UText *result; 5028 5029 status = U_ZERO_ERROR; 5030 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2); 5031 u_uastrncpy(text2, "No match here.", sizeof(text2)/2); 5032 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status); 5033 5034 re = uregex_openC("x(.*?)x", 0, NULL, &status); 5035 REGEX_CHECK_STATUS; 5036 5037 /* Normal case, with match */ 5038 uregex_setText(re, text1, -1, &status); 5039 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5040 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 5041 REGEX_CHECK_STATUS; 5042 REGEX_ASSERT(result == &bufferText); 5043 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result); 5044 5045 /* No match. Text should copy to output with no changes. */ 5046 uregex_setText(re, text2, -1, &status); 5047 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5048 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 5049 REGEX_CHECK_STATUS; 5050 REGEX_ASSERT(result == &bufferText); 5051 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result); 5052 5053 /* Unicode escapes */ 5054 uregex_setText(re, text1, -1, &status); 5055 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status); 5056 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5057 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 5058 REGEX_CHECK_STATUS; 5059 REGEX_ASSERT(result == &bufferText); 5060 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result); 5061 5062 uregex_close(re); 5063 utext_close(&replText); 5064 } 5065 5066 5067 /* 5068 * replaceAll() 5069 */ 5070 { 5071 UChar text1[80]; 5072 UChar text2[80]; 5073 UText replText = UTEXT_INITIALIZER; 5074 UText *result; 5075 5076 status = U_ZERO_ERROR; 5077 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2); 5078 u_uastrncpy(text2, "No match here.", sizeof(text2)/2); 5079 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status); 5080 5081 re = uregex_openC("x(.*?)x", 0, NULL, &status); 5082 REGEX_CHECK_STATUS; 5083 5084 /* Normal case, with match */ 5085 uregex_setText(re, text1, -1, &status); 5086 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5087 result = uregex_replaceAllUText(re, &replText, &bufferText, &status); 5088 REGEX_CHECK_STATUS; 5089 REGEX_ASSERT(result == &bufferText); 5090 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result); 5091 5092 /* No match. Text should copy to output with no changes. */ 5093 uregex_setText(re, text2, -1, &status); 5094 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5095 result = uregex_replaceAllUText(re, &replText, &bufferText, &status); 5096 REGEX_CHECK_STATUS; 5097 REGEX_ASSERT(result == &bufferText); 5098 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result); 5099 5100 uregex_close(re); 5101 utext_close(&replText); 5102 } 5103 5104 5105 /* 5106 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts, 5107 * so we don't need to test it here. 5108 */ 5109 5110 utext_close(&bufferText); 5111 utext_close(&patternText); 5112 } 5113 5114 //-------------------------------------------------------------- 5115 // 5116 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher. 5117 // 5118 //--------------------------------------------------------------- 5119 void RegexTest::Bug7651() { 5120 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)"); 5121 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData. 5122 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation. 5123 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)"); 5124 UnicodeString s("#ff @abcd This is test"); 5125 RegexPattern *REPattern = NULL; 5126 RegexMatcher *REMatcher = NULL; 5127 UErrorCode status = U_ZERO_ERROR; 5128 UParseError pe; 5129 5130 REPattern = RegexPattern::compile(pattern1, 0, pe, status); 5131 REGEX_CHECK_STATUS; 5132 REMatcher = REPattern->matcher(s, status); 5133 REGEX_CHECK_STATUS; 5134 REGEX_ASSERT(REMatcher->find()); 5135 REGEX_ASSERT(REMatcher->start(status) == 0); 5136 delete REPattern; 5137 delete REMatcher; 5138 status = U_ZERO_ERROR; 5139 5140 REPattern = RegexPattern::compile(pattern2, 0, pe, status); 5141 REGEX_CHECK_STATUS; 5142 REMatcher = REPattern->matcher(s, status); 5143 REGEX_CHECK_STATUS; 5144 REGEX_ASSERT(REMatcher->find()); 5145 REGEX_ASSERT(REMatcher->start(status) == 0); 5146 delete REPattern; 5147 delete REMatcher; 5148 status = U_ZERO_ERROR; 5149 } 5150 5151 void RegexTest::Bug7740() { 5152 UErrorCode status = U_ZERO_ERROR; 5153 UnicodeString pattern = "(a)"; 5154 UnicodeString text = "abcdef"; 5155 RegexMatcher *m = new RegexMatcher(pattern, text, 0, status); 5156 REGEX_CHECK_STATUS; 5157 REGEX_ASSERT(m->lookingAt(status)); 5158 REGEX_CHECK_STATUS; 5159 status = U_ILLEGAL_ARGUMENT_ERROR; 5160 UnicodeString s = m->group(1, status); // Bug 7740: segfault here. 5161 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); 5162 REGEX_ASSERT(s == ""); 5163 delete m; 5164 } 5165 5166 // Bug 8479: was crashing whith a Bogus UnicodeString as input. 5167 5168 void RegexTest::Bug8479() { 5169 UErrorCode status = U_ZERO_ERROR; 5170 5171 RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status); 5172 REGEX_CHECK_STATUS; 5173 if (U_SUCCESS(status)) 5174 { 5175 UnicodeString str; 5176 str.setToBogus(); 5177 pMatcher->reset(str); 5178 status = U_ZERO_ERROR; 5179 pMatcher->matches(status); 5180 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); 5181 delete pMatcher; 5182 } 5183 } 5184 5185 5186 // Bug 7029 5187 void RegexTest::Bug7029() { 5188 UErrorCode status = U_ZERO_ERROR; 5189 5190 RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status); 5191 UnicodeString text = "abc.def"; 5192 UnicodeString splits[10]; 5193 REGEX_CHECK_STATUS; 5194 int32_t numFields = pMatcher->split(text, splits, 10, status); 5195 REGEX_CHECK_STATUS; 5196 REGEX_ASSERT(numFields == 8); 5197 delete pMatcher; 5198 } 5199 5200 // Bug 9283 5201 // This test is checking for the existance of any supplemental characters that case-fold 5202 // to a bmp character. 5203 // 5204 // At the time of this writing there are none. If any should appear in a subsequent release 5205 // of Unicode, the code in regular expressions compilation that determines the longest 5206 // posssible match for a literal string will need to be enhanced. 5207 // 5208 // See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength() 5209 // for details on what to do in case of a failure of this test. 5210 // 5211 void RegexTest::Bug9283() { 5212 #if !UCONFIG_NO_NORMALIZATION 5213 UErrorCode status = U_ZERO_ERROR; 5214 UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status); 5215 REGEX_CHECK_STATUS; 5216 int32_t index; 5217 UChar32 c; 5218 for (index=0; ; index++) { 5219 c = supplementalsWithCaseFolding.charAt(index); 5220 if (c == -1) { 5221 break; 5222 } 5223 UnicodeString cf = UnicodeString(c).foldCase(); 5224 REGEX_ASSERT(cf.length() >= 2); 5225 } 5226 #endif /* #if !UCONFIG_NO_NORMALIZATION */ 5227 } 5228 5229 5230 void RegexTest::CheckInvBufSize() { 5231 if(inv_next>=INV_BUFSIZ) { 5232 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n", 5233 __FILE__, INV_BUFSIZ, inv_next); 5234 } else { 5235 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next); 5236 } 5237 } 5238 5239 5240 void RegexTest::Bug10459() { 5241 UErrorCode status = U_ZERO_ERROR; 5242 UnicodeString patternString("(txt)"); 5243 UnicodeString txtString("txt"); 5244 5245 UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status); 5246 REGEX_CHECK_STATUS; 5247 UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status); 5248 REGEX_CHECK_STATUS; 5249 5250 URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status); 5251 REGEX_CHECK_STATUS; 5252 5253 uregex_setUText(icu_re, utext_txt, &status); 5254 REGEX_CHECK_STATUS; 5255 5256 // The bug was that calling uregex_group() before doing a matching operation 5257 // was causing a segfault. Only for Regular Expressions created from UText. 5258 // It should set an U_REGEX_INVALID_STATE. 5259 5260 UChar buf[100]; 5261 int32_t len = uregex_group(icu_re, 0, buf, LENGTHOF(buf), &status); 5262 REGEX_ASSERT(status == U_REGEX_INVALID_STATE); 5263 REGEX_ASSERT(len == 0); 5264 5265 uregex_close(icu_re); 5266 utext_close(utext_pat); 5267 utext_close(utext_txt); 5268 } 5269 5270 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ 5271 5272