1 /******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 2002-2010, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6 7 // 8 // regextst.cpp 9 // 10 // ICU Regular Expressions test, part of intltest. 11 // 12 13 #include "intltest.h" 14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 15 16 #include "unicode/regex.h" 17 #include "unicode/uchar.h" 18 #include "unicode/ucnv.h" 19 #include "unicode/ustring.h" 20 #include "regextst.h" 21 #include "uvector.h" 22 #include "util.h" 23 #include <stdlib.h> 24 #include <string.h> 25 #include <stdio.h> 26 #include "cstring.h" 27 #include "uinvchar.h" 28 29 #define SUPPORT_MUTATING_INPUT_STRING 0 30 31 //--------------------------------------------------------------------------- 32 // 33 // Test class boilerplate 34 // 35 //--------------------------------------------------------------------------- 36 RegexTest::RegexTest() 37 { 38 } 39 40 41 RegexTest::~RegexTest() 42 { 43 } 44 45 46 47 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) 48 { 49 if (exec) logln("TestSuite RegexTest: "); 50 switch (index) { 51 52 case 0: name = "Basic"; 53 if (exec) Basic(); 54 break; 55 case 1: name = "API_Match"; 56 if (exec) API_Match(); 57 break; 58 case 2: name = "API_Replace"; 59 if (exec) API_Replace(); 60 break; 61 case 3: name = "API_Pattern"; 62 if (exec) API_Pattern(); 63 break; 64 case 4: 65 #if !UCONFIG_NO_FILE_IO 66 name = "Extended"; 67 if (exec) Extended(); 68 #else 69 name = "skip"; 70 #endif 71 break; 72 case 5: name = "Errors"; 73 if (exec) Errors(); 74 break; 75 case 6: name = "PerlTests"; 76 if (exec) PerlTests(); 77 break; 78 case 7: name = "Callbacks"; 79 if (exec) Callbacks(); 80 break; 81 case 8: name = "FindProgressCallbacks"; 82 if (exec) FindProgressCallbacks(); 83 break; 84 case 9: name = "Bug 6149"; 85 if (exec) Bug6149(); 86 break; 87 case 10: name = "UTextBasic"; 88 if (exec) UTextBasic(); 89 break; 90 case 11: name = "API_Match_UTF8"; 91 if (exec) API_Match_UTF8(); 92 break; 93 case 12: name = "API_Replace_UTF8"; 94 if (exec) API_Replace_UTF8(); 95 break; 96 case 13: name = "API_Pattern_UTF8"; 97 if (exec) API_Pattern_UTF8(); 98 break; 99 case 14: name = "PerlTestsUTF8"; 100 if (exec) PerlTestsUTF8(); 101 break; 102 case 15: name = "PreAllocatedUTextCAPI"; 103 if (exec) PreAllocatedUTextCAPI(); 104 break; 105 case 16: name = "Bug 7651"; 106 if (exec) Bug7651(); 107 break; 108 case 17: name = "Bug 7740"; 109 if (exec) Bug7740(); 110 break; 111 112 default: name = ""; 113 break; //needed to end loop 114 } 115 } 116 117 118 /** 119 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage 120 * into ASCII. 121 * @see utext_openUTF8 122 */ 123 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status); 124 125 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) { 126 #if U_CHARSET_FAMILY==U_ASCII_FAMILY 127 return utext_openUTF8(ut, inv, length, status); 128 #else 129 char buf[1024]; 130 131 uprv_aestrncpy((uint8_t*)buf, (const uint8_t*)inv, length); 132 133 return utext_openUTF8(ut, buf, length, status); 134 #endif 135 } 136 137 //--------------------------------------------------------------------------- 138 // 139 // Error Checking / Reporting macros used in all of the tests. 140 // 141 //--------------------------------------------------------------------------- 142 143 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) { 144 int64_t oldIndex = utext_getNativeIndex(text); 145 utext_setNativeIndex(text, 0); 146 char *bufPtr = buf; 147 UChar32 c = utext_next32From(text, 0); 148 while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) { 149 if (0x000020<=c && c<0x00007e) { 150 *bufPtr = c; 151 } else { 152 #if 0 153 sprintf(bufPtr,"U+%04X", c); 154 bufPtr+= strlen(bufPtr)-1; 155 #else 156 *bufPtr = '%'; 157 #endif 158 } 159 bufPtr++; 160 c = UTEXT_NEXT32(text); 161 } 162 *bufPtr = 0; 163 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY) 164 char *ebuf = (char*)malloc(bufLen); 165 uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen); 166 uprv_strncpy(buf, ebuf, bufLen); 167 free((void*)ebuf); 168 #endif 169 utext_setNativeIndex(text, oldIndex); 170 } 171 172 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);} 173 174 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \ 175 __FILE__, __LINE__, u_errorName(status)); return;}} 176 177 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};} 178 179 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\ 180 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \ 181 __LINE__, u_errorName(errcode), u_errorName(status));};} 182 183 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \ 184 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }} 185 186 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \ 187 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}} 188 189 /** 190 * @param expected expected text in UTF-8 (not platform) codepage 191 */ 192 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) { 193 UErrorCode status = U_ZERO_ERROR; 194 UText expectedText = UTEXT_INITIALIZER; 195 utext_openUTF8(&expectedText, expected, -1, &status); 196 if(U_FAILURE(status)) { 197 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected)); 198 return; 199 } 200 if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) { 201 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected)); 202 return; 203 } 204 utext_setNativeIndex(actual, 0); 205 if (utext_compare(&expectedText, -1, actual, -1) != 0) { 206 char buf[201 /*21*/]; 207 char expectedBuf[201]; 208 utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual); 209 utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText); 210 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual)); 211 } 212 utext_close(&expectedText); 213 } 214 /** 215 * @param expected invariant (platform local text) input 216 */ 217 218 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) { 219 UErrorCode status = U_ZERO_ERROR; 220 UText expectedText = UTEXT_INITIALIZER; 221 regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status); 222 if(U_FAILURE(status)) { 223 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected)); 224 return; 225 } 226 utext_setNativeIndex(actual, 0); 227 if (utext_compare(&expectedText, -1, actual, -1) != 0) { 228 char buf[201 /*21*/]; 229 char expectedBuf[201]; 230 utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual); 231 utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText); 232 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual)); 233 } 234 utext_close(&expectedText); 235 } 236 237 /** 238 * Assumes utf-8 input 239 */ 240 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__) 241 /** 242 * Assumes Invariant input 243 */ 244 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__) 245 246 247 //--------------------------------------------------------------------------- 248 // 249 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests 250 // for the LookingAt() and Match() functions. 251 // 252 // usage: 253 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected); 254 // 255 // The expected results are UBool - TRUE or FALSE. 256 // The input text is unescaped. The pattern is not. 257 // 258 // 259 //--------------------------------------------------------------------------- 260 261 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);} 262 263 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) { 264 const UnicodeString pattern(pat, -1, US_INV); 265 const UnicodeString inputText(text, -1, US_INV); 266 UErrorCode status = U_ZERO_ERROR; 267 UParseError pe; 268 RegexPattern *REPattern = NULL; 269 RegexMatcher *REMatcher = NULL; 270 UBool retVal = TRUE; 271 272 UnicodeString patString(pat, -1, US_INV); 273 REPattern = RegexPattern::compile(patString, 0, pe, status); 274 if (U_FAILURE(status)) { 275 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s", 276 line, u_errorName(status)); 277 return FALSE; 278 } 279 if (line==376) { RegexPatternDump(REPattern);} 280 281 UnicodeString inputString(inputText); 282 UnicodeString unEscapedInput = inputString.unescape(); 283 REMatcher = REPattern->matcher(unEscapedInput, status); 284 if (U_FAILURE(status)) { 285 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n", 286 line, u_errorName(status)); 287 return FALSE; 288 } 289 290 UBool actualmatch; 291 actualmatch = REMatcher->lookingAt(status); 292 if (U_FAILURE(status)) { 293 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n", 294 line, u_errorName(status)); 295 retVal = FALSE; 296 } 297 if (actualmatch != looking) { 298 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line); 299 retVal = FALSE; 300 } 301 302 status = U_ZERO_ERROR; 303 actualmatch = REMatcher->matches(status); 304 if (U_FAILURE(status)) { 305 errln("RegexTest failure in matches() at line %d. Status = %s\n", 306 line, u_errorName(status)); 307 retVal = FALSE; 308 } 309 if (actualmatch != match) { 310 errln("RegexTest: wrong return from matches() at line %d.\n", line); 311 retVal = FALSE; 312 } 313 314 if (retVal == FALSE) { 315 RegexPatternDump(REPattern); 316 } 317 318 delete REPattern; 319 delete REMatcher; 320 return retVal; 321 } 322 323 324 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) { 325 UText pattern = UTEXT_INITIALIZER; 326 int32_t inputUTF8Length; 327 char *textChars = NULL; 328 UText inputText = UTEXT_INITIALIZER; 329 UErrorCode status = U_ZERO_ERROR; 330 UParseError pe; 331 RegexPattern *REPattern = NULL; 332 RegexMatcher *REMatcher = NULL; 333 UBool retVal = TRUE; 334 335 regextst_openUTF8FromInvariant(&pattern, pat, -1, &status); 336 REPattern = RegexPattern::compile(&pattern, 0, pe, status); 337 if (U_FAILURE(status)) { 338 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n", 339 line, u_errorName(status)); 340 return FALSE; 341 } 342 343 UnicodeString inputString(text, -1, US_INV); 344 UnicodeString unEscapedInput = inputString.unescape(); 345 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status)); 346 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); 347 348 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status); 349 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) { 350 // UTF-8 does not allow unpaired surrogates, so this could actually happen 351 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status)); 352 return TRUE; // not a failure of the Regex engine 353 } 354 status = U_ZERO_ERROR; // buffer overflow 355 textChars = new char[inputUTF8Length+1]; 356 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status); 357 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status); 358 359 REMatcher = REPattern->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status); 360 if (U_FAILURE(status)) { 361 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n", 362 line, u_errorName(status)); 363 return FALSE; 364 } 365 366 UBool actualmatch; 367 actualmatch = REMatcher->lookingAt(status); 368 if (U_FAILURE(status)) { 369 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n", 370 line, u_errorName(status)); 371 retVal = FALSE; 372 } 373 if (actualmatch != looking) { 374 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line); 375 retVal = FALSE; 376 } 377 378 status = U_ZERO_ERROR; 379 actualmatch = REMatcher->matches(status); 380 if (U_FAILURE(status)) { 381 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n", 382 line, u_errorName(status)); 383 retVal = FALSE; 384 } 385 if (actualmatch != match) { 386 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line); 387 retVal = FALSE; 388 } 389 390 if (retVal == FALSE) { 391 RegexPatternDump(REPattern); 392 } 393 394 delete REPattern; 395 delete REMatcher; 396 utext_close(&inputText); 397 utext_close(&pattern); 398 delete[] textChars; 399 return retVal; 400 } 401 402 403 404 //--------------------------------------------------------------------------- 405 // 406 // REGEX_ERR Macro + invocation function to simplify writing tests 407 // regex tests for incorrect patterns 408 // 409 // usage: 410 // REGEX_ERR("pattern", expected error line, column, expected status); 411 // 412 //--------------------------------------------------------------------------- 413 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__); 414 415 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol, 416 UErrorCode expectedStatus, int32_t line) { 417 UnicodeString pattern(pat); 418 419 UErrorCode status = U_ZERO_ERROR; 420 UParseError pe; 421 RegexPattern *callerPattern = NULL; 422 423 // 424 // Compile the caller's pattern 425 // 426 UnicodeString patString(pat); 427 callerPattern = RegexPattern::compile(patString, 0, pe, status); 428 if (status != expectedStatus) { 429 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status)); 430 } else { 431 if (status != U_ZERO_ERROR) { 432 if (pe.line != errLine || pe.offset != errCol) { 433 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n", 434 line, errLine, errCol, pe.line, pe.offset); 435 } 436 } 437 } 438 439 delete callerPattern; 440 441 // 442 // Compile again, using a UTF-8-based UText 443 // 444 UText patternText = UTEXT_INITIALIZER; 445 regextst_openUTF8FromInvariant(&patternText, pat, -1, &status); 446 callerPattern = RegexPattern::compile(&patternText, 0, pe, status); 447 if (status != expectedStatus) { 448 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status)); 449 } else { 450 if (status != U_ZERO_ERROR) { 451 if (pe.line != errLine || pe.offset != errCol) { 452 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n", 453 line, errLine, errCol, pe.line, pe.offset); 454 } 455 } 456 } 457 458 delete callerPattern; 459 utext_close(&patternText); 460 } 461 462 463 464 //--------------------------------------------------------------------------- 465 // 466 // Basic Check for basic functionality of regex pattern matching. 467 // Avoid the use of REGEX_FIND test macro, which has 468 // substantial dependencies on basic Regex functionality. 469 // 470 //--------------------------------------------------------------------------- 471 void RegexTest::Basic() { 472 473 474 // 475 // Debug - slide failing test cases early 476 // 477 #if 0 478 { 479 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE); 480 UParseError pe; 481 UErrorCode status = U_ZERO_ERROR; 482 RegexPattern::compile("^(?:a?b?)*$", 0, pe, status); 483 // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd"); 484 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX===================="); 485 } 486 exit(1); 487 #endif 488 489 490 // 491 // Pattern with parentheses 492 // 493 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE); 494 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE); 495 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE); 496 497 // 498 // Patterns with * 499 // 500 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE); 501 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE); 502 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE); 503 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE); 504 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE); 505 506 REGEX_TESTLM("a*", "", TRUE, TRUE); 507 REGEX_TESTLM("a*", "b", TRUE, FALSE); 508 509 510 // 511 // Patterns with "." 512 // 513 REGEX_TESTLM(".", "abc", TRUE, FALSE); 514 REGEX_TESTLM("...", "abc", TRUE, TRUE); 515 REGEX_TESTLM("....", "abc", FALSE, FALSE); 516 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE); 517 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE); 518 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE); 519 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE); 520 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE); 521 522 // 523 // Patterns with * applied to chars at end of literal string 524 // 525 REGEX_TESTLM("abc*", "ab", TRUE, TRUE); 526 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE); 527 528 // 529 // Supplemental chars match as single chars, not a pair of surrogates. 530 // 531 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE); 532 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE); 533 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE); 534 535 536 // 537 // UnicodeSets in the pattern 538 // 539 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE); 540 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE); 541 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE); 542 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE); 543 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE); 544 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE); 545 546 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE); 547 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE); 548 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE); 549 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences. 550 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE); 551 552 // 553 // OR operator in patterns 554 // 555 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE); 556 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE); 557 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE); 558 REGEX_TESTLM("a|b", "b", TRUE, TRUE); 559 560 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE); 561 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE); 562 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE); 563 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE); 564 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE); 565 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE); 566 567 // 568 // + 569 // 570 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE); 571 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE); 572 REGEX_TESTLM("b+", "", FALSE, FALSE); 573 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE); 574 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE); 575 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE); 576 577 // 578 // ? 579 // 580 REGEX_TESTLM("ab?", "ab", TRUE, TRUE); 581 REGEX_TESTLM("ab?", "a", TRUE, TRUE); 582 REGEX_TESTLM("ab?", "ac", TRUE, FALSE); 583 REGEX_TESTLM("ab?", "abb", TRUE, FALSE); 584 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE); 585 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE); 586 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE); 587 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE); 588 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE); 589 590 // 591 // Escape sequences that become single literal chars, handled internally 592 // by ICU's Unescape. 593 // 594 595 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet. 596 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL 597 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L 598 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape 599 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed 600 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line 601 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR 602 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab 603 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE); 604 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE); 605 606 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input 607 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input 608 609 // Escape of special chars in patterns 610 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE); 611 } 612 613 614 //--------------------------------------------------------------------------- 615 // 616 // UTextBasic Check for quirks that are specific to the UText 617 // implementation. 618 // 619 //--------------------------------------------------------------------------- 620 void RegexTest::UTextBasic() { 621 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 622 UErrorCode status = U_ZERO_ERROR; 623 UText pattern = UTEXT_INITIALIZER; 624 utext_openUTF8(&pattern, str_abc, -1, &status); 625 RegexMatcher matcher(&pattern, 0, status); 626 REGEX_CHECK_STATUS; 627 628 UText input = UTEXT_INITIALIZER; 629 utext_openUTF8(&input, str_abc, -1, &status); 630 REGEX_CHECK_STATUS; 631 matcher.reset(&input); 632 REGEX_CHECK_STATUS; 633 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText()); 634 635 matcher.reset(matcher.inputText()); 636 REGEX_CHECK_STATUS; 637 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText()); 638 639 utext_close(&pattern); 640 utext_close(&input); 641 } 642 643 644 //--------------------------------------------------------------------------- 645 // 646 // API_Match Test that the API for class RegexMatcher 647 // is present and nominally working, but excluding functions 648 // implementing replace operations. 649 // 650 //--------------------------------------------------------------------------- 651 void RegexTest::API_Match() { 652 UParseError pe; 653 UErrorCode status=U_ZERO_ERROR; 654 int32_t flags = 0; 655 656 // 657 // Debug - slide failing test cases early 658 // 659 #if 0 660 { 661 } 662 return; 663 #endif 664 665 // 666 // Simple pattern compilation 667 // 668 { 669 UnicodeString re("abc"); 670 RegexPattern *pat2; 671 pat2 = RegexPattern::compile(re, flags, pe, status); 672 REGEX_CHECK_STATUS; 673 674 UnicodeString inStr1 = "abcdef this is a test"; 675 UnicodeString instr2 = "not abc"; 676 UnicodeString empty = ""; 677 678 679 // 680 // Matcher creation and reset. 681 // 682 RegexMatcher *m1 = pat2->matcher(inStr1, status); 683 REGEX_CHECK_STATUS; 684 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 685 REGEX_ASSERT(m1->input() == inStr1); 686 m1->reset(instr2); 687 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 688 REGEX_ASSERT(m1->input() == instr2); 689 m1->reset(inStr1); 690 REGEX_ASSERT(m1->input() == inStr1); 691 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 692 m1->reset(empty); 693 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 694 REGEX_ASSERT(m1->input() == empty); 695 REGEX_ASSERT(&m1->pattern() == pat2); 696 697 // 698 // reset(pos, status) 699 // 700 m1->reset(inStr1); 701 m1->reset(4, status); 702 REGEX_CHECK_STATUS; 703 REGEX_ASSERT(m1->input() == inStr1); 704 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 705 706 m1->reset(-1, status); 707 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 708 status = U_ZERO_ERROR; 709 710 m1->reset(0, status); 711 REGEX_CHECK_STATUS; 712 status = U_ZERO_ERROR; 713 714 int32_t len = m1->input().length(); 715 m1->reset(len-1, status); 716 REGEX_CHECK_STATUS; 717 status = U_ZERO_ERROR; 718 719 m1->reset(len, status); 720 REGEX_CHECK_STATUS; 721 status = U_ZERO_ERROR; 722 723 m1->reset(len+1, status); 724 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 725 status = U_ZERO_ERROR; 726 727 // 728 // match(pos, status) 729 // 730 m1->reset(instr2); 731 REGEX_ASSERT(m1->matches(4, status) == TRUE); 732 m1->reset(); 733 REGEX_ASSERT(m1->matches(3, status) == FALSE); 734 m1->reset(); 735 REGEX_ASSERT(m1->matches(5, status) == FALSE); 736 REGEX_ASSERT(m1->matches(4, status) == TRUE); 737 REGEX_ASSERT(m1->matches(-1, status) == FALSE); 738 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 739 740 // Match() at end of string should fail, but should not 741 // be an error. 742 status = U_ZERO_ERROR; 743 len = m1->input().length(); 744 REGEX_ASSERT(m1->matches(len, status) == FALSE); 745 REGEX_CHECK_STATUS; 746 747 // Match beyond end of string should fail with an error. 748 status = U_ZERO_ERROR; 749 REGEX_ASSERT(m1->matches(len+1, status) == FALSE); 750 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 751 752 // Successful match at end of string. 753 { 754 status = U_ZERO_ERROR; 755 RegexMatcher m("A?", 0, status); // will match zero length string. 756 REGEX_CHECK_STATUS; 757 m.reset(inStr1); 758 len = inStr1.length(); 759 REGEX_ASSERT(m.matches(len, status) == TRUE); 760 REGEX_CHECK_STATUS; 761 m.reset(empty); 762 REGEX_ASSERT(m.matches(0, status) == TRUE); 763 REGEX_CHECK_STATUS; 764 } 765 766 767 // 768 // lookingAt(pos, status) 769 // 770 status = U_ZERO_ERROR; 771 m1->reset(instr2); // "not abc" 772 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 773 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE); 774 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE); 775 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 776 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE); 777 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 778 status = U_ZERO_ERROR; 779 len = m1->input().length(); 780 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE); 781 REGEX_CHECK_STATUS; 782 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE); 783 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 784 785 delete m1; 786 delete pat2; 787 } 788 789 790 // 791 // Capture Group. 792 // RegexMatcher::start(); 793 // RegexMatcher::end(); 794 // RegexMatcher::groupCount(); 795 // 796 { 797 int32_t flags=0; 798 UParseError pe; 799 UErrorCode status=U_ZERO_ERROR; 800 801 UnicodeString re("01(23(45)67)(.*)"); 802 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 803 REGEX_CHECK_STATUS; 804 UnicodeString data = "0123456789"; 805 806 RegexMatcher *matcher = pat->matcher(data, status); 807 REGEX_CHECK_STATUS; 808 REGEX_ASSERT(matcher->lookingAt(status) == TRUE); 809 static const int32_t matchStarts[] = {0, 2, 4, 8}; 810 static const int32_t matchEnds[] = {10, 8, 6, 10}; 811 int32_t i; 812 for (i=0; i<4; i++) { 813 int32_t actualStart = matcher->start(i, status); 814 REGEX_CHECK_STATUS; 815 if (actualStart != matchStarts[i]) { 816 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n", 817 __LINE__, i, matchStarts[i], actualStart); 818 } 819 int32_t actualEnd = matcher->end(i, status); 820 REGEX_CHECK_STATUS; 821 if (actualEnd != matchEnds[i]) { 822 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n", 823 __LINE__, i, matchEnds[i], actualEnd); 824 } 825 } 826 827 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status)); 828 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status)); 829 830 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 831 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 832 matcher->reset(); 833 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE); 834 835 matcher->lookingAt(status); 836 REGEX_ASSERT(matcher->group(status) == "0123456789"); 837 REGEX_ASSERT(matcher->group(0, status) == "0123456789"); 838 REGEX_ASSERT(matcher->group(1, status) == "234567" ); 839 REGEX_ASSERT(matcher->group(2, status) == "45" ); 840 REGEX_ASSERT(matcher->group(3, status) == "89" ); 841 REGEX_CHECK_STATUS; 842 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 843 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 844 matcher->reset(); 845 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); 846 847 delete matcher; 848 delete pat; 849 850 } 851 852 // 853 // find 854 // 855 { 856 int32_t flags=0; 857 UParseError pe; 858 UErrorCode status=U_ZERO_ERROR; 859 860 UnicodeString re("abc"); 861 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 862 REGEX_CHECK_STATUS; 863 UnicodeString data = ".abc..abc...abc.."; 864 // 012345678901234567 865 866 RegexMatcher *matcher = pat->matcher(data, status); 867 REGEX_CHECK_STATUS; 868 REGEX_ASSERT(matcher->find()); 869 REGEX_ASSERT(matcher->start(status) == 1); 870 REGEX_ASSERT(matcher->find()); 871 REGEX_ASSERT(matcher->start(status) == 6); 872 REGEX_ASSERT(matcher->find()); 873 REGEX_ASSERT(matcher->start(status) == 12); 874 REGEX_ASSERT(matcher->find() == FALSE); 875 REGEX_ASSERT(matcher->find() == FALSE); 876 877 matcher->reset(); 878 REGEX_ASSERT(matcher->find()); 879 REGEX_ASSERT(matcher->start(status) == 1); 880 881 REGEX_ASSERT(matcher->find(0, status)); 882 REGEX_ASSERT(matcher->start(status) == 1); 883 REGEX_ASSERT(matcher->find(1, status)); 884 REGEX_ASSERT(matcher->start(status) == 1); 885 REGEX_ASSERT(matcher->find(2, status)); 886 REGEX_ASSERT(matcher->start(status) == 6); 887 REGEX_ASSERT(matcher->find(12, status)); 888 REGEX_ASSERT(matcher->start(status) == 12); 889 REGEX_ASSERT(matcher->find(13, status) == FALSE); 890 REGEX_ASSERT(matcher->find(16, status) == FALSE); 891 REGEX_ASSERT(matcher->find(17, status) == FALSE); 892 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE); 893 894 status = U_ZERO_ERROR; 895 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 896 status = U_ZERO_ERROR; 897 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR); 898 899 REGEX_ASSERT(matcher->groupCount() == 0); 900 901 delete matcher; 902 delete pat; 903 } 904 905 906 // 907 // find, with \G in pattern (true if at the end of a previous match). 908 // 909 { 910 int32_t flags=0; 911 UParseError pe; 912 UErrorCode status=U_ZERO_ERROR; 913 914 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV); 915 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 916 REGEX_CHECK_STATUS; 917 UnicodeString data = ".abcabc.abc.."; 918 // 012345678901234567 919 920 RegexMatcher *matcher = pat->matcher(data, status); 921 REGEX_CHECK_STATUS; 922 REGEX_ASSERT(matcher->find()); 923 REGEX_ASSERT(matcher->start(status) == 0); 924 REGEX_ASSERT(matcher->start(1, status) == -1); 925 REGEX_ASSERT(matcher->start(2, status) == 1); 926 927 REGEX_ASSERT(matcher->find()); 928 REGEX_ASSERT(matcher->start(status) == 4); 929 REGEX_ASSERT(matcher->start(1, status) == 4); 930 REGEX_ASSERT(matcher->start(2, status) == -1); 931 REGEX_CHECK_STATUS; 932 933 delete matcher; 934 delete pat; 935 } 936 937 // 938 // find with zero length matches, match position should bump ahead 939 // to prevent loops. 940 // 941 { 942 int32_t i; 943 UErrorCode status=U_ZERO_ERROR; 944 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere, 945 // using an always-true look-ahead. 946 REGEX_CHECK_STATUS; 947 UnicodeString s(" "); 948 m.reset(s); 949 for (i=0; ; i++) { 950 if (m.find() == FALSE) { 951 break; 952 } 953 REGEX_ASSERT(m.start(status) == i); 954 REGEX_ASSERT(m.end(status) == i); 955 } 956 REGEX_ASSERT(i==5); 957 958 // Check that the bump goes over surrogate pairs OK 959 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004"); 960 s = s.unescape(); 961 m.reset(s); 962 for (i=0; ; i+=2) { 963 if (m.find() == FALSE) { 964 break; 965 } 966 REGEX_ASSERT(m.start(status) == i); 967 REGEX_ASSERT(m.end(status) == i); 968 } 969 REGEX_ASSERT(i==10); 970 } 971 { 972 // find() loop breaking test. 973 // with pattern of /.?/, should see a series of one char matches, then a single 974 // match of zero length at the end of the input string. 975 int32_t i; 976 UErrorCode status=U_ZERO_ERROR; 977 RegexMatcher m(".?", 0, status); 978 REGEX_CHECK_STATUS; 979 UnicodeString s(" "); 980 m.reset(s); 981 for (i=0; ; i++) { 982 if (m.find() == FALSE) { 983 break; 984 } 985 REGEX_ASSERT(m.start(status) == i); 986 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i)); 987 } 988 REGEX_ASSERT(i==5); 989 } 990 991 992 // 993 // Matchers with no input string behave as if they had an empty input string. 994 // 995 996 { 997 UErrorCode status = U_ZERO_ERROR; 998 RegexMatcher m(".?", 0, status); 999 REGEX_CHECK_STATUS; 1000 REGEX_ASSERT(m.find()); 1001 REGEX_ASSERT(m.start(status) == 0); 1002 REGEX_ASSERT(m.input() == ""); 1003 } 1004 { 1005 UErrorCode status = U_ZERO_ERROR; 1006 RegexPattern *p = RegexPattern::compile(".", 0, status); 1007 RegexMatcher *m = p->matcher(status); 1008 REGEX_CHECK_STATUS; 1009 1010 REGEX_ASSERT(m->find() == FALSE); 1011 REGEX_ASSERT(m->input() == ""); 1012 delete m; 1013 delete p; 1014 } 1015 1016 // 1017 // Regions 1018 // 1019 { 1020 UErrorCode status = U_ZERO_ERROR; 1021 UnicodeString testString("This is test data"); 1022 RegexMatcher m(".*", testString, 0, status); 1023 REGEX_CHECK_STATUS; 1024 REGEX_ASSERT(m.regionStart() == 0); 1025 REGEX_ASSERT(m.regionEnd() == testString.length()); 1026 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1027 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1028 1029 m.region(2,4, status); 1030 REGEX_CHECK_STATUS; 1031 REGEX_ASSERT(m.matches(status)); 1032 REGEX_ASSERT(m.start(status)==2); 1033 REGEX_ASSERT(m.end(status)==4); 1034 REGEX_CHECK_STATUS; 1035 1036 m.reset(); 1037 REGEX_ASSERT(m.regionStart() == 0); 1038 REGEX_ASSERT(m.regionEnd() == testString.length()); 1039 1040 UnicodeString shorterString("short"); 1041 m.reset(shorterString); 1042 REGEX_ASSERT(m.regionStart() == 0); 1043 REGEX_ASSERT(m.regionEnd() == shorterString.length()); 1044 1045 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1046 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE)); 1047 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 1048 REGEX_ASSERT(&m == &m.reset()); 1049 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 1050 1051 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE)); 1052 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1053 REGEX_ASSERT(&m == &m.reset()); 1054 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1055 1056 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1057 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE)); 1058 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 1059 REGEX_ASSERT(&m == &m.reset()); 1060 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 1061 1062 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE)); 1063 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1064 REGEX_ASSERT(&m == &m.reset()); 1065 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1066 1067 } 1068 1069 // 1070 // hitEnd() and requireEnd() 1071 // 1072 { 1073 UErrorCode status = U_ZERO_ERROR; 1074 UnicodeString testString("aabb"); 1075 RegexMatcher m1(".*", testString, 0, status); 1076 REGEX_ASSERT(m1.lookingAt(status) == TRUE); 1077 REGEX_ASSERT(m1.hitEnd() == TRUE); 1078 REGEX_ASSERT(m1.requireEnd() == FALSE); 1079 REGEX_CHECK_STATUS; 1080 1081 status = U_ZERO_ERROR; 1082 RegexMatcher m2("a*", testString, 0, status); 1083 REGEX_ASSERT(m2.lookingAt(status) == TRUE); 1084 REGEX_ASSERT(m2.hitEnd() == FALSE); 1085 REGEX_ASSERT(m2.requireEnd() == FALSE); 1086 REGEX_CHECK_STATUS; 1087 1088 status = U_ZERO_ERROR; 1089 RegexMatcher m3(".*$", testString, 0, status); 1090 REGEX_ASSERT(m3.lookingAt(status) == TRUE); 1091 REGEX_ASSERT(m3.hitEnd() == TRUE); 1092 REGEX_ASSERT(m3.requireEnd() == TRUE); 1093 REGEX_CHECK_STATUS; 1094 } 1095 1096 1097 // 1098 // Compilation error on reset with UChar * 1099 // These were a hazard that people were stumbling over with runtime errors. 1100 // Changed them to compiler errors by adding private methods that more closely 1101 // matched the incorrect use of the functions. 1102 // 1103 #if 0 1104 { 1105 UErrorCode status = U_ZERO_ERROR; 1106 UChar ucharString[20]; 1107 RegexMatcher m(".", 0, status); 1108 m.reset(ucharString); // should not compile. 1109 1110 RegexPattern *p = RegexPattern::compile(".", 0, status); 1111 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile. 1112 1113 RegexMatcher m3(".", ucharString, 0, status); // Should not compile 1114 } 1115 #endif 1116 1117 // 1118 // Time Outs. 1119 // Note: These tests will need to be changed when the regexp engine is 1120 // able to detect and cut short the exponential time behavior on 1121 // this type of match. 1122 // 1123 { 1124 UErrorCode status = U_ZERO_ERROR; 1125 // Enough 'a's in the string to cause the match to time out. 1126 // (Each on additonal 'a' doubles the time) 1127 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa"); 1128 RegexMatcher matcher("(a+)+b", testString, 0, status); 1129 REGEX_CHECK_STATUS; 1130 REGEX_ASSERT(matcher.getTimeLimit() == 0); 1131 matcher.setTimeLimit(100, status); 1132 REGEX_ASSERT(matcher.getTimeLimit() == 100); 1133 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1134 REGEX_ASSERT(status == U_REGEX_TIME_OUT); 1135 } 1136 { 1137 UErrorCode status = U_ZERO_ERROR; 1138 // Few enough 'a's to slip in under the time limit. 1139 UnicodeString testString("aaaaaaaaaaaaaaaaaa"); 1140 RegexMatcher matcher("(a+)+b", testString, 0, status); 1141 REGEX_CHECK_STATUS; 1142 matcher.setTimeLimit(100, status); 1143 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1144 REGEX_CHECK_STATUS; 1145 } 1146 1147 // 1148 // Stack Limits 1149 // 1150 { 1151 UErrorCode status = U_ZERO_ERROR; 1152 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A' 1153 1154 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations 1155 // of the '+', and makes the stack frames larger. 1156 RegexMatcher matcher("(A)+A$", testString, 0, status); 1157 1158 // With the default stack, this match should fail to run 1159 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1160 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW); 1161 1162 // With unlimited stack, it should run 1163 status = U_ZERO_ERROR; 1164 matcher.setStackLimit(0, status); 1165 REGEX_CHECK_STATUS; 1166 REGEX_ASSERT(matcher.lookingAt(status) == TRUE); 1167 REGEX_CHECK_STATUS; 1168 REGEX_ASSERT(matcher.getStackLimit() == 0); 1169 1170 // With a limited stack, it the match should fail 1171 status = U_ZERO_ERROR; 1172 matcher.setStackLimit(10000, status); 1173 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1174 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW); 1175 REGEX_ASSERT(matcher.getStackLimit() == 10000); 1176 } 1177 1178 // A pattern that doesn't save state should work with 1179 // a minimal sized stack 1180 { 1181 UErrorCode status = U_ZERO_ERROR; 1182 UnicodeString testString = "abc"; 1183 RegexMatcher matcher("abc", testString, 0, status); 1184 REGEX_CHECK_STATUS; 1185 matcher.setStackLimit(30, status); 1186 REGEX_CHECK_STATUS; 1187 REGEX_ASSERT(matcher.matches(status) == TRUE); 1188 REGEX_CHECK_STATUS; 1189 REGEX_ASSERT(matcher.getStackLimit() == 30); 1190 1191 // Negative stack sizes should fail 1192 status = U_ZERO_ERROR; 1193 matcher.setStackLimit(1000, status); 1194 REGEX_CHECK_STATUS; 1195 matcher.setStackLimit(-1, status); 1196 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); 1197 REGEX_ASSERT(matcher.getStackLimit() == 1000); 1198 } 1199 1200 1201 } 1202 1203 1204 1205 1206 1207 1208 //--------------------------------------------------------------------------- 1209 // 1210 // API_Replace API test for class RegexMatcher, testing the 1211 // Replace family of functions. 1212 // 1213 //--------------------------------------------------------------------------- 1214 void RegexTest::API_Replace() { 1215 // 1216 // Replace 1217 // 1218 int32_t flags=0; 1219 UParseError pe; 1220 UErrorCode status=U_ZERO_ERROR; 1221 1222 UnicodeString re("abc"); 1223 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 1224 REGEX_CHECK_STATUS; 1225 UnicodeString data = ".abc..abc...abc.."; 1226 // 012345678901234567 1227 RegexMatcher *matcher = pat->matcher(data, status); 1228 1229 // 1230 // Plain vanilla matches. 1231 // 1232 UnicodeString dest; 1233 dest = matcher->replaceFirst("yz", status); 1234 REGEX_CHECK_STATUS; 1235 REGEX_ASSERT(dest == ".yz..abc...abc.."); 1236 1237 dest = matcher->replaceAll("yz", status); 1238 REGEX_CHECK_STATUS; 1239 REGEX_ASSERT(dest == ".yz..yz...yz.."); 1240 1241 // 1242 // Plain vanilla non-matches. 1243 // 1244 UnicodeString d2 = ".abx..abx...abx.."; 1245 matcher->reset(d2); 1246 dest = matcher->replaceFirst("yz", status); 1247 REGEX_CHECK_STATUS; 1248 REGEX_ASSERT(dest == ".abx..abx...abx.."); 1249 1250 dest = matcher->replaceAll("yz", status); 1251 REGEX_CHECK_STATUS; 1252 REGEX_ASSERT(dest == ".abx..abx...abx.."); 1253 1254 // 1255 // Empty source string 1256 // 1257 UnicodeString d3 = ""; 1258 matcher->reset(d3); 1259 dest = matcher->replaceFirst("yz", status); 1260 REGEX_CHECK_STATUS; 1261 REGEX_ASSERT(dest == ""); 1262 1263 dest = matcher->replaceAll("yz", status); 1264 REGEX_CHECK_STATUS; 1265 REGEX_ASSERT(dest == ""); 1266 1267 // 1268 // Empty substitution string 1269 // 1270 matcher->reset(data); // ".abc..abc...abc.." 1271 dest = matcher->replaceFirst("", status); 1272 REGEX_CHECK_STATUS; 1273 REGEX_ASSERT(dest == "...abc...abc.."); 1274 1275 dest = matcher->replaceAll("", status); 1276 REGEX_CHECK_STATUS; 1277 REGEX_ASSERT(dest == "........"); 1278 1279 // 1280 // match whole string 1281 // 1282 UnicodeString d4 = "abc"; 1283 matcher->reset(d4); 1284 dest = matcher->replaceFirst("xyz", status); 1285 REGEX_CHECK_STATUS; 1286 REGEX_ASSERT(dest == "xyz"); 1287 1288 dest = matcher->replaceAll("xyz", status); 1289 REGEX_CHECK_STATUS; 1290 REGEX_ASSERT(dest == "xyz"); 1291 1292 // 1293 // Capture Group, simple case 1294 // 1295 UnicodeString re2("a(..)"); 1296 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status); 1297 REGEX_CHECK_STATUS; 1298 UnicodeString d5 = "abcdefg"; 1299 RegexMatcher *matcher2 = pat2->matcher(d5, status); 1300 REGEX_CHECK_STATUS; 1301 dest = matcher2->replaceFirst("$1$1", status); 1302 REGEX_CHECK_STATUS; 1303 REGEX_ASSERT(dest == "bcbcdefg"); 1304 1305 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status); 1306 REGEX_CHECK_STATUS; 1307 REGEX_ASSERT(dest == "The value of $1 is bc.defg"); 1308 1309 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status); 1310 REGEX_CHECK_STATUS; 1311 REGEX_ASSERT(dest == "$ by itself, no group number $$$defg"); 1312 1313 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF."); 1314 replacement = replacement.unescape(); 1315 dest = matcher2->replaceFirst(replacement, status); 1316 REGEX_CHECK_STATUS; 1317 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg"); 1318 1319 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR); 1320 1321 1322 // 1323 // Replacement String with \u hex escapes 1324 // 1325 { 1326 UnicodeString src = "abc 1 abc 2 abc 3"; 1327 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--"); 1328 matcher->reset(src); 1329 UnicodeString result = matcher->replaceAll(substitute, status); 1330 REGEX_CHECK_STATUS; 1331 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3"); 1332 } 1333 { 1334 UnicodeString src = "abc !"; 1335 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--"); 1336 matcher->reset(src); 1337 UnicodeString result = matcher->replaceAll(substitute, status); 1338 REGEX_CHECK_STATUS; 1339 UnicodeString expected = UnicodeString("--"); 1340 expected.append((UChar32)0x10000); 1341 expected.append("-- !"); 1342 REGEX_ASSERT(result == expected); 1343 } 1344 // TODO: need more through testing of capture substitutions. 1345 1346 // Bug 4057 1347 // 1348 { 1349 status = U_ZERO_ERROR; 1350 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin"; 1351 RegexMatcher m("ss(.*?)ee", 0, status); 1352 REGEX_CHECK_STATUS; 1353 UnicodeString result; 1354 1355 // Multiple finds do NOT bump up the previous appendReplacement postion. 1356 m.reset(s); 1357 m.find(); 1358 m.find(); 1359 m.appendReplacement(result, "ooh", status); 1360 REGEX_CHECK_STATUS; 1361 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1362 1363 // After a reset into the interior of a string, appendReplacemnt still starts at beginning. 1364 status = U_ZERO_ERROR; 1365 result.truncate(0); 1366 m.reset(10, status); 1367 m.find(); 1368 m.find(); 1369 m.appendReplacement(result, "ooh", status); 1370 REGEX_CHECK_STATUS; 1371 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1372 1373 // find() at interior of string, appendReplacemnt still starts at beginning. 1374 status = U_ZERO_ERROR; 1375 result.truncate(0); 1376 m.reset(); 1377 m.find(10, status); 1378 m.find(); 1379 m.appendReplacement(result, "ooh", status); 1380 REGEX_CHECK_STATUS; 1381 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1382 1383 m.appendTail(result); 1384 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin"); 1385 1386 } 1387 1388 delete matcher2; 1389 delete pat2; 1390 delete matcher; 1391 delete pat; 1392 } 1393 1394 1395 //--------------------------------------------------------------------------- 1396 // 1397 // API_Pattern Test that the API for class RegexPattern is 1398 // present and nominally working. 1399 // 1400 //--------------------------------------------------------------------------- 1401 void RegexTest::API_Pattern() { 1402 RegexPattern pata; // Test default constructor to not crash. 1403 RegexPattern patb; 1404 1405 REGEX_ASSERT(pata == patb); 1406 REGEX_ASSERT(pata == pata); 1407 1408 UnicodeString re1("abc[a-l][m-z]"); 1409 UnicodeString re2("def"); 1410 UErrorCode status = U_ZERO_ERROR; 1411 UParseError pe; 1412 1413 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status); 1414 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status); 1415 REGEX_CHECK_STATUS; 1416 REGEX_ASSERT(*pat1 == *pat1); 1417 REGEX_ASSERT(*pat1 != pata); 1418 1419 // Assign 1420 patb = *pat1; 1421 REGEX_ASSERT(patb == *pat1); 1422 1423 // Copy Construct 1424 RegexPattern patc(*pat1); 1425 REGEX_ASSERT(patc == *pat1); 1426 REGEX_ASSERT(patb == patc); 1427 REGEX_ASSERT(pat1 != pat2); 1428 patb = *pat2; 1429 REGEX_ASSERT(patb != patc); 1430 REGEX_ASSERT(patb == *pat2); 1431 1432 // Compile with no flags. 1433 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status); 1434 REGEX_ASSERT(*pat1a == *pat1); 1435 1436 REGEX_ASSERT(pat1a->flags() == 0); 1437 1438 // Compile with different flags should be not equal 1439 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status); 1440 REGEX_CHECK_STATUS; 1441 1442 REGEX_ASSERT(*pat1b != *pat1a); 1443 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE); 1444 REGEX_ASSERT(pat1a->flags() == 0); 1445 delete pat1b; 1446 1447 // clone 1448 RegexPattern *pat1c = pat1->clone(); 1449 REGEX_ASSERT(*pat1c == *pat1); 1450 REGEX_ASSERT(*pat1c != *pat2); 1451 1452 delete pat1c; 1453 delete pat1a; 1454 delete pat1; 1455 delete pat2; 1456 1457 1458 // 1459 // Verify that a matcher created from a cloned pattern works. 1460 // (Jitterbug 3423) 1461 // 1462 { 1463 UErrorCode status = U_ZERO_ERROR; 1464 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status); 1465 RegexPattern *pClone = pSource->clone(); 1466 delete pSource; 1467 RegexMatcher *mFromClone = pClone->matcher(status); 1468 REGEX_CHECK_STATUS; 1469 UnicodeString s = "Hello World"; 1470 mFromClone->reset(s); 1471 REGEX_ASSERT(mFromClone->find() == TRUE); 1472 REGEX_ASSERT(mFromClone->group(status) == "Hello"); 1473 REGEX_ASSERT(mFromClone->find() == TRUE); 1474 REGEX_ASSERT(mFromClone->group(status) == "World"); 1475 REGEX_ASSERT(mFromClone->find() == FALSE); 1476 delete mFromClone; 1477 delete pClone; 1478 } 1479 1480 // 1481 // matches convenience API 1482 // 1483 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE); 1484 REGEX_CHECK_STATUS; 1485 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE); 1486 REGEX_CHECK_STATUS; 1487 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE); 1488 REGEX_CHECK_STATUS; 1489 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE); 1490 REGEX_CHECK_STATUS; 1491 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE); 1492 REGEX_CHECK_STATUS; 1493 status = U_INDEX_OUTOFBOUNDS_ERROR; 1494 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE); 1495 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1496 1497 1498 // 1499 // Split() 1500 // 1501 status = U_ZERO_ERROR; 1502 pat1 = RegexPattern::compile(" +", pe, status); 1503 REGEX_CHECK_STATUS; 1504 UnicodeString fields[10]; 1505 1506 int32_t n; 1507 n = pat1->split("Now is the time", fields, 10, status); 1508 REGEX_CHECK_STATUS; 1509 REGEX_ASSERT(n==4); 1510 REGEX_ASSERT(fields[0]=="Now"); 1511 REGEX_ASSERT(fields[1]=="is"); 1512 REGEX_ASSERT(fields[2]=="the"); 1513 REGEX_ASSERT(fields[3]=="time"); 1514 REGEX_ASSERT(fields[4]==""); 1515 1516 n = pat1->split("Now is the time", fields, 2, status); 1517 REGEX_CHECK_STATUS; 1518 REGEX_ASSERT(n==2); 1519 REGEX_ASSERT(fields[0]=="Now"); 1520 REGEX_ASSERT(fields[1]=="is the time"); 1521 REGEX_ASSERT(fields[2]=="the"); // left over from previous test 1522 1523 fields[1] = "*"; 1524 status = U_ZERO_ERROR; 1525 n = pat1->split("Now is the time", fields, 1, status); 1526 REGEX_CHECK_STATUS; 1527 REGEX_ASSERT(n==1); 1528 REGEX_ASSERT(fields[0]=="Now is the time"); 1529 REGEX_ASSERT(fields[1]=="*"); 1530 status = U_ZERO_ERROR; 1531 1532 n = pat1->split(" Now is the time ", fields, 10, status); 1533 REGEX_CHECK_STATUS; 1534 REGEX_ASSERT(n==5); 1535 REGEX_ASSERT(fields[0]==""); 1536 REGEX_ASSERT(fields[1]=="Now"); 1537 REGEX_ASSERT(fields[2]=="is"); 1538 REGEX_ASSERT(fields[3]=="the"); 1539 REGEX_ASSERT(fields[4]=="time"); 1540 REGEX_ASSERT(fields[5]==""); 1541 1542 n = pat1->split(" ", fields, 10, status); 1543 REGEX_CHECK_STATUS; 1544 REGEX_ASSERT(n==1); 1545 REGEX_ASSERT(fields[0]==""); 1546 1547 fields[0] = "foo"; 1548 n = pat1->split("", fields, 10, status); 1549 REGEX_CHECK_STATUS; 1550 REGEX_ASSERT(n==0); 1551 REGEX_ASSERT(fields[0]=="foo"); 1552 1553 delete pat1; 1554 1555 // split, with a pattern with (capture) 1556 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status); 1557 REGEX_CHECK_STATUS; 1558 1559 status = U_ZERO_ERROR; 1560 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status); 1561 REGEX_CHECK_STATUS; 1562 REGEX_ASSERT(n==6); 1563 REGEX_ASSERT(fields[0]==""); 1564 REGEX_ASSERT(fields[1]=="a"); 1565 REGEX_ASSERT(fields[2]=="Now is "); 1566 REGEX_ASSERT(fields[3]=="b"); 1567 REGEX_ASSERT(fields[4]=="the time"); 1568 REGEX_ASSERT(fields[5]=="c"); 1569 REGEX_ASSERT(fields[6]==""); 1570 REGEX_ASSERT(status==U_ZERO_ERROR); 1571 1572 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status); 1573 REGEX_CHECK_STATUS; 1574 REGEX_ASSERT(n==6); 1575 REGEX_ASSERT(fields[0]==" "); 1576 REGEX_ASSERT(fields[1]=="a"); 1577 REGEX_ASSERT(fields[2]=="Now is "); 1578 REGEX_ASSERT(fields[3]=="b"); 1579 REGEX_ASSERT(fields[4]=="the time"); 1580 REGEX_ASSERT(fields[5]=="c"); 1581 REGEX_ASSERT(fields[6]==""); 1582 1583 status = U_ZERO_ERROR; 1584 fields[6] = "foo"; 1585 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status); 1586 REGEX_CHECK_STATUS; 1587 REGEX_ASSERT(n==6); 1588 REGEX_ASSERT(fields[0]==" "); 1589 REGEX_ASSERT(fields[1]=="a"); 1590 REGEX_ASSERT(fields[2]=="Now is "); 1591 REGEX_ASSERT(fields[3]=="b"); 1592 REGEX_ASSERT(fields[4]=="the time"); 1593 REGEX_ASSERT(fields[5]=="c"); 1594 REGEX_ASSERT(fields[6]=="foo"); 1595 1596 status = U_ZERO_ERROR; 1597 fields[5] = "foo"; 1598 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status); 1599 REGEX_CHECK_STATUS; 1600 REGEX_ASSERT(n==5); 1601 REGEX_ASSERT(fields[0]==" "); 1602 REGEX_ASSERT(fields[1]=="a"); 1603 REGEX_ASSERT(fields[2]=="Now is "); 1604 REGEX_ASSERT(fields[3]=="b"); 1605 REGEX_ASSERT(fields[4]=="the time<c>"); 1606 REGEX_ASSERT(fields[5]=="foo"); 1607 1608 status = U_ZERO_ERROR; 1609 fields[5] = "foo"; 1610 n = pat1->split(" <a>Now is <b>the time", fields, 5, status); 1611 REGEX_CHECK_STATUS; 1612 REGEX_ASSERT(n==5); 1613 REGEX_ASSERT(fields[0]==" "); 1614 REGEX_ASSERT(fields[1]=="a"); 1615 REGEX_ASSERT(fields[2]=="Now is "); 1616 REGEX_ASSERT(fields[3]=="b"); 1617 REGEX_ASSERT(fields[4]=="the time"); 1618 REGEX_ASSERT(fields[5]=="foo"); 1619 1620 status = U_ZERO_ERROR; 1621 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status); 1622 REGEX_CHECK_STATUS; 1623 REGEX_ASSERT(n==4); 1624 REGEX_ASSERT(fields[0]==" "); 1625 REGEX_ASSERT(fields[1]=="a"); 1626 REGEX_ASSERT(fields[2]=="Now is "); 1627 REGEX_ASSERT(fields[3]=="the time<c>"); 1628 status = U_ZERO_ERROR; 1629 delete pat1; 1630 1631 pat1 = RegexPattern::compile("([-,])", pe, status); 1632 REGEX_CHECK_STATUS; 1633 n = pat1->split("1-10,20", fields, 10, status); 1634 REGEX_CHECK_STATUS; 1635 REGEX_ASSERT(n==5); 1636 REGEX_ASSERT(fields[0]=="1"); 1637 REGEX_ASSERT(fields[1]=="-"); 1638 REGEX_ASSERT(fields[2]=="10"); 1639 REGEX_ASSERT(fields[3]==","); 1640 REGEX_ASSERT(fields[4]=="20"); 1641 delete pat1; 1642 1643 1644 // 1645 // RegexPattern::pattern() 1646 // 1647 pat1 = new RegexPattern(); 1648 REGEX_ASSERT(pat1->pattern() == ""); 1649 delete pat1; 1650 1651 pat1 = RegexPattern::compile("(Hello, world)*", pe, status); 1652 REGEX_CHECK_STATUS; 1653 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*"); 1654 delete pat1; 1655 1656 1657 // 1658 // classID functions 1659 // 1660 pat1 = RegexPattern::compile("(Hello, world)*", pe, status); 1661 REGEX_CHECK_STATUS; 1662 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID()); 1663 REGEX_ASSERT(pat1->getDynamicClassID() != NULL); 1664 UnicodeString Hello("Hello, world."); 1665 RegexMatcher *m = pat1->matcher(Hello, status); 1666 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID()); 1667 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID()); 1668 REGEX_ASSERT(m->getDynamicClassID() != NULL); 1669 delete m; 1670 delete pat1; 1671 1672 } 1673 1674 //--------------------------------------------------------------------------- 1675 // 1676 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher 1677 // is present and working, but excluding functions 1678 // implementing replace operations. 1679 // 1680 //--------------------------------------------------------------------------- 1681 void RegexTest::API_Match_UTF8() { 1682 UParseError pe; 1683 UErrorCode status=U_ZERO_ERROR; 1684 int32_t flags = 0; 1685 1686 // 1687 // Debug - slide failing test cases early 1688 // 1689 #if 0 1690 { 1691 } 1692 return; 1693 #endif 1694 1695 // 1696 // Simple pattern compilation 1697 // 1698 { 1699 UText re = UTEXT_INITIALIZER; 1700 regextst_openUTF8FromInvariant(&re, "abc", -1, &status); 1701 RegexPattern *pat2; 1702 pat2 = RegexPattern::compile(&re, flags, pe, status); 1703 REGEX_CHECK_STATUS; 1704 1705 UText input1 = UTEXT_INITIALIZER; 1706 UText input2 = UTEXT_INITIALIZER; 1707 UText empty = UTEXT_INITIALIZER; 1708 regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status); 1709 REGEX_VERBOSE_TEXT(&input1); 1710 regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status); 1711 REGEX_VERBOSE_TEXT(&input2); 1712 utext_openUChars(&empty, NULL, 0, &status); 1713 1714 int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */ 1715 int32_t input2Len = strlen("not abc"); 1716 1717 1718 // 1719 // Matcher creation and reset. 1720 // 1721 RegexMatcher *m1 = pat2->matcher(&input1, RegexPattern::PATTERN_IS_UTEXT, status); 1722 REGEX_CHECK_STATUS; 1723 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 1724 const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */ 1725 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText()); 1726 m1->reset(&input2); 1727 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 1728 const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */ 1729 REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText()); 1730 m1->reset(&input1); 1731 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText()); 1732 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 1733 m1->reset(&empty); 1734 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 1735 REGEX_ASSERT(utext_nativeLength(&empty) == 0); 1736 1737 // 1738 // reset(pos, status) 1739 // 1740 m1->reset(&input1); 1741 m1->reset(4, status); 1742 REGEX_CHECK_STATUS; 1743 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText()); 1744 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 1745 1746 m1->reset(-1, status); 1747 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1748 status = U_ZERO_ERROR; 1749 1750 m1->reset(0, status); 1751 REGEX_CHECK_STATUS; 1752 status = U_ZERO_ERROR; 1753 1754 m1->reset(input1Len-1, status); 1755 REGEX_CHECK_STATUS; 1756 status = U_ZERO_ERROR; 1757 1758 m1->reset(input1Len, status); 1759 REGEX_CHECK_STATUS; 1760 status = U_ZERO_ERROR; 1761 1762 m1->reset(input1Len+1, status); 1763 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1764 status = U_ZERO_ERROR; 1765 1766 // 1767 // match(pos, status) 1768 // 1769 m1->reset(&input2); 1770 REGEX_ASSERT(m1->matches(4, status) == TRUE); 1771 m1->reset(); 1772 REGEX_ASSERT(m1->matches(3, status) == FALSE); 1773 m1->reset(); 1774 REGEX_ASSERT(m1->matches(5, status) == FALSE); 1775 REGEX_ASSERT(m1->matches(4, status) == TRUE); 1776 REGEX_ASSERT(m1->matches(-1, status) == FALSE); 1777 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1778 1779 // Match() at end of string should fail, but should not 1780 // be an error. 1781 status = U_ZERO_ERROR; 1782 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE); 1783 REGEX_CHECK_STATUS; 1784 1785 // Match beyond end of string should fail with an error. 1786 status = U_ZERO_ERROR; 1787 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE); 1788 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1789 1790 // Successful match at end of string. 1791 { 1792 status = U_ZERO_ERROR; 1793 RegexMatcher m("A?", 0, status); // will match zero length string. 1794 REGEX_CHECK_STATUS; 1795 m.reset(&input1); 1796 REGEX_ASSERT(m.matches(input1Len, status) == TRUE); 1797 REGEX_CHECK_STATUS; 1798 m.reset(&empty); 1799 REGEX_ASSERT(m.matches(0, status) == TRUE); 1800 REGEX_CHECK_STATUS; 1801 } 1802 1803 1804 // 1805 // lookingAt(pos, status) 1806 // 1807 status = U_ZERO_ERROR; 1808 m1->reset(&input2); // "not abc" 1809 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 1810 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE); 1811 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE); 1812 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 1813 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE); 1814 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1815 status = U_ZERO_ERROR; 1816 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE); 1817 REGEX_CHECK_STATUS; 1818 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE); 1819 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1820 1821 delete m1; 1822 delete pat2; 1823 1824 utext_close(&re); 1825 utext_close(&input1); 1826 utext_close(&input2); 1827 utext_close(&empty); 1828 } 1829 1830 1831 // 1832 // Capture Group. 1833 // RegexMatcher::start(); 1834 // RegexMatcher::end(); 1835 // RegexMatcher::groupCount(); 1836 // 1837 { 1838 int32_t flags=0; 1839 UParseError pe; 1840 UErrorCode status=U_ZERO_ERROR; 1841 UText re=UTEXT_INITIALIZER; 1842 const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */ 1843 utext_openUTF8(&re, str_01234567_pat, -1, &status); 1844 1845 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 1846 REGEX_CHECK_STATUS; 1847 1848 UText input = UTEXT_INITIALIZER; 1849 const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */ 1850 utext_openUTF8(&input, str_0123456789, -1, &status); 1851 1852 RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status); 1853 REGEX_CHECK_STATUS; 1854 REGEX_ASSERT(matcher->lookingAt(status) == TRUE); 1855 static const int32_t matchStarts[] = {0, 2, 4, 8}; 1856 static const int32_t matchEnds[] = {10, 8, 6, 10}; 1857 int32_t i; 1858 for (i=0; i<4; i++) { 1859 int32_t actualStart = matcher->start(i, status); 1860 REGEX_CHECK_STATUS; 1861 if (actualStart != matchStarts[i]) { 1862 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n", 1863 __FILE__, __LINE__, i, matchStarts[i], actualStart); 1864 } 1865 int32_t actualEnd = matcher->end(i, status); 1866 REGEX_CHECK_STATUS; 1867 if (actualEnd != matchEnds[i]) { 1868 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n", 1869 __FILE__, __LINE__, i, matchEnds[i], actualEnd); 1870 } 1871 } 1872 1873 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status)); 1874 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status)); 1875 1876 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 1877 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 1878 matcher->reset(); 1879 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE); 1880 1881 matcher->lookingAt(status); 1882 1883 UnicodeString dest; 1884 UText destText = UTEXT_INITIALIZER; 1885 utext_openUnicodeString(&destText, &dest, &status); 1886 UText *result; 1887 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */ 1888 // Test shallow-clone API 1889 int64_t group_len; 1890 result = matcher->group((UText *)NULL, group_len, status); 1891 REGEX_CHECK_STATUS; 1892 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 1893 utext_close(result); 1894 result = matcher->group(0, &destText, group_len, status); 1895 REGEX_CHECK_STATUS; 1896 REGEX_ASSERT(result == &destText); 1897 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 1898 // destText is now immutable, reopen it 1899 utext_close(&destText); 1900 utext_openUnicodeString(&destText, &dest, &status); 1901 1902 result = matcher->group(0, NULL, status); 1903 REGEX_CHECK_STATUS; 1904 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 1905 utext_close(result); 1906 result = matcher->group(0, &destText, status); 1907 REGEX_CHECK_STATUS; 1908 REGEX_ASSERT(result == &destText); 1909 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 1910 1911 result = matcher->group(1, NULL, status); 1912 REGEX_CHECK_STATUS; 1913 const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */ 1914 REGEX_ASSERT_UTEXT_UTF8(str_234567, result); 1915 utext_close(result); 1916 result = matcher->group(1, &destText, status); 1917 REGEX_CHECK_STATUS; 1918 REGEX_ASSERT(result == &destText); 1919 REGEX_ASSERT_UTEXT_UTF8(str_234567, result); 1920 1921 result = matcher->group(2, NULL, status); 1922 REGEX_CHECK_STATUS; 1923 const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */ 1924 REGEX_ASSERT_UTEXT_UTF8(str_45, result); 1925 utext_close(result); 1926 result = matcher->group(2, &destText, status); 1927 REGEX_CHECK_STATUS; 1928 REGEX_ASSERT(result == &destText); 1929 REGEX_ASSERT_UTEXT_UTF8(str_45, result); 1930 1931 result = matcher->group(3, NULL, status); 1932 REGEX_CHECK_STATUS; 1933 const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */ 1934 REGEX_ASSERT_UTEXT_UTF8(str_89, result); 1935 utext_close(result); 1936 result = matcher->group(3, &destText, status); 1937 REGEX_CHECK_STATUS; 1938 REGEX_ASSERT(result == &destText); 1939 REGEX_ASSERT_UTEXT_UTF8(str_89, result); 1940 1941 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 1942 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 1943 matcher->reset(); 1944 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); 1945 1946 delete matcher; 1947 delete pat; 1948 1949 utext_close(&destText); 1950 utext_close(&input); 1951 utext_close(&re); 1952 } 1953 1954 // 1955 // find 1956 // 1957 { 1958 int32_t flags=0; 1959 UParseError pe; 1960 UErrorCode status=U_ZERO_ERROR; 1961 UText re=UTEXT_INITIALIZER; 1962 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 1963 utext_openUTF8(&re, str_abc, -1, &status); 1964 1965 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 1966 REGEX_CHECK_STATUS; 1967 UText input = UTEXT_INITIALIZER; 1968 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */ 1969 utext_openUTF8(&input, str_abcabcabc, -1, &status); 1970 // 012345678901234567 1971 1972 RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status); 1973 REGEX_CHECK_STATUS; 1974 REGEX_ASSERT(matcher->find()); 1975 REGEX_ASSERT(matcher->start(status) == 1); 1976 REGEX_ASSERT(matcher->find()); 1977 REGEX_ASSERT(matcher->start(status) == 6); 1978 REGEX_ASSERT(matcher->find()); 1979 REGEX_ASSERT(matcher->start(status) == 12); 1980 REGEX_ASSERT(matcher->find() == FALSE); 1981 REGEX_ASSERT(matcher->find() == FALSE); 1982 1983 matcher->reset(); 1984 REGEX_ASSERT(matcher->find()); 1985 REGEX_ASSERT(matcher->start(status) == 1); 1986 1987 REGEX_ASSERT(matcher->find(0, status)); 1988 REGEX_ASSERT(matcher->start(status) == 1); 1989 REGEX_ASSERT(matcher->find(1, status)); 1990 REGEX_ASSERT(matcher->start(status) == 1); 1991 REGEX_ASSERT(matcher->find(2, status)); 1992 REGEX_ASSERT(matcher->start(status) == 6); 1993 REGEX_ASSERT(matcher->find(12, status)); 1994 REGEX_ASSERT(matcher->start(status) == 12); 1995 REGEX_ASSERT(matcher->find(13, status) == FALSE); 1996 REGEX_ASSERT(matcher->find(16, status) == FALSE); 1997 REGEX_ASSERT(matcher->find(17, status) == FALSE); 1998 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE); 1999 2000 status = U_ZERO_ERROR; 2001 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 2002 status = U_ZERO_ERROR; 2003 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR); 2004 2005 REGEX_ASSERT(matcher->groupCount() == 0); 2006 2007 delete matcher; 2008 delete pat; 2009 2010 utext_close(&input); 2011 utext_close(&re); 2012 } 2013 2014 2015 // 2016 // find, with \G in pattern (true if at the end of a previous match). 2017 // 2018 { 2019 int32_t flags=0; 2020 UParseError pe; 2021 UErrorCode status=U_ZERO_ERROR; 2022 UText re=UTEXT_INITIALIZER; 2023 const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */ 2024 utext_openUTF8(&re, str_Gabcabc, -1, &status); 2025 2026 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 2027 2028 REGEX_CHECK_STATUS; 2029 UText input = UTEXT_INITIALIZER; 2030 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */ 2031 utext_openUTF8(&input, str_abcabcabc, -1, &status); 2032 // 012345678901234567 2033 2034 RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status); 2035 REGEX_CHECK_STATUS; 2036 REGEX_ASSERT(matcher->find()); 2037 REGEX_ASSERT(matcher->start(status) == 0); 2038 REGEX_ASSERT(matcher->start(1, status) == -1); 2039 REGEX_ASSERT(matcher->start(2, status) == 1); 2040 2041 REGEX_ASSERT(matcher->find()); 2042 REGEX_ASSERT(matcher->start(status) == 4); 2043 REGEX_ASSERT(matcher->start(1, status) == 4); 2044 REGEX_ASSERT(matcher->start(2, status) == -1); 2045 REGEX_CHECK_STATUS; 2046 2047 delete matcher; 2048 delete pat; 2049 2050 utext_close(&input); 2051 utext_close(&re); 2052 } 2053 2054 // 2055 // find with zero length matches, match position should bump ahead 2056 // to prevent loops. 2057 // 2058 { 2059 int32_t i; 2060 UErrorCode status=U_ZERO_ERROR; 2061 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere, 2062 // using an always-true look-ahead. 2063 REGEX_CHECK_STATUS; 2064 UText s = UTEXT_INITIALIZER; 2065 utext_openUTF8(&s, " ", -1, &status); 2066 m.reset(&s); 2067 for (i=0; ; i++) { 2068 if (m.find() == FALSE) { 2069 break; 2070 } 2071 REGEX_ASSERT(m.start(status) == i); 2072 REGEX_ASSERT(m.end(status) == i); 2073 } 2074 REGEX_ASSERT(i==5); 2075 2076 // Check that the bump goes over characters outside the BMP OK 2077 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8 2078 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00}; 2079 utext_openUTF8(&s, (char *)aboveBMP, -1, &status); 2080 m.reset(&s); 2081 for (i=0; ; i+=4) { 2082 if (m.find() == FALSE) { 2083 break; 2084 } 2085 REGEX_ASSERT(m.start(status) == i); 2086 REGEX_ASSERT(m.end(status) == i); 2087 } 2088 REGEX_ASSERT(i==20); 2089 2090 utext_close(&s); 2091 } 2092 { 2093 // find() loop breaking test. 2094 // with pattern of /.?/, should see a series of one char matches, then a single 2095 // match of zero length at the end of the input string. 2096 int32_t i; 2097 UErrorCode status=U_ZERO_ERROR; 2098 RegexMatcher m(".?", 0, status); 2099 REGEX_CHECK_STATUS; 2100 UText s = UTEXT_INITIALIZER; 2101 utext_openUTF8(&s, " ", -1, &status); 2102 m.reset(&s); 2103 for (i=0; ; i++) { 2104 if (m.find() == FALSE) { 2105 break; 2106 } 2107 REGEX_ASSERT(m.start(status) == i); 2108 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i)); 2109 } 2110 REGEX_ASSERT(i==5); 2111 2112 utext_close(&s); 2113 } 2114 2115 2116 // 2117 // Matchers with no input string behave as if they had an empty input string. 2118 // 2119 2120 { 2121 UErrorCode status = U_ZERO_ERROR; 2122 RegexMatcher m(".?", 0, status); 2123 REGEX_CHECK_STATUS; 2124 REGEX_ASSERT(m.find()); 2125 REGEX_ASSERT(m.start(status) == 0); 2126 REGEX_ASSERT(m.input() == ""); 2127 } 2128 { 2129 UErrorCode status = U_ZERO_ERROR; 2130 RegexPattern *p = RegexPattern::compile(".", 0, status); 2131 RegexMatcher *m = p->matcher(status); 2132 REGEX_CHECK_STATUS; 2133 2134 REGEX_ASSERT(m->find() == FALSE); 2135 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0); 2136 delete m; 2137 delete p; 2138 } 2139 2140 // 2141 // Regions 2142 // 2143 { 2144 UErrorCode status = U_ZERO_ERROR; 2145 UText testPattern = UTEXT_INITIALIZER; 2146 UText testText = UTEXT_INITIALIZER; 2147 regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status); 2148 REGEX_VERBOSE_TEXT(&testPattern); 2149 regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status); 2150 REGEX_VERBOSE_TEXT(&testText); 2151 2152 RegexMatcher m(&testPattern, &testText, 0, status); 2153 REGEX_CHECK_STATUS; 2154 REGEX_ASSERT(m.regionStart() == 0); 2155 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data")); 2156 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2157 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2158 2159 m.region(2,4, status); 2160 REGEX_CHECK_STATUS; 2161 REGEX_ASSERT(m.matches(status)); 2162 REGEX_ASSERT(m.start(status)==2); 2163 REGEX_ASSERT(m.end(status)==4); 2164 REGEX_CHECK_STATUS; 2165 2166 m.reset(); 2167 REGEX_ASSERT(m.regionStart() == 0); 2168 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data")); 2169 2170 regextst_openUTF8FromInvariant(&testText, "short", -1, &status); 2171 REGEX_VERBOSE_TEXT(&testText); 2172 m.reset(&testText); 2173 REGEX_ASSERT(m.regionStart() == 0); 2174 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short")); 2175 2176 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2177 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE)); 2178 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 2179 REGEX_ASSERT(&m == &m.reset()); 2180 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 2181 2182 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE)); 2183 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2184 REGEX_ASSERT(&m == &m.reset()); 2185 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2186 2187 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2188 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE)); 2189 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 2190 REGEX_ASSERT(&m == &m.reset()); 2191 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 2192 2193 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE)); 2194 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2195 REGEX_ASSERT(&m == &m.reset()); 2196 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2197 2198 utext_close(&testText); 2199 utext_close(&testPattern); 2200 } 2201 2202 // 2203 // hitEnd() and requireEnd() 2204 // 2205 { 2206 UErrorCode status = U_ZERO_ERROR; 2207 UText testPattern = UTEXT_INITIALIZER; 2208 UText testText = UTEXT_INITIALIZER; 2209 const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */ 2210 const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */ 2211 utext_openUTF8(&testPattern, str_, -1, &status); 2212 utext_openUTF8(&testText, str_aabb, -1, &status); 2213 2214 RegexMatcher m1(&testPattern, &testText, 0, status); 2215 REGEX_ASSERT(m1.lookingAt(status) == TRUE); 2216 REGEX_ASSERT(m1.hitEnd() == TRUE); 2217 REGEX_ASSERT(m1.requireEnd() == FALSE); 2218 REGEX_CHECK_STATUS; 2219 2220 status = U_ZERO_ERROR; 2221 const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */ 2222 utext_openUTF8(&testPattern, str_a, -1, &status); 2223 RegexMatcher m2(&testPattern, &testText, 0, status); 2224 REGEX_ASSERT(m2.lookingAt(status) == TRUE); 2225 REGEX_ASSERT(m2.hitEnd() == FALSE); 2226 REGEX_ASSERT(m2.requireEnd() == FALSE); 2227 REGEX_CHECK_STATUS; 2228 2229 status = U_ZERO_ERROR; 2230 const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */ 2231 utext_openUTF8(&testPattern, str_dotstardollar, -1, &status); 2232 RegexMatcher m3(&testPattern, &testText, 0, status); 2233 REGEX_ASSERT(m3.lookingAt(status) == TRUE); 2234 REGEX_ASSERT(m3.hitEnd() == TRUE); 2235 REGEX_ASSERT(m3.requireEnd() == TRUE); 2236 REGEX_CHECK_STATUS; 2237 2238 utext_close(&testText); 2239 utext_close(&testPattern); 2240 } 2241 } 2242 2243 2244 //--------------------------------------------------------------------------- 2245 // 2246 // API_Replace_UTF8 API test for class RegexMatcher, testing the 2247 // Replace family of functions. 2248 // 2249 //--------------------------------------------------------------------------- 2250 void RegexTest::API_Replace_UTF8() { 2251 // 2252 // Replace 2253 // 2254 int32_t flags=0; 2255 UParseError pe; 2256 UErrorCode status=U_ZERO_ERROR; 2257 2258 UText re=UTEXT_INITIALIZER; 2259 regextst_openUTF8FromInvariant(&re, "abc", -1, &status); 2260 REGEX_VERBOSE_TEXT(&re); 2261 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 2262 REGEX_CHECK_STATUS; 2263 2264 char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */ 2265 // 012345678901234567 2266 UText dataText = UTEXT_INITIALIZER; 2267 utext_openUTF8(&dataText, data, -1, &status); 2268 REGEX_CHECK_STATUS; 2269 REGEX_VERBOSE_TEXT(&dataText); 2270 RegexMatcher *matcher = pat->matcher(&dataText, RegexPattern::PATTERN_IS_UTEXT, status); 2271 2272 // 2273 // Plain vanilla matches. 2274 // 2275 UnicodeString dest; 2276 UText destText = UTEXT_INITIALIZER; 2277 utext_openUnicodeString(&destText, &dest, &status); 2278 UText *result; 2279 2280 UText replText = UTEXT_INITIALIZER; 2281 2282 const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */ 2283 utext_openUTF8(&replText, str_yz, -1, &status); 2284 REGEX_VERBOSE_TEXT(&replText); 2285 result = matcher->replaceFirst(&replText, NULL, status); 2286 REGEX_CHECK_STATUS; 2287 const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */ 2288 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result); 2289 utext_close(result); 2290 result = matcher->replaceFirst(&replText, &destText, status); 2291 REGEX_CHECK_STATUS; 2292 REGEX_ASSERT(result == &destText); 2293 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result); 2294 2295 result = matcher->replaceAll(&replText, NULL, status); 2296 REGEX_CHECK_STATUS; 2297 const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */ 2298 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result); 2299 utext_close(result); 2300 2301 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2302 result = matcher->replaceAll(&replText, &destText, status); 2303 REGEX_CHECK_STATUS; 2304 REGEX_ASSERT(result == &destText); 2305 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result); 2306 2307 // 2308 // Plain vanilla non-matches. 2309 // 2310 const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */ 2311 utext_openUTF8(&dataText, str_abxabxabx, -1, &status); 2312 matcher->reset(&dataText); 2313 2314 result = matcher->replaceFirst(&replText, NULL, status); 2315 REGEX_CHECK_STATUS; 2316 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2317 utext_close(result); 2318 result = matcher->replaceFirst(&replText, &destText, status); 2319 REGEX_CHECK_STATUS; 2320 REGEX_ASSERT(result == &destText); 2321 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2322 2323 result = matcher->replaceAll(&replText, NULL, status); 2324 REGEX_CHECK_STATUS; 2325 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2326 utext_close(result); 2327 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2328 result = matcher->replaceAll(&replText, &destText, status); 2329 REGEX_CHECK_STATUS; 2330 REGEX_ASSERT(result == &destText); 2331 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2332 2333 // 2334 // Empty source string 2335 // 2336 utext_openUTF8(&dataText, NULL, 0, &status); 2337 matcher->reset(&dataText); 2338 2339 result = matcher->replaceFirst(&replText, NULL, status); 2340 REGEX_CHECK_STATUS; 2341 REGEX_ASSERT_UTEXT_UTF8("", result); 2342 utext_close(result); 2343 result = matcher->replaceFirst(&replText, &destText, status); 2344 REGEX_CHECK_STATUS; 2345 REGEX_ASSERT(result == &destText); 2346 REGEX_ASSERT_UTEXT_UTF8("", result); 2347 2348 result = matcher->replaceAll(&replText, NULL, status); 2349 REGEX_CHECK_STATUS; 2350 REGEX_ASSERT_UTEXT_UTF8("", result); 2351 utext_close(result); 2352 result = matcher->replaceAll(&replText, &destText, status); 2353 REGEX_CHECK_STATUS; 2354 REGEX_ASSERT(result == &destText); 2355 REGEX_ASSERT_UTEXT_UTF8("", result); 2356 2357 // 2358 // Empty substitution string 2359 // 2360 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.." 2361 matcher->reset(&dataText); 2362 2363 utext_openUTF8(&replText, NULL, 0, &status); 2364 result = matcher->replaceFirst(&replText, NULL, status); 2365 REGEX_CHECK_STATUS; 2366 const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */ 2367 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result); 2368 utext_close(result); 2369 result = matcher->replaceFirst(&replText, &destText, status); 2370 REGEX_CHECK_STATUS; 2371 REGEX_ASSERT(result == &destText); 2372 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result); 2373 2374 result = matcher->replaceAll(&replText, NULL, status); 2375 REGEX_CHECK_STATUS; 2376 const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */ 2377 REGEX_ASSERT_UTEXT_UTF8(str_dots, result); 2378 utext_close(result); 2379 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2380 result = matcher->replaceAll(&replText, &destText, status); 2381 REGEX_CHECK_STATUS; 2382 REGEX_ASSERT(result == &destText); 2383 REGEX_ASSERT_UTEXT_UTF8(str_dots, result); 2384 2385 // 2386 // match whole string 2387 // 2388 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 2389 utext_openUTF8(&dataText, str_abc, -1, &status); 2390 matcher->reset(&dataText); 2391 2392 const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */ 2393 utext_openUTF8(&replText, str_xyz, -1, &status); 2394 result = matcher->replaceFirst(&replText, NULL, status); 2395 REGEX_CHECK_STATUS; 2396 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2397 utext_close(result); 2398 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2399 result = matcher->replaceFirst(&replText, &destText, status); 2400 REGEX_CHECK_STATUS; 2401 REGEX_ASSERT(result == &destText); 2402 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2403 2404 result = matcher->replaceAll(&replText, NULL, status); 2405 REGEX_CHECK_STATUS; 2406 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2407 utext_close(result); 2408 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2409 result = matcher->replaceAll(&replText, &destText, status); 2410 REGEX_CHECK_STATUS; 2411 REGEX_ASSERT(result == &destText); 2412 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2413 2414 // 2415 // Capture Group, simple case 2416 // 2417 const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */ 2418 utext_openUTF8(&re, str_add, -1, &status); 2419 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status); 2420 REGEX_CHECK_STATUS; 2421 2422 const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */ 2423 utext_openUTF8(&dataText, str_abcdefg, -1, &status); 2424 RegexMatcher *matcher2 = pat2->matcher(&dataText, RegexPattern::PATTERN_IS_UTEXT, status); 2425 REGEX_CHECK_STATUS; 2426 2427 const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */ 2428 utext_openUTF8(&replText, str_11, -1, &status); 2429 result = matcher2->replaceFirst(&replText, NULL, status); 2430 REGEX_CHECK_STATUS; 2431 const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */ 2432 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result); 2433 utext_close(result); 2434 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2435 result = matcher2->replaceFirst(&replText, &destText, status); 2436 REGEX_CHECK_STATUS; 2437 REGEX_ASSERT(result == &destText); 2438 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result); 2439 2440 regextst_openUTF8FromInvariant(&replText, "The value of \\$1 is $1.", -1, &status); 2441 result = matcher2->replaceFirst(&replText, NULL, status); 2442 REGEX_CHECK_STATUS; 2443 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */ 2444 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result); 2445 utext_close(result); 2446 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2447 result = matcher2->replaceFirst(&replText, &destText, status); 2448 REGEX_CHECK_STATUS; 2449 REGEX_ASSERT(result == &destText); 2450 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result); 2451 2452 const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */ 2453 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status); 2454 result = matcher2->replaceFirst(&replText, NULL, status); 2455 REGEX_CHECK_STATUS; 2456 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */ 2457 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result); 2458 utext_close(result); 2459 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2460 result = matcher2->replaceFirst(&replText, &destText, status); 2461 REGEX_CHECK_STATUS; 2462 REGEX_ASSERT(result == &destText); 2463 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result); 2464 2465 unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */ 2466 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE 2467 // 012345678901234567890123456 2468 supplDigitChars[22] = 0xF0; 2469 supplDigitChars[23] = 0x9D; 2470 supplDigitChars[24] = 0x9F; 2471 supplDigitChars[25] = 0x8F; 2472 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status); 2473 2474 result = matcher2->replaceFirst(&replText, NULL, status); 2475 REGEX_CHECK_STATUS; 2476 const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */ 2477 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result); 2478 utext_close(result); 2479 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2480 result = matcher2->replaceFirst(&replText, &destText, status); 2481 REGEX_CHECK_STATUS; 2482 REGEX_ASSERT(result == &destText); 2483 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result); 2484 const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */ 2485 utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status); 2486 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR); 2487 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result); 2488 utext_close(result); 2489 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2490 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR); 2491 REGEX_ASSERT(result == &destText); 2492 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result); 2493 2494 // 2495 // Replacement String with \u hex escapes 2496 // 2497 { 2498 const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */ 2499 const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */ 2500 utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status); 2501 utext_openUTF8(&replText, str_u0043, -1, &status); 2502 matcher->reset(&dataText); 2503 2504 result = matcher->replaceAll(&replText, NULL, status); 2505 REGEX_CHECK_STATUS; 2506 const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */ 2507 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result); 2508 utext_close(result); 2509 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2510 result = matcher->replaceAll(&replText, &destText, status); 2511 REGEX_CHECK_STATUS; 2512 REGEX_ASSERT(result == &destText); 2513 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result); 2514 } 2515 { 2516 const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */ 2517 utext_openUTF8(&dataText, str_abc, -1, &status); 2518 const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */ 2519 utext_openUTF8(&replText, str_U00010000, -1, &status); 2520 matcher->reset(&dataText); 2521 2522 unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A" 2523 // 0123456789 2524 expected[2] = 0xF0; 2525 expected[3] = 0x90; 2526 expected[4] = 0x80; 2527 expected[5] = 0x80; 2528 2529 result = matcher->replaceAll(&replText, NULL, status); 2530 REGEX_CHECK_STATUS; 2531 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result); 2532 utext_close(result); 2533 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2534 result = matcher->replaceAll(&replText, &destText, status); 2535 REGEX_CHECK_STATUS; 2536 REGEX_ASSERT(result == &destText); 2537 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result); 2538 } 2539 // TODO: need more through testing of capture substitutions. 2540 2541 // Bug 4057 2542 // 2543 { 2544 status = U_ZERO_ERROR; 2545 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */ 2546 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */ 2547 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */ 2548 utext_openUTF8(&re, str_ssee, -1, &status); 2549 utext_openUTF8(&dataText, str_blah, -1, &status); 2550 utext_openUTF8(&replText, str_ooh, -1, &status); 2551 2552 RegexMatcher m(&re, 0, status); 2553 REGEX_CHECK_STATUS; 2554 2555 UnicodeString result; 2556 UText resultText = UTEXT_INITIALIZER; 2557 utext_openUnicodeString(&resultText, &result, &status); 2558 2559 // Multiple finds do NOT bump up the previous appendReplacement postion. 2560 m.reset(&dataText); 2561 m.find(); 2562 m.find(); 2563 m.appendReplacement(&resultText, &replText, status); 2564 REGEX_CHECK_STATUS; 2565 const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */ 2566 REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText); 2567 2568 // After a reset into the interior of a string, appendReplacement still starts at beginning. 2569 status = U_ZERO_ERROR; 2570 result.truncate(0); 2571 utext_openUnicodeString(&resultText, &result, &status); 2572 m.reset(10, status); 2573 m.find(); 2574 m.find(); 2575 m.appendReplacement(&resultText, &replText, status); 2576 REGEX_CHECK_STATUS; 2577 const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */ 2578 REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText); 2579 2580 // find() at interior of string, appendReplacement still starts at beginning. 2581 status = U_ZERO_ERROR; 2582 result.truncate(0); 2583 utext_openUnicodeString(&resultText, &result, &status); 2584 m.reset(); 2585 m.find(10, status); 2586 m.find(); 2587 m.appendReplacement(&resultText, &replText, status); 2588 REGEX_CHECK_STATUS; 2589 const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */ 2590 REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText); 2591 2592 m.appendTail(&resultText, status); 2593 const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */ 2594 REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText); 2595 2596 utext_close(&resultText); 2597 } 2598 2599 delete matcher2; 2600 delete pat2; 2601 delete matcher; 2602 delete pat; 2603 2604 utext_close(&dataText); 2605 utext_close(&replText); 2606 utext_close(&destText); 2607 utext_close(&re); 2608 } 2609 2610 2611 //--------------------------------------------------------------------------- 2612 // 2613 // API_Pattern_UTF8 Test that the API for class RegexPattern is 2614 // present and nominally working. 2615 // 2616 //--------------------------------------------------------------------------- 2617 void RegexTest::API_Pattern_UTF8() { 2618 RegexPattern pata; // Test default constructor to not crash. 2619 RegexPattern patb; 2620 2621 REGEX_ASSERT(pata == patb); 2622 REGEX_ASSERT(pata == pata); 2623 2624 UText re1 = UTEXT_INITIALIZER; 2625 UText re2 = UTEXT_INITIALIZER; 2626 UErrorCode status = U_ZERO_ERROR; 2627 UParseError pe; 2628 2629 const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */ 2630 const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */ 2631 utext_openUTF8(&re1, str_abcalmz, -1, &status); 2632 utext_openUTF8(&re2, str_def, -1, &status); 2633 2634 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status); 2635 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status); 2636 REGEX_CHECK_STATUS; 2637 REGEX_ASSERT(*pat1 == *pat1); 2638 REGEX_ASSERT(*pat1 != pata); 2639 2640 // Assign 2641 patb = *pat1; 2642 REGEX_ASSERT(patb == *pat1); 2643 2644 // Copy Construct 2645 RegexPattern patc(*pat1); 2646 REGEX_ASSERT(patc == *pat1); 2647 REGEX_ASSERT(patb == patc); 2648 REGEX_ASSERT(pat1 != pat2); 2649 patb = *pat2; 2650 REGEX_ASSERT(patb != patc); 2651 REGEX_ASSERT(patb == *pat2); 2652 2653 // Compile with no flags. 2654 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status); 2655 REGEX_ASSERT(*pat1a == *pat1); 2656 2657 REGEX_ASSERT(pat1a->flags() == 0); 2658 2659 // Compile with different flags should be not equal 2660 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status); 2661 REGEX_CHECK_STATUS; 2662 2663 REGEX_ASSERT(*pat1b != *pat1a); 2664 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE); 2665 REGEX_ASSERT(pat1a->flags() == 0); 2666 delete pat1b; 2667 2668 // clone 2669 RegexPattern *pat1c = pat1->clone(); 2670 REGEX_ASSERT(*pat1c == *pat1); 2671 REGEX_ASSERT(*pat1c != *pat2); 2672 2673 delete pat1c; 2674 delete pat1a; 2675 delete pat1; 2676 delete pat2; 2677 2678 utext_close(&re1); 2679 utext_close(&re2); 2680 2681 2682 // 2683 // Verify that a matcher created from a cloned pattern works. 2684 // (Jitterbug 3423) 2685 // 2686 { 2687 UErrorCode status = U_ZERO_ERROR; 2688 UText pattern = UTEXT_INITIALIZER; 2689 const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */ 2690 utext_openUTF8(&pattern, str_pL, -1, &status); 2691 2692 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status); 2693 RegexPattern *pClone = pSource->clone(); 2694 delete pSource; 2695 RegexMatcher *mFromClone = pClone->matcher(status); 2696 REGEX_CHECK_STATUS; 2697 2698 UText input = UTEXT_INITIALIZER; 2699 const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */ 2700 utext_openUTF8(&input, str_HelloWorld, -1, &status); 2701 mFromClone->reset(&input); 2702 REGEX_ASSERT(mFromClone->find() == TRUE); 2703 REGEX_ASSERT(mFromClone->group(status) == "Hello"); 2704 REGEX_ASSERT(mFromClone->find() == TRUE); 2705 REGEX_ASSERT(mFromClone->group(status) == "World"); 2706 REGEX_ASSERT(mFromClone->find() == FALSE); 2707 delete mFromClone; 2708 delete pClone; 2709 2710 utext_close(&input); 2711 utext_close(&pattern); 2712 } 2713 2714 // 2715 // matches convenience API 2716 // 2717 { 2718 UErrorCode status = U_ZERO_ERROR; 2719 UText pattern = UTEXT_INITIALIZER; 2720 UText input = UTEXT_INITIALIZER; 2721 2722 const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */ 2723 utext_openUTF8(&input, str_randominput, -1, &status); 2724 2725 const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */ 2726 utext_openUTF8(&pattern, str_dotstar, -1, &status); 2727 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE); 2728 REGEX_CHECK_STATUS; 2729 2730 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 2731 utext_openUTF8(&pattern, str_abc, -1, &status); 2732 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE); 2733 REGEX_CHECK_STATUS; 2734 2735 const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */ 2736 utext_openUTF8(&pattern, str_nput, -1, &status); 2737 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE); 2738 REGEX_CHECK_STATUS; 2739 2740 utext_openUTF8(&pattern, str_randominput, -1, &status); 2741 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE); 2742 REGEX_CHECK_STATUS; 2743 2744 const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */ 2745 utext_openUTF8(&pattern, str_u, -1, &status); 2746 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE); 2747 REGEX_CHECK_STATUS; 2748 2749 utext_openUTF8(&input, str_abc, -1, &status); 2750 utext_openUTF8(&pattern, str_abc, -1, &status); 2751 status = U_INDEX_OUTOFBOUNDS_ERROR; 2752 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE); 2753 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 2754 2755 utext_close(&input); 2756 utext_close(&pattern); 2757 } 2758 2759 2760 // 2761 // Split() 2762 // 2763 status = U_ZERO_ERROR; 2764 const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */ 2765 utext_openUTF8(&re1, str_spaceplus, -1, &status); 2766 pat1 = RegexPattern::compile(&re1, pe, status); 2767 REGEX_CHECK_STATUS; 2768 UnicodeString fields[10]; 2769 2770 int32_t n; 2771 n = pat1->split("Now is the time", fields, 10, status); 2772 REGEX_CHECK_STATUS; 2773 REGEX_ASSERT(n==4); 2774 REGEX_ASSERT(fields[0]=="Now"); 2775 REGEX_ASSERT(fields[1]=="is"); 2776 REGEX_ASSERT(fields[2]=="the"); 2777 REGEX_ASSERT(fields[3]=="time"); 2778 REGEX_ASSERT(fields[4]==""); 2779 2780 n = pat1->split("Now is the time", fields, 2, status); 2781 REGEX_CHECK_STATUS; 2782 REGEX_ASSERT(n==2); 2783 REGEX_ASSERT(fields[0]=="Now"); 2784 REGEX_ASSERT(fields[1]=="is the time"); 2785 REGEX_ASSERT(fields[2]=="the"); // left over from previous test 2786 2787 fields[1] = "*"; 2788 status = U_ZERO_ERROR; 2789 n = pat1->split("Now is the time", fields, 1, status); 2790 REGEX_CHECK_STATUS; 2791 REGEX_ASSERT(n==1); 2792 REGEX_ASSERT(fields[0]=="Now is the time"); 2793 REGEX_ASSERT(fields[1]=="*"); 2794 status = U_ZERO_ERROR; 2795 2796 n = pat1->split(" Now is the time ", fields, 10, status); 2797 REGEX_CHECK_STATUS; 2798 REGEX_ASSERT(n==5); 2799 REGEX_ASSERT(fields[0]==""); 2800 REGEX_ASSERT(fields[1]=="Now"); 2801 REGEX_ASSERT(fields[2]=="is"); 2802 REGEX_ASSERT(fields[3]=="the"); 2803 REGEX_ASSERT(fields[4]=="time"); 2804 REGEX_ASSERT(fields[5]==""); 2805 2806 n = pat1->split(" ", fields, 10, status); 2807 REGEX_CHECK_STATUS; 2808 REGEX_ASSERT(n==1); 2809 REGEX_ASSERT(fields[0]==""); 2810 2811 fields[0] = "foo"; 2812 n = pat1->split("", fields, 10, status); 2813 REGEX_CHECK_STATUS; 2814 REGEX_ASSERT(n==0); 2815 REGEX_ASSERT(fields[0]=="foo"); 2816 2817 delete pat1; 2818 2819 // split, with a pattern with (capture) 2820 regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status); 2821 pat1 = RegexPattern::compile(&re1, pe, status); 2822 REGEX_CHECK_STATUS; 2823 2824 status = U_ZERO_ERROR; 2825 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status); 2826 REGEX_CHECK_STATUS; 2827 REGEX_ASSERT(n==6); 2828 REGEX_ASSERT(fields[0]==""); 2829 REGEX_ASSERT(fields[1]=="a"); 2830 REGEX_ASSERT(fields[2]=="Now is "); 2831 REGEX_ASSERT(fields[3]=="b"); 2832 REGEX_ASSERT(fields[4]=="the time"); 2833 REGEX_ASSERT(fields[5]=="c"); 2834 REGEX_ASSERT(fields[6]==""); 2835 REGEX_ASSERT(status==U_ZERO_ERROR); 2836 2837 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status); 2838 REGEX_CHECK_STATUS; 2839 REGEX_ASSERT(n==6); 2840 REGEX_ASSERT(fields[0]==" "); 2841 REGEX_ASSERT(fields[1]=="a"); 2842 REGEX_ASSERT(fields[2]=="Now is "); 2843 REGEX_ASSERT(fields[3]=="b"); 2844 REGEX_ASSERT(fields[4]=="the time"); 2845 REGEX_ASSERT(fields[5]=="c"); 2846 REGEX_ASSERT(fields[6]==""); 2847 2848 status = U_ZERO_ERROR; 2849 fields[6] = "foo"; 2850 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status); 2851 REGEX_CHECK_STATUS; 2852 REGEX_ASSERT(n==6); 2853 REGEX_ASSERT(fields[0]==" "); 2854 REGEX_ASSERT(fields[1]=="a"); 2855 REGEX_ASSERT(fields[2]=="Now is "); 2856 REGEX_ASSERT(fields[3]=="b"); 2857 REGEX_ASSERT(fields[4]=="the time"); 2858 REGEX_ASSERT(fields[5]=="c"); 2859 REGEX_ASSERT(fields[6]=="foo"); 2860 2861 status = U_ZERO_ERROR; 2862 fields[5] = "foo"; 2863 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status); 2864 REGEX_CHECK_STATUS; 2865 REGEX_ASSERT(n==5); 2866 REGEX_ASSERT(fields[0]==" "); 2867 REGEX_ASSERT(fields[1]=="a"); 2868 REGEX_ASSERT(fields[2]=="Now is "); 2869 REGEX_ASSERT(fields[3]=="b"); 2870 REGEX_ASSERT(fields[4]=="the time<c>"); 2871 REGEX_ASSERT(fields[5]=="foo"); 2872 2873 status = U_ZERO_ERROR; 2874 fields[5] = "foo"; 2875 n = pat1->split(" <a>Now is <b>the time", fields, 5, status); 2876 REGEX_CHECK_STATUS; 2877 REGEX_ASSERT(n==5); 2878 REGEX_ASSERT(fields[0]==" "); 2879 REGEX_ASSERT(fields[1]=="a"); 2880 REGEX_ASSERT(fields[2]=="Now is "); 2881 REGEX_ASSERT(fields[3]=="b"); 2882 REGEX_ASSERT(fields[4]=="the time"); 2883 REGEX_ASSERT(fields[5]=="foo"); 2884 2885 status = U_ZERO_ERROR; 2886 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status); 2887 REGEX_CHECK_STATUS; 2888 REGEX_ASSERT(n==4); 2889 REGEX_ASSERT(fields[0]==" "); 2890 REGEX_ASSERT(fields[1]=="a"); 2891 REGEX_ASSERT(fields[2]=="Now is "); 2892 REGEX_ASSERT(fields[3]=="the time<c>"); 2893 status = U_ZERO_ERROR; 2894 delete pat1; 2895 2896 regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status); 2897 pat1 = RegexPattern::compile(&re1, pe, status); 2898 REGEX_CHECK_STATUS; 2899 n = pat1->split("1-10,20", fields, 10, status); 2900 REGEX_CHECK_STATUS; 2901 REGEX_ASSERT(n==5); 2902 REGEX_ASSERT(fields[0]=="1"); 2903 REGEX_ASSERT(fields[1]=="-"); 2904 REGEX_ASSERT(fields[2]=="10"); 2905 REGEX_ASSERT(fields[3]==","); 2906 REGEX_ASSERT(fields[4]=="20"); 2907 delete pat1; 2908 2909 2910 // 2911 // RegexPattern::pattern() and patternText() 2912 // 2913 pat1 = new RegexPattern(); 2914 REGEX_ASSERT(pat1->pattern() == ""); 2915 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status)); 2916 delete pat1; 2917 2918 regextst_openUTF8FromInvariant(&re1, "(Hello, world)*", -1, &status); 2919 pat1 = RegexPattern::compile(&re1, pe, status); 2920 REGEX_CHECK_STATUS; 2921 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*"); 2922 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status)); 2923 delete pat1; 2924 2925 utext_close(&re1); 2926 } 2927 2928 2929 //--------------------------------------------------------------------------- 2930 // 2931 // Extended A more thorough check for features of regex patterns 2932 // The test cases are in a separate data file, 2933 // source/tests/testdata/regextst.txt 2934 // A description of the test data format is included in that file. 2935 // 2936 //--------------------------------------------------------------------------- 2937 2938 const char * 2939 RegexTest::getPath(char buffer[2048], const char *filename) { 2940 UErrorCode status=U_ZERO_ERROR; 2941 const char *testDataDirectory = IntlTest::getSourceTestData(status); 2942 if (U_FAILURE(status)) { 2943 errln("ERROR: loadTestData() failed - %s", u_errorName(status)); 2944 return NULL; 2945 } 2946 2947 strcpy(buffer, testDataDirectory); 2948 strcat(buffer, filename); 2949 return buffer; 2950 } 2951 2952 void RegexTest::Extended() { 2953 char tdd[2048]; 2954 const char *srcPath; 2955 UErrorCode status = U_ZERO_ERROR; 2956 int32_t lineNum = 0; 2957 2958 // 2959 // Open and read the test data file. 2960 // 2961 srcPath=getPath(tdd, "regextst.txt"); 2962 if(srcPath==NULL) { 2963 return; /* something went wrong, error already output */ 2964 } 2965 2966 int32_t len; 2967 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status); 2968 if (U_FAILURE(status)) { 2969 return; /* something went wrong, error already output */ 2970 } 2971 2972 // 2973 // Put the test data into a UnicodeString 2974 // 2975 UnicodeString testString(FALSE, testData, len); 2976 2977 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status); 2978 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status); 2979 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)"), 0, status); 2980 2981 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status); 2982 UnicodeString testPattern; // The pattern for test from the test file. 2983 UnicodeString testFlags; // the flags for a test. 2984 UnicodeString matchString; // The marked up string to be used as input 2985 2986 if (U_FAILURE(status)){ 2987 dataerrln("Construct RegexMatcher() error."); 2988 delete [] testData; 2989 return; 2990 } 2991 2992 // 2993 // Loop over the test data file, once per line. 2994 // 2995 while (lineMat.find()) { 2996 lineNum++; 2997 if (U_FAILURE(status)) { 2998 errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status)); 2999 } 3000 3001 status = U_ZERO_ERROR; 3002 UnicodeString testLine = lineMat.group(1, status); 3003 if (testLine.length() == 0) { 3004 continue; 3005 } 3006 3007 // 3008 // Parse the test line. Skip blank and comment only lines. 3009 // Separate out the three main fields - pattern, flags, target. 3010 // 3011 3012 commentMat.reset(testLine); 3013 if (commentMat.lookingAt(status)) { 3014 // This line is a comment, or blank. 3015 continue; 3016 } 3017 3018 // 3019 // Pull out the pattern field, remove it from the test file line. 3020 // 3021 quotedStuffMat.reset(testLine); 3022 if (quotedStuffMat.lookingAt(status)) { 3023 testPattern = quotedStuffMat.group(2, status); 3024 testLine.remove(0, quotedStuffMat.end(0, status)); 3025 } else { 3026 errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum); 3027 continue; 3028 } 3029 3030 3031 // 3032 // Pull out the flags from the test file line. 3033 // 3034 flagsMat.reset(testLine); 3035 flagsMat.lookingAt(status); // Will always match, possibly an empty string. 3036 testFlags = flagsMat.group(1, status); 3037 if (flagsMat.group(2, status).length() > 0) { 3038 errln("Bad Match flag at line %d. Scanning %c\n", 3039 lineNum, flagsMat.group(2, status).charAt(0)); 3040 continue; 3041 } 3042 testLine.remove(0, flagsMat.end(0, status)); 3043 3044 // 3045 // Pull out the match string, as a whole. 3046 // We'll process the <tags> later. 3047 // 3048 quotedStuffMat.reset(testLine); 3049 if (quotedStuffMat.lookingAt(status)) { 3050 matchString = quotedStuffMat.group(2, status); 3051 testLine.remove(0, quotedStuffMat.end(0, status)); 3052 } else { 3053 errln("Bad match string at test file line %d", lineNum); 3054 continue; 3055 } 3056 3057 // 3058 // The only thing left from the input line should be an optional trailing comment. 3059 // 3060 commentMat.reset(testLine); 3061 if (commentMat.lookingAt(status) == FALSE) { 3062 errln("Line %d: unexpected characters at end of test line.", lineNum); 3063 continue; 3064 } 3065 3066 // 3067 // Run the test 3068 // 3069 regex_find(testPattern, testFlags, matchString, srcPath, lineNum); 3070 } 3071 3072 delete [] testData; 3073 3074 } 3075 3076 3077 3078 //--------------------------------------------------------------------------- 3079 // 3080 // regex_find(pattern, flags, inputString, lineNumber) 3081 // 3082 // Function to run a single test from the Extended (data driven) tests. 3083 // See file test/testdata/regextst.txt for a description of the 3084 // pattern and inputString fields, and the allowed flags. 3085 // lineNumber is the source line in regextst.txt of the test. 3086 // 3087 //--------------------------------------------------------------------------- 3088 3089 3090 // Set a value into a UVector at position specified by a decimal number in 3091 // a UnicodeString. This is a utility function needed by the actual test function, 3092 // which follows. 3093 static void set(UVector &vec, int32_t val, UnicodeString index) { 3094 UErrorCode status=U_ZERO_ERROR; 3095 int32_t idx = 0; 3096 for (int32_t i=0; i<index.length(); i++) { 3097 int32_t d=u_charDigitValue(index.charAt(i)); 3098 if (d<0) {return;} 3099 idx = idx*10 + d; 3100 } 3101 while (vec.size()<idx+1) {vec.addElement(-1, status);} 3102 vec.setElementAt(val, idx); 3103 } 3104 3105 static void setInt(UVector &vec, int32_t val, int32_t idx) { 3106 UErrorCode status=U_ZERO_ERROR; 3107 while (vec.size()<idx+1) {vec.addElement(-1, status);} 3108 vec.setElementAt(val, idx); 3109 } 3110 3111 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex) 3112 { 3113 UBool couldFind = TRUE; 3114 UTEXT_SETNATIVEINDEX(utext, 0); 3115 int32_t i = 0; 3116 while (i < unistrOffset) { 3117 UChar32 c = UTEXT_NEXT32(utext); 3118 if (c != U_SENTINEL) { 3119 i += U16_LENGTH(c); 3120 } else { 3121 couldFind = FALSE; 3122 break; 3123 } 3124 } 3125 nativeIndex = UTEXT_GETNATIVEINDEX(utext); 3126 return couldFind; 3127 } 3128 3129 3130 void RegexTest::regex_find(const UnicodeString &pattern, 3131 const UnicodeString &flags, 3132 const UnicodeString &inputString, 3133 const char *srcPath, 3134 int32_t line) { 3135 UnicodeString unEscapedInput; 3136 UnicodeString deTaggedInput; 3137 3138 int32_t patternUTF8Length, inputUTF8Length; 3139 char *patternChars = NULL, *inputChars = NULL; 3140 UText patternText = UTEXT_INITIALIZER; 3141 UText inputText = UTEXT_INITIALIZER; 3142 UConverter *UTF8Converter = NULL; 3143 3144 UErrorCode status = U_ZERO_ERROR; 3145 UParseError pe; 3146 RegexPattern *parsePat = NULL; 3147 RegexMatcher *parseMatcher = NULL; 3148 RegexPattern *callerPattern = NULL, *UTF8Pattern = NULL; 3149 RegexMatcher *matcher = NULL, *UTF8Matcher = NULL; 3150 UVector groupStarts(status); 3151 UVector groupEnds(status); 3152 UVector groupStartsUTF8(status); 3153 UVector groupEndsUTF8(status); 3154 UBool isMatch = FALSE, isUTF8Match = FALSE; 3155 UBool failed = FALSE; 3156 int32_t numFinds; 3157 int32_t i; 3158 UBool useMatchesFunc = FALSE; 3159 UBool useLookingAtFunc = FALSE; 3160 int32_t regionStart = -1; 3161 int32_t regionEnd = -1; 3162 int32_t regionStartUTF8 = -1; 3163 int32_t regionEndUTF8 = -1; 3164 3165 3166 // 3167 // Compile the caller's pattern 3168 // 3169 uint32_t bflags = 0; 3170 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag 3171 bflags |= UREGEX_CASE_INSENSITIVE; 3172 } 3173 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag 3174 bflags |= UREGEX_COMMENTS; 3175 } 3176 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag 3177 bflags |= UREGEX_DOTALL; 3178 } 3179 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag 3180 bflags |= UREGEX_MULTILINE; 3181 } 3182 3183 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag 3184 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES; 3185 } 3186 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag 3187 bflags |= UREGEX_UNIX_LINES; 3188 } 3189 3190 3191 callerPattern = RegexPattern::compile(pattern, bflags, pe, status); 3192 if (status != U_ZERO_ERROR) { 3193 #if UCONFIG_NO_BREAK_ITERATION==1 3194 // 'v' test flag means that the test pattern should not compile if ICU was configured 3195 // to not include break iteration. RBBI is needed for Unicode word boundaries. 3196 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) { 3197 goto cleanupAndReturn; 3198 } 3199 #endif 3200 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E' 3201 // Expected pattern compilation error. 3202 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd' 3203 logln("Pattern Compile returns \"%s\"", u_errorName(status)); 3204 } 3205 goto cleanupAndReturn; 3206 } else { 3207 // Unexpected pattern compilation error. 3208 errln("Line %d: error %s compiling pattern.", line, u_errorName(status)); 3209 goto cleanupAndReturn; 3210 } 3211 } 3212 3213 UTF8Converter = ucnv_open("UTF8", &status); 3214 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); 3215 3216 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status); 3217 status = U_ZERO_ERROR; // buffer overflow 3218 patternChars = new char[patternUTF8Length+1]; 3219 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status); 3220 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status); 3221 3222 if (status == U_ZERO_ERROR) { 3223 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status); 3224 3225 if (status != U_ZERO_ERROR) { 3226 #if UCONFIG_NO_BREAK_ITERATION==1 3227 // 'v' test flag means that the test pattern should not compile if ICU was configured 3228 // to not include break iteration. RBBI is needed for Unicode word boundaries. 3229 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) { 3230 goto cleanupAndReturn; 3231 } 3232 #endif 3233 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E' 3234 // Expected pattern compilation error. 3235 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd' 3236 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status)); 3237 } 3238 goto cleanupAndReturn; 3239 } else { 3240 // Unexpected pattern compilation error. 3241 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status)); 3242 goto cleanupAndReturn; 3243 } 3244 } 3245 } 3246 3247 if (UTF8Pattern == NULL) { 3248 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine 3249 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line); 3250 status = U_ZERO_ERROR; 3251 } 3252 3253 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag 3254 RegexPatternDump(callerPattern); 3255 } 3256 3257 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag 3258 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line); 3259 goto cleanupAndReturn; 3260 } 3261 3262 3263 // 3264 // Number of times find() should be called on the test string, default to 1 3265 // 3266 numFinds = 1; 3267 for (i=2; i<=9; i++) { 3268 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag 3269 if (numFinds != 1) { 3270 errln("Line %d: more than one digit flag. Scanning %d.", line, i); 3271 goto cleanupAndReturn; 3272 } 3273 numFinds = i; 3274 } 3275 } 3276 3277 // 'M' flag. Use matches() instead of find() 3278 if (flags.indexOf((UChar)0x4d) >= 0) { 3279 useMatchesFunc = TRUE; 3280 } 3281 if (flags.indexOf((UChar)0x4c) >= 0) { 3282 useLookingAtFunc = TRUE; 3283 } 3284 3285 // 3286 // Find the tags in the input data, remove them, and record the group boundary 3287 // positions. 3288 // 3289 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status); 3290 REGEX_CHECK_STATUS_L(line); 3291 3292 unEscapedInput = inputString.unescape(); 3293 parseMatcher = parsePat->matcher(unEscapedInput, status); 3294 REGEX_CHECK_STATUS_L(line); 3295 while(parseMatcher->find()) { 3296 parseMatcher->appendReplacement(deTaggedInput, "", status); 3297 REGEX_CHECK_STATUS; 3298 UnicodeString groupNum = parseMatcher->group(2, status); 3299 if (groupNum == "r") { 3300 // <r> or </r>, a region specification within the string 3301 if (parseMatcher->group(1, status) == "/") { 3302 regionEnd = deTaggedInput.length(); 3303 } else { 3304 regionStart = deTaggedInput.length(); 3305 } 3306 } else { 3307 // <digits> or </digits>, a group match boundary tag. 3308 if (parseMatcher->group(1, status) == "/") { 3309 set(groupEnds, deTaggedInput.length(), groupNum); 3310 } else { 3311 set(groupStarts, deTaggedInput.length(), groupNum); 3312 } 3313 } 3314 } 3315 parseMatcher->appendTail(deTaggedInput); 3316 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line); 3317 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) { 3318 errln("mismatched <r> tags"); 3319 failed = TRUE; 3320 goto cleanupAndReturn; 3321 } 3322 3323 // 3324 // Configure the matcher according to the flags specified with this test. 3325 // 3326 matcher = callerPattern->matcher(deTaggedInput, status); 3327 REGEX_CHECK_STATUS_L(line); 3328 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag 3329 matcher->setTrace(TRUE); 3330 } 3331 3332 if (UTF8Pattern != NULL) { 3333 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status); 3334 status = U_ZERO_ERROR; // buffer overflow 3335 inputChars = new char[inputUTF8Length+1]; 3336 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status); 3337 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status); 3338 3339 if (status == U_ZERO_ERROR) { 3340 UTF8Matcher = UTF8Pattern->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status); 3341 REGEX_CHECK_STATUS_L(line); 3342 } 3343 3344 if (UTF8Matcher == NULL) { 3345 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine 3346 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line); 3347 status = U_ZERO_ERROR; 3348 } 3349 } 3350 3351 // 3352 // Generate native indices for UTF8 versions of region and capture group info 3353 // 3354 if (UTF8Matcher != NULL) { 3355 if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8); 3356 if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8); 3357 3358 // Fill out the native index UVector info. 3359 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size() 3360 for (i=0; i<groupStarts.size(); i++) { 3361 int32_t start = groupStarts.elementAti(i); 3362 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting 3363 if (start >= 0) { 3364 int32_t startUTF8; 3365 if (!utextOffsetToNative(&inputText, start, startUTF8)) { 3366 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start); 3367 failed = TRUE; 3368 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3369 } 3370 setInt(groupStartsUTF8, startUTF8, i); 3371 } 3372 3373 int32_t end = groupEnds.elementAti(i); 3374 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting 3375 if (end >= 0) { 3376 int32_t endUTF8; 3377 if (!utextOffsetToNative(&inputText, end, endUTF8)) { 3378 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end); 3379 failed = TRUE; 3380 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3381 } 3382 setInt(groupEndsUTF8, endUTF8, i); 3383 } 3384 } 3385 } 3386 3387 if (regionStart>=0) { 3388 matcher->region(regionStart, regionEnd, status); 3389 REGEX_CHECK_STATUS_L(line); 3390 if (UTF8Matcher != NULL) { 3391 UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status); 3392 REGEX_CHECK_STATUS_L(line); 3393 } 3394 } 3395 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag 3396 matcher->useAnchoringBounds(FALSE); 3397 if (UTF8Matcher != NULL) { 3398 UTF8Matcher->useAnchoringBounds(FALSE); 3399 } 3400 } 3401 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag 3402 matcher->useTransparentBounds(TRUE); 3403 if (UTF8Matcher != NULL) { 3404 UTF8Matcher->useTransparentBounds(TRUE); 3405 } 3406 } 3407 3408 3409 3410 // 3411 // Do a find on the de-tagged input using the caller's pattern 3412 // TODO: error on count>1 and not find(). 3413 // error on both matches() and lookingAt(). 3414 // 3415 for (i=0; i<numFinds; i++) { 3416 if (useMatchesFunc) { 3417 isMatch = matcher->matches(status); 3418 if (UTF8Matcher != NULL) { 3419 isUTF8Match = UTF8Matcher->matches(status); 3420 } 3421 } else if (useLookingAtFunc) { 3422 isMatch = matcher->lookingAt(status); 3423 if (UTF8Matcher != NULL) { 3424 isUTF8Match = UTF8Matcher->lookingAt(status); 3425 } 3426 } else { 3427 isMatch = matcher->find(); 3428 if (UTF8Matcher != NULL) { 3429 isUTF8Match = UTF8Matcher->find(); 3430 } 3431 } 3432 } 3433 matcher->setTrace(FALSE); 3434 3435 // 3436 // Match up the groups from the find() with the groups from the tags 3437 // 3438 3439 // number of tags should match number of groups from find operation. 3440 // matcher->groupCount does not include group 0, the entire match, hence the +1. 3441 // G option in test means that capture group data is not available in the 3442 // expected results, so the check needs to be suppressed. 3443 if (isMatch == FALSE && groupStarts.size() != 0) { 3444 errln("Error at line %d: Match expected, but none found.", line); 3445 failed = TRUE; 3446 goto cleanupAndReturn; 3447 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) { 3448 errln("Error at line %d: Match expected, but none found. (UTF8)", line); 3449 failed = TRUE; 3450 goto cleanupAndReturn; 3451 } 3452 3453 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) { 3454 // Only check for match / no match. Don't check capture groups. 3455 if (isMatch && groupStarts.size() == 0) { 3456 errln("Error at line %d: No match expected, but one found.", line); 3457 failed = TRUE; 3458 } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) { 3459 errln("Error at line %d: No match expected, but one found. (UTF8)", line); 3460 failed = TRUE; 3461 } 3462 goto cleanupAndReturn; 3463 } 3464 3465 REGEX_CHECK_STATUS_L(line); 3466 for (i=0; i<=matcher->groupCount(); i++) { 3467 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i)); 3468 int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i)); 3469 if (matcher->start(i, status) != expectedStart) { 3470 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d", 3471 line, i, expectedStart, matcher->start(i, status)); 3472 failed = TRUE; 3473 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3474 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) { 3475 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)", 3476 line, i, expectedStartUTF8, UTF8Matcher->start(i, status)); 3477 failed = TRUE; 3478 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3479 } 3480 3481 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i)); 3482 int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i)); 3483 if (matcher->end(i, status) != expectedEnd) { 3484 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d", 3485 line, i, expectedEnd, matcher->end(i, status)); 3486 failed = TRUE; 3487 // Error on end position; keep going; real error is probably yet to come as group 3488 // end positions work from end of the input data towards the front. 3489 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) { 3490 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)", 3491 line, i, expectedEndUTF8, UTF8Matcher->end(i, status)); 3492 failed = TRUE; 3493 // Error on end position; keep going; real error is probably yet to come as group 3494 // end positions work from end of the input data towards the front. 3495 } 3496 } 3497 if ( matcher->groupCount()+1 < groupStarts.size()) { 3498 errln("Error at line %d: Expected %d capture groups, found %d.", 3499 line, groupStarts.size()-1, matcher->groupCount()); 3500 failed = TRUE; 3501 } 3502 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) { 3503 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)", 3504 line, groupStarts.size()-1, UTF8Matcher->groupCount()); 3505 failed = TRUE; 3506 } 3507 3508 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false 3509 matcher->requireEnd() == TRUE) { 3510 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line); 3511 failed = TRUE; 3512 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false 3513 UTF8Matcher->requireEnd() == TRUE) { 3514 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line); 3515 failed = TRUE; 3516 } 3517 3518 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true 3519 matcher->requireEnd() == FALSE) { 3520 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line); 3521 failed = TRUE; 3522 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false 3523 UTF8Matcher->requireEnd() == FALSE) { 3524 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line); 3525 failed = TRUE; 3526 } 3527 3528 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false 3529 matcher->hitEnd() == TRUE) { 3530 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line); 3531 failed = TRUE; 3532 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false 3533 UTF8Matcher->hitEnd() == TRUE) { 3534 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line); 3535 failed = TRUE; 3536 } 3537 3538 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true 3539 matcher->hitEnd() == FALSE) { 3540 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line); 3541 failed = TRUE; 3542 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true 3543 UTF8Matcher->hitEnd() == FALSE) { 3544 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line); 3545 failed = TRUE; 3546 } 3547 3548 3549 cleanupAndReturn: 3550 if (failed) { 3551 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" " 3552 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\""); 3553 // callerPattern->dump(); 3554 } 3555 delete parseMatcher; 3556 delete parsePat; 3557 delete UTF8Matcher; 3558 delete UTF8Pattern; 3559 delete matcher; 3560 delete callerPattern; 3561 3562 utext_close(&inputText); 3563 delete[] inputChars; 3564 utext_close(&patternText); 3565 delete[] patternChars; 3566 ucnv_close(UTF8Converter); 3567 } 3568 3569 3570 3571 3572 //--------------------------------------------------------------------------- 3573 // 3574 // Errors Check for error handling in patterns. 3575 // 3576 //--------------------------------------------------------------------------- 3577 void RegexTest::Errors() { 3578 // \escape sequences that aren't implemented yet. 3579 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED); 3580 3581 // Missing close parentheses 3582 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN); 3583 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN); 3584 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN); 3585 3586 // Extra close paren 3587 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN); 3588 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN); 3589 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN); 3590 3591 // Look-ahead, Look-behind 3592 // TODO: add tests for unbounded length look-behinds. 3593 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct 3594 3595 // Attempt to use non-default flags 3596 { 3597 UParseError pe; 3598 UErrorCode status = U_ZERO_ERROR; 3599 int32_t flags = UREGEX_CANON_EQ | 3600 UREGEX_COMMENTS | UREGEX_DOTALL | 3601 UREGEX_MULTILINE; 3602 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status); 3603 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED); 3604 delete pat1; 3605 } 3606 3607 3608 // Quantifiers are allowed only after something that can be quantified. 3609 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX); 3610 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX); 3611 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX); 3612 3613 // Mal-formed {min,max} quantifiers 3614 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL); 3615 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN); 3616 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL); 3617 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL); 3618 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL); 3619 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG); 3620 REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan 3621 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format 3622 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG); 3623 3624 // Ticket 5389 3625 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX); 3626 3627 // Invalid Back Reference \0 3628 // For ICU 3.8 and earlier 3629 // For ICU versions newer than 3.8, \0 introduces an octal escape. 3630 // 3631 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE); 3632 3633 } 3634 3635 3636 //------------------------------------------------------------------------------- 3637 // 3638 // Read a text data file, convert it to UChars, and return the data 3639 // in one big UChar * buffer, which the caller must delete. 3640 // 3641 //-------------------------------------------------------------------------------- 3642 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen, 3643 const char *defEncoding, UErrorCode &status) { 3644 UChar *retPtr = NULL; 3645 char *fileBuf = NULL; 3646 UConverter* conv = NULL; 3647 FILE *f = NULL; 3648 3649 ulen = 0; 3650 if (U_FAILURE(status)) { 3651 return retPtr; 3652 } 3653 3654 // 3655 // Open the file. 3656 // 3657 f = fopen(fileName, "rb"); 3658 if (f == 0) { 3659 dataerrln("Error opening test data file %s\n", fileName); 3660 status = U_FILE_ACCESS_ERROR; 3661 return NULL; 3662 } 3663 // 3664 // Read it in 3665 // 3666 int32_t fileSize; 3667 int32_t amt_read; 3668 3669 fseek( f, 0, SEEK_END); 3670 fileSize = ftell(f); 3671 fileBuf = new char[fileSize]; 3672 fseek(f, 0, SEEK_SET); 3673 amt_read = fread(fileBuf, 1, fileSize, f); 3674 if (amt_read != fileSize || fileSize <= 0) { 3675 errln("Error reading test data file."); 3676 goto cleanUpAndReturn; 3677 } 3678 3679 // 3680 // Look for a Unicode Signature (BOM) on the data just read 3681 // 3682 int32_t signatureLength; 3683 const char * fileBufC; 3684 const char* encoding; 3685 3686 fileBufC = fileBuf; 3687 encoding = ucnv_detectUnicodeSignature( 3688 fileBuf, fileSize, &signatureLength, &status); 3689 if(encoding!=NULL ){ 3690 fileBufC += signatureLength; 3691 fileSize -= signatureLength; 3692 } else { 3693 encoding = defEncoding; 3694 if (strcmp(encoding, "utf-8") == 0) { 3695 errln("file %s is missing its BOM", fileName); 3696 } 3697 } 3698 3699 // 3700 // Open a converter to take the rule file to UTF-16 3701 // 3702 conv = ucnv_open(encoding, &status); 3703 if (U_FAILURE(status)) { 3704 goto cleanUpAndReturn; 3705 } 3706 3707 // 3708 // Convert the rules to UChar. 3709 // Preflight first to determine required buffer size. 3710 // 3711 ulen = ucnv_toUChars(conv, 3712 NULL, // dest, 3713 0, // destCapacity, 3714 fileBufC, 3715 fileSize, 3716 &status); 3717 if (status == U_BUFFER_OVERFLOW_ERROR) { 3718 // Buffer Overflow is expected from the preflight operation. 3719 status = U_ZERO_ERROR; 3720 3721 retPtr = new UChar[ulen+1]; 3722 ucnv_toUChars(conv, 3723 retPtr, // dest, 3724 ulen+1, 3725 fileBufC, 3726 fileSize, 3727 &status); 3728 } 3729 3730 cleanUpAndReturn: 3731 fclose(f); 3732 delete[] fileBuf; 3733 ucnv_close(conv); 3734 if (U_FAILURE(status)) { 3735 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 3736 delete retPtr; 3737 retPtr = 0; 3738 ulen = 0; 3739 }; 3740 return retPtr; 3741 } 3742 3743 3744 //------------------------------------------------------------------------------- 3745 // 3746 // PerlTests - Run Perl's regular expression tests 3747 // The input file for this test is re_tests, the standard regular 3748 // expression test data distributed with the Perl source code. 3749 // 3750 // Here is Perl's description of the test data file: 3751 // 3752 // # The tests are in a separate file 't/op/re_tests'. 3753 // # Each line in that file is a separate test. 3754 // # There are five columns, separated by tabs. 3755 // # 3756 // # Column 1 contains the pattern, optionally enclosed in C<''>. 3757 // # Modifiers can be put after the closing C<'>. 3758 // # 3759 // # Column 2 contains the string to be matched. 3760 // # 3761 // # Column 3 contains the expected result: 3762 // # y expect a match 3763 // # n expect no match 3764 // # c expect an error 3765 // # B test exposes a known bug in Perl, should be skipped 3766 // # b test exposes a known bug in Perl, should be skipped if noamp 3767 // # 3768 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>. 3769 // # 3770 // # Column 4 contains a string, usually C<$&>. 3771 // # 3772 // # Column 5 contains the expected result of double-quote 3773 // # interpolating that string after the match, or start of error message. 3774 // # 3775 // # Column 6, if present, contains a reason why the test is skipped. 3776 // # This is printed with "skipped", for harness to pick up. 3777 // # 3778 // # \n in the tests are interpolated, as are variables of the form ${\w+}. 3779 // # 3780 // # If you want to add a regular expression test that can't be expressed 3781 // # in this format, don't add it here: put it in op/pat.t instead. 3782 // 3783 // For ICU, if field 3 contains an 'i', the test will be skipped. 3784 // The test exposes is some known incompatibility between ICU and Perl regexps. 3785 // (The i is in addition to whatever was there before.) 3786 // 3787 //------------------------------------------------------------------------------- 3788 void RegexTest::PerlTests() { 3789 char tdd[2048]; 3790 const char *srcPath; 3791 UErrorCode status = U_ZERO_ERROR; 3792 UParseError pe; 3793 3794 // 3795 // Open and read the test data file. 3796 // 3797 srcPath=getPath(tdd, "re_tests.txt"); 3798 if(srcPath==NULL) { 3799 return; /* something went wrong, error already output */ 3800 } 3801 3802 int32_t len; 3803 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status); 3804 if (U_FAILURE(status)) { 3805 return; /* something went wrong, error already output */ 3806 } 3807 3808 // 3809 // Put the test data into a UnicodeString 3810 // 3811 UnicodeString testDataString(FALSE, testData, len); 3812 3813 // 3814 // Regex to break the input file into lines, and strip the new lines. 3815 // One line per match, capture group one is the desired data. 3816 // 3817 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status); 3818 if (U_FAILURE(status)) { 3819 dataerrln("RegexPattern::compile() error"); 3820 return; 3821 } 3822 RegexMatcher* lineMat = linePat->matcher(testDataString, status); 3823 3824 // 3825 // Regex to split a test file line into fields. 3826 // There are six fields, separated by tabs. 3827 // 3828 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status); 3829 3830 // 3831 // Regex to identify test patterns with flag settings, and to separate them. 3832 // Test patterns with flags look like 'pattern'i 3833 // Test patterns without flags are not quoted: pattern 3834 // Coming out, capture group 2 is the pattern, capture group 3 is the flags. 3835 // 3836 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status); 3837 RegexMatcher* flagMat = flagPat->matcher(status); 3838 3839 // 3840 // The Perl tests reference several perl-isms, which are evaluated/substituted 3841 // in the test data. Not being perl, this must be done explicitly. Here 3842 // are string constants and REs for these constructs. 3843 // 3844 UnicodeString nulnulSrc("${nulnul}"); 3845 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV); 3846 nulnul = nulnul.unescape(); 3847 3848 UnicodeString ffffSrc("${ffff}"); 3849 UnicodeString ffff("\\uffff", -1, US_INV); 3850 ffff = ffff.unescape(); 3851 3852 // regexp for $-[0], $+[2], etc. 3853 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status); 3854 RegexMatcher *groupsMat = groupsPat->matcher(status); 3855 3856 // regexp for $0, $1, $2, etc. 3857 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status); 3858 RegexMatcher *cgMat = cgPat->matcher(status); 3859 3860 3861 // 3862 // Main Loop for the Perl Tests, runs once per line from the 3863 // test data file. 3864 // 3865 int32_t lineNum = 0; 3866 int32_t skippedUnimplementedCount = 0; 3867 while (lineMat->find()) { 3868 lineNum++; 3869 3870 // 3871 // Get a line, break it into its fields, do the Perl 3872 // variable substitutions. 3873 // 3874 UnicodeString line = lineMat->group(1, status); 3875 UnicodeString fields[7]; 3876 fieldPat->split(line, fields, 7, status); 3877 3878 flagMat->reset(fields[0]); 3879 flagMat->matches(status); 3880 UnicodeString pattern = flagMat->group(2, status); 3881 pattern.findAndReplace("${bang}", "!"); 3882 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000")); 3883 pattern.findAndReplace(ffffSrc, ffff); 3884 3885 // 3886 // Identify patterns that include match flag settings, 3887 // split off the flags, remove the extra quotes. 3888 // 3889 UnicodeString flagStr = flagMat->group(3, status); 3890 if (U_FAILURE(status)) { 3891 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 3892 return; 3893 } 3894 int32_t flags = 0; 3895 const UChar UChar_c = 0x63; // Char constants for the flag letters. 3896 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C) 3897 const UChar UChar_m = 0x6d; 3898 const UChar UChar_x = 0x78; 3899 const UChar UChar_y = 0x79; 3900 if (flagStr.indexOf(UChar_i) != -1) { 3901 flags |= UREGEX_CASE_INSENSITIVE; 3902 } 3903 if (flagStr.indexOf(UChar_m) != -1) { 3904 flags |= UREGEX_MULTILINE; 3905 } 3906 if (flagStr.indexOf(UChar_x) != -1) { 3907 flags |= UREGEX_COMMENTS; 3908 } 3909 3910 // 3911 // Compile the test pattern. 3912 // 3913 status = U_ZERO_ERROR; 3914 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status); 3915 if (status == U_REGEX_UNIMPLEMENTED) { 3916 // 3917 // Test of a feature that is planned for ICU, but not yet implemented. 3918 // skip the test. 3919 skippedUnimplementedCount++; 3920 delete testPat; 3921 status = U_ZERO_ERROR; 3922 continue; 3923 } 3924 3925 if (U_FAILURE(status)) { 3926 // Some tests are supposed to generate errors. 3927 // Only report an error for tests that are supposed to succeed. 3928 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND 3929 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility 3930 { 3931 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status)); 3932 } 3933 status = U_ZERO_ERROR; 3934 delete testPat; 3935 continue; 3936 } 3937 3938 if (fields[2].indexOf(UChar_i) >= 0) { 3939 // ICU should skip this test. 3940 delete testPat; 3941 continue; 3942 } 3943 3944 if (fields[2].indexOf(UChar_c) >= 0) { 3945 // This pattern should have caused a compilation error, but didn't/ 3946 errln("line %d: Expected a pattern compile error, got success.", lineNum); 3947 delete testPat; 3948 continue; 3949 } 3950 3951 // 3952 // replace the Perl variables that appear in some of the 3953 // match data strings. 3954 // 3955 UnicodeString matchString = fields[1]; 3956 matchString.findAndReplace(nulnulSrc, nulnul); 3957 matchString.findAndReplace(ffffSrc, ffff); 3958 3959 // Replace any \n in the match string with an actual new-line char. 3960 // Don't do full unescape, as this unescapes more than Perl does, which 3961 // causes other spurious failures in the tests. 3962 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 3963 3964 3965 3966 // 3967 // Run the test, check for expected match/don't match result. 3968 // 3969 RegexMatcher *testMat = testPat->matcher(matchString, status); 3970 UBool found = testMat->find(); 3971 UBool expected = FALSE; 3972 if (fields[2].indexOf(UChar_y) >=0) { 3973 expected = TRUE; 3974 } 3975 if (expected != found) { 3976 errln("line %d: Expected %smatch, got %smatch", 3977 lineNum, expected?"":"no ", found?"":"no " ); 3978 continue; 3979 } 3980 3981 // Don't try to check expected results if there is no match. 3982 // (Some have stuff in the expected fields) 3983 if (!found) { 3984 delete testMat; 3985 delete testPat; 3986 continue; 3987 } 3988 3989 // 3990 // Interpret the Perl expression from the fourth field of the data file, 3991 // building up an ICU string from the results of the ICU match. 3992 // The Perl expression will contain references to the results of 3993 // a regex match, including the matched string, capture group strings, 3994 // group starting and ending indicies, etc. 3995 // 3996 UnicodeString resultString; 3997 UnicodeString perlExpr = fields[3]; 3998 #if SUPPORT_MUTATING_INPUT_STRING 3999 groupsMat->reset(perlExpr); 4000 cgMat->reset(perlExpr); 4001 #endif 4002 4003 while (perlExpr.length() > 0) { 4004 #if !SUPPORT_MUTATING_INPUT_STRING 4005 // Perferred usage. Reset after any modification to input string. 4006 groupsMat->reset(perlExpr); 4007 cgMat->reset(perlExpr); 4008 #endif 4009 4010 if (perlExpr.startsWith("$&")) { 4011 resultString.append(testMat->group(status)); 4012 perlExpr.remove(0, 2); 4013 } 4014 4015 else if (groupsMat->lookingAt(status)) { 4016 // $-[0] $+[2] etc. 4017 UnicodeString digitString = groupsMat->group(2, status); 4018 int32_t t = 0; 4019 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4020 UnicodeString plusOrMinus = groupsMat->group(1, status); 4021 int32_t matchPosition; 4022 if (plusOrMinus.compare("+") == 0) { 4023 matchPosition = testMat->end(groupNum, status); 4024 } else { 4025 matchPosition = testMat->start(groupNum, status); 4026 } 4027 if (matchPosition != -1) { 4028 ICU_Utility::appendNumber(resultString, matchPosition); 4029 } 4030 perlExpr.remove(0, groupsMat->end(status)); 4031 } 4032 4033 else if (cgMat->lookingAt(status)) { 4034 // $1, $2, $3, etc. 4035 UnicodeString digitString = cgMat->group(1, status); 4036 int32_t t = 0; 4037 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4038 if (U_SUCCESS(status)) { 4039 resultString.append(testMat->group(groupNum, status)); 4040 status = U_ZERO_ERROR; 4041 } 4042 perlExpr.remove(0, cgMat->end(status)); 4043 } 4044 4045 else if (perlExpr.startsWith("@-")) { 4046 int32_t i; 4047 for (i=0; i<=testMat->groupCount(); i++) { 4048 if (i>0) { 4049 resultString.append(" "); 4050 } 4051 ICU_Utility::appendNumber(resultString, testMat->start(i, status)); 4052 } 4053 perlExpr.remove(0, 2); 4054 } 4055 4056 else if (perlExpr.startsWith("@+")) { 4057 int32_t i; 4058 for (i=0; i<=testMat->groupCount(); i++) { 4059 if (i>0) { 4060 resultString.append(" "); 4061 } 4062 ICU_Utility::appendNumber(resultString, testMat->end(i, status)); 4063 } 4064 perlExpr.remove(0, 2); 4065 } 4066 4067 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal. 4068 // or as an escaped sequence (e.g. \n) 4069 if (perlExpr.length() > 1) { 4070 perlExpr.remove(0, 1); // Remove the '\', but only if not last char. 4071 } 4072 UChar c = perlExpr.charAt(0); 4073 switch (c) { 4074 case 'n': c = '\n'; break; 4075 // add any other escape sequences that show up in the test expected results. 4076 } 4077 resultString.append(c); 4078 perlExpr.remove(0, 1); 4079 } 4080 4081 else { 4082 // Any characters from the perl expression that we don't explicitly 4083 // recognize before here are assumed to be literals and copied 4084 // as-is to the expected results. 4085 resultString.append(perlExpr.charAt(0)); 4086 perlExpr.remove(0, 1); 4087 } 4088 4089 if (U_FAILURE(status)) { 4090 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status)); 4091 break; 4092 } 4093 } 4094 4095 // 4096 // Expected Results Compare 4097 // 4098 UnicodeString expectedS(fields[4]); 4099 expectedS.findAndReplace(nulnulSrc, nulnul); 4100 expectedS.findAndReplace(ffffSrc, ffff); 4101 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4102 4103 4104 if (expectedS.compare(resultString) != 0) { 4105 err("Line %d: Incorrect perl expression results.", lineNum); 4106 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\""); 4107 } 4108 4109 delete testMat; 4110 delete testPat; 4111 } 4112 4113 // 4114 // All done. Clean up allocated stuff. 4115 // 4116 delete cgMat; 4117 delete cgPat; 4118 4119 delete groupsMat; 4120 delete groupsPat; 4121 4122 delete flagMat; 4123 delete flagPat; 4124 4125 delete lineMat; 4126 delete linePat; 4127 4128 delete fieldPat; 4129 delete [] testData; 4130 4131 4132 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount); 4133 4134 } 4135 4136 4137 //------------------------------------------------------------------------------- 4138 // 4139 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts 4140 // (instead of using UnicodeStrings) to test the alternate engine. 4141 // The input file for this test is re_tests, the standard regular 4142 // expression test data distributed with the Perl source code. 4143 // See PerlTests() for more information. 4144 // 4145 //------------------------------------------------------------------------------- 4146 void RegexTest::PerlTestsUTF8() { 4147 char tdd[2048]; 4148 const char *srcPath; 4149 UErrorCode status = U_ZERO_ERROR; 4150 UParseError pe; 4151 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status)); 4152 UText patternText = UTEXT_INITIALIZER; 4153 char *patternChars = NULL; 4154 int32_t patternLength; 4155 int32_t patternCapacity = 0; 4156 UText inputText = UTEXT_INITIALIZER; 4157 char *inputChars = NULL; 4158 int32_t inputLength; 4159 int32_t inputCapacity = 0; 4160 4161 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); 4162 4163 // 4164 // Open and read the test data file. 4165 // 4166 srcPath=getPath(tdd, "re_tests.txt"); 4167 if(srcPath==NULL) { 4168 return; /* something went wrong, error already output */ 4169 } 4170 4171 int32_t len; 4172 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status); 4173 if (U_FAILURE(status)) { 4174 return; /* something went wrong, error already output */ 4175 } 4176 4177 // 4178 // Put the test data into a UnicodeString 4179 // 4180 UnicodeString testDataString(FALSE, testData, len); 4181 4182 // 4183 // Regex to break the input file into lines, and strip the new lines. 4184 // One line per match, capture group one is the desired data. 4185 // 4186 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status); 4187 if (U_FAILURE(status)) { 4188 dataerrln("RegexPattern::compile() error"); 4189 return; 4190 } 4191 RegexMatcher* lineMat = linePat->matcher(testDataString, status); 4192 4193 // 4194 // Regex to split a test file line into fields. 4195 // There are six fields, separated by tabs. 4196 // 4197 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status); 4198 4199 // 4200 // Regex to identify test patterns with flag settings, and to separate them. 4201 // Test patterns with flags look like 'pattern'i 4202 // Test patterns without flags are not quoted: pattern 4203 // Coming out, capture group 2 is the pattern, capture group 3 is the flags. 4204 // 4205 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status); 4206 RegexMatcher* flagMat = flagPat->matcher(status); 4207 4208 // 4209 // The Perl tests reference several perl-isms, which are evaluated/substituted 4210 // in the test data. Not being perl, this must be done explicitly. Here 4211 // are string constants and REs for these constructs. 4212 // 4213 UnicodeString nulnulSrc("${nulnul}"); 4214 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV); 4215 nulnul = nulnul.unescape(); 4216 4217 UnicodeString ffffSrc("${ffff}"); 4218 UnicodeString ffff("\\uffff", -1, US_INV); 4219 ffff = ffff.unescape(); 4220 4221 // regexp for $-[0], $+[2], etc. 4222 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status); 4223 RegexMatcher *groupsMat = groupsPat->matcher(status); 4224 4225 // regexp for $0, $1, $2, etc. 4226 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status); 4227 RegexMatcher *cgMat = cgPat->matcher(status); 4228 4229 4230 // 4231 // Main Loop for the Perl Tests, runs once per line from the 4232 // test data file. 4233 // 4234 int32_t lineNum = 0; 4235 int32_t skippedUnimplementedCount = 0; 4236 while (lineMat->find()) { 4237 lineNum++; 4238 4239 // 4240 // Get a line, break it into its fields, do the Perl 4241 // variable substitutions. 4242 // 4243 UnicodeString line = lineMat->group(1, status); 4244 UnicodeString fields[7]; 4245 fieldPat->split(line, fields, 7, status); 4246 4247 flagMat->reset(fields[0]); 4248 flagMat->matches(status); 4249 UnicodeString pattern = flagMat->group(2, status); 4250 pattern.findAndReplace("${bang}", "!"); 4251 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000")); 4252 pattern.findAndReplace(ffffSrc, ffff); 4253 4254 // 4255 // Identify patterns that include match flag settings, 4256 // split off the flags, remove the extra quotes. 4257 // 4258 UnicodeString flagStr = flagMat->group(3, status); 4259 if (U_FAILURE(status)) { 4260 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 4261 return; 4262 } 4263 int32_t flags = 0; 4264 const UChar UChar_c = 0x63; // Char constants for the flag letters. 4265 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C) 4266 const UChar UChar_m = 0x6d; 4267 const UChar UChar_x = 0x78; 4268 const UChar UChar_y = 0x79; 4269 if (flagStr.indexOf(UChar_i) != -1) { 4270 flags |= UREGEX_CASE_INSENSITIVE; 4271 } 4272 if (flagStr.indexOf(UChar_m) != -1) { 4273 flags |= UREGEX_MULTILINE; 4274 } 4275 if (flagStr.indexOf(UChar_x) != -1) { 4276 flags |= UREGEX_COMMENTS; 4277 } 4278 4279 // 4280 // Put the pattern in a UTF-8 UText 4281 // 4282 status = U_ZERO_ERROR; 4283 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status); 4284 if (status == U_BUFFER_OVERFLOW_ERROR) { 4285 status = U_ZERO_ERROR; 4286 delete[] patternChars; 4287 patternCapacity = patternLength + 1; 4288 patternChars = new char[patternCapacity]; 4289 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status); 4290 } 4291 utext_openUTF8(&patternText, patternChars, patternLength, &status); 4292 4293 // 4294 // Compile the test pattern. 4295 // 4296 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status); 4297 if (status == U_REGEX_UNIMPLEMENTED) { 4298 // 4299 // Test of a feature that is planned for ICU, but not yet implemented. 4300 // skip the test. 4301 skippedUnimplementedCount++; 4302 delete testPat; 4303 status = U_ZERO_ERROR; 4304 continue; 4305 } 4306 4307 if (U_FAILURE(status)) { 4308 // Some tests are supposed to generate errors. 4309 // Only report an error for tests that are supposed to succeed. 4310 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND 4311 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility 4312 { 4313 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status)); 4314 } 4315 status = U_ZERO_ERROR; 4316 delete testPat; 4317 continue; 4318 } 4319 4320 if (fields[2].indexOf(UChar_i) >= 0) { 4321 // ICU should skip this test. 4322 delete testPat; 4323 continue; 4324 } 4325 4326 if (fields[2].indexOf(UChar_c) >= 0) { 4327 // This pattern should have caused a compilation error, but didn't/ 4328 errln("line %d: Expected a pattern compile error, got success.", lineNum); 4329 delete testPat; 4330 continue; 4331 } 4332 4333 4334 // 4335 // replace the Perl variables that appear in some of the 4336 // match data strings. 4337 // 4338 UnicodeString matchString = fields[1]; 4339 matchString.findAndReplace(nulnulSrc, nulnul); 4340 matchString.findAndReplace(ffffSrc, ffff); 4341 4342 // Replace any \n in the match string with an actual new-line char. 4343 // Don't do full unescape, as this unescapes more than Perl does, which 4344 // causes other spurious failures in the tests. 4345 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4346 4347 // 4348 // Put the input in a UTF-8 UText 4349 // 4350 status = U_ZERO_ERROR; 4351 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status); 4352 if (status == U_BUFFER_OVERFLOW_ERROR) { 4353 status = U_ZERO_ERROR; 4354 delete[] inputChars; 4355 inputCapacity = inputLength + 1; 4356 inputChars = new char[inputCapacity]; 4357 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status); 4358 } 4359 utext_openUTF8(&inputText, inputChars, inputLength, &status); 4360 4361 // 4362 // Run the test, check for expected match/don't match result. 4363 // 4364 RegexMatcher *testMat = testPat->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status); 4365 UBool found = testMat->find(); 4366 UBool expected = FALSE; 4367 if (fields[2].indexOf(UChar_y) >=0) { 4368 expected = TRUE; 4369 } 4370 if (expected != found) { 4371 errln("line %d: Expected %smatch, got %smatch", 4372 lineNum, expected?"":"no ", found?"":"no " ); 4373 continue; 4374 } 4375 4376 // Don't try to check expected results if there is no match. 4377 // (Some have stuff in the expected fields) 4378 if (!found) { 4379 delete testMat; 4380 delete testPat; 4381 continue; 4382 } 4383 4384 // 4385 // Interpret the Perl expression from the fourth field of the data file, 4386 // building up an ICU string from the results of the ICU match. 4387 // The Perl expression will contain references to the results of 4388 // a regex match, including the matched string, capture group strings, 4389 // group starting and ending indicies, etc. 4390 // 4391 UnicodeString resultString; 4392 UnicodeString perlExpr = fields[3]; 4393 4394 while (perlExpr.length() > 0) { 4395 groupsMat->reset(perlExpr); 4396 cgMat->reset(perlExpr); 4397 4398 if (perlExpr.startsWith("$&")) { 4399 resultString.append(testMat->group(status)); 4400 perlExpr.remove(0, 2); 4401 } 4402 4403 else if (groupsMat->lookingAt(status)) { 4404 // $-[0] $+[2] etc. 4405 UnicodeString digitString = groupsMat->group(2, status); 4406 int32_t t = 0; 4407 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4408 UnicodeString plusOrMinus = groupsMat->group(1, status); 4409 int32_t matchPosition; 4410 if (plusOrMinus.compare("+") == 0) { 4411 matchPosition = testMat->end(groupNum, status); 4412 } else { 4413 matchPosition = testMat->start(groupNum, status); 4414 } 4415 if (matchPosition != -1) { 4416 ICU_Utility::appendNumber(resultString, matchPosition); 4417 } 4418 perlExpr.remove(0, groupsMat->end(status)); 4419 } 4420 4421 else if (cgMat->lookingAt(status)) { 4422 // $1, $2, $3, etc. 4423 UnicodeString digitString = cgMat->group(1, status); 4424 int32_t t = 0; 4425 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4426 if (U_SUCCESS(status)) { 4427 resultString.append(testMat->group(groupNum, status)); 4428 status = U_ZERO_ERROR; 4429 } 4430 perlExpr.remove(0, cgMat->end(status)); 4431 } 4432 4433 else if (perlExpr.startsWith("@-")) { 4434 int32_t i; 4435 for (i=0; i<=testMat->groupCount(); i++) { 4436 if (i>0) { 4437 resultString.append(" "); 4438 } 4439 ICU_Utility::appendNumber(resultString, testMat->start(i, status)); 4440 } 4441 perlExpr.remove(0, 2); 4442 } 4443 4444 else if (perlExpr.startsWith("@+")) { 4445 int32_t i; 4446 for (i=0; i<=testMat->groupCount(); i++) { 4447 if (i>0) { 4448 resultString.append(" "); 4449 } 4450 ICU_Utility::appendNumber(resultString, testMat->end(i, status)); 4451 } 4452 perlExpr.remove(0, 2); 4453 } 4454 4455 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal. 4456 // or as an escaped sequence (e.g. \n) 4457 if (perlExpr.length() > 1) { 4458 perlExpr.remove(0, 1); // Remove the '\', but only if not last char. 4459 } 4460 UChar c = perlExpr.charAt(0); 4461 switch (c) { 4462 case 'n': c = '\n'; break; 4463 // add any other escape sequences that show up in the test expected results. 4464 } 4465 resultString.append(c); 4466 perlExpr.remove(0, 1); 4467 } 4468 4469 else { 4470 // Any characters from the perl expression that we don't explicitly 4471 // recognize before here are assumed to be literals and copied 4472 // as-is to the expected results. 4473 resultString.append(perlExpr.charAt(0)); 4474 perlExpr.remove(0, 1); 4475 } 4476 4477 if (U_FAILURE(status)) { 4478 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status)); 4479 break; 4480 } 4481 } 4482 4483 // 4484 // Expected Results Compare 4485 // 4486 UnicodeString expectedS(fields[4]); 4487 expectedS.findAndReplace(nulnulSrc, nulnul); 4488 expectedS.findAndReplace(ffffSrc, ffff); 4489 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4490 4491 4492 if (expectedS.compare(resultString) != 0) { 4493 err("Line %d: Incorrect perl expression results.", lineNum); 4494 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\""); 4495 } 4496 4497 delete testMat; 4498 delete testPat; 4499 } 4500 4501 // 4502 // All done. Clean up allocated stuff. 4503 // 4504 delete cgMat; 4505 delete cgPat; 4506 4507 delete groupsMat; 4508 delete groupsPat; 4509 4510 delete flagMat; 4511 delete flagPat; 4512 4513 delete lineMat; 4514 delete linePat; 4515 4516 delete fieldPat; 4517 delete [] testData; 4518 4519 utext_close(&patternText); 4520 utext_close(&inputText); 4521 4522 delete [] patternChars; 4523 delete [] inputChars; 4524 4525 4526 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount); 4527 4528 } 4529 4530 4531 //-------------------------------------------------------------- 4532 // 4533 // Bug6149 Verify limits to heap expansion for backtrack stack. 4534 // Use this pattern, 4535 // "(a?){1,}" 4536 // The zero-length match will repeat forever. 4537 // (That this goes into a loop is another bug) 4538 // 4539 //--------------------------------------------------------------- 4540 void RegexTest::Bug6149() { 4541 UnicodeString pattern("(a?){1,}"); 4542 UnicodeString s("xyz"); 4543 uint32_t flags = 0; 4544 UErrorCode status = U_ZERO_ERROR; 4545 4546 RegexMatcher matcher(pattern, s, flags, status); 4547 UBool result = false; 4548 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW); 4549 REGEX_ASSERT(result == FALSE); 4550 } 4551 4552 4553 // 4554 // Callbacks() Test the callback function. 4555 // When set, callbacks occur periodically during matching operations, 4556 // giving the application code the ability to abort the operation 4557 // before it's normal completion. 4558 // 4559 4560 struct callBackContext { 4561 RegexTest *test; 4562 int32_t maxCalls; 4563 int32_t numCalls; 4564 int32_t lastSteps; 4565 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;}; 4566 }; 4567 4568 U_CDECL_BEGIN 4569 static UBool U_CALLCONV 4570 testCallBackFn(const void *context, int32_t steps) { 4571 callBackContext *info = (callBackContext *)context; 4572 if (info->lastSteps+1 != steps) { 4573 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps); 4574 } 4575 info->lastSteps = steps; 4576 info->numCalls++; 4577 return (info->numCalls < info->maxCalls); 4578 } 4579 U_CDECL_END 4580 4581 void RegexTest::Callbacks() { 4582 { 4583 // Getter returns NULLs if no callback has been set 4584 4585 // The variables that the getter will fill in. 4586 // Init to non-null values so that the action of the getter can be seen. 4587 const void *returnedContext = &returnedContext; 4588 URegexMatchCallback *returnedFn = &testCallBackFn; 4589 4590 UErrorCode status = U_ZERO_ERROR; 4591 RegexMatcher matcher("x", 0, status); 4592 REGEX_CHECK_STATUS; 4593 matcher.getMatchCallback(returnedFn, returnedContext, status); 4594 REGEX_CHECK_STATUS; 4595 REGEX_ASSERT(returnedFn == NULL); 4596 REGEX_ASSERT(returnedContext == NULL); 4597 } 4598 4599 { 4600 // Set and Get work 4601 callBackContext cbInfo = {this, 0, 0, 0}; 4602 const void *returnedContext; 4603 URegexMatchCallback *returnedFn; 4604 UErrorCode status = U_ZERO_ERROR; 4605 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long. 4606 REGEX_CHECK_STATUS; 4607 matcher.setMatchCallback(testCallBackFn, &cbInfo, status); 4608 REGEX_CHECK_STATUS; 4609 matcher.getMatchCallback(returnedFn, returnedContext, status); 4610 REGEX_CHECK_STATUS; 4611 REGEX_ASSERT(returnedFn == testCallBackFn); 4612 REGEX_ASSERT(returnedContext == &cbInfo); 4613 4614 // A short-running match shouldn't invoke the callback 4615 status = U_ZERO_ERROR; 4616 cbInfo.reset(1); 4617 UnicodeString s = "xxx"; 4618 matcher.reset(s); 4619 REGEX_ASSERT(matcher.matches(status)); 4620 REGEX_CHECK_STATUS; 4621 REGEX_ASSERT(cbInfo.numCalls == 0); 4622 4623 // A medium-length match that runs long enough to invoke the 4624 // callback, but not so long that the callback aborts it. 4625 status = U_ZERO_ERROR; 4626 cbInfo.reset(4); 4627 s = "aaaaaaaaaaaaaaaaaaab"; 4628 matcher.reset(s); 4629 REGEX_ASSERT(matcher.matches(status)==FALSE); 4630 REGEX_CHECK_STATUS; 4631 REGEX_ASSERT(cbInfo.numCalls > 0); 4632 4633 // A longer running match that the callback function will abort. 4634 status = U_ZERO_ERROR; 4635 cbInfo.reset(4); 4636 s = "aaaaaaaaaaaaaaaaaaaaaaab"; 4637 matcher.reset(s); 4638 REGEX_ASSERT(matcher.matches(status)==FALSE); 4639 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); 4640 REGEX_ASSERT(cbInfo.numCalls == 4); 4641 } 4642 4643 4644 } 4645 4646 4647 // 4648 // FindProgressCallbacks() Test the find "progress" callback function. 4649 // When set, the find progress callback will be invoked during a find operations 4650 // after each return from a match attempt, giving the application the opportunity 4651 // to terminate a long-running find operation before it's normal completion. 4652 // 4653 4654 struct progressCallBackContext { 4655 RegexTest *test; 4656 int64_t lastIndex; 4657 int32_t maxCalls; 4658 int32_t numCalls; 4659 void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;}; 4660 }; 4661 4662 U_CDECL_BEGIN 4663 static UBool U_CALLCONV 4664 testProgressCallBackFn(const void *context, int64_t matchIndex) { 4665 progressCallBackContext *info = (progressCallBackContext *)context; 4666 info->numCalls++; 4667 info->lastIndex = matchIndex; 4668 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls); 4669 return (info->numCalls < info->maxCalls); 4670 } 4671 U_CDECL_END 4672 4673 void RegexTest::FindProgressCallbacks() { 4674 { 4675 // Getter returns NULLs if no callback has been set 4676 4677 // The variables that the getter will fill in. 4678 // Init to non-null values so that the action of the getter can be seen. 4679 const void *returnedContext = &returnedContext; 4680 URegexFindProgressCallback *returnedFn = &testProgressCallBackFn; 4681 4682 UErrorCode status = U_ZERO_ERROR; 4683 RegexMatcher matcher("x", 0, status); 4684 REGEX_CHECK_STATUS; 4685 matcher.getFindProgressCallback(returnedFn, returnedContext, status); 4686 REGEX_CHECK_STATUS; 4687 REGEX_ASSERT(returnedFn == NULL); 4688 REGEX_ASSERT(returnedContext == NULL); 4689 } 4690 4691 { 4692 // Set and Get work 4693 progressCallBackContext cbInfo = {this, 0, 0, 0}; 4694 const void *returnedContext; 4695 URegexFindProgressCallback *returnedFn; 4696 UErrorCode status = U_ZERO_ERROR; 4697 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long. 4698 REGEX_CHECK_STATUS; 4699 matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status); 4700 REGEX_CHECK_STATUS; 4701 matcher.getFindProgressCallback(returnedFn, returnedContext, status); 4702 REGEX_CHECK_STATUS; 4703 REGEX_ASSERT(returnedFn == testProgressCallBackFn); 4704 REGEX_ASSERT(returnedContext == &cbInfo); 4705 4706 // A short-running match should NOT invoke the callback. 4707 status = U_ZERO_ERROR; 4708 cbInfo.reset(100); 4709 UnicodeString s = "abxxx"; 4710 matcher.reset(s); 4711 #if 0 4712 matcher.setTrace(TRUE); 4713 #endif 4714 REGEX_ASSERT(matcher.find(0, status)); 4715 REGEX_CHECK_STATUS; 4716 REGEX_ASSERT(cbInfo.numCalls == 0); 4717 4718 // A medium running match that causes matcher.find() to invoke our callback for each index. 4719 status = U_ZERO_ERROR; 4720 s = "aaaaaaaaaaaaaaaaaaab"; 4721 cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string 4722 matcher.reset(s); 4723 REGEX_ASSERT(matcher.find(0, status)==FALSE); 4724 REGEX_CHECK_STATUS; 4725 REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25); 4726 4727 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point. 4728 status = U_ZERO_ERROR; 4729 UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab"; 4730 cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string 4731 matcher.reset(s1); 4732 REGEX_ASSERT(matcher.find(0, status)==FALSE); 4733 REGEX_CHECK_STATUS; 4734 REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5); 4735 4736 #if 0 4737 // Now a match that will succeed, but after an interruption 4738 status = U_ZERO_ERROR; 4739 UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx"; 4740 cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string 4741 matcher.reset(s2); 4742 REGEX_ASSERT(matcher.find(0, status)==FALSE); 4743 REGEX_CHECK_STATUS; 4744 // Now retry the match from where left off 4745 cbInfo.maxCalls = 100; // No callback limit 4746 REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status)); 4747 REGEX_CHECK_STATUS; 4748 #endif 4749 } 4750 4751 4752 } 4753 4754 4755 //--------------------------------------------------------------------------- 4756 // 4757 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable 4758 // UTexts. The pure-C implementation of UText 4759 // has no mutable backing stores, but we can 4760 // use UnicodeString here to test the functionality. 4761 // 4762 //--------------------------------------------------------------------------- 4763 void RegexTest::PreAllocatedUTextCAPI () { 4764 UErrorCode status = U_ZERO_ERROR; 4765 URegularExpression *re; 4766 UText patternText = UTEXT_INITIALIZER; 4767 UnicodeString buffer; 4768 UText bufferText = UTEXT_INITIALIZER; 4769 4770 utext_openUnicodeString(&bufferText, &buffer, &status); 4771 4772 /* 4773 * getText() and getUText() 4774 */ 4775 { 4776 UText text1 = UTEXT_INITIALIZER; 4777 UText text2 = UTEXT_INITIALIZER; 4778 UChar text2Chars[20]; 4779 UText *resultText; 4780 4781 status = U_ZERO_ERROR; 4782 regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status); 4783 regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status); 4784 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2); 4785 utext_openUChars(&text2, text2Chars, -1, &status); 4786 4787 regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status); 4788 re = uregex_openUText(&patternText, 0, NULL, &status); 4789 4790 /* First set a UText */ 4791 uregex_setUText(re, &text1, &status); 4792 resultText = uregex_getUText(re, &bufferText, &status); 4793 REGEX_CHECK_STATUS; 4794 REGEX_ASSERT(resultText == &bufferText); 4795 utext_setNativeIndex(resultText, 0); 4796 utext_setNativeIndex(&text1, 0); 4797 REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0); 4798 4799 resultText = uregex_getUText(re, &bufferText, &status); 4800 REGEX_CHECK_STATUS; 4801 REGEX_ASSERT(resultText == &bufferText); 4802 utext_setNativeIndex(resultText, 0); 4803 utext_setNativeIndex(&text1, 0); 4804 REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0); 4805 4806 /* Then set a UChar * */ 4807 uregex_setText(re, text2Chars, 7, &status); 4808 resultText = uregex_getUText(re, &bufferText, &status); 4809 REGEX_CHECK_STATUS; 4810 REGEX_ASSERT(resultText == &bufferText); 4811 utext_setNativeIndex(resultText, 0); 4812 utext_setNativeIndex(&text2, 0); 4813 REGEX_ASSERT(utext_compare(resultText, -1, &text2, -1) == 0); 4814 4815 uregex_close(re); 4816 utext_close(&text1); 4817 utext_close(&text2); 4818 } 4819 4820 /* 4821 * group() 4822 */ 4823 { 4824 UChar text1[80]; 4825 UText *actual; 4826 UBool result; 4827 u_uastrncpy(text1, "noise abc interior def, and this is off the end", sizeof(text1)/2); 4828 4829 status = U_ZERO_ERROR; 4830 re = uregex_openC("abc(.*?)def", 0, NULL, &status); 4831 REGEX_CHECK_STATUS; 4832 4833 uregex_setText(re, text1, -1, &status); 4834 result = uregex_find(re, 0, &status); 4835 REGEX_ASSERT(result==TRUE); 4836 4837 /* Capture Group 0, the full match. Should succeed. */ 4838 status = U_ZERO_ERROR; 4839 actual = uregex_groupUTextDeep(re, 0, &bufferText, &status); 4840 REGEX_CHECK_STATUS; 4841 REGEX_ASSERT(actual == &bufferText); 4842 REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual); 4843 4844 /* Capture group #1. Should succeed. */ 4845 status = U_ZERO_ERROR; 4846 actual = uregex_groupUTextDeep(re, 1, &bufferText, &status); 4847 REGEX_CHECK_STATUS; 4848 REGEX_ASSERT(actual == &bufferText); 4849 REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual); 4850 4851 /* Capture group out of range. Error. */ 4852 status = U_ZERO_ERROR; 4853 actual = uregex_groupUTextDeep(re, 2, &bufferText, &status); 4854 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 4855 REGEX_ASSERT(actual == &bufferText); 4856 4857 uregex_close(re); 4858 4859 } 4860 4861 /* 4862 * replaceFirst() 4863 */ 4864 { 4865 UChar text1[80]; 4866 UChar text2[80]; 4867 UText replText = UTEXT_INITIALIZER; 4868 UText *result; 4869 4870 status = U_ZERO_ERROR; 4871 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2); 4872 u_uastrncpy(text2, "No match here.", sizeof(text2)/2); 4873 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status); 4874 4875 re = uregex_openC("x(.*?)x", 0, NULL, &status); 4876 REGEX_CHECK_STATUS; 4877 4878 /* Normal case, with match */ 4879 uregex_setText(re, text1, -1, &status); 4880 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 4881 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 4882 REGEX_CHECK_STATUS; 4883 REGEX_ASSERT(result == &bufferText); 4884 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result); 4885 4886 /* No match. Text should copy to output with no changes. */ 4887 uregex_setText(re, text2, -1, &status); 4888 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 4889 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 4890 REGEX_CHECK_STATUS; 4891 REGEX_ASSERT(result == &bufferText); 4892 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result); 4893 4894 /* Unicode escapes */ 4895 uregex_setText(re, text1, -1, &status); 4896 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status); 4897 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 4898 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 4899 REGEX_CHECK_STATUS; 4900 REGEX_ASSERT(result == &bufferText); 4901 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result); 4902 4903 uregex_close(re); 4904 utext_close(&replText); 4905 } 4906 4907 4908 /* 4909 * replaceAll() 4910 */ 4911 { 4912 UChar text1[80]; 4913 UChar text2[80]; 4914 UText replText = UTEXT_INITIALIZER; 4915 UText *result; 4916 4917 status = U_ZERO_ERROR; 4918 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2); 4919 u_uastrncpy(text2, "No match here.", sizeof(text2)/2); 4920 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status); 4921 4922 re = uregex_openC("x(.*?)x", 0, NULL, &status); 4923 REGEX_CHECK_STATUS; 4924 4925 /* Normal case, with match */ 4926 uregex_setText(re, text1, -1, &status); 4927 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 4928 result = uregex_replaceAllUText(re, &replText, &bufferText, &status); 4929 REGEX_CHECK_STATUS; 4930 REGEX_ASSERT(result == &bufferText); 4931 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result); 4932 4933 /* No match. Text should copy to output with no changes. */ 4934 uregex_setText(re, text2, -1, &status); 4935 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 4936 result = uregex_replaceAllUText(re, &replText, &bufferText, &status); 4937 REGEX_CHECK_STATUS; 4938 REGEX_ASSERT(result == &bufferText); 4939 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result); 4940 4941 uregex_close(re); 4942 utext_close(&replText); 4943 } 4944 4945 4946 /* 4947 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts, 4948 * so we don't need to test it here. 4949 */ 4950 4951 utext_close(&bufferText); 4952 utext_close(&patternText); 4953 } 4954 4955 //-------------------------------------------------------------- 4956 // 4957 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher. 4958 // 4959 //--------------------------------------------------------------- 4960 void RegexTest::Bug7651() { 4961 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)"); 4962 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData. 4963 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation. 4964 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)"); 4965 UnicodeString s("#ff @abcd This is test"); 4966 RegexPattern *REPattern = NULL; 4967 RegexMatcher *REMatcher = NULL; 4968 UErrorCode status = U_ZERO_ERROR; 4969 UParseError pe; 4970 4971 REPattern = RegexPattern::compile(pattern1, 0, pe, status); 4972 REGEX_CHECK_STATUS; 4973 REMatcher = REPattern->matcher(s, status); 4974 REGEX_CHECK_STATUS; 4975 REGEX_ASSERT(REMatcher->find()); 4976 REGEX_ASSERT(REMatcher->start(status) == 0); 4977 delete REPattern; 4978 delete REMatcher; 4979 status = U_ZERO_ERROR; 4980 4981 REPattern = RegexPattern::compile(pattern2, 0, pe, status); 4982 REGEX_CHECK_STATUS; 4983 REMatcher = REPattern->matcher(s, status); 4984 REGEX_CHECK_STATUS; 4985 REGEX_ASSERT(REMatcher->find()); 4986 REGEX_ASSERT(REMatcher->start(status) == 0); 4987 delete REPattern; 4988 delete REMatcher; 4989 status = U_ZERO_ERROR; 4990 } 4991 4992 void RegexTest::Bug7740() { 4993 UErrorCode status = U_ZERO_ERROR; 4994 UnicodeString pattern = "(a)"; 4995 UnicodeString text = "abcdef"; 4996 RegexMatcher *m = new RegexMatcher(pattern, text, 0, status); 4997 REGEX_CHECK_STATUS; 4998 REGEX_ASSERT(m->lookingAt(status)); 4999 REGEX_CHECK_STATUS; 5000 status = U_ILLEGAL_ARGUMENT_ERROR; 5001 UnicodeString s = m->group(1, status); // Bug 7740: segfault here. 5002 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); 5003 REGEX_ASSERT(s == ""); 5004 delete m; 5005 } 5006 5007 5008 5009 5010 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ 5011 5012