1 /* 2 ********************************************************************** 3 * Copyright (C) 2005-2012, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 8 9 #include "unicode/utypes.h" 10 11 #if !UCONFIG_NO_COLLATION 12 13 #include "unicode/unistr.h" 14 #include "unicode/putil.h" 15 #include "unicode/usearch.h" 16 17 #include "cmemory.h" 18 #include "unicode/coll.h" 19 #include "unicode/tblcoll.h" 20 #include "unicode/coleitr.h" 21 #include "unicode/ucoleitr.h" 22 23 #include "unicode/regex.h" // TODO: make conditional on regexp being built. 24 25 #include "unicode/uniset.h" 26 #include "unicode/uset.h" 27 #include "unicode/ustring.h" 28 #include "hash.h" 29 #include "uhash.h" 30 #include "ucol_imp.h" 31 32 #include "intltest.h" 33 #include "ssearch.h" 34 35 #include "unicode/colldata.h" 36 #include "unicode/bmsearch.h" 37 #include "unicode/bms.h" 38 39 #include "xmlparser.h" 40 #include "ucbuf.h" 41 42 #include <stdlib.h> 43 #include <string.h> 44 #include <stdio.h> 45 46 char testId[100]; 47 48 #define TEST_ASSERT(x) {if (!(x)) { \ 49 errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, testId);}} 50 51 #define TEST_ASSERT_M(x, m) {if (!(x)) { \ 52 errln("Failure in file %s, line %d. \"%s\"", __FILE__, __LINE__, m);return;}} 53 54 #define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \ 55 dataerrln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \ 56 __FILE__, __LINE__, testId, u_errorName(errcode));}} 57 58 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 59 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) 60 #define DELETE_ARRAY(array) uprv_free((void *) (array)) 61 62 //--------------------------------------------------------------------------- 63 // 64 // Test class boilerplate 65 // 66 //--------------------------------------------------------------------------- 67 SSearchTest::SSearchTest() 68 { 69 } 70 71 SSearchTest::~SSearchTest() 72 { 73 } 74 75 void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char *params ) 76 { 77 if (exec) logln("TestSuite SSearchTest: "); 78 switch (index) { 79 #if !UCONFIG_NO_BREAK_ITERATION 80 case 0: name = "searchTest"; 81 if (exec) searchTest(); 82 break; 83 84 case 1: name = "offsetTest"; 85 if (exec) offsetTest(); 86 break; 87 88 case 2: name = "monkeyTest"; 89 if (exec) monkeyTest(params); 90 break; 91 92 case 3: name = "bmMonkeyTest"; 93 if (exec) bmMonkeyTest(params); 94 break; 95 96 case 4: name = "boyerMooreTest"; 97 if (exec) boyerMooreTest(); 98 break; 99 100 case 5: name = "goodSuffixTest"; 101 if (exec) goodSuffixTest(); 102 break; 103 104 case 6: name = "searchTime"; 105 if (exec) searchTime(); 106 break; 107 108 case 7: name = "bmsTest"; 109 if (exec) bmsTest(); 110 break; 111 112 case 8: name = "bmSearchTest"; 113 if (exec) bmSearchTest(); 114 break; 115 116 case 9: name = "udhrTest"; 117 if (exec) udhrTest(); 118 break; 119 case 10: name = "stringListTest"; 120 if (exec) stringListTest(); 121 break; 122 #endif 123 default: name = ""; 124 break; //needed to end loop 125 } 126 } 127 128 129 #if !UCONFIG_NO_BREAK_ITERATION 130 131 #define PATH_BUFFER_SIZE 2048 132 const char *SSearchTest::getPath(char buffer[2048], const char *filename) { 133 UErrorCode status = U_ZERO_ERROR; 134 const char *testDataDirectory = IntlTest::getSourceTestData(status); 135 136 if (U_FAILURE(status) || strlen(testDataDirectory) + strlen(filename) + 1 >= PATH_BUFFER_SIZE) { 137 errln("ERROR: getPath() failed - %s", u_errorName(status)); 138 return NULL; 139 } 140 141 strcpy(buffer, testDataDirectory); 142 strcat(buffer, filename); 143 return buffer; 144 } 145 146 147 void SSearchTest::searchTest() 148 { 149 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO 150 UErrorCode status = U_ZERO_ERROR; 151 char path[PATH_BUFFER_SIZE]; 152 const char *testFilePath = getPath(path, "ssearch.xml"); 153 154 if (testFilePath == NULL) { 155 return; /* Couldn't get path: error message already output. */ 156 } 157 158 LocalPointer<UXMLParser> parser(UXMLParser::createParser(status)); 159 TEST_ASSERT_SUCCESS(status); 160 LocalPointer<UXMLElement> root(parser->parseFile(testFilePath, status)); 161 TEST_ASSERT_SUCCESS(status); 162 if (U_FAILURE(status)) { 163 return; 164 } 165 166 const UnicodeString *debugTestCase = root->getAttribute("debug"); 167 if (debugTestCase != NULL) { 168 // setenv("USEARCH_DEBUG", "1", 1); 169 } 170 171 172 const UXMLElement *testCase; 173 int32_t tc = 0; 174 175 while((testCase = root->nextChildElement(tc)) != NULL) { 176 177 if (testCase->getTagName().compare("test-case") != 0) { 178 errln("ssearch, unrecognized XML Element in test file"); 179 continue; 180 } 181 const UnicodeString *id = testCase->getAttribute("id"); 182 *testId = 0; 183 if (id != NULL) { 184 id->extract(0, id->length(), testId, sizeof(testId), US_INV); 185 } 186 187 // If debugging test case has been specified and this is not it, skip to next. 188 if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) { 189 continue; 190 } 191 // 192 // Get the requested collation strength. 193 // Default is tertiary if the XML attribute is missing from the test case. 194 // 195 const UnicodeString *strength = testCase->getAttribute("strength"); 196 UColAttributeValue collatorStrength = UCOL_PRIMARY; 197 if (strength==NULL) { collatorStrength = UCOL_TERTIARY;} 198 else if (*strength=="PRIMARY") { collatorStrength = UCOL_PRIMARY;} 199 else if (*strength=="SECONDARY") { collatorStrength = UCOL_SECONDARY;} 200 else if (*strength=="TERTIARY") { collatorStrength = UCOL_TERTIARY;} 201 else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;} 202 else if (*strength=="IDENTICAL") { collatorStrength = UCOL_IDENTICAL;} 203 else { 204 // Bogus value supplied for strength. Shouldn't happen, even from 205 // typos, if the XML source has been validated. 206 // This assert is a little deceiving in that strength can be 207 // any of the allowed values, not just TERTIARY, but it will 208 // do the job of getting the error output. 209 TEST_ASSERT(*strength=="TERTIARY") 210 } 211 212 // 213 // Get the collator normalization flag. Default is UCOL_OFF. 214 // 215 UColAttributeValue normalize = UCOL_OFF; 216 const UnicodeString *norm = testCase->getAttribute("norm"); 217 TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF"); 218 if (norm!=NULL && *norm=="ON") { 219 normalize = UCOL_ON; 220 } 221 222 // 223 // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE. 224 // 225 UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE; 226 const UnicodeString *alt = testCase->getAttribute("alternate_handling"); 227 TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE"); 228 if (alt != NULL && *alt == "SHIFTED") { 229 alternateHandling = UCOL_SHIFTED; 230 } 231 232 const UnicodeString defLocale("en"); 233 char clocale[100]; 234 const UnicodeString *locale = testCase->getAttribute("locale"); 235 if (locale == NULL || locale->length()==0) { 236 locale = &defLocale; 237 }; 238 locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL); 239 240 241 UnicodeString text; 242 UnicodeString target; 243 UnicodeString pattern; 244 int32_t expectedMatchStart = -1; 245 int32_t expectedMatchLimit = -1; 246 const UXMLElement *n; 247 int32_t nodeCount = 0; 248 249 n = testCase->getChildElement("pattern"); 250 TEST_ASSERT(n != NULL); 251 if (n==NULL) { 252 continue; 253 } 254 text = n->getText(FALSE); 255 text = text.unescape(); 256 pattern.append(text); 257 nodeCount++; 258 259 n = testCase->getChildElement("pre"); 260 if (n!=NULL) { 261 text = n->getText(FALSE); 262 text = text.unescape(); 263 target.append(text); 264 nodeCount++; 265 } 266 267 n = testCase->getChildElement("m"); 268 if (n!=NULL) { 269 expectedMatchStart = target.length(); 270 text = n->getText(FALSE); 271 text = text.unescape(); 272 target.append(text); 273 expectedMatchLimit = target.length(); 274 nodeCount++; 275 } 276 277 n = testCase->getChildElement("post"); 278 if (n!=NULL) { 279 text = n->getText(FALSE); 280 text = text.unescape(); 281 target.append(text); 282 nodeCount++; 283 } 284 285 // Check that there weren't extra things in the XML 286 TEST_ASSERT(nodeCount == testCase->countChildren()); 287 288 // Open a collator and StringSearch based on the parameters 289 // obtained from the XML. 290 // 291 status = U_ZERO_ERROR; 292 LocalUCollatorPointer collator(ucol_open(clocale, &status)); 293 ucol_setStrength(collator.getAlias(), collatorStrength); 294 ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status); 295 ucol_setAttribute(collator.getAlias(), UCOL_ALTERNATE_HANDLING, alternateHandling, &status); 296 LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(), 297 target.getBuffer(), target.length(), 298 collator.getAlias(), 299 NULL, // the break iterator 300 &status)); 301 302 TEST_ASSERT_SUCCESS(status); 303 if (U_FAILURE(status)) { 304 continue; 305 } 306 307 int32_t foundStart = 0; 308 int32_t foundLimit = 0; 309 UBool foundMatch; 310 311 // 312 // Do the search, check the match result against the expected results. 313 // 314 foundMatch= usearch_search(uss.getAlias(), 0, &foundStart, &foundLimit, &status); 315 TEST_ASSERT_SUCCESS(status); 316 if ((foundMatch && expectedMatchStart<0) || 317 (foundStart != expectedMatchStart) || 318 (foundLimit != expectedMatchLimit)) { 319 TEST_ASSERT(FALSE); // ouput generic error position 320 infoln("Found, expected match start = %d, %d \n" 321 "Found, expected match limit = %d, %d", 322 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit); 323 } 324 325 // In case there are other matches... 326 // (should we only do this if the test case passed?) 327 while (foundMatch) { 328 expectedMatchStart = foundStart; 329 expectedMatchLimit = foundLimit; 330 331 foundMatch = usearch_search(uss.getAlias(), foundLimit, &foundStart, &foundLimit, &status); 332 } 333 334 uss.adoptInstead(usearch_openFromCollator(pattern.getBuffer(), pattern.length(), 335 target.getBuffer(), target.length(), 336 collator.getAlias(), 337 NULL, 338 &status)); 339 340 // 341 // Do the backwards search, check the match result against the expected results. 342 // 343 foundMatch= usearch_searchBackwards(uss.getAlias(), target.length(), &foundStart, &foundLimit, &status); 344 TEST_ASSERT_SUCCESS(status); 345 if ((foundMatch && expectedMatchStart<0) || 346 (foundStart != expectedMatchStart) || 347 (foundLimit != expectedMatchLimit)) { 348 TEST_ASSERT(FALSE); // ouput generic error position 349 infoln("Found, expected backwards match start = %d, %d \n" 350 "Found, expected backwards match limit = %d, %d", 351 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit); 352 } 353 } 354 #endif 355 } 356 357 struct UdhrTestCase 358 { 359 const char *locale; 360 const char *file; 361 }; 362 363 void SSearchTest::udhrTest() 364 { 365 UErrorCode status = U_ZERO_ERROR; 366 char path[PATH_BUFFER_SIZE]; 367 const char *udhrPath = getPath(path, "udhr"); 368 369 if (udhrPath == NULL) { 370 // couldn't get path: error message already output... 371 return; 372 } 373 374 UdhrTestCase testCases[] = { 375 {"en", "udhr_eng.txt"}, 376 {"de", "udhr_deu_1996.txt"}, 377 {"fr", "udhr_fra.txt"}, 378 {"ru", "udhr_rus.txt"}, 379 {"th", "udhr_tha.txt"}, 380 {"ja", "udhr_jpn.txt"}, 381 {"ko", "udhr_kor.txt"}, 382 {"zh", "udhr_cmn_hans.txt"}, 383 {"zh_Hant", "udhr_cmn_hant.txt"} 384 }; 385 386 int32_t testCount = ARRAY_SIZE(testCases); 387 388 for (int32_t t = 0; t < testCount; t += 1) { 389 int32_t len = 0; 390 char *resolvedFileName = NULL; 391 const char *encoding = NULL; 392 UCHARBUF *ucharBuf = NULL; 393 394 ucbuf_resolveFileName(udhrPath, testCases[t].file, NULL, &len, &status); 395 resolvedFileName = NEW_ARRAY(char, len); 396 397 if(resolvedFileName == NULL){ 398 continue; 399 } 400 401 if(status == U_BUFFER_OVERFLOW_ERROR){ 402 status = U_ZERO_ERROR; 403 } 404 405 ucbuf_resolveFileName(udhrPath, testCases[t].file, resolvedFileName, &len, &status); 406 ucharBuf = ucbuf_open(resolvedFileName, &encoding, TRUE, FALSE, &status); 407 408 DELETE_ARRAY(resolvedFileName); 409 410 if(U_FAILURE(status)){ 411 infoln("Could not open the input file %s. Test skipped\n", testCases[t].file); 412 continue; 413 } 414 415 int32_t targetLen = 0; 416 const UChar *target = ucbuf_getBuffer(ucharBuf, &targetLen, &status); 417 418 /* The first line of the file contains the pattern */ 419 int32_t start = 0, end = 0, plen = 0; 420 421 for(end = start; ; end += 1) { 422 UChar ch = target[end]; 423 424 if (ch == 0x000A || ch == 0x000D || ch == 0x2028) { 425 break; 426 } 427 } 428 429 plen = end - start; 430 431 UChar *pattern = NEW_ARRAY(UChar, plen); 432 for (int32_t i = 0; i < plen; i += 1) { 433 pattern[i] = target[start++]; 434 } 435 436 int32_t offset = 0; 437 UCollator *coll = ucol_open(testCases[t].locale, &status); 438 UCD *ucd = NULL; 439 BMS *bms = NULL; 440 441 if (U_FAILURE(status)) { 442 errln("Could not open collator for %s", testCases[t].locale); 443 goto delete_collator; 444 } 445 446 ucd = ucd_open(coll, &status); 447 448 if (U_FAILURE(status)) { 449 errln("Could not open CollData object for %s", testCases[t].locale); 450 goto delete_ucd; 451 } 452 453 bms = bms_open(ucd, pattern, plen, target, targetLen, &status); 454 455 if (U_FAILURE(status)) { 456 errln("Could not open search object for %s", testCases[t].locale); 457 goto delete_bms; 458 } 459 460 start = end = -1; 461 while (bms_search(bms, offset, &start, &end)) { 462 offset = end; 463 } 464 465 if (offset == 0) { 466 errln("Could not find pattern - locale: %s, file: %s ", testCases[t].locale, testCases[t].file); 467 } 468 469 delete_bms: 470 bms_close(bms); 471 472 delete_ucd: 473 ucd_close(ucd); 474 475 delete_collator: 476 ucol_close(coll); 477 478 DELETE_ARRAY(pattern); 479 ucbuf_close(ucharBuf); 480 } 481 482 ucd_flushCache(); 483 } 484 485 void SSearchTest::bmSearchTest() 486 { 487 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 488 UErrorCode status = U_ZERO_ERROR; 489 char path[PATH_BUFFER_SIZE]; 490 const char *testFilePath = getPath(path, "ssearch.xml"); 491 492 if (testFilePath == NULL) { 493 return; /* Couldn't get path: error message already output. */ 494 } 495 496 UXMLParser *parser = UXMLParser::createParser(status); 497 TEST_ASSERT_SUCCESS(status); 498 UXMLElement *root = parser->parseFile(testFilePath, status); 499 TEST_ASSERT_SUCCESS(status); 500 if (U_FAILURE(status)) { 501 return; 502 } 503 504 const UnicodeString *debugTestCase = root->getAttribute("debug"); 505 if (debugTestCase != NULL) { 506 // setenv("USEARCH_DEBUG", "1", 1); 507 } 508 509 510 const UXMLElement *testCase; 511 int32_t tc = 0; 512 513 while((testCase = root->nextChildElement(tc)) != NULL) { 514 515 if (testCase->getTagName().compare("test-case") != 0) { 516 errln("ssearch, unrecognized XML Element in test file"); 517 continue; 518 } 519 const UnicodeString *id = testCase->getAttribute("id"); 520 *testId = 0; 521 if (id != NULL) { 522 id->extract(0, id->length(), testId, sizeof(testId), US_INV); 523 } 524 525 // If debugging test case has been specified and this is not it, skip to next. 526 if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) { 527 continue; 528 } 529 // 530 // Get the requested collation strength. 531 // Default is tertiary if the XML attribute is missing from the test case. 532 // 533 const UnicodeString *strength = testCase->getAttribute("strength"); 534 UColAttributeValue collatorStrength = UCOL_PRIMARY; 535 if (strength==NULL) { collatorStrength = UCOL_TERTIARY;} 536 else if (*strength=="PRIMARY") { collatorStrength = UCOL_PRIMARY;} 537 else if (*strength=="SECONDARY") { collatorStrength = UCOL_SECONDARY;} 538 else if (*strength=="TERTIARY") { collatorStrength = UCOL_TERTIARY;} 539 else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;} 540 else if (*strength=="IDENTICAL") { collatorStrength = UCOL_IDENTICAL;} 541 else { 542 // Bogus value supplied for strength. Shouldn't happen, even from 543 // typos, if the XML source has been validated. 544 // This assert is a little deceiving in that strength can be 545 // any of the allowed values, not just TERTIARY, but it will 546 // do the job of getting the error output. 547 TEST_ASSERT(*strength=="TERTIARY") 548 } 549 550 // 551 // Get the collator normalization flag. Default is UCOL_OFF. 552 // 553 UColAttributeValue normalize = UCOL_OFF; 554 const UnicodeString *norm = testCase->getAttribute("norm"); 555 TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF"); 556 if (norm!=NULL && *norm=="ON") { 557 normalize = UCOL_ON; 558 } 559 560 // 561 // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE. 562 // 563 UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE; 564 const UnicodeString *alt = testCase->getAttribute("alternate_handling"); 565 TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE"); 566 if (alt != NULL && *alt == "SHIFTED") { 567 alternateHandling = UCOL_SHIFTED; 568 } 569 570 const UnicodeString defLocale("en"); 571 char clocale[100]; 572 const UnicodeString *locale = testCase->getAttribute("locale"); 573 if (locale == NULL || locale->length()==0) { 574 locale = &defLocale; 575 }; 576 locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL); 577 578 579 UnicodeString text; 580 UnicodeString target; 581 UnicodeString pattern; 582 int32_t expectedMatchStart = -1; 583 int32_t expectedMatchLimit = -1; 584 const UXMLElement *n; 585 int32_t nodeCount = 0; 586 587 n = testCase->getChildElement("pattern"); 588 TEST_ASSERT(n != NULL); 589 if (n==NULL) { 590 continue; 591 } 592 text = n->getText(FALSE); 593 text = text.unescape(); 594 pattern.append(text); 595 nodeCount++; 596 597 n = testCase->getChildElement("pre"); 598 if (n!=NULL) { 599 text = n->getText(FALSE); 600 text = text.unescape(); 601 target.append(text); 602 nodeCount++; 603 } 604 605 n = testCase->getChildElement("m"); 606 if (n!=NULL) { 607 expectedMatchStart = target.length(); 608 text = n->getText(FALSE); 609 text = text.unescape(); 610 target.append(text); 611 expectedMatchLimit = target.length(); 612 nodeCount++; 613 } 614 615 n = testCase->getChildElement("post"); 616 if (n!=NULL) { 617 text = n->getText(FALSE); 618 text = text.unescape(); 619 target.append(text); 620 nodeCount++; 621 } 622 623 // Check that there weren't extra things in the XML 624 TEST_ASSERT(nodeCount == testCase->countChildren()); 625 626 // Open a collator and StringSearch based on the parameters 627 // obtained from the XML. 628 // 629 status = U_ZERO_ERROR; 630 UCollator *collator = ucol_open(clocale, &status); 631 ucol_setStrength(collator, collatorStrength); 632 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, normalize, &status); 633 ucol_setAttribute(collator, UCOL_ALTERNATE_HANDLING, alternateHandling, &status); 634 UCD *ucd = ucd_open(collator, &status); 635 BMS *bms = bms_open(ucd, pattern.getBuffer(), pattern.length(), target.getBuffer(), target.length(), &status); 636 637 TEST_ASSERT_SUCCESS(status); 638 if (U_FAILURE(status)) { 639 bms_close(bms); 640 ucd_close(ucd); 641 ucol_close(collator); 642 continue; 643 } 644 645 int32_t foundStart = 0; 646 int32_t foundLimit = 0; 647 UBool foundMatch; 648 649 // 650 // Do the search, check the match result against the expected results. 651 // 652 foundMatch = bms_search(bms, 0, &foundStart, &foundLimit); 653 //TEST_ASSERT_SUCCESS(status); 654 if ((foundMatch && expectedMatchStart < 0) || 655 (foundStart != expectedMatchStart) || 656 (foundLimit != expectedMatchLimit)) { 657 TEST_ASSERT(FALSE); // ouput generic error position 658 infoln("Found, expected match start = %d, %d \n" 659 "Found, expected match limit = %d, %d", 660 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit); 661 } 662 663 bms_close(bms); 664 ucd_close(ucd); 665 ucol_close(collator); 666 } 667 668 ucd_flushCache(); 669 delete root; 670 delete parser; 671 #endif 672 } 673 674 struct Order 675 { 676 int32_t order; 677 int32_t lowOffset; 678 int32_t highOffset; 679 }; 680 681 class OrderList 682 { 683 public: 684 OrderList(); 685 OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset = 0); 686 ~OrderList(); 687 688 int32_t size(void) const; 689 void add(int32_t order, int32_t low, int32_t high); 690 const Order *get(int32_t index) const; 691 int32_t getLowOffset(int32_t index) const; 692 int32_t getHighOffset(int32_t index) const; 693 int32_t getOrder(int32_t index) const; 694 void reverse(void); 695 UBool compare(const OrderList &other) const; 696 UBool matchesAt(int32_t offset, const OrderList &other) const; 697 698 private: 699 Order *list; 700 int32_t listMax; 701 int32_t listSize; 702 }; 703 704 OrderList::OrderList() 705 : list(NULL), listMax(16), listSize(0) 706 { 707 list = new Order[listMax]; 708 } 709 710 OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset) 711 : list(NULL), listMax(16), listSize(0) 712 { 713 UErrorCode status = U_ZERO_ERROR; 714 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status); 715 uint32_t strengthMask = 0; 716 int32_t order, low, high; 717 718 switch (ucol_getStrength(coll)) 719 { 720 default: 721 strengthMask |= UCOL_TERTIARYORDERMASK; 722 /* fall through */ 723 724 case UCOL_SECONDARY: 725 strengthMask |= UCOL_SECONDARYORDERMASK; 726 /* fall through */ 727 728 case UCOL_PRIMARY: 729 strengthMask |= UCOL_PRIMARYORDERMASK; 730 } 731 732 list = new Order[listMax]; 733 734 ucol_setOffset(elems, stringOffset, &status); 735 736 do { 737 low = ucol_getOffset(elems); 738 order = ucol_next(elems, &status); 739 high = ucol_getOffset(elems); 740 741 if (order != UCOL_NULLORDER) { 742 order &= strengthMask; 743 } 744 745 if (order != UCOL_IGNORABLE) { 746 add(order, low, high); 747 } 748 } while (order != UCOL_NULLORDER); 749 750 ucol_closeElements(elems); 751 } 752 753 OrderList::~OrderList() 754 { 755 delete[] list; 756 } 757 758 void OrderList::add(int32_t order, int32_t low, int32_t high) 759 { 760 if (listSize >= listMax) { 761 listMax *= 2; 762 763 Order *newList = new Order[listMax]; 764 765 uprv_memcpy(newList, list, listSize * sizeof(Order)); 766 delete[] list; 767 list = newList; 768 } 769 770 list[listSize].order = order; 771 list[listSize].lowOffset = low; 772 list[listSize].highOffset = high; 773 774 listSize += 1; 775 } 776 777 const Order *OrderList::get(int32_t index) const 778 { 779 if (index >= listSize) { 780 return NULL; 781 } 782 783 return &list[index]; 784 } 785 786 int32_t OrderList::getLowOffset(int32_t index) const 787 { 788 const Order *order = get(index); 789 790 if (order != NULL) { 791 return order->lowOffset; 792 } 793 794 return -1; 795 } 796 797 int32_t OrderList::getHighOffset(int32_t index) const 798 { 799 const Order *order = get(index); 800 801 if (order != NULL) { 802 return order->highOffset; 803 } 804 805 return -1; 806 } 807 808 int32_t OrderList::getOrder(int32_t index) const 809 { 810 const Order *order = get(index); 811 812 if (order != NULL) { 813 return order->order; 814 } 815 816 return UCOL_NULLORDER; 817 } 818 819 int32_t OrderList::size() const 820 { 821 return listSize; 822 } 823 824 void OrderList::reverse() 825 { 826 for(int32_t f = 0, b = listSize - 1; f < b; f += 1, b -= 1) { 827 Order swap = list[b]; 828 829 list[b] = list[f]; 830 list[f] = swap; 831 } 832 } 833 834 UBool OrderList::compare(const OrderList &other) const 835 { 836 if (listSize != other.listSize) { 837 return FALSE; 838 } 839 840 for(int32_t i = 0; i < listSize; i += 1) { 841 if (list[i].order != other.list[i].order || 842 list[i].lowOffset != other.list[i].lowOffset || 843 list[i].highOffset != other.list[i].highOffset) { 844 return FALSE; 845 } 846 } 847 848 return TRUE; 849 } 850 851 UBool OrderList::matchesAt(int32_t offset, const OrderList &other) const 852 { 853 // NOTE: sizes include the NULLORDER, which we don't want to compare. 854 int32_t otherSize = other.size() - 1; 855 856 if (listSize - 1 - offset < otherSize) { 857 return FALSE; 858 } 859 860 for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) { 861 if (getOrder(i) != other.getOrder(j)) { 862 return FALSE; 863 } 864 } 865 866 return TRUE; 867 } 868 869 static char *printOffsets(char *buffer, OrderList &list) 870 { 871 int32_t size = list.size(); 872 char *s = buffer; 873 874 for(int32_t i = 0; i < size; i += 1) { 875 const Order *order = list.get(i); 876 877 if (i != 0) { 878 s += sprintf(s, ", "); 879 } 880 881 s += sprintf(s, "(%d, %d)", order->lowOffset, order->highOffset); 882 } 883 884 return buffer; 885 } 886 887 static char *printOrders(char *buffer, OrderList &list) 888 { 889 int32_t size = list.size(); 890 char *s = buffer; 891 892 for(int32_t i = 0; i < size; i += 1) { 893 const Order *order = list.get(i); 894 895 if (i != 0) { 896 s += sprintf(s, ", "); 897 } 898 899 s += sprintf(s, "%8.8X", order->order); 900 } 901 902 return buffer; 903 } 904 905 void SSearchTest::offsetTest() 906 { 907 const char *test[] = { 908 // The sequence \u0FB3\u0F71\u0F71\u0F80 contains a discontiguous 909 // contraction (\u0FB3\u0F71\u0F80) logically followed by \u0F71. 910 "\\u1E33\\u0FB3\\u0F71\\u0F71\\u0F80\\uD835\\uDF6C\\u01B0", 911 912 "\\ua191\\u16ef\\u2036\\u017a", 913 914 #if 0 915 // This results in a complex interaction between contraction, 916 // expansion and normalization that confuses the backwards offset fixups. 917 "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85", 918 #endif 919 920 "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85", 921 "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3", 922 923 "\\u02FE\\u02FF" 924 "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\\u030A\\u030B\\u030C\\u030D\\u030E\\u030F" 925 "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F" 926 "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F" 927 "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F" 928 "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E", // currently not working, see #8081 929 930 "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318", // currently not working, see #8081 931 "a\\u02FF\\u0301\\u0316", // currently not working, see #8081 932 "a\\u02FF\\u0316\\u0301", 933 "a\\u0430\\u0301\\u0316", 934 "a\\u0430\\u0316\\u0301", 935 "abc\\u0E41\\u0301\\u0316", 936 "abc\\u0E41\\u0316\\u0301", 937 "\\u0E41\\u0301\\u0316", 938 "\\u0E41\\u0316\\u0301", 939 "a\\u0301\\u0316", 940 "a\\u0316\\u0301", 941 "\\uAC52\\uAC53", 942 "\\u34CA\\u34CB", 943 "\\u11ED\\u11EE", 944 "\\u30C3\\u30D0", 945 "p\\u00E9ch\\u00E9", 946 "a\\u0301\\u0325", 947 "a\\u0300\\u0325", 948 "a\\u0325\\u0300", 949 "A\\u0323\\u0300B", 950 "A\\u0300\\u0323B", 951 "A\\u0301\\u0323B", 952 "A\\u0302\\u0301\\u0323B", 953 "abc", 954 "ab\\u0300c", 955 "ab\\u0300\\u0323c", 956 " \\uD800\\uDC00\\uDC00", 957 "a\\uD800\\uDC00\\uDC00", 958 "A\\u0301\\u0301", 959 "A\\u0301\\u0323", 960 "A\\u0301\\u0323B", 961 "B\\u0301\\u0323C", 962 "A\\u0300\\u0323B", 963 "\\u0301A\\u0301\\u0301", 964 "abcd\\r\\u0301", 965 "p\\u00EAche", 966 "pe\\u0302che", 967 }; 968 969 int32_t testCount = ARRAY_SIZE(test); 970 UErrorCode status = U_ZERO_ERROR; 971 RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Locale::getEnglish(), status); 972 if (U_FAILURE(status)) { 973 errcheckln(status, "Failed to create collator in offsetTest! - %s", u_errorName(status)); 974 return; 975 } 976 char buffer[4096]; // A bit of a hack... just happens to be long enough for all the test cases... 977 // We could allocate one that's the right size by (CE_count * 10) + 2 978 // 10 chars is enough room for 8 hex digits plus ", ". 2 extra chars for "[" and "]" 979 980 col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status); 981 982 for(int32_t i = 0; i < testCount; i += 1) { 983 if (!isICUVersionAtLeast(51, 1) && i>=4 && i<=6) { 984 continue; // timebomb until ticket #9156 (was #8081) is resolved 985 } 986 UnicodeString ts = CharsToUnicodeString(test[i]); 987 CollationElementIterator *iter = col->createCollationElementIterator(ts); 988 OrderList forwardList; 989 OrderList backwardList; 990 int32_t order, low, high; 991 992 do { 993 low = iter->getOffset(); 994 order = iter->next(status); 995 high = iter->getOffset(); 996 997 forwardList.add(order, low, high); 998 } while (order != CollationElementIterator::NULLORDER); 999 1000 iter->reset(); 1001 iter->setOffset(ts.length(), status); 1002 1003 backwardList.add(CollationElementIterator::NULLORDER, iter->getOffset(), iter->getOffset()); 1004 1005 do { 1006 high = iter->getOffset(); 1007 order = iter->previous(status); 1008 low = iter->getOffset(); 1009 1010 if (order == CollationElementIterator::NULLORDER) { 1011 break; 1012 } 1013 1014 backwardList.add(order, low, high); 1015 } while (TRUE); 1016 1017 backwardList.reverse(); 1018 1019 if (forwardList.compare(backwardList)) { 1020 logln("Works with \"%s\"", test[i]); 1021 logln("Forward offsets: [%s]", printOffsets(buffer, forwardList)); 1022 // logln("Backward offsets: [%s]", printOffsets(buffer, backwardList)); 1023 1024 logln("Forward CEs: [%s]", printOrders(buffer, forwardList)); 1025 // logln("Backward CEs: [%s]", printOrders(buffer, backwardList)); 1026 1027 logln(); 1028 } else { 1029 errln("Fails with \"%s\"", test[i]); 1030 infoln("Forward offsets: [%s]", printOffsets(buffer, forwardList)); 1031 infoln("Backward offsets: [%s]", printOffsets(buffer, backwardList)); 1032 1033 infoln("Forward CEs: [%s]", printOrders(buffer, forwardList)); 1034 infoln("Backward CEs: [%s]", printOrders(buffer, backwardList)); 1035 1036 infoln(); 1037 } 1038 delete iter; 1039 } 1040 delete col; 1041 } 1042 1043 #if 0 1044 static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer) 1045 { 1046 for(int32_t i = 0; i < string.length(); i += 1) { 1047 UChar32 ch = string.char32At(i); 1048 1049 if (ch >= 0x0020 && ch <= 0x007F) { 1050 if (ch == 0x005C) { 1051 buffer.append("\\\\"); 1052 } else { 1053 buffer.append(ch); 1054 } 1055 } else { 1056 char cbuffer[12]; 1057 1058 if (ch <= 0xFFFFL) { 1059 sprintf(cbuffer, "\\u%4.4X", ch); 1060 } else { 1061 sprintf(cbuffer, "\\U%8.8X", ch); 1062 } 1063 1064 buffer.append(cbuffer); 1065 } 1066 1067 if (ch >= 0x10000L) { 1068 i += 1; 1069 } 1070 } 1071 1072 return buffer; 1073 } 1074 #endif 1075 1076 #if 1 1077 1078 struct PCE 1079 { 1080 uint64_t ce; 1081 int32_t lowOffset; 1082 int32_t highOffset; 1083 }; 1084 1085 class PCEList 1086 { 1087 public: 1088 PCEList(UCollator *coll, const UnicodeString &string); 1089 ~PCEList(); 1090 1091 int32_t size() const; 1092 1093 const PCE *get(int32_t index) const; 1094 1095 int32_t getLowOffset(int32_t index) const; 1096 int32_t getHighOffset(int32_t index) const; 1097 uint64_t getOrder(int32_t index) const; 1098 1099 UBool matchesAt(int32_t offset, const PCEList &other) const; 1100 1101 uint64_t operator[](int32_t index) const; 1102 1103 private: 1104 void add(uint64_t ce, int32_t low, int32_t high); 1105 1106 PCE *list; 1107 int32_t listMax; 1108 int32_t listSize; 1109 }; 1110 1111 PCEList::PCEList(UCollator *coll, const UnicodeString &string) 1112 { 1113 UErrorCode status = U_ZERO_ERROR; 1114 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status); 1115 uint64_t order; 1116 int32_t low, high; 1117 1118 list = new PCE[listMax]; 1119 1120 ucol_setOffset(elems, 0, &status); 1121 1122 do { 1123 order = ucol_nextProcessed(elems, &low, &high, &status); 1124 add(order, low, high); 1125 } while (order != UCOL_PROCESSED_NULLORDER); 1126 1127 ucol_closeElements(elems); 1128 } 1129 1130 PCEList::~PCEList() 1131 { 1132 delete[] list; 1133 } 1134 1135 void PCEList::add(uint64_t order, int32_t low, int32_t high) 1136 { 1137 if (listSize >= listMax) { 1138 listMax *= 2; 1139 1140 PCE *newList = new PCE[listMax]; 1141 1142 uprv_memcpy(newList, list, listSize * sizeof(Order)); 1143 delete[] list; 1144 list = newList; 1145 } 1146 1147 list[listSize].ce = order; 1148 list[listSize].lowOffset = low; 1149 list[listSize].highOffset = high; 1150 1151 listSize += 1; 1152 } 1153 1154 const PCE *PCEList::get(int32_t index) const 1155 { 1156 if (index >= listSize) { 1157 return NULL; 1158 } 1159 1160 return &list[index]; 1161 } 1162 1163 int32_t PCEList::getLowOffset(int32_t index) const 1164 { 1165 const PCE *pce = get(index); 1166 1167 if (pce != NULL) { 1168 return pce->lowOffset; 1169 } 1170 1171 return -1; 1172 } 1173 1174 int32_t PCEList::getHighOffset(int32_t index) const 1175 { 1176 const PCE *pce = get(index); 1177 1178 if (pce != NULL) { 1179 return pce->highOffset; 1180 } 1181 1182 return -1; 1183 } 1184 1185 uint64_t PCEList::getOrder(int32_t index) const 1186 { 1187 const PCE *pce = get(index); 1188 1189 if (pce != NULL) { 1190 return pce->ce; 1191 } 1192 1193 return UCOL_PROCESSED_NULLORDER; 1194 } 1195 1196 int32_t PCEList::size() const 1197 { 1198 return listSize; 1199 } 1200 1201 UBool PCEList::matchesAt(int32_t offset, const PCEList &other) const 1202 { 1203 // NOTE: sizes include the NULLORDER, which we don't want to compare. 1204 int32_t otherSize = other.size() - 1; 1205 1206 if (listSize - 1 - offset < otherSize) { 1207 return FALSE; 1208 } 1209 1210 for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) { 1211 if (getOrder(i) != other.getOrder(j)) { 1212 return FALSE; 1213 } 1214 } 1215 1216 return TRUE; 1217 } 1218 1219 uint64_t PCEList::operator[](int32_t index) const 1220 { 1221 return getOrder(index); 1222 } 1223 1224 void SSearchTest::boyerMooreTest() 1225 { 1226 UErrorCode status = U_ZERO_ERROR; 1227 UCollator *coll = NULL; 1228 CollData *data = NULL; 1229 const CEList* ce = NULL; 1230 const CEList* ce1 = NULL; 1231 UnicodeString lp = "fuss"; 1232 UnicodeString sp = "fu\\u00DF"; 1233 BoyerMooreSearch *longPattern = NULL; 1234 BoyerMooreSearch *shortPattern = NULL; 1235 UnicodeString targets[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball", 1236 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF", 1237 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"}; 1238 int32_t start = -1, end = -1; 1239 1240 coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status); 1241 if (U_FAILURE(status)) { 1242 errcheckln(status, "Could not open collator. - %s", u_errorName(status)); 1243 return; 1244 } 1245 1246 data = CollData::open(coll, status); 1247 if (U_FAILURE(status)) { 1248 errln("Could not open CollData object."); 1249 goto close_data; 1250 } 1251 1252 data->getDynamicClassID(); 1253 if (U_FAILURE(status)) { 1254 errln("Could not get dynamic class ID of CollData."); 1255 goto close_patterns; 1256 } 1257 1258 data->getStaticClassID(); 1259 if (U_FAILURE(status)) { 1260 errln("Could not get static class ID of CollData."); 1261 goto close_patterns; 1262 } 1263 1264 longPattern = new BoyerMooreSearch(data, lp.unescape(), NULL, status); 1265 shortPattern = new BoyerMooreSearch(data, sp.unescape(), NULL, status); 1266 if (U_FAILURE(status)) { 1267 errln("Could not create pattern objects."); 1268 goto close_patterns; 1269 } 1270 1271 longPattern->getBadCharacterTable(); 1272 shortPattern->getBadCharacterTable(); 1273 if (U_FAILURE(status)) { 1274 errln("Could not get bad character table."); 1275 goto close_patterns; 1276 } 1277 1278 longPattern->getGoodSuffixTable(); 1279 shortPattern->getGoodSuffixTable(); 1280 if (U_FAILURE(status)) { 1281 errln("Could not get good suffix table."); 1282 goto close_patterns; 1283 } 1284 1285 longPattern->getDynamicClassID(); 1286 shortPattern->getDynamicClassID(); 1287 if (U_FAILURE(status)) { 1288 errln("Could not get dynamic class ID of BoyerMooreSearch."); 1289 goto close_patterns; 1290 } 1291 1292 longPattern->getStaticClassID(); 1293 shortPattern->getStaticClassID(); 1294 if (U_FAILURE(status)) { 1295 errln("Could not get static class ID of BoyerMooreSearch."); 1296 goto close_patterns; 1297 } 1298 1299 longPattern->getData(); 1300 shortPattern->getData(); 1301 if (U_FAILURE(status)) { 1302 errln("Could not get collate data."); 1303 goto close_patterns; 1304 } 1305 1306 ce = longPattern->getPatternCEs(); 1307 ce1 = shortPattern->getPatternCEs(); 1308 if (U_FAILURE(status)) { 1309 errln("Could not get pattern CEs."); 1310 goto close_patterns; 1311 } 1312 1313 ce->getDynamicClassID(); 1314 ce1->getDynamicClassID(); 1315 if (U_FAILURE(status)) { 1316 errln("Could not get dynamic class ID of CEList."); 1317 goto close_patterns; 1318 } 1319 1320 ce->getStaticClassID(); 1321 ce1->getStaticClassID(); 1322 if (U_FAILURE(status)) { 1323 errln("Could not get static class ID of CEList."); 1324 goto close_patterns; 1325 } 1326 1327 if(data->minLengthInChars(ce,0) != 3){ 1328 errln("Minimal Length in Characters for 'data' with 'ce' was suppose to give 3."); 1329 goto close_patterns; 1330 } 1331 1332 if(data->minLengthInChars(ce1,0) != 3){ 1333 errln("Minimal Length in Characters for 'data' with 'ce1' was suppose to give 3."); 1334 goto close_patterns; 1335 } 1336 1337 for (uint32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) { 1338 UnicodeString target = targets[t].unescape(); 1339 1340 longPattern->setTargetString(&target, status); 1341 if (longPattern->search(0, start, end)) { 1342 logln("Test %d: found long pattern at [%d, %d].", t, start, end); 1343 } else { 1344 errln("Test %d: did not find long pattern.", t); 1345 } 1346 1347 shortPattern->setTargetString(&target, status); 1348 if (shortPattern->search(0, start, end)) { 1349 logln("Test %d: found short pattern at [%d, %d].", t, start, end); 1350 } else { 1351 errln("Test %d: did not find short pattern.", t); 1352 } 1353 1354 if(longPattern->empty()){ 1355 errln("Test %d: Long pattern should not have been empty."); 1356 } 1357 1358 if(shortPattern->empty()){ 1359 errln("Test %d: Short pattern should not have been empty."); 1360 } 1361 } 1362 1363 close_patterns: 1364 delete shortPattern; 1365 delete longPattern; 1366 1367 close_data: 1368 CollData::close(data); 1369 ucol_close(coll); 1370 } 1371 1372 void SSearchTest::bmsTest() 1373 { 1374 UErrorCode status = U_ZERO_ERROR; 1375 UCollator *coll = NULL; 1376 UCD *data = NULL; 1377 UnicodeString lp = "fuss"; 1378 UnicodeString lpu = lp.unescape(); 1379 UnicodeString sp = "fu\\u00DF"; 1380 UnicodeString spu = sp.unescape(); 1381 BMS *longPattern = NULL; 1382 BMS *shortPattern = NULL; 1383 UnicodeString targets[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball", 1384 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF", 1385 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"}; 1386 int32_t start = -1, end = -1; 1387 1388 coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status); 1389 if (U_FAILURE(status)) { 1390 errcheckln(status, "Could not open collator. - %s", u_errorName(status)); 1391 return; 1392 } 1393 1394 data = ucd_open(coll, &status); 1395 if (U_FAILURE(status)) { 1396 errln("Could not open CollData object."); 1397 goto close_data; 1398 } 1399 1400 longPattern = bms_open(data, lpu.getBuffer(), lpu.length(), NULL, 0, &status); 1401 shortPattern = bms_open(data, spu.getBuffer(), spu.length(), NULL, 0, &status); 1402 if (U_FAILURE(status)) { 1403 errln("Couldn't open pattern objects."); 1404 goto close_patterns; 1405 } 1406 1407 for (uint32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) { 1408 UnicodeString target = targets[t].unescape(); 1409 1410 bms_setTargetString(longPattern, target.getBuffer(), target.length(), &status); 1411 if (bms_search(longPattern, 0, &start, &end)) { 1412 logln("Test %d: found long pattern at [%d, %d].", t, start, end); 1413 } else { 1414 errln("Test %d: did not find long pattern.", t); 1415 } 1416 1417 bms_setTargetString(shortPattern, target.getBuffer(), target.length(), &status); 1418 if (bms_search(shortPattern, 0, &start, &end)) { 1419 logln("Test %d: found short pattern at [%d, %d].", t, start, end); 1420 } else { 1421 errln("Test %d: did not find short pattern.", t); 1422 } 1423 } 1424 1425 /* Add better coverage for bms code. */ 1426 if(bms_empty(longPattern)) { 1427 errln("FAIL: longgPattern is empty."); 1428 } 1429 1430 if (!bms_getData(longPattern)) { 1431 errln("FAIL: bms_getData returned NULL."); 1432 } 1433 1434 if (!ucd_getCollator(data)) { 1435 errln("FAIL: ucd_getCollator returned NULL."); 1436 } 1437 1438 close_patterns: 1439 bms_close(shortPattern); 1440 bms_close(longPattern); 1441 1442 close_data: 1443 ucd_close(data); 1444 ucd_freeCache(); 1445 ucol_close(coll); 1446 } 1447 1448 void SSearchTest::goodSuffixTest() 1449 { 1450 UErrorCode status = U_ZERO_ERROR; 1451 UCollator *coll = NULL; 1452 CollData *data = NULL; 1453 UnicodeString pat = /*"gcagagag"*/ "fxeld"; 1454 UnicodeString target = /*"gcatcgcagagagtatacagtacg"*/ "cloveldfxeld"; 1455 BoyerMooreSearch *pattern = NULL; 1456 int32_t start = -1, end = -1; 1457 1458 coll = ucol_open(NULL, &status); 1459 if (U_FAILURE(status)) { 1460 errcheckln(status, "Couldn't open collator. - %s", u_errorName(status)); 1461 return; 1462 } 1463 1464 data = CollData::open(coll, status); 1465 if (U_FAILURE(status)) { 1466 errln("Couldn't open CollData object."); 1467 goto close_data; 1468 } 1469 1470 pattern = new BoyerMooreSearch(data, pat, &target, status); 1471 if (U_FAILURE(status)) { 1472 errln("Couldn't open pattern object."); 1473 goto close_pattern; 1474 } 1475 1476 if (pattern->search(0, start, end)) { 1477 logln("Found pattern at [%d, %d].", start, end); 1478 } else { 1479 errln("Did not find pattern."); 1480 } 1481 1482 close_pattern: 1483 delete pattern; 1484 1485 close_data: 1486 CollData::close(data); 1487 ucol_close(coll); 1488 } 1489 1490 // 1491 // searchTime() A quick and dirty performance test for string search. 1492 // Probably doesn't really belong as part of intltest, but it 1493 // does check that the search succeeds, and gets the right result, 1494 // so it serves as a functionality test also. 1495 // 1496 // To run as a perf test, up the loop count, select by commenting 1497 // and uncommenting in the code the operation to be measured, 1498 // rebuild, and measure the running time of this test alone. 1499 // 1500 // time LD_LIBRARY_PATH=whatever ./intltest collate/SSearchTest/searchTime 1501 // 1502 void SSearchTest::searchTime() { 1503 static const char *longishText = 1504 "Whylom, as olde stories tellen us,\n" 1505 "Ther was a duk that highte Theseus:\n" 1506 "Of Athenes he was lord and governour,\n" 1507 "And in his tyme swich a conquerour,\n" 1508 "That gretter was ther noon under the sonne.\n" 1509 "Ful many a riche contree hadde he wonne;\n" 1510 "What with his wisdom and his chivalrye,\n" 1511 "He conquered al the regne of Femenye,\n" 1512 "That whylom was y-cleped Scithia;\n" 1513 "And weddede the quene Ipolita,\n" 1514 "And broghte hir hoom with him in his contree\n" 1515 "With muchel glorie and greet solempnitee,\n" 1516 "And eek hir yonge suster Emelye.\n" 1517 "And thus with victorie and with melodye\n" 1518 "Lete I this noble duk to Athenes ryde,\n" 1519 "And al his hoost, in armes, him bisyde.\n" 1520 "And certes, if it nere to long to here,\n" 1521 "I wolde han told yow fully the manere,\n" 1522 "How wonnen was the regne of Femenye\n" 1523 "By Theseus, and by his chivalrye;\n" 1524 "And of the grete bataille for the nones\n" 1525 "Bitwixen Athen's and Amazones;\n" 1526 "And how asseged was Ipolita,\n" 1527 "The faire hardy quene of Scithia;\n" 1528 "And of the feste that was at hir weddinge,\n" 1529 "And of the tempest at hir hoom-cominge;\n" 1530 "But al that thing I moot as now forbere.\n" 1531 "I have, God woot, a large feeld to ere,\n" 1532 "And wayke been the oxen in my plough.\n" 1533 "The remenant of the tale is long y-nough.\n" 1534 "I wol nat letten eek noon of this route;\n" 1535 "Lat every felawe telle his tale aboute,\n" 1536 "And lat see now who shal the soper winne;\n" 1537 "And ther I lefte, I wol ageyn biginne.\n" 1538 "This duk, of whom I make mencioun,\n" 1539 "When he was come almost unto the toun,\n" 1540 "In al his wele and in his moste pryde,\n" 1541 "He was war, as he caste his eye asyde,\n" 1542 "Wher that ther kneled in the hye weye\n" 1543 "A companye of ladies, tweye and tweye,\n" 1544 "Ech after other, clad in clothes blake; \n" 1545 "But swich a cry and swich a wo they make,\n" 1546 "That in this world nis creature livinge,\n" 1547 "That herde swich another weymentinge;\n" 1548 "And of this cry they nolde never stenten,\n" 1549 "Til they the reynes of his brydel henten.\n" 1550 "'What folk ben ye, that at myn hoomcominge\n" 1551 "Perturben so my feste with cryinge'?\n" 1552 "Quod Theseus, 'have ye so greet envye\n" 1553 "Of myn honour, that thus compleyne and crye? \n" 1554 "Or who hath yow misboden, or offended?\n" 1555 "And telleth me if it may been amended;\n" 1556 "And why that ye ben clothed thus in blak'?\n" 1557 "The eldest lady of hem alle spak,\n" 1558 "When she hadde swowned with a deedly chere,\n" 1559 "That it was routhe for to seen and here,\n" 1560 "And seyde: 'Lord, to whom Fortune hath yiven\n" 1561 "Victorie, and as a conquerour to liven,\n" 1562 "Noght greveth us your glorie and your honour;\n" 1563 "But we biseken mercy and socour.\n" 1564 "Have mercy on our wo and our distresse.\n" 1565 "Som drope of pitee, thurgh thy gentilesse,\n" 1566 "Up-on us wrecched wommen lat thou falle.\n" 1567 "For certes, lord, ther nis noon of us alle,\n" 1568 "That she nath been a duchesse or a quene;\n" 1569 "Now be we caitifs, as it is wel sene:\n" 1570 "Thanked be Fortune, and hir false wheel,\n" 1571 "That noon estat assureth to be weel.\n" 1572 "And certes, lord, t'abyden your presence,\n" 1573 "Here in the temple of the goddesse Clemence\n" 1574 "We han ben waytinge al this fourtenight;\n" 1575 "Now help us, lord, sith it is in thy might.\n" 1576 "I wrecche, which that wepe and waille thus,\n" 1577 "Was whylom wyf to king Capaneus,\n" 1578 "That starf at Thebes, cursed be that day!\n" 1579 "And alle we, that been in this array,\n" 1580 "And maken al this lamentacioun,\n" 1581 "We losten alle our housbondes at that toun,\n" 1582 "Whyl that the sege ther-aboute lay.\n" 1583 "And yet now th'olde Creon, weylaway!\n" 1584 "The lord is now of Thebes the citee, \n" 1585 "Fulfild of ire and of iniquitee,\n" 1586 "He, for despyt, and for his tirannye,\n" 1587 "To do the dede bodyes vileinye,\n" 1588 "Of alle our lordes, whiche that ben slawe,\n" 1589 "Hath alle the bodyes on an heep y-drawe,\n" 1590 "And wol nat suffren hem, by noon assent,\n" 1591 "Neither to been y-buried nor y-brent,\n" 1592 "But maketh houndes ete hem in despyt. zet'\n"; 1593 1594 #define TEST_BOYER_MOORE 1 1595 const char *cPattern = "maketh houndes ete hem"; 1596 //const char *cPattern = "Whylom"; 1597 //const char *cPattern = "zet"; 1598 const char *testId = "searchTime()"; // for error macros. 1599 UnicodeString target = longishText; 1600 UErrorCode status = U_ZERO_ERROR; 1601 1602 1603 LocalUCollatorPointer collator(ucol_open("en", &status)); 1604 CollData *data = CollData::open(collator.getAlias(), status); 1605 if (U_FAILURE(status) || collator.isNull() || data == NULL) { 1606 errcheckln(status, "Unable to open UCollator or CollData. - %s", u_errorName(status)); 1607 return; 1608 } 1609 //ucol_setStrength(collator.getAlias(), collatorStrength); 1610 //ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status); 1611 UnicodeString uPattern = cPattern; 1612 #ifndef TEST_BOYER_MOORE 1613 LocalUStringSearchPointer uss(usearch_openFromCollator(uPattern.getBuffer(), uPattern.length(), 1614 target.getBuffer(), target.length(), 1615 collator.getAlias(), 1616 NULL, // the break iterator 1617 &status)); 1618 TEST_ASSERT_SUCCESS(status); 1619 #else 1620 BoyerMooreSearch bms(data, uPattern, &target, status); 1621 TEST_ASSERT_SUCCESS(status); 1622 #endif 1623 1624 // int32_t foundStart; 1625 // int32_t foundEnd; 1626 UBool found; 1627 1628 // Find the match position usgin strstr 1629 const char *pm = strstr(longishText, cPattern); 1630 TEST_ASSERT_M(pm!=NULL, "No pattern match with strstr"); 1631 int32_t refMatchPos = (int32_t)(pm - longishText); 1632 int32_t icuMatchPos; 1633 int32_t icuMatchEnd; 1634 #ifndef TEST_BOYER_MOORE 1635 usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status); 1636 TEST_ASSERT_SUCCESS(status); 1637 #else 1638 found = bms.search(0, icuMatchPos, icuMatchEnd); 1639 #endif 1640 TEST_ASSERT_M(refMatchPos == icuMatchPos, "strstr and icu give different match positions."); 1641 1642 int32_t i; 1643 // int32_t j=0; 1644 1645 // Try loopcounts around 100000 to some millions, depending on the operation, 1646 // to get runtimes of at least several seconds. 1647 for (i=0; i<10000; i++) { 1648 #ifndef TEST_BOYER_MOORE 1649 found = usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status); 1650 #else 1651 found = bms.search(0, icuMatchPos, icuMatchEnd); 1652 #endif 1653 //TEST_ASSERT_SUCCESS(status); 1654 //TEST_ASSERT(found); 1655 1656 // usearch_setOffset(uss.getAlias(), 0, &status); 1657 // icuMatchPos = usearch_next(uss.getAlias(), &status); 1658 1659 // The i+j stuff is to confuse the optimizer and get it to actually leave the 1660 // call to strstr in place. 1661 //pm = strstr(longishText+j, cPattern); 1662 //j = (j + i)%5; 1663 } 1664 1665 //printf("%ld, %d\n", pm-longishText, j); 1666 #ifdef TEST_BOYER_MOORE 1667 CollData::close(data); 1668 #endif 1669 } 1670 #endif 1671 1672 //---------------------------------------------------------------------------------------- 1673 // 1674 // Random Numbers. Similar to standard lib rand() and srand() 1675 // Not using library to 1676 // 1. Get same results on all platforms. 1677 // 2. Get access to current seed, to more easily reproduce failures. 1678 // 1679 //--------------------------------------------------------------------------------------- 1680 static uint32_t m_seed = 1; 1681 1682 static uint32_t m_rand() 1683 { 1684 m_seed = m_seed * 1103515245 + 12345; 1685 return (uint32_t)(m_seed/65536) % 32768; 1686 } 1687 1688 class Monkey 1689 { 1690 public: 1691 virtual void append(UnicodeString &test, UnicodeString &alternate) = 0; 1692 1693 protected: 1694 Monkey(); 1695 virtual ~Monkey(); 1696 }; 1697 1698 Monkey::Monkey() 1699 { 1700 // ook? 1701 } 1702 1703 Monkey::~Monkey() 1704 { 1705 // ook? 1706 } 1707 1708 class SetMonkey : public Monkey 1709 { 1710 public: 1711 SetMonkey(const USet *theSet); 1712 ~SetMonkey(); 1713 1714 virtual void append(UnicodeString &test, UnicodeString &alternate); 1715 1716 private: 1717 const USet *set; 1718 }; 1719 1720 SetMonkey::SetMonkey(const USet *theSet) 1721 : Monkey(), set(theSet) 1722 { 1723 // ook? 1724 } 1725 1726 SetMonkey::~SetMonkey() 1727 { 1728 //ook... 1729 } 1730 1731 void SetMonkey::append(UnicodeString &test, UnicodeString &alternate) 1732 { 1733 int32_t size = uset_size(set); 1734 int32_t index = m_rand() % size; 1735 UChar32 ch = uset_charAt(set, index); 1736 UnicodeString str(ch); 1737 1738 test.append(str); 1739 alternate.append(str); // flip case, or some junk? 1740 } 1741 1742 class StringSetMonkey : public Monkey 1743 { 1744 public: 1745 StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData); 1746 ~StringSetMonkey(); 1747 1748 void append(UnicodeString &testCase, UnicodeString &alternate); 1749 1750 private: 1751 UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeString &alternate); 1752 1753 const USet *set; 1754 UCollator *coll; 1755 CollData *collData; 1756 }; 1757 1758 StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData) 1759 : Monkey(), set(theSet), coll(theCollator), collData(theCollData) 1760 { 1761 // ook. 1762 } 1763 1764 StringSetMonkey::~StringSetMonkey() 1765 { 1766 // ook? 1767 } 1768 1769 void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate) 1770 { 1771 int32_t itemCount = uset_getItemCount(set), len = 0; 1772 int32_t index = m_rand() % itemCount; 1773 UChar32 rangeStart = 0, rangeEnd = 0; 1774 UChar buffer[16]; 1775 UErrorCode err = U_ZERO_ERROR; 1776 1777 len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err); 1778 1779 if (len == 0) { 1780 int32_t offset = m_rand() % (rangeEnd - rangeStart + 1); 1781 UChar32 ch = rangeStart + offset; 1782 UnicodeString str(ch); 1783 1784 testCase.append(str); 1785 generateAlternative(str, alternate); 1786 } else if (len > 0) { 1787 // should check that len < 16... 1788 UnicodeString str(buffer, len); 1789 1790 testCase.append(str); 1791 generateAlternative(str, alternate); 1792 } else { 1793 // shouldn't happen... 1794 } 1795 } 1796 1797 UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCase, UnicodeString &alternate) 1798 { 1799 // find out shortest string for the longest sequence of ces. 1800 // needs to be refined to use dynamic programming, but will be roughly right 1801 UErrorCode status = U_ZERO_ERROR; 1802 CEList ceList(coll, testCase, status); 1803 UnicodeString alt; 1804 int32_t offset = 0; 1805 1806 if (ceList.size() == 0) { 1807 return alternate.append(testCase); 1808 } 1809 1810 while (offset < ceList.size()) { 1811 int32_t ce = ceList.get(offset); 1812 const StringList *strings = collData->getStringList(ce); 1813 1814 if (strings == NULL) { 1815 return alternate.append(testCase); 1816 } 1817 1818 int32_t stringCount = strings->size(); 1819 int32_t tries = 0; 1820 1821 // find random string that generates the same CEList 1822 const CEList *ceList2 = NULL; 1823 const UnicodeString *string = NULL; 1824 UBool matches = FALSE; 1825 1826 do { 1827 int32_t s = m_rand() % stringCount; 1828 1829 if (tries++ > stringCount) { 1830 alternate.append(testCase); 1831 return alternate; 1832 } 1833 1834 string = strings->get(s); 1835 ceList2 = collData->getCEList(string); 1836 matches = ceList.matchesAt(offset, ceList2); 1837 1838 if (! matches) { 1839 collData->freeCEList((CEList *) ceList2); 1840 } 1841 } while (! matches); 1842 1843 alt.append(*string); 1844 offset += ceList2->size(); 1845 collData->freeCEList(ceList2); 1846 } 1847 1848 const CEList altCEs(coll, alt, status); 1849 1850 if (ceList.matchesAt(0, &altCEs)) { 1851 return alternate.append(alt); 1852 } 1853 1854 return alternate.append(testCase); 1855 } 1856 1857 static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyCount, UnicodeString &testCase, UnicodeString &alternate) 1858 { 1859 int32_t pieces = (m_rand() % 4) + 1; 1860 UErrorCode status = U_ZERO_ERROR; 1861 UBool matches; 1862 1863 do { 1864 testCase.remove(); 1865 alternate.remove(); 1866 monkeys[0]->append(testCase, alternate); 1867 1868 for(int32_t piece = 0; piece < pieces; piece += 1) { 1869 int32_t monkey = m_rand() % monkeyCount; 1870 1871 monkeys[monkey]->append(testCase, alternate); 1872 } 1873 1874 const CEList ceTest(coll, testCase, status); 1875 const CEList ceAlt(coll, alternate, status); 1876 1877 matches = ceTest.matchesAt(0, &ceAlt); 1878 } while (! matches); 1879 } 1880 1881 // 1882 // Find the next acceptable boundary following the specified starting index 1883 // in the target text being searched. 1884 // TODO: refine what is an acceptable boundary. For the moment, 1885 // choose the next position not within a combining sequence. 1886 // 1887 #if 0 1888 static int32_t nextBoundaryAfter(const UnicodeString &string, int32_t startIndex) { 1889 const UChar *text = string.getBuffer(); 1890 int32_t textLen = string.length(); 1891 1892 if (startIndex >= textLen) { 1893 return startIndex; 1894 } 1895 1896 UChar32 c; 1897 int32_t i = startIndex; 1898 1899 U16_NEXT(text, i, textLen, c); 1900 1901 // If we are on a control character, stop without looking for combining marks. 1902 // Control characters do not combine. 1903 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 1904 if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) { 1905 return i; 1906 } 1907 1908 // The initial character was not a control, and can thus accept trailing 1909 // combining characters. Advance over however many of them there are. 1910 int32_t indexOfLastCharChecked; 1911 1912 for (;;) { 1913 indexOfLastCharChecked = i; 1914 1915 if (i>=textLen) { 1916 break; 1917 } 1918 1919 U16_NEXT(text, i, textLen, c); 1920 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 1921 1922 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { 1923 break; 1924 } 1925 } 1926 1927 return indexOfLastCharChecked; 1928 } 1929 #endif 1930 1931 #if 0 1932 static UBool isInCombiningSequence(const UnicodeString &string, int32_t index) { 1933 const UChar *text = string.getBuffer(); 1934 int32_t textLen = string.length(); 1935 1936 if (index>=textLen || index<=0) { 1937 return FALSE; 1938 } 1939 1940 // If the character at the current index is not a GRAPHEME_EXTEND 1941 // then we can not be within a combining sequence. 1942 UChar32 c; 1943 U16_GET(text, 0, index, textLen, c); 1944 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 1945 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { 1946 return FALSE; 1947 } 1948 1949 // We are at a combining mark. If the preceding character is anything 1950 // except a CONTROL, CR or LF, we are in a combining sequence. 1951 U16_PREV(text, 0, index, c); 1952 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 1953 1954 return !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR); 1955 } 1956 #endif 1957 1958 static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd) 1959 { 1960 UErrorCode status = U_ZERO_ERROR; 1961 OrderList targetOrders(coll, target, offset); 1962 OrderList patternOrders(coll, pattern); 1963 int32_t targetSize = targetOrders.size() - 1; 1964 int32_t patternSize = patternOrders.size() - 1; 1965 UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status), 1966 target.getBuffer(), target.length(), &status); 1967 1968 if (patternSize == 0) { 1969 // Searching for an empty pattern always fails 1970 matchStart = matchEnd = -1; 1971 ubrk_close(charBreakIterator); 1972 return FALSE; 1973 } 1974 1975 matchStart = matchEnd = -1; 1976 1977 for(int32_t i = 0; i < targetSize; i += 1) { 1978 if (targetOrders.matchesAt(i, patternOrders)) { 1979 int32_t start = targetOrders.getLowOffset(i); 1980 int32_t maxLimit = targetOrders.getLowOffset(i + patternSize); 1981 int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1); 1982 1983 // if the low and high offsets of the first CE in 1984 // the match are the same, it means that the match 1985 // starts in the middle of an expansion - all but 1986 // the first CE of the expansion will have the offset 1987 // of the following character. 1988 if (start == targetOrders.getHighOffset(i)) { 1989 continue; 1990 } 1991 1992 // Make sure match starts on a grapheme boundary 1993 if (! ubrk_isBoundary(charBreakIterator, start)) { 1994 continue; 1995 } 1996 1997 // If the low and high offsets of the CE after the match 1998 // are the same, it means that the match ends in the middle 1999 // of an expansion sequence. 2000 if (maxLimit == targetOrders.getHighOffset(i + patternSize) && 2001 targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) { 2002 continue; 2003 } 2004 2005 int32_t mend = maxLimit; 2006 2007 // Find the first grapheme break after the character index 2008 // of the last CE in the match. If it's after character index 2009 // that's after the last CE in the match, use that index 2010 // as the end of the match. 2011 if (minLimit < maxLimit) { 2012 // When the last CE's low index is same with its high index, the CE is likely 2013 // a part of expansion. In this case, the index is located just after the 2014 // character corresponding to the CEs compared above. If the index is right 2015 // at the break boundary, move the position to the next boundary will result 2016 // incorrect match length when there are ignorable characters exist between 2017 // the position and the next character produces CE(s). See ticket#8482. 2018 if (minLimit == targetOrders.getHighOffset(i + patternSize - 1) && ubrk_isBoundary(charBreakIterator, minLimit)) { 2019 mend = minLimit; 2020 } else { 2021 int32_t nba = ubrk_following(charBreakIterator, minLimit); 2022 2023 if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) { 2024 mend = nba; 2025 } 2026 } 2027 } 2028 2029 if (mend > maxLimit) { 2030 continue; 2031 } 2032 2033 if (! ubrk_isBoundary(charBreakIterator, mend)) { 2034 continue; 2035 } 2036 2037 matchStart = start; 2038 matchEnd = mend; 2039 2040 ubrk_close(charBreakIterator); 2041 return TRUE; 2042 } 2043 } 2044 2045 ubrk_close(charBreakIterator); 2046 return FALSE; 2047 } 2048 2049 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 2050 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) { 2051 int32_t val = defaultVal; 2052 2053 name.append(" *= *(-?\\d+)"); 2054 2055 UErrorCode status = U_ZERO_ERROR; 2056 RegexMatcher m(name, params, 0, status); 2057 2058 if (m.find()) { 2059 // The param exists. Convert the string to an int. 2060 char valString[100]; 2061 int32_t paramLength = m.end(1, status) - m.start(1, status); 2062 2063 if (paramLength >= (int32_t)(sizeof(valString)-1)) { 2064 paramLength = (int32_t)(sizeof(valString)-2); 2065 } 2066 2067 params.extract(m.start(1, status), paramLength, valString, sizeof(valString)); 2068 val = strtol(valString, NULL, 10); 2069 2070 // Delete this parameter from the params string. 2071 m.reset(); 2072 params = m.replaceFirst("", status); 2073 } 2074 2075 //U_ASSERT(U_SUCCESS(status)); 2076 if (! U_SUCCESS(status)) { 2077 val = defaultVal; 2078 } 2079 2080 return val; 2081 } 2082 #endif 2083 2084 #if !UCONFIG_NO_COLLATION 2085 int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern, 2086 const char *name, const char *strength, uint32_t seed) 2087 { 2088 UErrorCode status = U_ZERO_ERROR; 2089 int32_t actualStart = -1, actualEnd = -1; 2090 //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length(); 2091 int32_t expectedStart = -1, expectedEnd = -1; 2092 int32_t notFoundCount = 0; 2093 LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(), 2094 testCase.getBuffer(), testCase.length(), 2095 coll, 2096 NULL, // the break iterator 2097 &status)); 2098 2099 // **** TODO: find *all* matches, not just first one **** 2100 simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd); 2101 2102 usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status); 2103 2104 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) { 2105 errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n" 2106 " strength=%s seed=%d", 2107 name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed); 2108 } 2109 2110 if (expectedStart == -1 && actualStart == -1) { 2111 notFoundCount += 1; 2112 } 2113 2114 // **** TODO: find *all* matches, not just first one **** 2115 simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd); 2116 2117 usearch_setPattern(uss.getAlias(), altPattern.getBuffer(), altPattern.length(), &status); 2118 2119 usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status); 2120 2121 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) { 2122 errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n" 2123 " strength=%s seed=%d", 2124 name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed); 2125 } 2126 2127 if (expectedStart == -1 && actualStart == -1) { 2128 notFoundCount += 1; 2129 } 2130 2131 return notFoundCount; 2132 } 2133 2134 int32_t SSearchTest::bmMonkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern, 2135 BoyerMooreSearch *bms, BoyerMooreSearch *abms, 2136 const char *name, const char *strength, uint32_t seed) 2137 { 2138 UErrorCode status = U_ZERO_ERROR; 2139 int32_t actualStart = -1, actualEnd = -1; 2140 //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length(); 2141 int32_t expectedStart = -1, expectedEnd = -1; 2142 int32_t notFoundCount = 0; 2143 2144 // **** TODO: find *all* matches, not just first one **** 2145 simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd); 2146 2147 bms->setTargetString(&testCase, status); 2148 bms->search(0, actualStart, actualEnd); 2149 2150 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) { 2151 errln("Boyer-Moore Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n" 2152 " strength=%s seed=%d", 2153 name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed); 2154 errln(UNICODE_STRING_SIMPLE(" <pattern>: ") + prettify(pattern)); 2155 } 2156 2157 if (expectedStart == -1 && actualStart == -1) { 2158 notFoundCount += 1; 2159 } 2160 2161 // **** TODO: find *all* matches, not just first one **** 2162 simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd); 2163 2164 abms->setTargetString(&testCase, status); 2165 abms->search(0, actualStart, actualEnd); 2166 2167 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) { 2168 errln("Boyer-Moore Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n" 2169 " strength=%s seed=%d", 2170 name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed); 2171 errln(UNICODE_STRING_SIMPLE(" <alt_pattern>: ") + prettify(altPattern)); 2172 } 2173 2174 if (expectedStart == -1 && actualStart == -1) { 2175 notFoundCount += 1; 2176 } 2177 2178 2179 return notFoundCount; 2180 } 2181 #endif 2182 2183 void SSearchTest::monkeyTest(char *params) 2184 { 2185 // ook! 2186 UErrorCode status = U_ZERO_ERROR; 2187 //UCollator *coll = ucol_open(NULL, &status); 2188 UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status); 2189 2190 if (U_FAILURE(status)) { 2191 errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status)); 2192 return; 2193 } 2194 2195 CollData *monkeyData = CollData::open(coll, status); 2196 2197 USet *expansions = uset_openEmpty(); 2198 USet *contractions = uset_openEmpty(); 2199 2200 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status); 2201 2202 U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39); 2203 U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39); 2204 USet *letters = uset_openPattern(letter_pattern, 39, &status); 2205 SetMonkey letterMonkey(letters); 2206 StringSetMonkey contractionMonkey(contractions, coll, monkeyData); 2207 StringSetMonkey expansionMonkey(expansions, coll, monkeyData); 2208 UnicodeString testCase; 2209 UnicodeString alternate; 2210 UnicodeString pattern, altPattern; 2211 UnicodeString prefix, altPrefix; 2212 UnicodeString suffix, altSuffix; 2213 2214 Monkey *monkeys[] = { 2215 &letterMonkey, 2216 &contractionMonkey, 2217 &expansionMonkey, 2218 &contractionMonkey, 2219 &expansionMonkey, 2220 &contractionMonkey, 2221 &expansionMonkey, 2222 &contractionMonkey, 2223 &expansionMonkey}; 2224 int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]); 2225 // int32_t nonMatchCount = 0; 2226 2227 UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY}; 2228 const char *strengthNames[] = {"primary", "secondary", "tertiary"}; 2229 int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]); 2230 int32_t loopCount = quick? 1000 : 10000; 2231 int32_t firstStrength = 0; 2232 int32_t lastStrength = strengthCount - 1; //*/ 0; 2233 2234 if (params != NULL) { 2235 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 2236 UnicodeString p(params); 2237 2238 loopCount = getIntParam("loop", p, loopCount); 2239 m_seed = getIntParam("seed", p, m_seed); 2240 2241 RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status); 2242 if (m.find()) { 2243 UnicodeString breakType = m.group(1, status); 2244 2245 for (int32_t s = 0; s < strengthCount; s += 1) { 2246 if (breakType == strengthNames[s]) { 2247 firstStrength = lastStrength = s; 2248 break; 2249 } 2250 } 2251 2252 m.reset(); 2253 p = m.replaceFirst("", status); 2254 } 2255 2256 if (RegexMatcher("\\S", p, 0, status).find()) { 2257 // Each option is stripped out of the option string as it is processed. 2258 // All options have been checked. The option string should have been completely emptied.. 2259 char buf[100]; 2260 p.extract(buf, sizeof(buf), NULL, status); 2261 buf[sizeof(buf)-1] = 0; 2262 errln("Unrecognized or extra parameter: %s\n", buf); 2263 return; 2264 } 2265 #else 2266 infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters."); 2267 #endif 2268 } 2269 2270 for(int32_t s = firstStrength; s <= lastStrength; s += 1) { 2271 int32_t notFoundCount = 0; 2272 2273 logln("Setting strength to %s.", strengthNames[s]); 2274 ucol_setStrength(coll, strengths[s]); 2275 2276 // TODO: try alternate prefix and suffix too? 2277 // TODO: alterntaes are only equal at primary strength. Is this OK? 2278 for(int32_t t = 0; t < loopCount; t += 1) { 2279 uint32_t seed = m_seed; 2280 // int32_t nmc = 0; 2281 2282 generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern); 2283 generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix); 2284 generateTestCase(coll, monkeys, monkeyCount, suffix, altSuffix); 2285 2286 // pattern 2287 notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed); 2288 2289 testCase.remove(); 2290 testCase.append(prefix); 2291 testCase.append(/*alt*/pattern); 2292 2293 // prefix + pattern 2294 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed); 2295 2296 testCase.append(suffix); 2297 2298 // prefix + pattern + suffix 2299 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed); 2300 2301 testCase.remove(); 2302 testCase.append(pattern); 2303 testCase.append(suffix); 2304 2305 // pattern + suffix 2306 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed); 2307 } 2308 2309 logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount); 2310 } 2311 2312 uset_close(contractions); 2313 uset_close(expansions); 2314 uset_close(letters); 2315 2316 CollData::close(monkeyData); 2317 2318 ucol_close(coll); 2319 } 2320 2321 void SSearchTest::bmMonkeyTest(char *params) 2322 { 2323 static const UChar skipChars[] = { 0x0E40, 0x0E41, 0x0E42, 0x0E43, 0x0E44, 0xAAB5, 0xAAB6, 0xAAB9, 0xAABB, 0xAABC, 0 }; // for timebomb 2324 // ook! 2325 UErrorCode status = U_ZERO_ERROR; 2326 UCollator *coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status); 2327 2328 if (U_FAILURE(status)) { 2329 errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status)); 2330 return; 2331 } 2332 2333 CollData *monkeyData = CollData::open(coll, status); 2334 2335 USet *expansions = uset_openEmpty(); 2336 USet *contractions = uset_openEmpty(); 2337 2338 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status); 2339 2340 U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39); 2341 U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39); 2342 USet *letters = uset_openPattern(letter_pattern, 39, &status); 2343 SetMonkey letterMonkey(letters); 2344 StringSetMonkey contractionMonkey(contractions, coll, monkeyData); 2345 StringSetMonkey expansionMonkey(expansions, coll, monkeyData); 2346 UnicodeString testCase; 2347 UnicodeString alternate; 2348 UnicodeString pattern, altPattern; 2349 UnicodeString prefix, altPrefix; 2350 UnicodeString suffix, altSuffix; 2351 2352 Monkey *monkeys[] = { 2353 &letterMonkey, 2354 &contractionMonkey, 2355 &expansionMonkey, 2356 &contractionMonkey, 2357 &expansionMonkey, 2358 &contractionMonkey, 2359 &expansionMonkey, 2360 &contractionMonkey, 2361 &expansionMonkey}; 2362 int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]); 2363 // int32_t nonMatchCount = 0; 2364 2365 UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY}; 2366 const char *strengthNames[] = {"primary", "secondary", "tertiary"}; 2367 int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]); 2368 int32_t loopCount = quick? 1000 : 10000; 2369 int32_t firstStrength = 0; 2370 int32_t lastStrength = strengthCount - 1; //*/ 0; 2371 2372 if (params != NULL) { 2373 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 2374 UnicodeString p(params); 2375 2376 loopCount = getIntParam("loop", p, loopCount); 2377 m_seed = getIntParam("seed", p, m_seed); 2378 2379 RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status); 2380 if (m.find()) { 2381 UnicodeString breakType = m.group(1, status); 2382 2383 for (int32_t s = 0; s < strengthCount; s += 1) { 2384 if (breakType == strengthNames[s]) { 2385 firstStrength = lastStrength = s; 2386 break; 2387 } 2388 } 2389 2390 m.reset(); 2391 p = m.replaceFirst("", status); 2392 } 2393 2394 if (RegexMatcher("\\S", p, 0, status).find()) { 2395 // Each option is stripped out of the option string as it is processed. 2396 // All options have been checked. The option string should have been completely emptied.. 2397 char buf[100]; 2398 p.extract(buf, sizeof(buf), NULL, status); 2399 buf[sizeof(buf)-1] = 0; 2400 errln("Unrecognized or extra parameter: %s\n", buf); 2401 return; 2402 } 2403 #else 2404 infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters."); 2405 #endif 2406 } 2407 2408 for(int32_t s = firstStrength; s <= lastStrength; s += 1) { 2409 int32_t notFoundCount = 0; 2410 2411 logln("Setting strength to %s.", strengthNames[s]); 2412 ucol_setStrength(coll, strengths[s]); 2413 2414 CollData *data = CollData::open(coll, status); 2415 2416 UnicodeSet skipSet; 2417 if(isICUVersionBefore(51, 1)) { 2418 // timebomb until ticket #9156 (was #8081) is resolved 2419 UnicodeString skipString(skipChars); 2420 skipSet.addAll(skipString); 2421 } 2422 if(isICUVersionBefore(51, 1)) { 2423 // Time bomb until ticket #9490 is fixed. 2424 skipSet.add(0x12327); 2425 skipSet.add(0x1311b); 2426 skipSet.add(0x1200d); 2427 } 2428 skipSet.freeze(); 2429 // TODO: try alternate prefix and suffix too? 2430 // TODO: alternates are only equal at primary strength. Is this OK? 2431 for(int32_t t = 0; t < loopCount; t += 1) { 2432 uint32_t seed = m_seed; 2433 // int32_t nmc = 0; 2434 2435 generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern); 2436 generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix); 2437 generateTestCase(coll, monkeys, monkeyCount, suffix, altSuffix); 2438 2439 if (skipSet.containsSome(pattern)) { 2440 continue; // time bomb 2441 } 2442 2443 BoyerMooreSearch pat(data, pattern, NULL, status); 2444 BoyerMooreSearch alt(data, altPattern, NULL, status); 2445 2446 // **** need a better way to deal with this **** 2447 #if 0 2448 if (pat.empty() || 2449 alt.empty()) { 2450 continue; 2451 } 2452 #endif 2453 2454 // pattern 2455 notFoundCount += bmMonkeyTestCase(coll, pattern, pattern, altPattern, &pat, &alt, "pattern", strengthNames[s], seed); 2456 2457 testCase.remove(); 2458 testCase.append(prefix); 2459 testCase.append(/*alt*/pattern); 2460 2461 // prefix + pattern 2462 notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "prefix + pattern", strengthNames[s], seed); 2463 2464 testCase.append(suffix); 2465 2466 // prefix + pattern + suffix 2467 notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "prefix + pattern + suffix", strengthNames[s], seed); 2468 2469 testCase.remove(); 2470 testCase.append(pattern); 2471 testCase.append(suffix); 2472 2473 // pattern + suffix 2474 notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "pattern + suffix", strengthNames[s], seed); 2475 } 2476 2477 CollData::close(data); 2478 2479 logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount); 2480 } 2481 2482 uset_close(contractions); 2483 uset_close(expansions); 2484 uset_close(letters); 2485 2486 CollData::close(monkeyData); 2487 2488 ucol_close(coll); 2489 } 2490 2491 void SSearchTest::stringListTest(){ 2492 UErrorCode status = U_ZERO_ERROR; 2493 StringList *sl = new StringList(status); 2494 if(U_FAILURE(status)){ 2495 errln("ERROR: stringListTest: Could not start StringList"); 2496 } 2497 2498 const UChar chars[] = { 2499 0x0000 2500 }; 2501 sl->add(chars, (int32_t) 0, status); 2502 if(U_FAILURE(status)){ 2503 errln("ERROR: stringListTest: StringList::add"); 2504 } 2505 2506 if(sl->getDynamicClassID() != StringList::getStaticClassID()){ 2507 errln("ERROR: stringListTest: getDynamicClassID and getStaticClassID does not match"); 2508 } 2509 delete sl; 2510 } 2511 2512 #endif 2513 2514 #endif 2515