1 /* 2 ********************************************************************** 3 * Copyright (C) 2005-2009, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 8 9 #include "unicode/utypes.h" 10 11 #if !UCONFIG_NO_COLLATION 12 13 #include "unicode/unistr.h" 14 #include "unicode/putil.h" 15 #include "unicode/usearch.h" 16 17 #include "cmemory.h" 18 #include "unicode/coll.h" 19 #include "unicode/tblcoll.h" 20 #include "unicode/coleitr.h" 21 #include "unicode/ucoleitr.h" 22 23 #include "unicode/regex.h" // TODO: make conditional on regexp being built. 24 25 #include "unicode/uniset.h" 26 #include "unicode/uset.h" 27 #include "unicode/ustring.h" 28 #include "hash.h" 29 #include "uhash.h" 30 #include "ucol_imp.h" 31 32 #include "intltest.h" 33 #include "ssearch.h" 34 35 #include "unicode/colldata.h" 36 #include "unicode/bmsearch.h" 37 #include "unicode/bms.h" 38 39 #include "xmlparser.h" 40 #include "ucbuf.h" 41 42 #include <stdlib.h> 43 #include <string.h> 44 #include <stdio.h> 45 46 char testId[100]; 47 48 #define TEST_ASSERT(x) {if (!(x)) { \ 49 errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, testId);}} 50 51 #define TEST_ASSERT_M(x, m) {if (!(x)) { \ 52 errln("Failure in file %s, line %d. \"%s\"", __FILE__, __LINE__, m);return;}} 53 54 #define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \ 55 dataerrln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \ 56 __FILE__, __LINE__, testId, u_errorName(errcode));}} 57 58 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 59 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) 60 #define DELETE_ARRAY(array) uprv_free((void *) (array)) 61 62 //--------------------------------------------------------------------------- 63 // 64 // Test class boilerplate 65 // 66 //--------------------------------------------------------------------------- 67 SSearchTest::SSearchTest() 68 { 69 } 70 71 SSearchTest::~SSearchTest() 72 { 73 } 74 75 void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char *params ) 76 { 77 if (exec) logln("TestSuite SSearchTest: "); 78 switch (index) { 79 #if !UCONFIG_NO_BREAK_ITERATION 80 case 0: name = "searchTest"; 81 if (exec) searchTest(); 82 break; 83 84 case 1: name = "offsetTest"; 85 if (exec) offsetTest(); 86 break; 87 88 case 2: name = "monkeyTest"; 89 if (exec) monkeyTest(params); 90 break; 91 92 case 3: name = "bmMonkeyTest"; 93 if (exec) bmMonkeyTest(params); 94 break; 95 96 case 4: name = "boyerMooreTest"; 97 if (exec) boyerMooreTest(); 98 break; 99 100 case 5: name = "goodSuffixTest"; 101 if (exec) goodSuffixTest(); 102 break; 103 104 case 6: name = "searchTime"; 105 if (exec) searchTime(); 106 break; 107 108 case 7: name = "bmsTest"; 109 if (exec) bmsTest(); 110 break; 111 112 case 8: name = "bmSearchTest"; 113 if (exec) bmSearchTest(); 114 break; 115 116 case 9: name = "udhrTest"; 117 if (exec) udhrTest(); 118 break; 119 #endif 120 default: name = ""; 121 break; //needed to end loop 122 } 123 } 124 125 126 #if !UCONFIG_NO_BREAK_ITERATION 127 128 #define PATH_BUFFER_SIZE 2048 129 const char *SSearchTest::getPath(char buffer[2048], const char *filename) { 130 UErrorCode status = U_ZERO_ERROR; 131 const char *testDataDirectory = IntlTest::getSourceTestData(status); 132 133 if (U_FAILURE(status) || strlen(testDataDirectory) + strlen(filename) + 1 >= PATH_BUFFER_SIZE) { 134 errln("ERROR: getPath() failed - %s", u_errorName(status)); 135 return NULL; 136 } 137 138 strcpy(buffer, testDataDirectory); 139 strcat(buffer, filename); 140 return buffer; 141 } 142 143 144 void SSearchTest::searchTest() 145 { 146 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 147 UErrorCode status = U_ZERO_ERROR; 148 char path[PATH_BUFFER_SIZE]; 149 const char *testFilePath = getPath(path, "ssearch.xml"); 150 151 if (testFilePath == NULL) { 152 return; /* Couldn't get path: error message already output. */ 153 } 154 155 UXMLParser *parser = UXMLParser::createParser(status); 156 TEST_ASSERT_SUCCESS(status); 157 UXMLElement *root = parser->parseFile(testFilePath, status); 158 TEST_ASSERT_SUCCESS(status); 159 if (U_FAILURE(status)) { 160 return; 161 } 162 163 const UnicodeString *debugTestCase = root->getAttribute("debug"); 164 if (debugTestCase != NULL) { 165 // setenv("USEARCH_DEBUG", "1", 1); 166 } 167 168 169 const UXMLElement *testCase; 170 int32_t tc = 0; 171 172 while((testCase = root->nextChildElement(tc)) != NULL) { 173 174 if (testCase->getTagName().compare("test-case") != 0) { 175 errln("ssearch, unrecognized XML Element in test file"); 176 continue; 177 } 178 const UnicodeString *id = testCase->getAttribute("id"); 179 *testId = 0; 180 if (id != NULL) { 181 id->extract(0, id->length(), testId, sizeof(testId), US_INV); 182 } 183 184 // If debugging test case has been specified and this is not it, skip to next. 185 if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) { 186 continue; 187 } 188 // 189 // Get the requested collation strength. 190 // Default is tertiary if the XML attribute is missing from the test case. 191 // 192 const UnicodeString *strength = testCase->getAttribute("strength"); 193 UColAttributeValue collatorStrength; 194 if (strength==NULL) { collatorStrength = UCOL_TERTIARY;} 195 else if (*strength=="PRIMARY") { collatorStrength = UCOL_PRIMARY;} 196 else if (*strength=="SECONDARY") { collatorStrength = UCOL_SECONDARY;} 197 else if (*strength=="TERTIARY") { collatorStrength = UCOL_TERTIARY;} 198 else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;} 199 else if (*strength=="IDENTICAL") { collatorStrength = UCOL_IDENTICAL;} 200 else { 201 // Bogus value supplied for strength. Shouldn't happen, even from 202 // typos, if the XML source has been validated. 203 // This assert is a little deceiving in that strength can be 204 // any of the allowed values, not just TERTIARY, but it will 205 // do the job of getting the error output. 206 TEST_ASSERT(*strength=="TERTIARY") 207 } 208 209 // 210 // Get the collator normalization flag. Default is UCOL_OFF. 211 // 212 UColAttributeValue normalize = UCOL_OFF; 213 const UnicodeString *norm = testCase->getAttribute("norm"); 214 TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF"); 215 if (norm!=NULL && *norm=="ON") { 216 normalize = UCOL_ON; 217 } 218 219 // 220 // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE. 221 // 222 UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE; 223 const UnicodeString *alt = testCase->getAttribute("alternate_handling"); 224 TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE"); 225 if (alt != NULL && *alt == "SHIFTED") { 226 alternateHandling = UCOL_SHIFTED; 227 } 228 229 const UnicodeString defLocale("en"); 230 char clocale[100]; 231 const UnicodeString *locale = testCase->getAttribute("locale"); 232 if (locale == NULL || locale->length()==0) { 233 locale = &defLocale; 234 }; 235 locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL); 236 237 238 UnicodeString text; 239 UnicodeString target; 240 UnicodeString pattern; 241 int32_t expectedMatchStart = -1; 242 int32_t expectedMatchLimit = -1; 243 const UXMLElement *n; 244 int32_t nodeCount = 0; 245 246 n = testCase->getChildElement("pattern"); 247 TEST_ASSERT(n != NULL); 248 if (n==NULL) { 249 continue; 250 } 251 text = n->getText(FALSE); 252 text = text.unescape(); 253 pattern.append(text); 254 nodeCount++; 255 256 n = testCase->getChildElement("pre"); 257 if (n!=NULL) { 258 text = n->getText(FALSE); 259 text = text.unescape(); 260 target.append(text); 261 nodeCount++; 262 } 263 264 n = testCase->getChildElement("m"); 265 if (n!=NULL) { 266 expectedMatchStart = target.length(); 267 text = n->getText(FALSE); 268 text = text.unescape(); 269 target.append(text); 270 expectedMatchLimit = target.length(); 271 nodeCount++; 272 } 273 274 n = testCase->getChildElement("post"); 275 if (n!=NULL) { 276 text = n->getText(FALSE); 277 text = text.unescape(); 278 target.append(text); 279 nodeCount++; 280 } 281 282 // Check that there weren't extra things in the XML 283 TEST_ASSERT(nodeCount == testCase->countChildren()); 284 285 // Open a collator and StringSearch based on the parameters 286 // obtained from the XML. 287 // 288 status = U_ZERO_ERROR; 289 UCollator *collator = ucol_open(clocale, &status); 290 ucol_setStrength(collator, collatorStrength); 291 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, normalize, &status); 292 ucol_setAttribute(collator, UCOL_ALTERNATE_HANDLING, alternateHandling, &status); 293 UStringSearch *uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(), 294 target.getBuffer(), target.length(), 295 collator, 296 NULL, // the break iterator 297 &status); 298 299 TEST_ASSERT_SUCCESS(status); 300 if (U_FAILURE(status)) { 301 usearch_close(uss); 302 ucol_close(collator); 303 continue; 304 } 305 306 int32_t foundStart = 0; 307 int32_t foundLimit = 0; 308 UBool foundMatch; 309 310 // 311 // Do the search, check the match result against the expected results. 312 // 313 foundMatch= usearch_search(uss, 0, &foundStart, &foundLimit, &status); 314 TEST_ASSERT_SUCCESS(status); 315 if (foundMatch && expectedMatchStart<0 || 316 foundStart != expectedMatchStart || 317 foundLimit != expectedMatchLimit) { 318 TEST_ASSERT(FALSE); // ouput generic error position 319 infoln("Found, expected match start = %d, %d \n" 320 "Found, expected match limit = %d, %d", 321 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit); 322 } 323 324 // In case there are other matches... 325 // (should we only do this if the test case passed?) 326 while (foundMatch) { 327 expectedMatchStart = foundStart; 328 expectedMatchLimit = foundLimit; 329 330 foundMatch = usearch_search(uss, foundLimit, &foundStart, &foundLimit, &status); 331 } 332 333 usearch_close(uss); 334 335 uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(), 336 target.getBuffer(), target.length(), 337 collator, 338 NULL, 339 &status); 340 341 // 342 // Do the backwards search, check the match result against the expected results. 343 // 344 foundMatch= usearch_searchBackwards(uss, target.length(), &foundStart, &foundLimit, &status); 345 TEST_ASSERT_SUCCESS(status); 346 if (foundMatch && expectedMatchStart<0 || 347 foundStart != expectedMatchStart || 348 foundLimit != expectedMatchLimit) { 349 TEST_ASSERT(FALSE); // ouput generic error position 350 infoln("Found, expected backwards match start = %d, %d \n" 351 "Found, expected backwards match limit = %d, %d", 352 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit); 353 } 354 355 usearch_close(uss); 356 ucol_close(collator); 357 } 358 359 delete root; 360 delete parser; 361 #endif 362 } 363 364 struct UdhrTestCase 365 { 366 char *locale; 367 char *file; 368 }; 369 370 void SSearchTest::udhrTest() 371 { 372 UErrorCode status = U_ZERO_ERROR; 373 char path[PATH_BUFFER_SIZE]; 374 const char *udhrPath = getPath(path, "udhr"); 375 376 if (udhrPath == NULL) { 377 // couldn't get path: error message already output... 378 return; 379 } 380 381 UdhrTestCase testCases[] = { 382 {"en", "udhr_eng.txt"}, 383 {"de", "udhr_deu_1996.txt"}, 384 {"fr", "udhr_fra.txt"}, 385 {"ru", "udhr_rus.txt"}, 386 {"th", "udhr_tha.txt"}, 387 {"ja", "udhr_jpn.txt"}, 388 {"ko", "udhr_kor.txt"}, 389 {"zh", "udhr_cmn_hans.txt"}, 390 {"zh_Hant", "udhr_cmn_hant.txt"} 391 }; 392 393 int32_t testCount = ARRAY_SIZE(testCases); 394 395 for (int32_t t = 0; t < testCount; t += 1) { 396 int32_t len = 0; 397 char *resolvedFileName = NULL; 398 const char *encoding = NULL; 399 UCHARBUF *ucharBuf = NULL; 400 401 ucbuf_resolveFileName(udhrPath, testCases[t].file, NULL, &len, &status); 402 resolvedFileName = NEW_ARRAY(char, len); 403 404 if(resolvedFileName == NULL){ 405 continue; 406 } 407 408 if(status == U_BUFFER_OVERFLOW_ERROR){ 409 status = U_ZERO_ERROR; 410 } 411 412 ucbuf_resolveFileName(udhrPath, testCases[t].file, resolvedFileName, &len, &status); 413 ucharBuf = ucbuf_open(resolvedFileName, &encoding, TRUE, FALSE, &status); 414 415 DELETE_ARRAY(resolvedFileName); 416 417 if(U_FAILURE(status)){ 418 infoln("Could not open the input file %s. Test skipped\n", testCases[t].file); 419 continue; 420 } 421 422 int32_t targetLen = 0; 423 const UChar *target = ucbuf_getBuffer(ucharBuf, &targetLen, &status); 424 425 /* The first line of the file contains the pattern */ 426 int32_t start = 0, end = 0, plen = 0; 427 428 for(end = start; ; end += 1) { 429 UChar ch = target[end]; 430 431 if (ch == 0x000A || ch == 0x000D || ch == 0x2028) { 432 break; 433 } 434 } 435 436 plen = end - start; 437 438 UChar *pattern = NEW_ARRAY(UChar, plen); 439 for (int32_t i = 0; i < plen; i += 1) { 440 pattern[i] = target[start++]; 441 } 442 443 int32_t offset = 0; 444 UCollator *coll = ucol_open(testCases[t].locale, &status); 445 UCD *ucd = NULL; 446 BMS *bms = NULL; 447 448 if (U_FAILURE(status)) { 449 errln("Could not open collator for %s", testCases[t].locale); 450 goto delete_collator; 451 } 452 453 ucd = ucd_open(coll, &status); 454 455 if (U_FAILURE(status)) { 456 errln("Could not open CollData object for %s", testCases[t].locale); 457 goto delete_ucd; 458 } 459 460 bms = bms_open(ucd, pattern, plen, target, targetLen, &status); 461 462 if (U_FAILURE(status)) { 463 errln("Could not open search object for %s", testCases[t].locale); 464 goto delete_bms; 465 } 466 467 start = end = -1; 468 while (bms_search(bms, offset, &start, &end)) { 469 offset = end; 470 } 471 472 if (offset == 0) { 473 errln("Could not find pattern - locale: %s, file: %s ", testCases[t].locale, testCases[t].file); 474 } 475 476 delete_bms: 477 bms_close(bms); 478 479 delete_ucd: 480 ucd_close(ucd); 481 482 delete_collator: 483 ucol_close(coll); 484 485 DELETE_ARRAY(pattern); 486 ucbuf_close(ucharBuf); 487 } 488 489 ucd_flushCache(); 490 } 491 492 void SSearchTest::bmSearchTest() 493 { 494 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 495 UErrorCode status = U_ZERO_ERROR; 496 char path[PATH_BUFFER_SIZE]; 497 const char *testFilePath = getPath(path, "ssearch.xml"); 498 499 if (testFilePath == NULL) { 500 return; /* Couldn't get path: error message already output. */ 501 } 502 503 UXMLParser *parser = UXMLParser::createParser(status); 504 TEST_ASSERT_SUCCESS(status); 505 UXMLElement *root = parser->parseFile(testFilePath, status); 506 TEST_ASSERT_SUCCESS(status); 507 if (U_FAILURE(status)) { 508 return; 509 } 510 511 const UnicodeString *debugTestCase = root->getAttribute("debug"); 512 if (debugTestCase != NULL) { 513 // setenv("USEARCH_DEBUG", "1", 1); 514 } 515 516 517 const UXMLElement *testCase; 518 int32_t tc = 0; 519 520 while((testCase = root->nextChildElement(tc)) != NULL) { 521 522 if (testCase->getTagName().compare("test-case") != 0) { 523 errln("ssearch, unrecognized XML Element in test file"); 524 continue; 525 } 526 const UnicodeString *id = testCase->getAttribute("id"); 527 *testId = 0; 528 if (id != NULL) { 529 id->extract(0, id->length(), testId, sizeof(testId), US_INV); 530 } 531 532 // If debugging test case has been specified and this is not it, skip to next. 533 if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) { 534 continue; 535 } 536 // 537 // Get the requested collation strength. 538 // Default is tertiary if the XML attribute is missing from the test case. 539 // 540 const UnicodeString *strength = testCase->getAttribute("strength"); 541 UColAttributeValue collatorStrength; 542 if (strength==NULL) { collatorStrength = UCOL_TERTIARY;} 543 else if (*strength=="PRIMARY") { collatorStrength = UCOL_PRIMARY;} 544 else if (*strength=="SECONDARY") { collatorStrength = UCOL_SECONDARY;} 545 else if (*strength=="TERTIARY") { collatorStrength = UCOL_TERTIARY;} 546 else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;} 547 else if (*strength=="IDENTICAL") { collatorStrength = UCOL_IDENTICAL;} 548 else { 549 // Bogus value supplied for strength. Shouldn't happen, even from 550 // typos, if the XML source has been validated. 551 // This assert is a little deceiving in that strength can be 552 // any of the allowed values, not just TERTIARY, but it will 553 // do the job of getting the error output. 554 TEST_ASSERT(*strength=="TERTIARY") 555 } 556 557 // 558 // Get the collator normalization flag. Default is UCOL_OFF. 559 // 560 UColAttributeValue normalize = UCOL_OFF; 561 const UnicodeString *norm = testCase->getAttribute("norm"); 562 TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF"); 563 if (norm!=NULL && *norm=="ON") { 564 normalize = UCOL_ON; 565 } 566 567 // 568 // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE. 569 // 570 UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE; 571 const UnicodeString *alt = testCase->getAttribute("alternate_handling"); 572 TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE"); 573 if (alt != NULL && *alt == "SHIFTED") { 574 alternateHandling = UCOL_SHIFTED; 575 } 576 577 const UnicodeString defLocale("en"); 578 char clocale[100]; 579 const UnicodeString *locale = testCase->getAttribute("locale"); 580 if (locale == NULL || locale->length()==0) { 581 locale = &defLocale; 582 }; 583 locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL); 584 585 586 UnicodeString text; 587 UnicodeString target; 588 UnicodeString pattern; 589 int32_t expectedMatchStart = -1; 590 int32_t expectedMatchLimit = -1; 591 const UXMLElement *n; 592 int32_t nodeCount = 0; 593 594 n = testCase->getChildElement("pattern"); 595 TEST_ASSERT(n != NULL); 596 if (n==NULL) { 597 continue; 598 } 599 text = n->getText(FALSE); 600 text = text.unescape(); 601 pattern.append(text); 602 nodeCount++; 603 604 n = testCase->getChildElement("pre"); 605 if (n!=NULL) { 606 text = n->getText(FALSE); 607 text = text.unescape(); 608 target.append(text); 609 nodeCount++; 610 } 611 612 n = testCase->getChildElement("m"); 613 if (n!=NULL) { 614 expectedMatchStart = target.length(); 615 text = n->getText(FALSE); 616 text = text.unescape(); 617 target.append(text); 618 expectedMatchLimit = target.length(); 619 nodeCount++; 620 } 621 622 n = testCase->getChildElement("post"); 623 if (n!=NULL) { 624 text = n->getText(FALSE); 625 text = text.unescape(); 626 target.append(text); 627 nodeCount++; 628 } 629 630 // Check that there weren't extra things in the XML 631 TEST_ASSERT(nodeCount == testCase->countChildren()); 632 633 // Open a collator and StringSearch based on the parameters 634 // obtained from the XML. 635 // 636 status = U_ZERO_ERROR; 637 UCollator *collator = ucol_open(clocale, &status); 638 ucol_setStrength(collator, collatorStrength); 639 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, normalize, &status); 640 ucol_setAttribute(collator, UCOL_ALTERNATE_HANDLING, alternateHandling, &status); 641 UCD *ucd = ucd_open(collator, &status); 642 BMS *bms = bms_open(ucd, pattern.getBuffer(), pattern.length(), target.getBuffer(), target.length(), &status); 643 644 TEST_ASSERT_SUCCESS(status); 645 if (U_FAILURE(status)) { 646 bms_close(bms); 647 ucd_close(ucd); 648 ucol_close(collator); 649 continue; 650 } 651 652 int32_t foundStart = 0; 653 int32_t foundLimit = 0; 654 UBool foundMatch; 655 656 // 657 // Do the search, check the match result against the expected results. 658 // 659 foundMatch = bms_search(bms, 0, &foundStart, &foundLimit); 660 //TEST_ASSERT_SUCCESS(status); 661 if (foundMatch && expectedMatchStart < 0 || 662 foundStart != expectedMatchStart || 663 foundLimit != expectedMatchLimit) { 664 TEST_ASSERT(FALSE); // ouput generic error position 665 infoln("Found, expected match start = %d, %d \n" 666 "Found, expected match limit = %d, %d", 667 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit); 668 } 669 670 bms_close(bms); 671 ucd_close(ucd); 672 ucol_close(collator); 673 } 674 675 ucd_flushCache(); 676 delete root; 677 delete parser; 678 #endif 679 } 680 681 struct Order 682 { 683 int32_t order; 684 int32_t lowOffset; 685 int32_t highOffset; 686 }; 687 688 class OrderList 689 { 690 public: 691 OrderList(); 692 OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset = 0); 693 ~OrderList(); 694 695 int32_t size(void) const; 696 void add(int32_t order, int32_t low, int32_t high); 697 const Order *get(int32_t index) const; 698 int32_t getLowOffset(int32_t index) const; 699 int32_t getHighOffset(int32_t index) const; 700 int32_t getOrder(int32_t index) const; 701 void reverse(void); 702 UBool compare(const OrderList &other) const; 703 UBool matchesAt(int32_t offset, const OrderList &other) const; 704 705 private: 706 Order *list; 707 int32_t listMax; 708 int32_t listSize; 709 }; 710 711 OrderList::OrderList() 712 : list(NULL), listSize(0), listMax(16) 713 { 714 list = new Order[listMax]; 715 } 716 717 OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset) 718 : list(NULL), listMax(16), listSize(0) 719 { 720 UErrorCode status = U_ZERO_ERROR; 721 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status); 722 uint32_t strengthMask = 0; 723 int32_t order, low, high; 724 725 switch (ucol_getStrength(coll)) 726 { 727 default: 728 strengthMask |= UCOL_TERTIARYORDERMASK; 729 /* fall through */ 730 731 case UCOL_SECONDARY: 732 strengthMask |= UCOL_SECONDARYORDERMASK; 733 /* fall through */ 734 735 case UCOL_PRIMARY: 736 strengthMask |= UCOL_PRIMARYORDERMASK; 737 } 738 739 list = new Order[listMax]; 740 741 ucol_setOffset(elems, stringOffset, &status); 742 743 do { 744 low = ucol_getOffset(elems); 745 order = ucol_next(elems, &status); 746 high = ucol_getOffset(elems); 747 748 if (order != UCOL_NULLORDER) { 749 order &= strengthMask; 750 } 751 752 if (order != UCOL_IGNORABLE) { 753 add(order, low, high); 754 } 755 } while (order != UCOL_NULLORDER); 756 757 ucol_closeElements(elems); 758 } 759 760 OrderList::~OrderList() 761 { 762 delete[] list; 763 } 764 765 void OrderList::add(int32_t order, int32_t low, int32_t high) 766 { 767 if (listSize >= listMax) { 768 listMax *= 2; 769 770 Order *newList = new Order[listMax]; 771 772 uprv_memcpy(newList, list, listSize * sizeof(Order)); 773 delete[] list; 774 list = newList; 775 } 776 777 list[listSize].order = order; 778 list[listSize].lowOffset = low; 779 list[listSize].highOffset = high; 780 781 listSize += 1; 782 } 783 784 const Order *OrderList::get(int32_t index) const 785 { 786 if (index >= listSize) { 787 return NULL; 788 } 789 790 return &list[index]; 791 } 792 793 int32_t OrderList::getLowOffset(int32_t index) const 794 { 795 const Order *order = get(index); 796 797 if (order != NULL) { 798 return order->lowOffset; 799 } 800 801 return -1; 802 } 803 804 int32_t OrderList::getHighOffset(int32_t index) const 805 { 806 const Order *order = get(index); 807 808 if (order != NULL) { 809 return order->highOffset; 810 } 811 812 return -1; 813 } 814 815 int32_t OrderList::getOrder(int32_t index) const 816 { 817 const Order *order = get(index); 818 819 if (order != NULL) { 820 return order->order; 821 } 822 823 return UCOL_NULLORDER; 824 } 825 826 int32_t OrderList::size() const 827 { 828 return listSize; 829 } 830 831 void OrderList::reverse() 832 { 833 for(int32_t f = 0, b = listSize - 1; f < b; f += 1, b -= 1) { 834 Order swap = list[b]; 835 836 list[b] = list[f]; 837 list[f] = swap; 838 } 839 } 840 841 UBool OrderList::compare(const OrderList &other) const 842 { 843 if (listSize != other.listSize) { 844 return FALSE; 845 } 846 847 for(int32_t i = 0; i < listSize; i += 1) { 848 if (list[i].order != other.list[i].order || 849 list[i].lowOffset != other.list[i].lowOffset || 850 list[i].highOffset != other.list[i].highOffset) { 851 return FALSE; 852 } 853 } 854 855 return TRUE; 856 } 857 858 UBool OrderList::matchesAt(int32_t offset, const OrderList &other) const 859 { 860 // NOTE: sizes include the NULLORDER, which we don't want to compare. 861 int32_t otherSize = other.size() - 1; 862 863 if (listSize - 1 - offset < otherSize) { 864 return FALSE; 865 } 866 867 for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) { 868 if (getOrder(i) != other.getOrder(j)) { 869 return FALSE; 870 } 871 } 872 873 return TRUE; 874 } 875 876 static char *printOffsets(char *buffer, OrderList &list) 877 { 878 int32_t size = list.size(); 879 char *s = buffer; 880 881 for(int32_t i = 0; i < size; i += 1) { 882 const Order *order = list.get(i); 883 884 if (i != 0) { 885 s += sprintf(s, ", "); 886 } 887 888 s += sprintf(s, "(%d, %d)", order->lowOffset, order->highOffset); 889 } 890 891 return buffer; 892 } 893 894 static char *printOrders(char *buffer, OrderList &list) 895 { 896 int32_t size = list.size(); 897 char *s = buffer; 898 899 for(int32_t i = 0; i < size; i += 1) { 900 const Order *order = list.get(i); 901 902 if (i != 0) { 903 s += sprintf(s, ", "); 904 } 905 906 s += sprintf(s, "%8.8X", order->order); 907 } 908 909 return buffer; 910 } 911 912 void SSearchTest::offsetTest() 913 { 914 const char *test[] = { 915 // The sequence \u0FB3\u0F71\u0F71\u0F80 contains a discontiguous 916 // contraction (\u0FB3\u0F71\u0F80) logically followed by \u0F71. 917 "\\u1E33\\u0FB3\\u0F71\\u0F71\\u0F80\\uD835\\uDF6C\\u01B0", 918 919 "\\ua191\\u16ef\\u2036\\u017a", 920 921 #if 0 922 // This results in a complex interaction between contraction, 923 // expansion and normalization that confuses the backwards offset fixups. 924 "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85", 925 #endif 926 927 "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85", 928 "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3", 929 930 "\\u02FE\\u02FF" 931 "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\\u030A\\u030B\\u030C\\u030D\\u030E\\u030F" 932 "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F" 933 "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F" 934 "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F" 935 "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E", 936 937 "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318", 938 "abc\\u0E41\\u0301\\u0316", 939 "abc\\u0E41\\u0316\\u0301", 940 "\\u0E41\\u0301\\u0316", 941 "\\u0E41\\u0316\\u0301", 942 "a\\u0301\\u0316", 943 "a\\u0316\\u0301", 944 "\\uAC52\\uAC53", 945 "\\u34CA\\u34CB", 946 "\\u11ED\\u11EE", 947 "\\u30C3\\u30D0", 948 "p\\u00E9ch\\u00E9", 949 "a\\u0301\\u0325", 950 "a\\u0300\\u0325", 951 "a\\u0325\\u0300", 952 "A\\u0323\\u0300B", 953 "A\\u0300\\u0323B", 954 "A\\u0301\\u0323B", 955 "A\\u0302\\u0301\\u0323B", 956 "abc", 957 "ab\\u0300c", 958 "ab\\u0300\\u0323c", 959 " \\uD800\\uDC00\\uDC00", 960 "a\\uD800\\uDC00\\uDC00", 961 "A\\u0301\\u0301", 962 "A\\u0301\\u0323", 963 "A\\u0301\\u0323B", 964 "B\\u0301\\u0323C", 965 "A\\u0300\\u0323B", 966 "\\u0301A\\u0301\\u0301", 967 "abcd\\r\\u0301", 968 "p\\u00EAche", 969 "pe\\u0302che", 970 }; 971 972 int32_t testCount = ARRAY_SIZE(test); 973 UErrorCode status = U_ZERO_ERROR; 974 RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Locale::getEnglish(), status); 975 if (U_FAILURE(status)) { 976 errcheckln(status, "Failed to create collator in offsetTest! - %s", u_errorName(status)); 977 return; 978 } 979 char buffer[4096]; // A bit of a hack... just happens to be long enough for all the test cases... 980 // We could allocate one that's the right size by (CE_count * 10) + 2 981 // 10 chars is enough room for 8 hex digits plus ", ". 2 extra chars for "[" and "]" 982 983 col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status); 984 985 for(int32_t i = 0; i < testCount; i += 1) { 986 UnicodeString ts = CharsToUnicodeString(test[i]); 987 CollationElementIterator *iter = col->createCollationElementIterator(ts); 988 OrderList forwardList; 989 OrderList backwardList; 990 int32_t order, low, high; 991 992 do { 993 low = iter->getOffset(); 994 order = iter->next(status); 995 high = iter->getOffset(); 996 997 forwardList.add(order, low, high); 998 } while (order != CollationElementIterator::NULLORDER); 999 1000 iter->reset(); 1001 iter->setOffset(ts.length(), status); 1002 1003 backwardList.add(CollationElementIterator::NULLORDER, iter->getOffset(), iter->getOffset()); 1004 1005 do { 1006 high = iter->getOffset(); 1007 order = iter->previous(status); 1008 low = iter->getOffset(); 1009 1010 if (order == CollationElementIterator::NULLORDER) { 1011 break; 1012 } 1013 1014 backwardList.add(order, low, high); 1015 } while (TRUE); 1016 1017 backwardList.reverse(); 1018 1019 if (forwardList.compare(backwardList)) { 1020 logln("Works with \"%s\"", test[i]); 1021 logln("Forward offsets: [%s]", printOffsets(buffer, forwardList)); 1022 // logln("Backward offsets: [%s]", printOffsets(buffer, backwardList)); 1023 1024 logln("Forward CEs: [%s]", printOrders(buffer, forwardList)); 1025 // logln("Backward CEs: [%s]", printOrders(buffer, backwardList)); 1026 1027 logln(); 1028 } else { 1029 errln("Fails with \"%s\"", test[i]); 1030 infoln("Forward offsets: [%s]", printOffsets(buffer, forwardList)); 1031 infoln("Backward offsets: [%s]", printOffsets(buffer, backwardList)); 1032 1033 infoln("Forward CEs: [%s]", printOrders(buffer, forwardList)); 1034 infoln("Backward CEs: [%s]", printOrders(buffer, backwardList)); 1035 1036 infoln(); 1037 } 1038 delete iter; 1039 } 1040 delete col; 1041 } 1042 1043 static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer) 1044 { 1045 for(int32_t i = 0; i < string.length(); i += 1) { 1046 UChar32 ch = string.char32At(i); 1047 1048 if (ch >= 0x0020 && ch <= 0x007F) { 1049 if (ch == 0x005C) { 1050 buffer.append("\\\\"); 1051 } else { 1052 buffer.append(ch); 1053 } 1054 } else { 1055 char cbuffer[12]; 1056 1057 if (ch <= 0xFFFFL) { 1058 sprintf(cbuffer, "\\u%4.4X", ch); 1059 } else { 1060 sprintf(cbuffer, "\\U%8.8X", ch); 1061 } 1062 1063 buffer.append(cbuffer); 1064 } 1065 1066 if (ch >= 0x10000L) { 1067 i += 1; 1068 } 1069 } 1070 1071 return buffer; 1072 } 1073 1074 #if 1 1075 1076 struct PCE 1077 { 1078 uint64_t ce; 1079 int32_t lowOffset; 1080 int32_t highOffset; 1081 }; 1082 1083 class PCEList 1084 { 1085 public: 1086 PCEList(UCollator *coll, const UnicodeString &string); 1087 ~PCEList(); 1088 1089 int32_t size() const; 1090 1091 const PCE *get(int32_t index) const; 1092 1093 int32_t getLowOffset(int32_t index) const; 1094 int32_t getHighOffset(int32_t index) const; 1095 uint64_t getOrder(int32_t index) const; 1096 1097 UBool matchesAt(int32_t offset, const PCEList &other) const; 1098 1099 uint64_t operator[](int32_t index) const; 1100 1101 private: 1102 void add(uint64_t ce, int32_t low, int32_t high); 1103 1104 PCE *list; 1105 int32_t listMax; 1106 int32_t listSize; 1107 }; 1108 1109 PCEList::PCEList(UCollator *coll, const UnicodeString &string) 1110 { 1111 UErrorCode status = U_ZERO_ERROR; 1112 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status); 1113 uint64_t order; 1114 int32_t low, high; 1115 1116 list = new PCE[listMax]; 1117 1118 ucol_setOffset(elems, 0, &status); 1119 1120 do { 1121 order = ucol_nextProcessed(elems, &low, &high, &status); 1122 add(order, low, high); 1123 } while (order != UCOL_PROCESSED_NULLORDER); 1124 1125 ucol_closeElements(elems); 1126 } 1127 1128 PCEList::~PCEList() 1129 { 1130 delete[] list; 1131 } 1132 1133 void PCEList::add(uint64_t order, int32_t low, int32_t high) 1134 { 1135 if (listSize >= listMax) { 1136 listMax *= 2; 1137 1138 PCE *newList = new PCE[listMax]; 1139 1140 uprv_memcpy(newList, list, listSize * sizeof(Order)); 1141 delete[] list; 1142 list = newList; 1143 } 1144 1145 list[listSize].ce = order; 1146 list[listSize].lowOffset = low; 1147 list[listSize].highOffset = high; 1148 1149 listSize += 1; 1150 } 1151 1152 const PCE *PCEList::get(int32_t index) const 1153 { 1154 if (index >= listSize) { 1155 return NULL; 1156 } 1157 1158 return &list[index]; 1159 } 1160 1161 int32_t PCEList::getLowOffset(int32_t index) const 1162 { 1163 const PCE *pce = get(index); 1164 1165 if (pce != NULL) { 1166 return pce->lowOffset; 1167 } 1168 1169 return -1; 1170 } 1171 1172 int32_t PCEList::getHighOffset(int32_t index) const 1173 { 1174 const PCE *pce = get(index); 1175 1176 if (pce != NULL) { 1177 return pce->highOffset; 1178 } 1179 1180 return -1; 1181 } 1182 1183 uint64_t PCEList::getOrder(int32_t index) const 1184 { 1185 const PCE *pce = get(index); 1186 1187 if (pce != NULL) { 1188 return pce->ce; 1189 } 1190 1191 return UCOL_PROCESSED_NULLORDER; 1192 } 1193 1194 int32_t PCEList::size() const 1195 { 1196 return listSize; 1197 } 1198 1199 UBool PCEList::matchesAt(int32_t offset, const PCEList &other) const 1200 { 1201 // NOTE: sizes include the NULLORDER, which we don't want to compare. 1202 int32_t otherSize = other.size() - 1; 1203 1204 if (listSize - 1 - offset < otherSize) { 1205 return FALSE; 1206 } 1207 1208 for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) { 1209 if (getOrder(i) != other.getOrder(j)) { 1210 return FALSE; 1211 } 1212 } 1213 1214 return TRUE; 1215 } 1216 1217 uint64_t PCEList::operator[](int32_t index) const 1218 { 1219 return getOrder(index); 1220 } 1221 1222 void SSearchTest::boyerMooreTest() 1223 { 1224 UErrorCode status = U_ZERO_ERROR; 1225 UCollator *coll = NULL; 1226 CollData *data = NULL; 1227 UnicodeString lp = "fuss"; 1228 UnicodeString sp = "fu\\u00DF"; 1229 BoyerMooreSearch *longPattern = NULL; 1230 BoyerMooreSearch *shortPattern = NULL; 1231 UnicodeString targets[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball", 1232 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF", 1233 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"}; 1234 int32_t start = -1, end = -1; 1235 1236 coll = ucol_openFromShortString("S1", FALSE, NULL, &status); 1237 if (U_FAILURE(status)) { 1238 errcheckln(status, "Could not open collator. - %s", u_errorName(status)); 1239 return; 1240 } 1241 1242 data = CollData::open(coll, status); 1243 if (U_FAILURE(status)) { 1244 errln("Could not open CollData object."); 1245 goto close_data; 1246 } 1247 1248 1249 longPattern = new BoyerMooreSearch(data, lp.unescape(), NULL, status); 1250 shortPattern = new BoyerMooreSearch(data, sp.unescape(), NULL, status); 1251 if (U_FAILURE(status)) { 1252 errln("Could not create pattern objects."); 1253 goto close_patterns; 1254 } 1255 1256 for (int32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) { 1257 UnicodeString target = targets[t].unescape(); 1258 1259 longPattern->setTargetString(&target, status); 1260 if (longPattern->search(0, start, end)) { 1261 logln("Test %d: found long pattern at [%d, %d].", t, start, end); 1262 } else { 1263 errln("Test %d: did not find long pattern.", t); 1264 } 1265 1266 shortPattern->setTargetString(&target, status); 1267 if (shortPattern->search(0, start, end)) { 1268 logln("Test %d: found short pattern at [%d, %d].", t, start, end); 1269 } else { 1270 errln("Test %d: did not find short pattern.", t); 1271 } 1272 } 1273 1274 close_patterns: 1275 delete shortPattern; 1276 delete longPattern; 1277 1278 close_data: 1279 CollData::close(data); 1280 ucol_close(coll); 1281 } 1282 1283 void SSearchTest::bmsTest() 1284 { 1285 UErrorCode status = U_ZERO_ERROR; 1286 UCollator *coll = NULL; 1287 UCD *data = NULL; 1288 UnicodeString lp = "fuss"; 1289 UnicodeString lpu = lp.unescape(); 1290 UnicodeString sp = "fu\\u00DF"; 1291 UnicodeString spu = sp.unescape(); 1292 BMS *longPattern = NULL; 1293 BMS *shortPattern = NULL; 1294 UnicodeString targets[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball", 1295 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF", 1296 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"}; 1297 int32_t start = -1, end = -1; 1298 1299 coll = ucol_openFromShortString("S1", FALSE, NULL, &status); 1300 if (U_FAILURE(status)) { 1301 errcheckln(status, "Could not open collator. - %s", u_errorName(status)); 1302 return; 1303 } 1304 1305 data = ucd_open(coll, &status); 1306 if (U_FAILURE(status)) { 1307 errln("Could not open CollData object."); 1308 goto close_data; 1309 } 1310 1311 longPattern = bms_open(data, lpu.getBuffer(), lpu.length(), NULL, 0, &status); 1312 shortPattern = bms_open(data, spu.getBuffer(), spu.length(), NULL, 0, &status); 1313 if (U_FAILURE(status)) { 1314 errln("Couldn't open pattern objects."); 1315 goto close_patterns; 1316 } 1317 1318 for (int32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) { 1319 UnicodeString target = targets[t].unescape(); 1320 1321 bms_setTargetString(longPattern, target.getBuffer(), target.length(), &status); 1322 if (bms_search(longPattern, 0, &start, &end)) { 1323 logln("Test %d: found long pattern at [%d, %d].", t, start, end); 1324 } else { 1325 errln("Test %d: did not find long pattern.", t); 1326 } 1327 1328 bms_setTargetString(shortPattern, target.getBuffer(), target.length(), &status); 1329 if (bms_search(shortPattern, 0, &start, &end)) { 1330 logln("Test %d: found short pattern at [%d, %d].", t, start, end); 1331 } else { 1332 errln("Test %d: did not find short pattern.", t); 1333 } 1334 } 1335 1336 close_patterns: 1337 bms_close(shortPattern); 1338 bms_close(longPattern); 1339 1340 close_data: 1341 ucd_close(data); 1342 ucol_close(coll); 1343 } 1344 1345 void SSearchTest::goodSuffixTest() 1346 { 1347 UErrorCode status = U_ZERO_ERROR; 1348 UCollator *coll = NULL; 1349 CollData *data = NULL; 1350 UnicodeString pat = /*"gcagagag"*/ "fxeld"; 1351 UnicodeString target = /*"gcatcgcagagagtatacagtacg"*/ "cloveldfxeld"; 1352 BoyerMooreSearch *pattern = NULL; 1353 int32_t start = -1, end = -1; 1354 1355 coll = ucol_open(NULL, &status); 1356 if (U_FAILURE(status)) { 1357 errcheckln(status, "Couldn't open collator. - %s", u_errorName(status)); 1358 return; 1359 } 1360 1361 data = CollData::open(coll, status); 1362 if (U_FAILURE(status)) { 1363 errln("Couldn't open CollData object."); 1364 goto close_data; 1365 } 1366 1367 pattern = new BoyerMooreSearch(data, pat, &target, status); 1368 if (U_FAILURE(status)) { 1369 errln("Couldn't open pattern object."); 1370 goto close_pattern; 1371 } 1372 1373 if (pattern->search(0, start, end)) { 1374 logln("Found pattern at [%d, %d].", start, end); 1375 } else { 1376 errln("Did not find pattern."); 1377 } 1378 1379 close_pattern: 1380 delete pattern; 1381 1382 close_data: 1383 CollData::close(data); 1384 ucol_close(coll); 1385 } 1386 1387 // 1388 // searchTime() A quick and dirty performance test for string search. 1389 // Probably doesn't really belong as part of intltest, but it 1390 // does check that the search succeeds, and gets the right result, 1391 // so it serves as a functionality test also. 1392 // 1393 // To run as a perf test, up the loop count, select by commenting 1394 // and uncommenting in the code the operation to be measured, 1395 // rebuild, and measure the running time of this test alone. 1396 // 1397 // time LD_LIBRARY_PATH=whatever ./intltest collate/SSearchTest/searchTime 1398 // 1399 void SSearchTest::searchTime() { 1400 static const char *longishText = 1401 "Whylom, as olde stories tellen us,\n" 1402 "Ther was a duk that highte Theseus:\n" 1403 "Of Athenes he was lord and governour,\n" 1404 "And in his tyme swich a conquerour,\n" 1405 "That gretter was ther noon under the sonne.\n" 1406 "Ful many a riche contree hadde he wonne;\n" 1407 "What with his wisdom and his chivalrye,\n" 1408 "He conquered al the regne of Femenye,\n" 1409 "That whylom was y-cleped Scithia;\n" 1410 "And weddede the quene Ipolita,\n" 1411 "And broghte hir hoom with him in his contree\n" 1412 "With muchel glorie and greet solempnitee,\n" 1413 "And eek hir yonge suster Emelye.\n" 1414 "And thus with victorie and with melodye\n" 1415 "Lete I this noble duk to Athenes ryde,\n" 1416 "And al his hoost, in armes, him bisyde.\n" 1417 "And certes, if it nere to long to here,\n" 1418 "I wolde han told yow fully the manere,\n" 1419 "How wonnen was the regne of Femenye\n" 1420 "By Theseus, and by his chivalrye;\n" 1421 "And of the grete bataille for the nones\n" 1422 "Bitwixen Athen's and Amazones;\n" 1423 "And how asseged was Ipolita,\n" 1424 "The faire hardy quene of Scithia;\n" 1425 "And of the feste that was at hir weddinge,\n" 1426 "And of the tempest at hir hoom-cominge;\n" 1427 "But al that thing I moot as now forbere.\n" 1428 "I have, God woot, a large feeld to ere,\n" 1429 "And wayke been the oxen in my plough.\n" 1430 "The remenant of the tale is long y-nough.\n" 1431 "I wol nat letten eek noon of this route;\n" 1432 "Lat every felawe telle his tale aboute,\n" 1433 "And lat see now who shal the soper winne;\n" 1434 "And ther I lefte, I wol ageyn biginne.\n" 1435 "This duk, of whom I make mencioun,\n" 1436 "When he was come almost unto the toun,\n" 1437 "In al his wele and in his moste pryde,\n" 1438 "He was war, as he caste his eye asyde,\n" 1439 "Wher that ther kneled in the hye weye\n" 1440 "A companye of ladies, tweye and tweye,\n" 1441 "Ech after other, clad in clothes blake; \n" 1442 "But swich a cry and swich a wo they make,\n" 1443 "That in this world nis creature livinge,\n" 1444 "That herde swich another weymentinge;\n" 1445 "And of this cry they nolde never stenten,\n" 1446 "Til they the reynes of his brydel henten.\n" 1447 "'What folk ben ye, that at myn hoomcominge\n" 1448 "Perturben so my feste with cryinge'?\n" 1449 "Quod Theseus, 'have ye so greet envye\n" 1450 "Of myn honour, that thus compleyne and crye? \n" 1451 "Or who hath yow misboden, or offended?\n" 1452 "And telleth me if it may been amended;\n" 1453 "And why that ye ben clothed thus in blak'?\n" 1454 "The eldest lady of hem alle spak,\n" 1455 "When she hadde swowned with a deedly chere,\n" 1456 "That it was routhe for to seen and here,\n" 1457 "And seyde: 'Lord, to whom Fortune hath yiven\n" 1458 "Victorie, and as a conquerour to liven,\n" 1459 "Noght greveth us your glorie and your honour;\n" 1460 "But we biseken mercy and socour.\n" 1461 "Have mercy on our wo and our distresse.\n" 1462 "Som drope of pitee, thurgh thy gentilesse,\n" 1463 "Up-on us wrecched wommen lat thou falle.\n" 1464 "For certes, lord, ther nis noon of us alle,\n" 1465 "That she nath been a duchesse or a quene;\n" 1466 "Now be we caitifs, as it is wel sene:\n" 1467 "Thanked be Fortune, and hir false wheel,\n" 1468 "That noon estat assureth to be weel.\n" 1469 "And certes, lord, t'abyden your presence,\n" 1470 "Here in the temple of the goddesse Clemence\n" 1471 "We han ben waytinge al this fourtenight;\n" 1472 "Now help us, lord, sith it is in thy might.\n" 1473 "I wrecche, which that wepe and waille thus,\n" 1474 "Was whylom wyf to king Capaneus,\n" 1475 "That starf at Thebes, cursed be that day!\n" 1476 "And alle we, that been in this array,\n" 1477 "And maken al this lamentacioun,\n" 1478 "We losten alle our housbondes at that toun,\n" 1479 "Whyl that the sege ther-aboute lay.\n" 1480 "And yet now th'olde Creon, weylaway!\n" 1481 "The lord is now of Thebes the citee, \n" 1482 "Fulfild of ire and of iniquitee,\n" 1483 "He, for despyt, and for his tirannye,\n" 1484 "To do the dede bodyes vileinye,\n" 1485 "Of alle our lordes, whiche that ben slawe,\n" 1486 "Hath alle the bodyes on an heep y-drawe,\n" 1487 "And wol nat suffren hem, by noon assent,\n" 1488 "Neither to been y-buried nor y-brent,\n" 1489 "But maketh houndes ete hem in despyt. zet'\n"; 1490 1491 #define TEST_BOYER_MOORE 1 1492 const char *cPattern = "maketh houndes ete hem"; 1493 //const char *cPattern = "Whylom"; 1494 //const char *cPattern = "zet"; 1495 const char *testId = "searchTime()"; // for error macros. 1496 UnicodeString target = longishText; 1497 UErrorCode status = U_ZERO_ERROR; 1498 1499 1500 UCollator *collator = ucol_open("en", &status); 1501 CollData *data = CollData::open(collator, status); 1502 if (U_FAILURE(status) || collator == NULL || data == NULL) { 1503 errcheckln(status, "Unable to open UCollator or CollData. - %s", u_errorName(status)); 1504 return; 1505 } 1506 //ucol_setStrength(collator, collatorStrength); 1507 //ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, normalize, &status); 1508 UnicodeString uPattern = cPattern; 1509 #ifndef TEST_BOYER_MOORE 1510 UStringSearch *uss = usearch_openFromCollator(uPattern.getBuffer(), uPattern.length(), 1511 target.getBuffer(), target.length(), 1512 collator, 1513 NULL, // the break iterator 1514 &status); 1515 TEST_ASSERT_SUCCESS(status); 1516 #else 1517 BoyerMooreSearch bms(data, uPattern, &target, status); 1518 TEST_ASSERT_SUCCESS(status); 1519 #endif 1520 1521 // int32_t foundStart; 1522 // int32_t foundEnd; 1523 UBool found; 1524 1525 // Find the match position usgin strstr 1526 const char *pm = strstr(longishText, cPattern); 1527 TEST_ASSERT_M(pm!=NULL, "No pattern match with strstr"); 1528 int32_t refMatchPos = (int32_t)(pm - longishText); 1529 int32_t icuMatchPos; 1530 int32_t icuMatchEnd; 1531 #ifndef TEST_BOYER_MOORE 1532 usearch_search(uss, 0, &icuMatchPos, &icuMatchEnd, &status); 1533 TEST_ASSERT_SUCCESS(status); 1534 #else 1535 found = bms.search(0, icuMatchPos, icuMatchEnd); 1536 #endif 1537 TEST_ASSERT_M(refMatchPos == icuMatchPos, "strstr and icu give different match positions."); 1538 1539 int32_t i; 1540 int32_t j=0; 1541 1542 // Try loopcounts around 100000 to some millions, depending on the operation, 1543 // to get runtimes of at least several seconds. 1544 for (i=0; i<10000; i++) { 1545 #ifndef TEST_BOYER_MOORE 1546 found = usearch_search(uss, 0, &icuMatchPos, &icuMatchEnd, &status); 1547 #else 1548 found = bms.search(0, icuMatchPos, icuMatchEnd); 1549 #endif 1550 //TEST_ASSERT_SUCCESS(status); 1551 //TEST_ASSERT(found); 1552 1553 // usearch_setOffset(uss, 0, &status); 1554 // icuMatchPos = usearch_next(uss, &status); 1555 1556 // The i+j stuff is to confuse the optimizer and get it to actually leave the 1557 // call to strstr in place. 1558 //pm = strstr(longishText+j, cPattern); 1559 //j = (j + i)%5; 1560 } 1561 1562 printf("%d\n", pm-longishText, j); 1563 #ifndef TEST_BOYER_MOORE 1564 usearch_close(uss); 1565 #else 1566 CollData::close(data); 1567 #endif 1568 ucol_close(collator); 1569 } 1570 #endif 1571 1572 //---------------------------------------------------------------------------------------- 1573 // 1574 // Random Numbers. Similar to standard lib rand() and srand() 1575 // Not using library to 1576 // 1. Get same results on all platforms. 1577 // 2. Get access to current seed, to more easily reproduce failures. 1578 // 1579 //--------------------------------------------------------------------------------------- 1580 static uint32_t m_seed = 1; 1581 1582 static uint32_t m_rand() 1583 { 1584 m_seed = m_seed * 1103515245 + 12345; 1585 return (uint32_t)(m_seed/65536) % 32768; 1586 } 1587 1588 class Monkey 1589 { 1590 public: 1591 virtual void append(UnicodeString &test, UnicodeString &alternate) = 0; 1592 1593 protected: 1594 Monkey(); 1595 virtual ~Monkey(); 1596 }; 1597 1598 Monkey::Monkey() 1599 { 1600 // ook? 1601 } 1602 1603 Monkey::~Monkey() 1604 { 1605 // ook? 1606 } 1607 1608 class SetMonkey : public Monkey 1609 { 1610 public: 1611 SetMonkey(const USet *theSet); 1612 ~SetMonkey(); 1613 1614 virtual void append(UnicodeString &test, UnicodeString &alternate); 1615 1616 private: 1617 const USet *set; 1618 }; 1619 1620 SetMonkey::SetMonkey(const USet *theSet) 1621 : Monkey(), set(theSet) 1622 { 1623 // ook? 1624 } 1625 1626 SetMonkey::~SetMonkey() 1627 { 1628 //ook... 1629 } 1630 1631 void SetMonkey::append(UnicodeString &test, UnicodeString &alternate) 1632 { 1633 int32_t size = uset_size(set); 1634 int32_t index = m_rand() % size; 1635 UChar32 ch = uset_charAt(set, index); 1636 UnicodeString str(ch); 1637 1638 test.append(str); 1639 alternate.append(str); // flip case, or some junk? 1640 } 1641 1642 class StringSetMonkey : public Monkey 1643 { 1644 public: 1645 StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData); 1646 ~StringSetMonkey(); 1647 1648 void append(UnicodeString &testCase, UnicodeString &alternate); 1649 1650 private: 1651 UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeString &alternate); 1652 1653 const USet *set; 1654 UCollator *coll; 1655 CollData *collData; 1656 }; 1657 1658 StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData) 1659 : Monkey(), set(theSet), coll(theCollator), collData(theCollData) 1660 { 1661 // ook. 1662 } 1663 1664 StringSetMonkey::~StringSetMonkey() 1665 { 1666 // ook? 1667 } 1668 1669 void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate) 1670 { 1671 int32_t itemCount = uset_getItemCount(set), len = 0; 1672 int32_t index = m_rand() % itemCount; 1673 UChar32 rangeStart = 0, rangeEnd = 0; 1674 UChar buffer[16]; 1675 UErrorCode err = U_ZERO_ERROR; 1676 1677 len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err); 1678 1679 if (len == 0) { 1680 int32_t offset = m_rand() % (rangeEnd - rangeStart + 1); 1681 UChar32 ch = rangeStart + offset; 1682 UnicodeString str(ch); 1683 1684 testCase.append(str); 1685 generateAlternative(str, alternate); 1686 } else if (len > 0) { 1687 // should check that len < 16... 1688 UnicodeString str(buffer, len); 1689 1690 testCase.append(str); 1691 generateAlternative(str, alternate); 1692 } else { 1693 // shouldn't happen... 1694 } 1695 } 1696 1697 UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCase, UnicodeString &alternate) 1698 { 1699 // find out shortest string for the longest sequence of ces. 1700 // needs to be refined to use dynamic programming, but will be roughly right 1701 UErrorCode status = U_ZERO_ERROR; 1702 CEList ceList(coll, testCase, status); 1703 UnicodeString alt; 1704 int32_t offset = 0; 1705 1706 if (ceList.size() == 0) { 1707 return alternate.append(testCase); 1708 } 1709 1710 while (offset < ceList.size()) { 1711 int32_t ce = ceList.get(offset); 1712 const StringList *strings = collData->getStringList(ce); 1713 1714 if (strings == NULL) { 1715 return alternate.append(testCase); 1716 } 1717 1718 int32_t stringCount = strings->size(); 1719 int32_t tries = 0; 1720 1721 // find random string that generates the same CEList 1722 const CEList *ceList2 = NULL; 1723 const UnicodeString *string = NULL; 1724 UBool matches = FALSE; 1725 1726 do { 1727 int32_t s = m_rand() % stringCount; 1728 1729 if (tries++ > stringCount) { 1730 alternate.append(testCase); 1731 return alternate; 1732 } 1733 1734 string = strings->get(s); 1735 ceList2 = collData->getCEList(string); 1736 matches = ceList.matchesAt(offset, ceList2); 1737 1738 if (! matches) { 1739 collData->freeCEList((CEList *) ceList2); 1740 } 1741 } while (! matches); 1742 1743 alt.append(*string); 1744 offset += ceList2->size(); 1745 collData->freeCEList(ceList2); 1746 } 1747 1748 const CEList altCEs(coll, alt, status); 1749 1750 if (ceList.matchesAt(0, &altCEs)) { 1751 return alternate.append(alt); 1752 } 1753 1754 return alternate.append(testCase); 1755 } 1756 1757 static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyCount, UnicodeString &testCase, UnicodeString &alternate) 1758 { 1759 int32_t pieces = (m_rand() % 4) + 1; 1760 UErrorCode status = U_ZERO_ERROR; 1761 UBool matches; 1762 1763 do { 1764 testCase.remove(); 1765 alternate.remove(); 1766 monkeys[0]->append(testCase, alternate); 1767 1768 for(int32_t piece = 0; piece < pieces; piece += 1) { 1769 int32_t monkey = m_rand() % monkeyCount; 1770 1771 monkeys[monkey]->append(testCase, alternate); 1772 } 1773 1774 const CEList ceTest(coll, testCase, status); 1775 const CEList ceAlt(coll, alternate, status); 1776 1777 matches = ceTest.matchesAt(0, &ceAlt); 1778 } while (! matches); 1779 } 1780 1781 // 1782 // Find the next acceptable boundary following the specified starting index 1783 // in the target text being searched. 1784 // TODO: refine what is an acceptable boundary. For the moment, 1785 // choose the next position not within a combining sequence. 1786 // 1787 static int32_t nextBoundaryAfter(const UnicodeString &string, int32_t startIndex) { 1788 const UChar *text = string.getBuffer(); 1789 int32_t textLen = string.length(); 1790 1791 if (startIndex >= textLen) { 1792 return startIndex; 1793 } 1794 1795 UChar32 c; 1796 int32_t i = startIndex; 1797 1798 U16_NEXT(text, i, textLen, c); 1799 1800 // If we are on a control character, stop without looking for combining marks. 1801 // Control characters do not combine. 1802 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 1803 if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) { 1804 return i; 1805 } 1806 1807 // The initial character was not a control, and can thus accept trailing 1808 // combining characters. Advance over however many of them there are. 1809 int32_t indexOfLastCharChecked; 1810 1811 for (;;) { 1812 indexOfLastCharChecked = i; 1813 1814 if (i>=textLen) { 1815 break; 1816 } 1817 1818 U16_NEXT(text, i, textLen, c); 1819 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 1820 1821 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { 1822 break; 1823 } 1824 } 1825 1826 return indexOfLastCharChecked; 1827 } 1828 1829 static UBool isInCombiningSequence(const UnicodeString &string, int32_t index) { 1830 const UChar *text = string.getBuffer(); 1831 int32_t textLen = string.length(); 1832 1833 if (index>=textLen || index<=0) { 1834 return FALSE; 1835 } 1836 1837 // If the character at the current index is not a GRAPHEME_EXTEND 1838 // then we can not be within a combining sequence. 1839 UChar32 c; 1840 U16_GET(text, 0, index, textLen, c); 1841 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 1842 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { 1843 return FALSE; 1844 } 1845 1846 // We are at a combining mark. If the preceding character is anything 1847 // except a CONTROL, CR or LF, we are in a combining sequence. 1848 U16_PREV(text, 0, index, c); 1849 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 1850 1851 return !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR); 1852 } 1853 1854 static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd) 1855 { 1856 UErrorCode status = U_ZERO_ERROR; 1857 OrderList targetOrders(coll, target, offset); 1858 OrderList patternOrders(coll, pattern); 1859 int32_t targetSize = targetOrders.size() - 1; 1860 int32_t patternSize = patternOrders.size() - 1; 1861 UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status), 1862 target.getBuffer(), target.length(), &status); 1863 1864 if (patternSize == 0) { 1865 // Searching for an empty pattern always fails 1866 matchStart = matchEnd = -1; 1867 ubrk_close(charBreakIterator); 1868 return FALSE; 1869 } 1870 1871 matchStart = matchEnd = -1; 1872 1873 for(int32_t i = 0; i < targetSize; i += 1) { 1874 if (targetOrders.matchesAt(i, patternOrders)) { 1875 int32_t start = targetOrders.getLowOffset(i); 1876 int32_t maxLimit = targetOrders.getLowOffset(i + patternSize); 1877 int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1); 1878 1879 // if the low and high offsets of the first CE in 1880 // the match are the same, it means that the match 1881 // starts in the middle of an expansion - all but 1882 // the first CE of the expansion will have the offset 1883 // of the following character. 1884 if (start == targetOrders.getHighOffset(i)) { 1885 continue; 1886 } 1887 1888 // Make sure match starts on a grapheme boundary 1889 if (! ubrk_isBoundary(charBreakIterator, start)) { 1890 continue; 1891 } 1892 1893 // If the low and high offsets of the CE after the match 1894 // are the same, it means that the match ends in the middle 1895 // of an expansion sequence. 1896 if (maxLimit == targetOrders.getHighOffset(i + patternSize) && 1897 targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) { 1898 continue; 1899 } 1900 1901 int32_t mend = maxLimit; 1902 1903 // Find the first grapheme break after the character index 1904 // of the last CE in the match. If it's after character index 1905 // that's after the last CE in the match, use that index 1906 // as the end of the match. 1907 if (minLimit < maxLimit) { 1908 int32_t nba = ubrk_following(charBreakIterator, minLimit); 1909 1910 if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) { 1911 mend = nba; 1912 } 1913 } 1914 1915 if (mend > maxLimit) { 1916 continue; 1917 } 1918 1919 if (! ubrk_isBoundary(charBreakIterator, mend)) { 1920 continue; 1921 } 1922 1923 matchStart = start; 1924 matchEnd = mend; 1925 1926 ubrk_close(charBreakIterator); 1927 return TRUE; 1928 } 1929 } 1930 1931 ubrk_close(charBreakIterator); 1932 return FALSE; 1933 } 1934 1935 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 1936 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) { 1937 int32_t val = defaultVal; 1938 1939 name.append(" *= *(-?\\d+)"); 1940 1941 UErrorCode status = U_ZERO_ERROR; 1942 RegexMatcher m(name, params, 0, status); 1943 1944 if (m.find()) { 1945 // The param exists. Convert the string to an int. 1946 char valString[100]; 1947 int32_t paramLength = m.end(1, status) - m.start(1, status); 1948 1949 if (paramLength >= (int32_t)(sizeof(valString)-1)) { 1950 paramLength = (int32_t)(sizeof(valString)-2); 1951 } 1952 1953 params.extract(m.start(1, status), paramLength, valString, sizeof(valString)); 1954 val = strtol(valString, NULL, 10); 1955 1956 // Delete this parameter from the params string. 1957 m.reset(); 1958 params = m.replaceFirst("", status); 1959 } 1960 1961 //U_ASSERT(U_SUCCESS(status)); 1962 if (! U_SUCCESS(status)) { 1963 val = defaultVal; 1964 } 1965 1966 return val; 1967 } 1968 #endif 1969 1970 #if !UCONFIG_NO_COLLATION 1971 int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern, 1972 const char *name, const char *strength, uint32_t seed) 1973 { 1974 UErrorCode status = U_ZERO_ERROR; 1975 int32_t actualStart = -1, actualEnd = -1; 1976 //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length(); 1977 int32_t expectedStart = -1, expectedEnd = -1; 1978 int32_t notFoundCount = 0; 1979 UStringSearch *uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(), 1980 testCase.getBuffer(), testCase.length(), 1981 coll, 1982 NULL, // the break iterator 1983 &status); 1984 1985 // **** TODO: find *all* matches, not just first one **** 1986 simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd); 1987 1988 usearch_search(uss, 0, &actualStart, &actualEnd, &status); 1989 1990 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) { 1991 errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n" 1992 " strength=%s seed=%d", 1993 name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed); 1994 } 1995 1996 if (expectedStart == -1 && actualStart == -1) { 1997 notFoundCount += 1; 1998 } 1999 2000 // **** TODO: find *all* matches, not just first one **** 2001 simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd); 2002 2003 usearch_setPattern(uss, altPattern.getBuffer(), altPattern.length(), &status); 2004 2005 usearch_search(uss, 0, &actualStart, &actualEnd, &status); 2006 2007 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) { 2008 errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n" 2009 " strength=%s seed=%d", 2010 name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed); 2011 } 2012 2013 if (expectedStart == -1 && actualStart == -1) { 2014 notFoundCount += 1; 2015 } 2016 2017 usearch_close(uss); 2018 2019 return notFoundCount; 2020 } 2021 2022 int32_t SSearchTest::bmMonkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern, 2023 BoyerMooreSearch *bms, BoyerMooreSearch *abms, 2024 const char *name, const char *strength, uint32_t seed) 2025 { 2026 UErrorCode status = U_ZERO_ERROR; 2027 int32_t actualStart = -1, actualEnd = -1; 2028 //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length(); 2029 int32_t expectedStart = -1, expectedEnd = -1; 2030 int32_t notFoundCount = 0; 2031 2032 // **** TODO: find *all* matches, not just first one **** 2033 simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd); 2034 2035 bms->setTargetString(&testCase, status); 2036 bms->search(0, actualStart, actualEnd); 2037 2038 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) { 2039 errln("Boyer-Moore Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n" 2040 " strength=%s seed=%d", 2041 name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed); 2042 } 2043 2044 if (expectedStart == -1 && actualStart == -1) { 2045 notFoundCount += 1; 2046 } 2047 2048 // **** TODO: find *all* matches, not just first one **** 2049 simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd); 2050 2051 abms->setTargetString(&testCase, status); 2052 abms->search(0, actualStart, actualEnd); 2053 2054 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) { 2055 errln("Boyer-Moore Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n" 2056 " strength=%s seed=%d", 2057 name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed); 2058 } 2059 2060 if (expectedStart == -1 && actualStart == -1) { 2061 notFoundCount += 1; 2062 } 2063 2064 2065 return notFoundCount; 2066 } 2067 #endif 2068 2069 void SSearchTest::monkeyTest(char *params) 2070 { 2071 // ook! 2072 UErrorCode status = U_ZERO_ERROR; 2073 //UCollator *coll = ucol_open(NULL, &status); 2074 UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status); 2075 2076 if (U_FAILURE(status)) { 2077 errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status)); 2078 return; 2079 } 2080 2081 CollData *monkeyData = CollData::open(coll, status); 2082 2083 USet *expansions = uset_openEmpty(); 2084 USet *contractions = uset_openEmpty(); 2085 2086 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status); 2087 2088 U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39); 2089 U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39); 2090 USet *letters = uset_openPattern(letter_pattern, 39, &status); 2091 SetMonkey letterMonkey(letters); 2092 StringSetMonkey contractionMonkey(contractions, coll, monkeyData); 2093 StringSetMonkey expansionMonkey(expansions, coll, monkeyData); 2094 UnicodeString testCase; 2095 UnicodeString alternate; 2096 UnicodeString pattern, altPattern; 2097 UnicodeString prefix, altPrefix; 2098 UnicodeString suffix, altSuffix; 2099 2100 Monkey *monkeys[] = { 2101 &letterMonkey, 2102 &contractionMonkey, 2103 &expansionMonkey, 2104 &contractionMonkey, 2105 &expansionMonkey, 2106 &contractionMonkey, 2107 &expansionMonkey, 2108 &contractionMonkey, 2109 &expansionMonkey}; 2110 int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]); 2111 int32_t nonMatchCount = 0; 2112 2113 UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY}; 2114 const char *strengthNames[] = {"primary", "secondary", "tertiary"}; 2115 int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]); 2116 int32_t loopCount = quick? 1000 : 10000; 2117 int32_t firstStrength = 0; 2118 int32_t lastStrength = strengthCount - 1; //*/ 0; 2119 2120 if (params != NULL) { 2121 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 2122 UnicodeString p(params); 2123 2124 loopCount = getIntParam("loop", p, loopCount); 2125 m_seed = getIntParam("seed", p, m_seed); 2126 2127 RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status); 2128 if (m.find()) { 2129 UnicodeString breakType = m.group(1, status); 2130 2131 for (int32_t s = 0; s < strengthCount; s += 1) { 2132 if (breakType == strengthNames[s]) { 2133 firstStrength = lastStrength = s; 2134 break; 2135 } 2136 } 2137 2138 m.reset(); 2139 p = m.replaceFirst("", status); 2140 } 2141 2142 if (RegexMatcher("\\S", p, 0, status).find()) { 2143 // Each option is stripped out of the option string as it is processed. 2144 // All options have been checked. The option string should have been completely emptied.. 2145 char buf[100]; 2146 p.extract(buf, sizeof(buf), NULL, status); 2147 buf[sizeof(buf)-1] = 0; 2148 errln("Unrecognized or extra parameter: %s\n", buf); 2149 return; 2150 } 2151 #else 2152 infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters."); 2153 #endif 2154 } 2155 2156 for(int32_t s = firstStrength; s <= lastStrength; s += 1) { 2157 int32_t notFoundCount = 0; 2158 2159 logln("Setting strength to %s.", strengthNames[s]); 2160 ucol_setStrength(coll, strengths[s]); 2161 2162 // TODO: try alternate prefix and suffix too? 2163 // TODO: alterntaes are only equal at primary strength. Is this OK? 2164 for(int32_t t = 0; t < loopCount; t += 1) { 2165 uint32_t seed = m_seed; 2166 int32_t nmc = 0; 2167 2168 generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern); 2169 generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix); 2170 generateTestCase(coll, monkeys, monkeyCount, suffix, altSuffix); 2171 2172 // pattern 2173 notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed); 2174 2175 testCase.remove(); 2176 testCase.append(prefix); 2177 testCase.append(/*alt*/pattern); 2178 2179 // prefix + pattern 2180 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed); 2181 2182 testCase.append(suffix); 2183 2184 // prefix + pattern + suffix 2185 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed); 2186 2187 testCase.remove(); 2188 testCase.append(pattern); 2189 testCase.append(suffix); 2190 2191 // pattern + suffix 2192 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed); 2193 } 2194 2195 logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount); 2196 } 2197 2198 uset_close(contractions); 2199 uset_close(expansions); 2200 uset_close(letters); 2201 2202 CollData::close(monkeyData); 2203 2204 ucol_close(coll); 2205 } 2206 2207 void SSearchTest::bmMonkeyTest(char *params) 2208 { 2209 // ook! 2210 UErrorCode status = U_ZERO_ERROR; 2211 UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status); 2212 2213 if (U_FAILURE(status)) { 2214 errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status)); 2215 return; 2216 } 2217 2218 CollData *monkeyData = CollData::open(coll, status); 2219 2220 USet *expansions = uset_openEmpty(); 2221 USet *contractions = uset_openEmpty(); 2222 2223 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status); 2224 2225 U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39); 2226 U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39); 2227 USet *letters = uset_openPattern(letter_pattern, 39, &status); 2228 SetMonkey letterMonkey(letters); 2229 StringSetMonkey contractionMonkey(contractions, coll, monkeyData); 2230 StringSetMonkey expansionMonkey(expansions, coll, monkeyData); 2231 UnicodeString testCase; 2232 UnicodeString alternate; 2233 UnicodeString pattern, altPattern; 2234 UnicodeString prefix, altPrefix; 2235 UnicodeString suffix, altSuffix; 2236 2237 Monkey *monkeys[] = { 2238 &letterMonkey, 2239 &contractionMonkey, 2240 &expansionMonkey, 2241 &contractionMonkey, 2242 &expansionMonkey, 2243 &contractionMonkey, 2244 &expansionMonkey, 2245 &contractionMonkey, 2246 &expansionMonkey}; 2247 int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]); 2248 int32_t nonMatchCount = 0; 2249 2250 UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY}; 2251 const char *strengthNames[] = {"primary", "secondary", "tertiary"}; 2252 int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]); 2253 int32_t loopCount = quick? 1000 : 10000; 2254 int32_t firstStrength = 0; 2255 int32_t lastStrength = strengthCount - 1; //*/ 0; 2256 2257 if (params != NULL) { 2258 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 2259 UnicodeString p(params); 2260 2261 loopCount = getIntParam("loop", p, loopCount); 2262 m_seed = getIntParam("seed", p, m_seed); 2263 2264 RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status); 2265 if (m.find()) { 2266 UnicodeString breakType = m.group(1, status); 2267 2268 for (int32_t s = 0; s < strengthCount; s += 1) { 2269 if (breakType == strengthNames[s]) { 2270 firstStrength = lastStrength = s; 2271 break; 2272 } 2273 } 2274 2275 m.reset(); 2276 p = m.replaceFirst("", status); 2277 } 2278 2279 if (RegexMatcher("\\S", p, 0, status).find()) { 2280 // Each option is stripped out of the option string as it is processed. 2281 // All options have been checked. The option string should have been completely emptied.. 2282 char buf[100]; 2283 p.extract(buf, sizeof(buf), NULL, status); 2284 buf[sizeof(buf)-1] = 0; 2285 errln("Unrecognized or extra parameter: %s\n", buf); 2286 return; 2287 } 2288 #else 2289 infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters."); 2290 #endif 2291 } 2292 2293 for(int32_t s = firstStrength; s <= lastStrength; s += 1) { 2294 int32_t notFoundCount = 0; 2295 2296 logln("Setting strength to %s.", strengthNames[s]); 2297 ucol_setStrength(coll, strengths[s]); 2298 2299 CollData *data = CollData::open(coll, status); 2300 2301 // TODO: try alternate prefix and suffix too? 2302 // TODO: alterntaes are only equal at primary strength. Is this OK? 2303 for(int32_t t = 0; t < loopCount; t += 1) { 2304 uint32_t seed = m_seed; 2305 int32_t nmc = 0; 2306 2307 generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern); 2308 generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix); 2309 generateTestCase(coll, monkeys, monkeyCount, suffix, altSuffix); 2310 2311 BoyerMooreSearch pat(data, pattern, NULL, status); 2312 BoyerMooreSearch alt(data, altPattern, NULL, status); 2313 2314 // **** need a better way to deal with this **** 2315 #if 0 2316 if (pat.empty() || 2317 alt.empty()) { 2318 continue; 2319 } 2320 #endif 2321 2322 // pattern 2323 notFoundCount += bmMonkeyTestCase(coll, pattern, pattern, altPattern, &pat, &alt, "pattern", strengthNames[s], seed); 2324 2325 testCase.remove(); 2326 testCase.append(prefix); 2327 testCase.append(/*alt*/pattern); 2328 2329 // prefix + pattern 2330 notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "prefix + pattern", strengthNames[s], seed); 2331 2332 testCase.append(suffix); 2333 2334 // prefix + pattern + suffix 2335 notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "prefix + pattern + suffix", strengthNames[s], seed); 2336 2337 testCase.remove(); 2338 testCase.append(pattern); 2339 testCase.append(suffix); 2340 2341 // pattern + suffix 2342 notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "pattern + suffix", strengthNames[s], seed); 2343 } 2344 2345 CollData::close(data); 2346 2347 logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount); 2348 } 2349 2350 uset_close(contractions); 2351 uset_close(expansions); 2352 uset_close(letters); 2353 2354 CollData::close(monkeyData); 2355 2356 ucol_close(coll); 2357 } 2358 2359 #endif 2360 2361 #endif 2362