1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2005-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 */ 9 10 #include "unicode/utypes.h" 11 12 #if !UCONFIG_NO_COLLATION 13 14 #include "cmemory.h" 15 #include "cstring.h" 16 #include "usrchimp.h" 17 18 #include "unicode/coll.h" 19 #include "unicode/tblcoll.h" 20 #include "unicode/usearch.h" 21 #include "unicode/uset.h" 22 #include "unicode/ustring.h" 23 24 #include "unicode/coleitr.h" 25 #include "unicode/regex.h" // TODO: make conditional on regexp being built. 26 27 #include "colldata.h" 28 #include "ssearch.h" 29 #include "xmlparser.h" 30 31 #include <stdio.h> // for sprintf 32 33 char testId[100]; 34 35 #define TEST_ASSERT(x) {if (!(x)) { \ 36 errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, testId);}} 37 38 #define TEST_ASSERT_M(x, m) {if (!(x)) { \ 39 dataerrln("Failure in file %s, line %d. \"%s\"", __FILE__, __LINE__, m);return;}} 40 41 #define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \ 42 dataerrln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \ 43 __FILE__, __LINE__, testId, u_errorName(errcode));}} 44 45 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) 46 #define DELETE_ARRAY(array) uprv_free((void *) (array)) 47 48 //--------------------------------------------------------------------------- 49 // 50 // Test class boilerplate 51 // 52 //--------------------------------------------------------------------------- 53 SSearchTest::SSearchTest() 54 { 55 } 56 57 SSearchTest::~SSearchTest() 58 { 59 } 60 61 void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char *params ) 62 { 63 if (exec) logln("TestSuite SSearchTest: "); 64 switch (index) { 65 #if !UCONFIG_NO_BREAK_ITERATION 66 case 0: name = "searchTest"; 67 if (exec) searchTest(); 68 break; 69 70 case 1: name = "offsetTest"; 71 if (exec) offsetTest(); 72 break; 73 74 case 2: name = "monkeyTest"; 75 if (exec) monkeyTest(params); 76 break; 77 78 case 3: name = "sharpSTest"; 79 if (exec) sharpSTest(); 80 break; 81 82 case 4: name = "goodSuffixTest"; 83 if (exec) goodSuffixTest(); 84 break; 85 86 case 5: name = "searchTime"; 87 if (exec) searchTime(); 88 break; 89 #endif 90 default: name = ""; 91 break; //needed to end loop 92 } 93 } 94 95 96 #if !UCONFIG_NO_BREAK_ITERATION 97 98 #define PATH_BUFFER_SIZE 2048 99 const char *SSearchTest::getPath(char buffer[2048], const char *filename) { 100 UErrorCode status = U_ZERO_ERROR; 101 const char *testDataDirectory = IntlTest::getSourceTestData(status); 102 103 if (U_FAILURE(status) || strlen(testDataDirectory) + strlen(filename) + 1 >= PATH_BUFFER_SIZE) { 104 errln("ERROR: getPath() failed - %s", u_errorName(status)); 105 return NULL; 106 } 107 108 strcpy(buffer, testDataDirectory); 109 strcat(buffer, filename); 110 return buffer; 111 } 112 113 114 void SSearchTest::searchTest() 115 { 116 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO 117 UErrorCode status = U_ZERO_ERROR; 118 char path[PATH_BUFFER_SIZE]; 119 const char *testFilePath = getPath(path, "ssearch.xml"); 120 121 if (testFilePath == NULL) { 122 return; /* Couldn't get path: error message already output. */ 123 } 124 125 LocalPointer<UXMLParser> parser(UXMLParser::createParser(status)); 126 TEST_ASSERT_SUCCESS(status); 127 LocalPointer<UXMLElement> root(parser->parseFile(testFilePath, status)); 128 TEST_ASSERT_SUCCESS(status); 129 if (U_FAILURE(status)) { 130 return; 131 } 132 133 const UnicodeString *debugTestCase = root->getAttribute("debug"); 134 if (debugTestCase != NULL) { 135 // setenv("USEARCH_DEBUG", "1", 1); 136 } 137 138 139 const UXMLElement *testCase; 140 int32_t tc = 0; 141 142 while((testCase = root->nextChildElement(tc)) != NULL) { 143 144 if (testCase->getTagName().compare("test-case") != 0) { 145 errln("ssearch, unrecognized XML Element in test file"); 146 continue; 147 } 148 const UnicodeString *id = testCase->getAttribute("id"); 149 *testId = 0; 150 if (id != NULL) { 151 id->extract(0, id->length(), testId, sizeof(testId), US_INV); 152 } 153 154 // If debugging test case has been specified and this is not it, skip to next. 155 if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) { 156 continue; 157 } 158 // 159 // Get the requested collation strength. 160 // Default is tertiary if the XML attribute is missing from the test case. 161 // 162 const UnicodeString *strength = testCase->getAttribute("strength"); 163 UColAttributeValue collatorStrength = UCOL_PRIMARY; 164 if (strength==NULL) { collatorStrength = UCOL_TERTIARY;} 165 else if (*strength=="PRIMARY") { collatorStrength = UCOL_PRIMARY;} 166 else if (*strength=="SECONDARY") { collatorStrength = UCOL_SECONDARY;} 167 else if (*strength=="TERTIARY") { collatorStrength = UCOL_TERTIARY;} 168 else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;} 169 else if (*strength=="IDENTICAL") { collatorStrength = UCOL_IDENTICAL;} 170 else { 171 // Bogus value supplied for strength. Shouldn't happen, even from 172 // typos, if the XML source has been validated. 173 // This assert is a little deceiving in that strength can be 174 // any of the allowed values, not just TERTIARY, but it will 175 // do the job of getting the error output. 176 TEST_ASSERT(*strength=="TERTIARY") 177 } 178 179 // 180 // Get the collator normalization flag. Default is UCOL_OFF. 181 // 182 UColAttributeValue normalize = UCOL_OFF; 183 const UnicodeString *norm = testCase->getAttribute("norm"); 184 TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF"); 185 if (norm!=NULL && *norm=="ON") { 186 normalize = UCOL_ON; 187 } 188 189 // 190 // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE. 191 // 192 UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE; 193 const UnicodeString *alt = testCase->getAttribute("alternate_handling"); 194 TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE"); 195 if (alt != NULL && *alt == "SHIFTED") { 196 alternateHandling = UCOL_SHIFTED; 197 } 198 199 const UnicodeString defLocale("en"); 200 char clocale[100]; 201 const UnicodeString *locale = testCase->getAttribute("locale"); 202 if (locale == NULL || locale->length()==0) { 203 locale = &defLocale; 204 }; 205 locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL); 206 207 208 UnicodeString text; 209 UnicodeString target; 210 UnicodeString pattern; 211 int32_t expectedMatchStart = -1; 212 int32_t expectedMatchLimit = -1; 213 const UXMLElement *n; 214 int32_t nodeCount = 0; 215 216 n = testCase->getChildElement("pattern"); 217 TEST_ASSERT(n != NULL); 218 if (n==NULL) { 219 continue; 220 } 221 text = n->getText(FALSE); 222 text = text.unescape(); 223 pattern.append(text); 224 nodeCount++; 225 226 n = testCase->getChildElement("pre"); 227 if (n!=NULL) { 228 text = n->getText(FALSE); 229 text = text.unescape(); 230 target.append(text); 231 nodeCount++; 232 } 233 234 n = testCase->getChildElement("m"); 235 if (n!=NULL) { 236 expectedMatchStart = target.length(); 237 text = n->getText(FALSE); 238 text = text.unescape(); 239 target.append(text); 240 expectedMatchLimit = target.length(); 241 nodeCount++; 242 } 243 244 n = testCase->getChildElement("post"); 245 if (n!=NULL) { 246 text = n->getText(FALSE); 247 text = text.unescape(); 248 target.append(text); 249 nodeCount++; 250 } 251 252 // Check that there weren't extra things in the XML 253 TEST_ASSERT(nodeCount == testCase->countChildren()); 254 255 // Open a collator and StringSearch based on the parameters 256 // obtained from the XML. 257 // 258 status = U_ZERO_ERROR; 259 LocalUCollatorPointer collator(ucol_open(clocale, &status)); 260 ucol_setStrength(collator.getAlias(), collatorStrength); 261 ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status); 262 ucol_setAttribute(collator.getAlias(), UCOL_ALTERNATE_HANDLING, alternateHandling, &status); 263 LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(), 264 target.getBuffer(), target.length(), 265 collator.getAlias(), 266 NULL, // the break iterator 267 &status)); 268 269 TEST_ASSERT_SUCCESS(status); 270 if (U_FAILURE(status)) { 271 continue; 272 } 273 274 int32_t foundStart = 0; 275 int32_t foundLimit = 0; 276 UBool foundMatch; 277 278 // 279 // Do the search, check the match result against the expected results. 280 // 281 foundMatch= usearch_search(uss.getAlias(), 0, &foundStart, &foundLimit, &status); 282 TEST_ASSERT_SUCCESS(status); 283 if ((foundMatch && expectedMatchStart<0) || 284 (foundStart != expectedMatchStart) || 285 (foundLimit != expectedMatchLimit)) { 286 TEST_ASSERT(FALSE); // ouput generic error position 287 infoln("Found, expected match start = %d, %d \n" 288 "Found, expected match limit = %d, %d", 289 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit); 290 } 291 292 // In case there are other matches... 293 // (should we only do this if the test case passed?) 294 while (foundMatch) { 295 expectedMatchStart = foundStart; 296 expectedMatchLimit = foundLimit; 297 298 foundMatch = usearch_search(uss.getAlias(), foundLimit, &foundStart, &foundLimit, &status); 299 } 300 301 uss.adoptInstead(usearch_openFromCollator(pattern.getBuffer(), pattern.length(), 302 target.getBuffer(), target.length(), 303 collator.getAlias(), 304 NULL, 305 &status)); 306 307 // 308 // Do the backwards search, check the match result against the expected results. 309 // 310 foundMatch= usearch_searchBackwards(uss.getAlias(), target.length(), &foundStart, &foundLimit, &status); 311 TEST_ASSERT_SUCCESS(status); 312 if ((foundMatch && expectedMatchStart<0) || 313 (foundStart != expectedMatchStart) || 314 (foundLimit != expectedMatchLimit)) { 315 TEST_ASSERT(FALSE); // ouput generic error position 316 infoln("Found, expected backwards match start = %d, %d \n" 317 "Found, expected backwards match limit = %d, %d", 318 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit); 319 } 320 } 321 #endif 322 } 323 324 struct Order 325 { 326 int32_t order; 327 int32_t lowOffset; 328 int32_t highOffset; 329 }; 330 331 class OrderList 332 { 333 public: 334 OrderList(); 335 OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset = 0); 336 ~OrderList(); 337 338 int32_t size(void) const; 339 void add(int32_t order, int32_t low, int32_t high); 340 const Order *get(int32_t index) const; 341 int32_t getLowOffset(int32_t index) const; 342 int32_t getHighOffset(int32_t index) const; 343 int32_t getOrder(int32_t index) const; 344 void reverse(void); 345 UBool compare(const OrderList &other) const; 346 UBool matchesAt(int32_t offset, const OrderList &other) const; 347 348 private: 349 Order *list; 350 int32_t listMax; 351 int32_t listSize; 352 }; 353 354 OrderList::OrderList() 355 : list(NULL), listMax(16), listSize(0) 356 { 357 list = new Order[listMax]; 358 } 359 360 OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset) 361 : list(NULL), listMax(16), listSize(0) 362 { 363 UErrorCode status = U_ZERO_ERROR; 364 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status); 365 uint32_t strengthMask = 0; 366 int32_t order, low, high; 367 368 switch (ucol_getStrength(coll)) 369 { 370 default: 371 strengthMask |= UCOL_TERTIARYORDERMASK; 372 U_FALLTHROUGH; 373 case UCOL_SECONDARY: 374 strengthMask |= UCOL_SECONDARYORDERMASK; 375 U_FALLTHROUGH; 376 case UCOL_PRIMARY: 377 strengthMask |= UCOL_PRIMARYORDERMASK; 378 } 379 380 list = new Order[listMax]; 381 382 ucol_setOffset(elems, stringOffset, &status); 383 384 do { 385 low = ucol_getOffset(elems); 386 order = ucol_next(elems, &status); 387 high = ucol_getOffset(elems); 388 389 if (order != UCOL_NULLORDER) { 390 order &= strengthMask; 391 } 392 393 if (order != UCOL_IGNORABLE) { 394 add(order, low, high); 395 } 396 } while (order != UCOL_NULLORDER); 397 398 ucol_closeElements(elems); 399 } 400 401 OrderList::~OrderList() 402 { 403 delete[] list; 404 } 405 406 void OrderList::add(int32_t order, int32_t low, int32_t high) 407 { 408 if (listSize >= listMax) { 409 listMax *= 2; 410 411 Order *newList = new Order[listMax]; 412 413 uprv_memcpy(newList, list, listSize * sizeof(Order)); 414 delete[] list; 415 list = newList; 416 } 417 418 list[listSize].order = order; 419 list[listSize].lowOffset = low; 420 list[listSize].highOffset = high; 421 422 listSize += 1; 423 } 424 425 const Order *OrderList::get(int32_t index) const 426 { 427 if (index >= listSize) { 428 return NULL; 429 } 430 431 return &list[index]; 432 } 433 434 int32_t OrderList::getLowOffset(int32_t index) const 435 { 436 const Order *order = get(index); 437 438 if (order != NULL) { 439 return order->lowOffset; 440 } 441 442 return -1; 443 } 444 445 int32_t OrderList::getHighOffset(int32_t index) const 446 { 447 const Order *order = get(index); 448 449 if (order != NULL) { 450 return order->highOffset; 451 } 452 453 return -1; 454 } 455 456 int32_t OrderList::getOrder(int32_t index) const 457 { 458 const Order *order = get(index); 459 460 if (order != NULL) { 461 return order->order; 462 } 463 464 return UCOL_NULLORDER; 465 } 466 467 int32_t OrderList::size() const 468 { 469 return listSize; 470 } 471 472 void OrderList::reverse() 473 { 474 for(int32_t f = 0, b = listSize - 1; f < b; f += 1, b -= 1) { 475 Order swap = list[b]; 476 477 list[b] = list[f]; 478 list[f] = swap; 479 } 480 } 481 482 UBool OrderList::compare(const OrderList &other) const 483 { 484 if (listSize != other.listSize) { 485 return FALSE; 486 } 487 488 for(int32_t i = 0; i < listSize; i += 1) { 489 if (list[i].order != other.list[i].order || 490 list[i].lowOffset != other.list[i].lowOffset || 491 list[i].highOffset != other.list[i].highOffset) { 492 return FALSE; 493 } 494 } 495 496 return TRUE; 497 } 498 499 UBool OrderList::matchesAt(int32_t offset, const OrderList &other) const 500 { 501 // NOTE: sizes include the NULLORDER, which we don't want to compare. 502 int32_t otherSize = other.size() - 1; 503 504 if (listSize - 1 - offset < otherSize) { 505 return FALSE; 506 } 507 508 for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) { 509 if (getOrder(i) != other.getOrder(j)) { 510 return FALSE; 511 } 512 } 513 514 return TRUE; 515 } 516 517 static char *printOffsets(char *buffer, OrderList &list) 518 { 519 int32_t size = list.size(); 520 char *s = buffer; 521 522 for(int32_t i = 0; i < size; i += 1) { 523 const Order *order = list.get(i); 524 525 if (i != 0) { 526 s += sprintf(s, ", "); 527 } 528 529 s += sprintf(s, "(%d, %d)", order->lowOffset, order->highOffset); 530 } 531 532 return buffer; 533 } 534 535 static char *printOrders(char *buffer, OrderList &list) 536 { 537 int32_t size = list.size(); 538 char *s = buffer; 539 540 for(int32_t i = 0; i < size; i += 1) { 541 const Order *order = list.get(i); 542 543 if (i != 0) { 544 s += sprintf(s, ", "); 545 } 546 547 s += sprintf(s, "%8.8X", order->order); 548 } 549 550 return buffer; 551 } 552 553 void SSearchTest::offsetTest() 554 { 555 const char *test[] = { 556 // The sequence \u0FB3\u0F71\u0F71\u0F80 contains a discontiguous 557 // contraction (\u0FB3\u0F71\u0F80) logically followed by \u0F71. 558 "\\u1E33\\u0FB3\\u0F71\\u0F71\\u0F80\\uD835\\uDF6C\\u01B0", 559 560 "\\ua191\\u16ef\\u2036\\u017a", 561 562 #if 0 563 // This results in a complex interaction between contraction, 564 // expansion and normalization that confuses the backwards offset fixups. 565 "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85", 566 #endif 567 568 "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85", 569 "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3", 570 571 "\\u02FE\\u02FF" 572 "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\\u030A\\u030B\\u030C\\u030D\\u030E\\u030F" 573 "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F" 574 "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F" 575 "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F" 576 "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E", // currently not working, see #8081 577 578 "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318", // currently not working, see #8081 579 "a\\u02FF\\u0301\\u0316", // currently not working, see #8081 580 "a\\u02FF\\u0316\\u0301", 581 "a\\u0430\\u0301\\u0316", 582 "a\\u0430\\u0316\\u0301", 583 "abc\\u0E41\\u0301\\u0316", 584 "abc\\u0E41\\u0316\\u0301", 585 "\\u0E41\\u0301\\u0316", 586 "\\u0E41\\u0316\\u0301", 587 "a\\u0301\\u0316", 588 "a\\u0316\\u0301", 589 "\\uAC52\\uAC53", 590 "\\u34CA\\u34CB", 591 "\\u11ED\\u11EE", 592 "\\u30C3\\u30D0", 593 "p\\u00E9ch\\u00E9", 594 "a\\u0301\\u0325", 595 "a\\u0300\\u0325", 596 "a\\u0325\\u0300", 597 "A\\u0323\\u0300B", 598 "A\\u0300\\u0323B", 599 "A\\u0301\\u0323B", 600 "A\\u0302\\u0301\\u0323B", 601 "abc", 602 "ab\\u0300c", 603 "ab\\u0300\\u0323c", 604 " \\uD800\\uDC00\\uDC00", 605 "a\\uD800\\uDC00\\uDC00", 606 "A\\u0301\\u0301", 607 "A\\u0301\\u0323", 608 "A\\u0301\\u0323B", 609 "B\\u0301\\u0323C", 610 "A\\u0300\\u0323B", 611 "\\u0301A\\u0301\\u0301", 612 "abcd\\r\\u0301", 613 "p\\u00EAche", 614 "pe\\u0302che", 615 }; 616 617 int32_t testCount = UPRV_LENGTHOF(test); 618 UErrorCode status = U_ZERO_ERROR; 619 RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Locale::getEnglish(), status); 620 if (U_FAILURE(status)) { 621 errcheckln(status, "Failed to create collator in offsetTest! - %s", u_errorName(status)); 622 return; 623 } 624 char buffer[4096]; // A bit of a hack... just happens to be long enough for all the test cases... 625 // We could allocate one that's the right size by (CE_count * 10) + 2 626 // 10 chars is enough room for 8 hex digits plus ", ". 2 extra chars for "[" and "]" 627 628 col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status); 629 630 for(int32_t i = 0; i < testCount; i += 1) { 631 UnicodeString ts = CharsToUnicodeString(test[i]); 632 CollationElementIterator *iter = col->createCollationElementIterator(ts); 633 OrderList forwardList; 634 OrderList backwardList; 635 int32_t order, low, high; 636 637 do { 638 low = iter->getOffset(); 639 order = iter->next(status); 640 high = iter->getOffset(); 641 642 forwardList.add(order, low, high); 643 } while (order != CollationElementIterator::NULLORDER); 644 645 iter->reset(); 646 iter->setOffset(ts.length(), status); 647 648 backwardList.add(CollationElementIterator::NULLORDER, iter->getOffset(), iter->getOffset()); 649 650 do { 651 high = iter->getOffset(); 652 order = iter->previous(status); 653 low = iter->getOffset(); 654 655 if (order == CollationElementIterator::NULLORDER) { 656 break; 657 } 658 659 backwardList.add(order, low, high); 660 } while (TRUE); 661 662 backwardList.reverse(); 663 664 if (forwardList.compare(backwardList)) { 665 logln("Works with \"%s\"", test[i]); 666 logln("Forward offsets: [%s]", printOffsets(buffer, forwardList)); 667 // logln("Backward offsets: [%s]", printOffsets(buffer, backwardList)); 668 669 logln("Forward CEs: [%s]", printOrders(buffer, forwardList)); 670 // logln("Backward CEs: [%s]", printOrders(buffer, backwardList)); 671 672 logln(); 673 } else { 674 errln("Fails with \"%s\"", test[i]); 675 infoln("Forward offsets: [%s]", printOffsets(buffer, forwardList)); 676 infoln("Backward offsets: [%s]", printOffsets(buffer, backwardList)); 677 678 infoln("Forward CEs: [%s]", printOrders(buffer, forwardList)); 679 infoln("Backward CEs: [%s]", printOrders(buffer, backwardList)); 680 681 infoln(); 682 } 683 delete iter; 684 } 685 delete col; 686 } 687 688 #if 0 689 static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer) 690 { 691 for(int32_t i = 0; i < string.length(); i += 1) { 692 UChar32 ch = string.char32At(i); 693 694 if (ch >= 0x0020 && ch <= 0x007F) { 695 if (ch == 0x005C) { 696 buffer.append("\\\\"); 697 } else { 698 buffer.append(ch); 699 } 700 } else { 701 char cbuffer[12]; 702 703 if (ch <= 0xFFFFL) { 704 sprintf(cbuffer, "\\u%4.4X", ch); 705 } else { 706 sprintf(cbuffer, "\\U%8.8X", ch); 707 } 708 709 buffer.append(cbuffer); 710 } 711 712 if (ch >= 0x10000L) { 713 i += 1; 714 } 715 } 716 717 return buffer; 718 } 719 #endif 720 721 void SSearchTest::sharpSTest() 722 { 723 UErrorCode status = U_ZERO_ERROR; 724 UCollator *coll = NULL; 725 UnicodeString lp = "fuss"; 726 UnicodeString sp = "fu\\u00DF"; 727 UnicodeString targets[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball", 728 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF", 729 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"}; 730 int32_t start = -1, end = -1; 731 732 coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status); 733 TEST_ASSERT_SUCCESS(status); 734 735 UnicodeString lpUnescaped = lp.unescape(); 736 UnicodeString spUnescaped = sp.unescape(); 737 738 LocalUStringSearchPointer ussLong(usearch_openFromCollator(lpUnescaped.getBuffer(), lpUnescaped.length(), 739 lpUnescaped.getBuffer(), lpUnescaped.length(), // actual test data will be set later 740 coll, 741 NULL, // the break iterator 742 &status)); 743 744 LocalUStringSearchPointer ussShort(usearch_openFromCollator(spUnescaped.getBuffer(), spUnescaped.length(), 745 spUnescaped.getBuffer(), spUnescaped.length(), // actual test data will be set later 746 coll, 747 NULL, // the break iterator 748 &status)); 749 TEST_ASSERT_SUCCESS(status); 750 751 for (uint32_t t = 0; t < UPRV_LENGTHOF(targets); t += 1) { 752 UBool bFound; 753 UnicodeString target = targets[t].unescape(); 754 755 start = end = -1; 756 usearch_setText(ussLong.getAlias(), target.getBuffer(), target.length(), &status); 757 bFound = usearch_search(ussLong.getAlias(), 0, &start, &end, &status); 758 TEST_ASSERT_SUCCESS(status); 759 if (bFound) { 760 logln("Test %d: found long pattern at [%d, %d].", t, start, end); 761 } else { 762 dataerrln("Test %d: did not find long pattern.", t); 763 } 764 765 usearch_setText(ussShort.getAlias(), target.getBuffer(), target.length(), &status); 766 bFound = usearch_search(ussShort.getAlias(), 0, &start, &end, &status); 767 TEST_ASSERT_SUCCESS(status); 768 if (bFound) { 769 logln("Test %d: found long pattern at [%d, %d].", t, start, end); 770 } else { 771 dataerrln("Test %d: did not find long pattern.", t); 772 } 773 } 774 775 ucol_close(coll); 776 } 777 778 void SSearchTest::goodSuffixTest() 779 { 780 UErrorCode status = U_ZERO_ERROR; 781 UCollator *coll = NULL; 782 UnicodeString pat = /*"gcagagag"*/ "fxeld"; 783 UnicodeString target = /*"gcatcgcagagagtatacagtacg"*/ "cloveldfxeld"; 784 int32_t start = -1, end = -1; 785 UBool bFound; 786 787 coll = ucol_open(NULL, &status); 788 TEST_ASSERT_SUCCESS(status); 789 790 LocalUStringSearchPointer ss(usearch_openFromCollator(pat.getBuffer(), pat.length(), 791 target.getBuffer(), target.length(), 792 coll, 793 NULL, // the break iterator 794 &status)); 795 TEST_ASSERT_SUCCESS(status); 796 797 bFound = usearch_search(ss.getAlias(), 0, &start, &end, &status); 798 TEST_ASSERT_SUCCESS(status); 799 if (bFound) { 800 logln("Found pattern at [%d, %d].", start, end); 801 } else { 802 dataerrln("Did not find pattern."); 803 } 804 805 ucol_close(coll); 806 } 807 808 // 809 // searchTime() A quick and dirty performance test for string search. 810 // Probably doesn't really belong as part of intltest, but it 811 // does check that the search succeeds, and gets the right result, 812 // so it serves as a functionality test also. 813 // 814 // To run as a perf test, up the loop count, select by commenting 815 // and uncommenting in the code the operation to be measured, 816 // rebuild, and measure the running time of this test alone. 817 // 818 // time LD_LIBRARY_PATH=whatever ./intltest collate/SSearchTest/searchTime 819 // 820 void SSearchTest::searchTime() { 821 static const char *longishText = 822 "Whylom, as olde stories tellen us,\n" 823 "Ther was a duk that highte Theseus:\n" 824 "Of Athenes he was lord and governour,\n" 825 "And in his tyme swich a conquerour,\n" 826 "That gretter was ther noon under the sonne.\n" 827 "Ful many a riche contree hadde he wonne;\n" 828 "What with his wisdom and his chivalrye,\n" 829 "He conquered al the regne of Femenye,\n" 830 "That whylom was y-cleped Scithia;\n" 831 "And weddede the quene Ipolita,\n" 832 "And broghte hir hoom with him in his contree\n" 833 "With muchel glorie and greet solempnitee,\n" 834 "And eek hir yonge suster Emelye.\n" 835 "And thus with victorie and with melodye\n" 836 "Lete I this noble duk to Athenes ryde,\n" 837 "And al his hoost, in armes, him bisyde.\n" 838 "And certes, if it nere to long to here,\n" 839 "I wolde han told yow fully the manere,\n" 840 "How wonnen was the regne of Femenye\n" 841 "By Theseus, and by his chivalrye;\n" 842 "And of the grete bataille for the nones\n" 843 "Bitwixen Athen's and Amazones;\n" 844 "And how asseged was Ipolita,\n" 845 "The faire hardy quene of Scithia;\n" 846 "And of the feste that was at hir weddinge,\n" 847 "And of the tempest at hir hoom-cominge;\n" 848 "But al that thing I moot as now forbere.\n" 849 "I have, God woot, a large feeld to ere,\n" 850 "And wayke been the oxen in my plough.\n" 851 "The remenant of the tale is long y-nough.\n" 852 "I wol nat letten eek noon of this route;\n" 853 "Lat every felawe telle his tale aboute,\n" 854 "And lat see now who shal the soper winne;\n" 855 "And ther I lefte, I wol ageyn biginne.\n" 856 "This duk, of whom I make mencioun,\n" 857 "When he was come almost unto the toun,\n" 858 "In al his wele and in his moste pryde,\n" 859 "He was war, as he caste his eye asyde,\n" 860 "Wher that ther kneled in the hye weye\n" 861 "A companye of ladies, tweye and tweye,\n" 862 "Ech after other, clad in clothes blake; \n" 863 "But swich a cry and swich a wo they make,\n" 864 "That in this world nis creature livinge,\n" 865 "That herde swich another weymentinge;\n" 866 "And of this cry they nolde never stenten,\n" 867 "Til they the reynes of his brydel henten.\n" 868 "'What folk ben ye, that at myn hoomcominge\n" 869 "Perturben so my feste with cryinge'?\n" 870 "Quod Theseus, 'have ye so greet envye\n" 871 "Of myn honour, that thus compleyne and crye? \n" 872 "Or who hath yow misboden, or offended?\n" 873 "And telleth me if it may been amended;\n" 874 "And why that ye ben clothed thus in blak'?\n" 875 "The eldest lady of hem alle spak,\n" 876 "When she hadde swowned with a deedly chere,\n" 877 "That it was routhe for to seen and here,\n" 878 "And seyde: 'Lord, to whom Fortune hath yiven\n" 879 "Victorie, and as a conquerour to liven,\n" 880 "Noght greveth us your glorie and your honour;\n" 881 "But we biseken mercy and socour.\n" 882 "Have mercy on our wo and our distresse.\n" 883 "Som drope of pitee, thurgh thy gentilesse,\n" 884 "Up-on us wrecched wommen lat thou falle.\n" 885 "For certes, lord, ther nis noon of us alle,\n" 886 "That she nath been a duchesse or a quene;\n" 887 "Now be we caitifs, as it is wel sene:\n" 888 "Thanked be Fortune, and hir false wheel,\n" 889 "That noon estat assureth to be weel.\n" 890 "And certes, lord, t'abyden your presence,\n" 891 "Here in the temple of the goddesse Clemence\n" 892 "We han ben waytinge al this fourtenight;\n" 893 "Now help us, lord, sith it is in thy might.\n" 894 "I wrecche, which that wepe and waille thus,\n" 895 "Was whylom wyf to king Capaneus,\n" 896 "That starf at Thebes, cursed be that day!\n" 897 "And alle we, that been in this array,\n" 898 "And maken al this lamentacioun,\n" 899 "We losten alle our housbondes at that toun,\n" 900 "Whyl that the sege ther-aboute lay.\n" 901 "And yet now th'olde Creon, weylaway!\n" 902 "The lord is now of Thebes the citee, \n" 903 "Fulfild of ire and of iniquitee,\n" 904 "He, for despyt, and for his tirannye,\n" 905 "To do the dede bodyes vileinye,\n" 906 "Of alle our lordes, whiche that ben slawe,\n" 907 "Hath alle the bodyes on an heep y-drawe,\n" 908 "And wol nat suffren hem, by noon assent,\n" 909 "Neither to been y-buried nor y-brent,\n" 910 "But maketh houndes ete hem in despyt. zet'\n"; 911 912 const char *cPattern = "maketh houndes ete hem"; 913 //const char *cPattern = "Whylom"; 914 //const char *cPattern = "zet"; 915 const char *testId = "searchTime()"; // for error macros. 916 UnicodeString target = longishText; 917 UErrorCode status = U_ZERO_ERROR; 918 919 920 LocalUCollatorPointer collator(ucol_open("en", &status)); 921 //ucol_setStrength(collator.getAlias(), collatorStrength); 922 //ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status); 923 UnicodeString uPattern = cPattern; 924 LocalUStringSearchPointer uss(usearch_openFromCollator(uPattern.getBuffer(), uPattern.length(), 925 target.getBuffer(), target.length(), 926 collator.getAlias(), 927 NULL, // the break iterator 928 &status)); 929 TEST_ASSERT_SUCCESS(status); 930 931 // int32_t foundStart; 932 // int32_t foundEnd; 933 UBool found; 934 935 // Find the match position usgin strstr 936 const char *pm = strstr(longishText, cPattern); 937 TEST_ASSERT_M(pm!=NULL, "No pattern match with strstr"); 938 int32_t refMatchPos = (int32_t)(pm - longishText); 939 int32_t icuMatchPos; 940 int32_t icuMatchEnd; 941 usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status); 942 TEST_ASSERT_SUCCESS(status); 943 TEST_ASSERT_M(refMatchPos == icuMatchPos, "strstr and icu give different match positions."); 944 945 int32_t i; 946 // int32_t j=0; 947 948 // Try loopcounts around 100000 to some millions, depending on the operation, 949 // to get runtimes of at least several seconds. 950 for (i=0; i<10000; i++) { 951 found = usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status); 952 (void)found; // Suppress set but not used warning. 953 //TEST_ASSERT_SUCCESS(status); 954 //TEST_ASSERT(found); 955 956 // usearch_setOffset(uss.getAlias(), 0, &status); 957 // icuMatchPos = usearch_next(uss.getAlias(), &status); 958 959 // The i+j stuff is to confuse the optimizer and get it to actually leave the 960 // call to strstr in place. 961 //pm = strstr(longishText+j, cPattern); 962 //j = (j + i)%5; 963 } 964 965 //printf("%ld, %d\n", pm-longishText, j); 966 } 967 968 //---------------------------------------------------------------------------------------- 969 // 970 // Random Numbers. Similar to standard lib rand() and srand() 971 // Not using library to 972 // 1. Get same results on all platforms. 973 // 2. Get access to current seed, to more easily reproduce failures. 974 // 975 //--------------------------------------------------------------------------------------- 976 static uint32_t m_seed = 1; 977 978 static uint32_t m_rand() 979 { 980 m_seed = m_seed * 1103515245 + 12345; 981 return (uint32_t)(m_seed/65536) % 32768; 982 } 983 984 class Monkey 985 { 986 public: 987 virtual void append(UnicodeString &test, UnicodeString &alternate) = 0; 988 989 protected: 990 Monkey(); 991 virtual ~Monkey(); 992 }; 993 994 Monkey::Monkey() 995 { 996 // ook? 997 } 998 999 Monkey::~Monkey() 1000 { 1001 // ook? 1002 } 1003 1004 class SetMonkey : public Monkey 1005 { 1006 public: 1007 SetMonkey(const USet *theSet); 1008 ~SetMonkey(); 1009 1010 virtual void append(UnicodeString &test, UnicodeString &alternate); 1011 1012 private: 1013 const USet *set; 1014 }; 1015 1016 SetMonkey::SetMonkey(const USet *theSet) 1017 : Monkey(), set(theSet) 1018 { 1019 // ook? 1020 } 1021 1022 SetMonkey::~SetMonkey() 1023 { 1024 //ook... 1025 } 1026 1027 void SetMonkey::append(UnicodeString &test, UnicodeString &alternate) 1028 { 1029 int32_t size = uset_size(set); 1030 int32_t index = m_rand() % size; 1031 UChar32 ch = uset_charAt(set, index); 1032 UnicodeString str(ch); 1033 1034 test.append(str); 1035 alternate.append(str); // flip case, or some junk? 1036 } 1037 1038 class StringSetMonkey : public Monkey 1039 { 1040 public: 1041 StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData); 1042 ~StringSetMonkey(); 1043 1044 void append(UnicodeString &testCase, UnicodeString &alternate); 1045 1046 private: 1047 UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeString &alternate); 1048 1049 const USet *set; 1050 UCollator *coll; 1051 CollData *collData; 1052 }; 1053 1054 StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData) 1055 : Monkey(), set(theSet), coll(theCollator), collData(theCollData) 1056 { 1057 // ook. 1058 } 1059 1060 StringSetMonkey::~StringSetMonkey() 1061 { 1062 // ook? 1063 } 1064 1065 void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate) 1066 { 1067 int32_t itemCount = uset_getItemCount(set), len = 0; 1068 int32_t index = m_rand() % itemCount; 1069 UChar32 rangeStart = 0, rangeEnd = 0; 1070 UChar buffer[16]; 1071 UErrorCode err = U_ZERO_ERROR; 1072 1073 len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err); 1074 1075 if (len == 0) { 1076 int32_t offset = m_rand() % (rangeEnd - rangeStart + 1); 1077 UChar32 ch = rangeStart + offset; 1078 UnicodeString str(ch); 1079 1080 testCase.append(str); 1081 generateAlternative(str, alternate); 1082 } else if (len > 0) { 1083 // should check that len < 16... 1084 UnicodeString str(buffer, len); 1085 1086 testCase.append(str); 1087 generateAlternative(str, alternate); 1088 } else { 1089 // shouldn't happen... 1090 } 1091 } 1092 1093 UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCase, UnicodeString &alternate) 1094 { 1095 // find out shortest string for the longest sequence of ces. 1096 // needs to be refined to use dynamic programming, but will be roughly right 1097 UErrorCode status = U_ZERO_ERROR; 1098 CEList ceList(coll, testCase, status); 1099 UnicodeString alt; 1100 int32_t offset = 0; 1101 1102 if (ceList.size() == 0) { 1103 return alternate.append(testCase); 1104 } 1105 1106 while (offset < ceList.size()) { 1107 int32_t ce = ceList.get(offset); 1108 const StringList *strings = collData->getStringList(ce); 1109 1110 if (strings == NULL) { 1111 return alternate.append(testCase); 1112 } 1113 1114 int32_t stringCount = strings->size(); 1115 int32_t tries = 0; 1116 1117 // find random string that generates the same CEList 1118 const CEList *ceList2 = NULL; 1119 const UnicodeString *string = NULL; 1120 UBool matches = FALSE; 1121 1122 do { 1123 int32_t s = m_rand() % stringCount; 1124 1125 if (tries++ > stringCount) { 1126 alternate.append(testCase); 1127 return alternate; 1128 } 1129 1130 string = strings->get(s); 1131 ceList2 = collData->getCEList(string); 1132 matches = ceList.matchesAt(offset, ceList2); 1133 1134 if (! matches) { 1135 collData->freeCEList((CEList *) ceList2); 1136 } 1137 } while (! matches); 1138 1139 alt.append(*string); 1140 offset += ceList2->size(); 1141 collData->freeCEList(ceList2); 1142 } 1143 1144 const CEList altCEs(coll, alt, status); 1145 1146 if (ceList.matchesAt(0, &altCEs)) { 1147 return alternate.append(alt); 1148 } 1149 1150 return alternate.append(testCase); 1151 } 1152 1153 static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyCount, UnicodeString &testCase, UnicodeString &alternate) 1154 { 1155 int32_t pieces = (m_rand() % 4) + 1; 1156 UErrorCode status = U_ZERO_ERROR; 1157 UBool matches; 1158 1159 do { 1160 testCase.remove(); 1161 alternate.remove(); 1162 monkeys[0]->append(testCase, alternate); 1163 1164 for(int32_t piece = 0; piece < pieces; piece += 1) { 1165 int32_t monkey = m_rand() % monkeyCount; 1166 1167 monkeys[monkey]->append(testCase, alternate); 1168 } 1169 1170 const CEList ceTest(coll, testCase, status); 1171 const CEList ceAlt(coll, alternate, status); 1172 1173 matches = ceTest.matchesAt(0, &ceAlt); 1174 } while (! matches); 1175 } 1176 1177 static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd) 1178 { 1179 UErrorCode status = U_ZERO_ERROR; 1180 OrderList targetOrders(coll, target, offset); 1181 OrderList patternOrders(coll, pattern); 1182 int32_t targetSize = targetOrders.size() - 1; 1183 int32_t patternSize = patternOrders.size() - 1; 1184 UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status), 1185 target.getBuffer(), target.length(), &status); 1186 1187 if (patternSize == 0) { 1188 // Searching for an empty pattern always fails 1189 matchStart = matchEnd = -1; 1190 ubrk_close(charBreakIterator); 1191 return FALSE; 1192 } 1193 1194 matchStart = matchEnd = -1; 1195 1196 for(int32_t i = 0; i < targetSize; i += 1) { 1197 if (targetOrders.matchesAt(i, patternOrders)) { 1198 int32_t start = targetOrders.getLowOffset(i); 1199 int32_t maxLimit = targetOrders.getLowOffset(i + patternSize); 1200 int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1); 1201 1202 // if the low and high offsets of the first CE in 1203 // the match are the same, it means that the match 1204 // starts in the middle of an expansion - all but 1205 // the first CE of the expansion will have the offset 1206 // of the following character. 1207 if (start == targetOrders.getHighOffset(i)) { 1208 continue; 1209 } 1210 1211 // Make sure match starts on a grapheme boundary 1212 if (! ubrk_isBoundary(charBreakIterator, start)) { 1213 continue; 1214 } 1215 1216 // If the low and high offsets of the CE after the match 1217 // are the same, it means that the match ends in the middle 1218 // of an expansion sequence. 1219 if (maxLimit == targetOrders.getHighOffset(i + patternSize) && 1220 targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) { 1221 continue; 1222 } 1223 1224 int32_t mend = maxLimit; 1225 1226 // Find the first grapheme break after the character index 1227 // of the last CE in the match. If it's after character index 1228 // that's after the last CE in the match, use that index 1229 // as the end of the match. 1230 if (minLimit < maxLimit) { 1231 // When the last CE's low index is same with its high index, the CE is likely 1232 // a part of expansion. In this case, the index is located just after the 1233 // character corresponding to the CEs compared above. If the index is right 1234 // at the break boundary, move the position to the next boundary will result 1235 // incorrect match length when there are ignorable characters exist between 1236 // the position and the next character produces CE(s). See ticket#8482. 1237 if (minLimit == targetOrders.getHighOffset(i + patternSize - 1) && ubrk_isBoundary(charBreakIterator, minLimit)) { 1238 mend = minLimit; 1239 } else { 1240 int32_t nba = ubrk_following(charBreakIterator, minLimit); 1241 1242 if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) { 1243 mend = nba; 1244 } 1245 } 1246 } 1247 1248 if (mend > maxLimit) { 1249 continue; 1250 } 1251 1252 if (! ubrk_isBoundary(charBreakIterator, mend)) { 1253 continue; 1254 } 1255 1256 matchStart = start; 1257 matchEnd = mend; 1258 1259 ubrk_close(charBreakIterator); 1260 return TRUE; 1261 } 1262 } 1263 1264 ubrk_close(charBreakIterator); 1265 return FALSE; 1266 } 1267 1268 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 1269 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) { 1270 int32_t val = defaultVal; 1271 1272 name.append(" *= *(-?\\d+)"); 1273 1274 UErrorCode status = U_ZERO_ERROR; 1275 RegexMatcher m(name, params, 0, status); 1276 1277 if (m.find()) { 1278 // The param exists. Convert the string to an int. 1279 char valString[100]; 1280 int32_t paramLength = m.end(1, status) - m.start(1, status); 1281 1282 if (paramLength >= (int32_t)(sizeof(valString)-1)) { 1283 paramLength = (int32_t)(sizeof(valString)-2); 1284 } 1285 1286 params.extract(m.start(1, status), paramLength, valString, sizeof(valString)); 1287 val = uprv_strtol(valString, NULL, 10); 1288 1289 // Delete this parameter from the params string. 1290 m.reset(); 1291 params = m.replaceFirst("", status); 1292 } 1293 1294 //U_ASSERT(U_SUCCESS(status)); 1295 if (! U_SUCCESS(status)) { 1296 val = defaultVal; 1297 } 1298 1299 return val; 1300 } 1301 #endif 1302 1303 #if !UCONFIG_NO_COLLATION 1304 int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern, 1305 const char *name, const char *strength, uint32_t seed) 1306 { 1307 UErrorCode status = U_ZERO_ERROR; 1308 int32_t actualStart = -1, actualEnd = -1; 1309 //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length(); 1310 int32_t expectedStart = -1, expectedEnd = -1; 1311 int32_t notFoundCount = 0; 1312 LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(), 1313 testCase.getBuffer(), testCase.length(), 1314 coll, 1315 NULL, // the break iterator 1316 &status)); 1317 1318 // **** TODO: find *all* matches, not just first one **** 1319 simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd); 1320 1321 usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status); 1322 1323 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) { 1324 errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n" 1325 " strength=%s seed=%d", 1326 name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed); 1327 } 1328 1329 if (expectedStart == -1 && actualStart == -1) { 1330 notFoundCount += 1; 1331 } 1332 1333 // **** TODO: find *all* matches, not just first one **** 1334 simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd); 1335 1336 usearch_setPattern(uss.getAlias(), altPattern.getBuffer(), altPattern.length(), &status); 1337 1338 usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status); 1339 1340 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) { 1341 errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n" 1342 " strength=%s seed=%d", 1343 name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed); 1344 } 1345 1346 if (expectedStart == -1 && actualStart == -1) { 1347 notFoundCount += 1; 1348 } 1349 1350 return notFoundCount; 1351 } 1352 #endif 1353 1354 void SSearchTest::monkeyTest(char *params) 1355 { 1356 // ook! 1357 UErrorCode status = U_ZERO_ERROR; 1358 //UCollator *coll = ucol_open(NULL, &status); 1359 UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status); 1360 1361 if (U_FAILURE(status)) { 1362 errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status)); 1363 return; 1364 } 1365 1366 CollData *monkeyData = new CollData(coll, status); 1367 1368 USet *expansions = uset_openEmpty(); 1369 USet *contractions = uset_openEmpty(); 1370 1371 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status); 1372 1373 U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39); 1374 U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39); 1375 USet *letters = uset_openPattern(letter_pattern, 39, &status); 1376 SetMonkey letterMonkey(letters); 1377 StringSetMonkey contractionMonkey(contractions, coll, monkeyData); 1378 StringSetMonkey expansionMonkey(expansions, coll, monkeyData); 1379 UnicodeString testCase; 1380 UnicodeString alternate; 1381 UnicodeString pattern, altPattern; 1382 UnicodeString prefix, altPrefix; 1383 UnicodeString suffix, altSuffix; 1384 1385 Monkey *monkeys[] = { 1386 &letterMonkey, 1387 &contractionMonkey, 1388 &expansionMonkey, 1389 &contractionMonkey, 1390 &expansionMonkey, 1391 &contractionMonkey, 1392 &expansionMonkey, 1393 &contractionMonkey, 1394 &expansionMonkey}; 1395 int32_t monkeyCount = UPRV_LENGTHOF(monkeys); 1396 // int32_t nonMatchCount = 0; 1397 1398 UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY}; 1399 const char *strengthNames[] = {"primary", "secondary", "tertiary"}; 1400 int32_t strengthCount = UPRV_LENGTHOF(strengths); 1401 int32_t loopCount = quick? 1000 : 10000; 1402 int32_t firstStrength = 0; 1403 int32_t lastStrength = strengthCount - 1; //*/ 0; 1404 1405 if (params != NULL) { 1406 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 1407 UnicodeString p(params); 1408 1409 loopCount = getIntParam("loop", p, loopCount); 1410 m_seed = getIntParam("seed", p, m_seed); 1411 1412 RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status); 1413 if (m.find()) { 1414 UnicodeString breakType = m.group(1, status); 1415 1416 for (int32_t s = 0; s < strengthCount; s += 1) { 1417 if (breakType == strengthNames[s]) { 1418 firstStrength = lastStrength = s; 1419 break; 1420 } 1421 } 1422 1423 m.reset(); 1424 p = m.replaceFirst("", status); 1425 } 1426 1427 if (RegexMatcher("\\S", p, 0, status).find()) { 1428 // Each option is stripped out of the option string as it is processed. 1429 // All options have been checked. The option string should have been completely emptied.. 1430 char buf[100]; 1431 p.extract(buf, sizeof(buf), NULL, status); 1432 buf[sizeof(buf)-1] = 0; 1433 errln("Unrecognized or extra parameter: %s\n", buf); 1434 return; 1435 } 1436 #else 1437 infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters."); 1438 #endif 1439 } 1440 1441 for(int32_t s = firstStrength; s <= lastStrength; s += 1) { 1442 int32_t notFoundCount = 0; 1443 1444 logln("Setting strength to %s.", strengthNames[s]); 1445 ucol_setStrength(coll, strengths[s]); 1446 1447 // TODO: try alternate prefix and suffix too? 1448 // TODO: alternates are only equal at primary strength. Is this OK? 1449 for(int32_t t = 0; t < loopCount; t += 1) { 1450 uint32_t seed = m_seed; 1451 // int32_t nmc = 0; 1452 1453 generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern); 1454 generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix); 1455 generateTestCase(coll, monkeys, monkeyCount, suffix, altSuffix); 1456 1457 // pattern 1458 notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed); 1459 1460 testCase.remove(); 1461 testCase.append(prefix); 1462 testCase.append(/*alt*/pattern); 1463 1464 // prefix + pattern 1465 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed); 1466 1467 testCase.append(suffix); 1468 1469 // prefix + pattern + suffix 1470 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed); 1471 1472 testCase.remove(); 1473 testCase.append(pattern); 1474 testCase.append(suffix); 1475 1476 // pattern + suffix 1477 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed); 1478 } 1479 1480 logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount); 1481 } 1482 1483 uset_close(contractions); 1484 uset_close(expansions); 1485 uset_close(letters); 1486 delete monkeyData; 1487 1488 ucol_close(coll); 1489 } 1490 1491 #endif 1492 1493 #endif 1494