Home | History | Annotate | Download | only in intltest
      1 /*
      2  **********************************************************************
      3  *   Copyright (C) 2005-2009, International Business Machines
      4  *   Corporation and others.  All Rights Reserved.
      5  **********************************************************************
      6  */
      7 
      8 
      9 #include "unicode/utypes.h"
     10 
     11 #if !UCONFIG_NO_COLLATION
     12 
     13 #include "unicode/unistr.h"
     14 #include "unicode/putil.h"
     15 #include "unicode/usearch.h"
     16 
     17 #include "cmemory.h"
     18 #include "unicode/coll.h"
     19 #include "unicode/tblcoll.h"
     20 #include "unicode/coleitr.h"
     21 #include "unicode/ucoleitr.h"
     22 
     23 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
     24 
     25 #include "unicode/uniset.h"
     26 #include "unicode/uset.h"
     27 #include "unicode/ustring.h"
     28 #include "hash.h"
     29 #include "uhash.h"
     30 #include "ucol_imp.h"
     31 
     32 #include "intltest.h"
     33 #include "ssearch.h"
     34 
     35 #include "unicode/colldata.h"
     36 #include "unicode/bmsearch.h"
     37 #include "unicode/bms.h"
     38 
     39 #include "xmlparser.h"
     40 #include "ucbuf.h"
     41 
     42 #include <stdlib.h>
     43 #include <string.h>
     44 #include <stdio.h>
     45 
     46 char testId[100];
     47 
     48 #define TEST_ASSERT(x) {if (!(x)) { \
     49     errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, testId);}}
     50 
     51 #define TEST_ASSERT_M(x, m) {if (!(x)) { \
     52     errln("Failure in file %s, line %d.   \"%s\"", __FILE__, __LINE__, m);return;}}
     53 
     54 #define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \
     55     dataerrln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \
     56           __FILE__, __LINE__, testId, u_errorName(errcode));}}
     57 
     58 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
     59 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
     60 #define DELETE_ARRAY(array) uprv_free((void *) (array))
     61 
     62 //---------------------------------------------------------------------------
     63 //
     64 //  Test class boilerplate
     65 //
     66 //---------------------------------------------------------------------------
     67 SSearchTest::SSearchTest()
     68 {
     69 }
     70 
     71 SSearchTest::~SSearchTest()
     72 {
     73 }
     74 
     75 void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char *params )
     76 {
     77     if (exec) logln("TestSuite SSearchTest: ");
     78     switch (index) {
     79 #if !UCONFIG_NO_BREAK_ITERATION
     80        case 0: name = "searchTest";
     81             if (exec) searchTest();
     82             break;
     83 
     84         case 1: name = "offsetTest";
     85             if (exec) offsetTest();
     86             break;
     87 
     88         case 2: name = "monkeyTest";
     89             if (exec) monkeyTest(params);
     90             break;
     91 
     92         case 3: name = "bmMonkeyTest";
     93             if (exec) bmMonkeyTest(params);
     94             break;
     95 
     96         case 4: name = "boyerMooreTest";
     97             if (exec) boyerMooreTest();
     98             break;
     99 
    100         case 5: name = "goodSuffixTest";
    101             if (exec) goodSuffixTest();
    102             break;
    103 
    104         case 6: name = "searchTime";
    105             if (exec) searchTime();
    106             break;
    107 
    108         case 7: name = "bmsTest";
    109             if (exec) bmsTest();
    110             break;
    111 
    112         case 8: name = "bmSearchTest";
    113             if (exec) bmSearchTest();
    114             break;
    115 
    116         case 9: name = "udhrTest";
    117             if (exec) udhrTest();
    118             break;
    119 #endif
    120         default: name = "";
    121             break; //needed to end loop
    122     }
    123 }
    124 
    125 
    126 #if !UCONFIG_NO_BREAK_ITERATION
    127 
    128 #define PATH_BUFFER_SIZE 2048
    129 const char *SSearchTest::getPath(char buffer[2048], const char *filename) {
    130     UErrorCode status = U_ZERO_ERROR;
    131     const char *testDataDirectory = IntlTest::getSourceTestData(status);
    132 
    133     if (U_FAILURE(status) || strlen(testDataDirectory) + strlen(filename) + 1 >= PATH_BUFFER_SIZE) {
    134         errln("ERROR: getPath() failed - %s", u_errorName(status));
    135         return NULL;
    136     }
    137 
    138     strcpy(buffer, testDataDirectory);
    139     strcat(buffer, filename);
    140     return buffer;
    141 }
    142 
    143 
    144 void SSearchTest::searchTest()
    145 {
    146 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
    147     UErrorCode status = U_ZERO_ERROR;
    148     char path[PATH_BUFFER_SIZE];
    149     const char *testFilePath = getPath(path, "ssearch.xml");
    150 
    151     if (testFilePath == NULL) {
    152         return; /* Couldn't get path: error message already output. */
    153     }
    154 
    155     UXMLParser  *parser = UXMLParser::createParser(status);
    156     TEST_ASSERT_SUCCESS(status);
    157     UXMLElement *root   = parser->parseFile(testFilePath, status);
    158     TEST_ASSERT_SUCCESS(status);
    159     if (U_FAILURE(status)) {
    160         return;
    161     }
    162 
    163     const UnicodeString *debugTestCase = root->getAttribute("debug");
    164     if (debugTestCase != NULL) {
    165 //       setenv("USEARCH_DEBUG", "1", 1);
    166     }
    167 
    168 
    169     const UXMLElement *testCase;
    170     int32_t tc = 0;
    171 
    172     while((testCase = root->nextChildElement(tc)) != NULL) {
    173 
    174         if (testCase->getTagName().compare("test-case") != 0) {
    175             errln("ssearch, unrecognized XML Element in test file");
    176             continue;
    177         }
    178         const UnicodeString *id       = testCase->getAttribute("id");
    179         *testId = 0;
    180         if (id != NULL) {
    181             id->extract(0, id->length(), testId,  sizeof(testId), US_INV);
    182         }
    183 
    184         // If debugging test case has been specified and this is not it, skip to next.
    185         if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) {
    186             continue;
    187         }
    188         //
    189         //  Get the requested collation strength.
    190         //    Default is tertiary if the XML attribute is missing from the test case.
    191         //
    192         const UnicodeString *strength = testCase->getAttribute("strength");
    193         UColAttributeValue collatorStrength;
    194         if      (strength==NULL)          { collatorStrength = UCOL_TERTIARY;}
    195         else if (*strength=="PRIMARY")    { collatorStrength = UCOL_PRIMARY;}
    196         else if (*strength=="SECONDARY")  { collatorStrength = UCOL_SECONDARY;}
    197         else if (*strength=="TERTIARY")   { collatorStrength = UCOL_TERTIARY;}
    198         else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;}
    199         else if (*strength=="IDENTICAL")  { collatorStrength = UCOL_IDENTICAL;}
    200         else {
    201             // Bogus value supplied for strength.  Shouldn't happen, even from
    202             //  typos, if the  XML source has been validated.
    203             //  This assert is a little deceiving in that strength can be
    204             //   any of the allowed values, not just TERTIARY, but it will
    205             //   do the job of getting the error output.
    206             TEST_ASSERT(*strength=="TERTIARY")
    207         }
    208 
    209         //
    210         // Get the collator normalization flag.  Default is UCOL_OFF.
    211         //
    212         UColAttributeValue normalize = UCOL_OFF;
    213         const UnicodeString *norm = testCase->getAttribute("norm");
    214         TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF");
    215         if (norm!=NULL && *norm=="ON") {
    216             normalize = UCOL_ON;
    217         }
    218 
    219         //
    220         // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE.
    221         //
    222         UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE;
    223         const UnicodeString *alt = testCase->getAttribute("alternate_handling");
    224         TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE");
    225         if (alt != NULL && *alt == "SHIFTED") {
    226             alternateHandling = UCOL_SHIFTED;
    227         }
    228 
    229         const UnicodeString defLocale("en");
    230         char  clocale[100];
    231         const UnicodeString *locale   = testCase->getAttribute("locale");
    232         if (locale == NULL || locale->length()==0) {
    233             locale = &defLocale;
    234         };
    235         locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL);
    236 
    237 
    238         UnicodeString  text;
    239         UnicodeString  target;
    240         UnicodeString  pattern;
    241         int32_t        expectedMatchStart = -1;
    242         int32_t        expectedMatchLimit = -1;
    243         const UXMLElement  *n;
    244         int32_t                nodeCount = 0;
    245 
    246         n = testCase->getChildElement("pattern");
    247         TEST_ASSERT(n != NULL);
    248         if (n==NULL) {
    249             continue;
    250         }
    251         text = n->getText(FALSE);
    252         text = text.unescape();
    253         pattern.append(text);
    254         nodeCount++;
    255 
    256         n = testCase->getChildElement("pre");
    257         if (n!=NULL) {
    258             text = n->getText(FALSE);
    259             text = text.unescape();
    260             target.append(text);
    261             nodeCount++;
    262         }
    263 
    264         n = testCase->getChildElement("m");
    265         if (n!=NULL) {
    266             expectedMatchStart = target.length();
    267             text = n->getText(FALSE);
    268             text = text.unescape();
    269             target.append(text);
    270             expectedMatchLimit = target.length();
    271             nodeCount++;
    272         }
    273 
    274         n = testCase->getChildElement("post");
    275         if (n!=NULL) {
    276             text = n->getText(FALSE);
    277             text = text.unescape();
    278             target.append(text);
    279             nodeCount++;
    280         }
    281 
    282         //  Check that there weren't extra things in the XML
    283         TEST_ASSERT(nodeCount == testCase->countChildren());
    284 
    285         // Open a collator and StringSearch based on the parameters
    286         //   obtained from the XML.
    287         //
    288         status = U_ZERO_ERROR;
    289         UCollator *collator = ucol_open(clocale, &status);
    290         ucol_setStrength(collator, collatorStrength);
    291         ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, normalize, &status);
    292         ucol_setAttribute(collator, UCOL_ALTERNATE_HANDLING, alternateHandling, &status);
    293         UStringSearch *uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
    294                                          target.getBuffer(), target.length(),
    295                                          collator,
    296                                          NULL,     // the break iterator
    297                                          &status);
    298 
    299         TEST_ASSERT_SUCCESS(status);
    300         if (U_FAILURE(status)) {
    301             usearch_close(uss);
    302             ucol_close(collator);
    303             continue;
    304         }
    305 
    306         int32_t foundStart = 0;
    307         int32_t foundLimit = 0;
    308         UBool   foundMatch;
    309 
    310         //
    311         // Do the search, check the match result against the expected results.
    312         //
    313         foundMatch= usearch_search(uss, 0, &foundStart, &foundLimit, &status);
    314         TEST_ASSERT_SUCCESS(status);
    315         if (foundMatch && expectedMatchStart<0 ||
    316             foundStart != expectedMatchStart   ||
    317             foundLimit != expectedMatchLimit) {
    318                 TEST_ASSERT(FALSE);   //  ouput generic error position
    319                 infoln("Found, expected match start = %d, %d \n"
    320                        "Found, expected match limit = %d, %d",
    321                 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
    322         }
    323 
    324         // In case there are other matches...
    325         // (should we only do this if the test case passed?)
    326         while (foundMatch) {
    327             expectedMatchStart = foundStart;
    328             expectedMatchLimit = foundLimit;
    329 
    330             foundMatch = usearch_search(uss, foundLimit, &foundStart, &foundLimit, &status);
    331         }
    332 
    333         usearch_close(uss);
    334 
    335         uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
    336             target.getBuffer(), target.length(),
    337             collator,
    338             NULL,
    339             &status);
    340 
    341         //
    342         // Do the backwards search, check the match result against the expected results.
    343         //
    344         foundMatch= usearch_searchBackwards(uss, target.length(), &foundStart, &foundLimit, &status);
    345         TEST_ASSERT_SUCCESS(status);
    346         if (foundMatch && expectedMatchStart<0 ||
    347             foundStart != expectedMatchStart   ||
    348             foundLimit != expectedMatchLimit) {
    349                 TEST_ASSERT(FALSE);   //  ouput generic error position
    350                 infoln("Found, expected backwards match start = %d, %d \n"
    351                        "Found, expected backwards match limit = %d, %d",
    352                 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
    353         }
    354 
    355         usearch_close(uss);
    356         ucol_close(collator);
    357     }
    358 
    359     delete root;
    360     delete parser;
    361 #endif
    362 }
    363 
    364 struct UdhrTestCase
    365 {
    366     char *locale;
    367     char *file;
    368 };
    369 
    370 void SSearchTest::udhrTest()
    371 {
    372     UErrorCode status = U_ZERO_ERROR;
    373     char path[PATH_BUFFER_SIZE];
    374     const char *udhrPath = getPath(path, "udhr");
    375 
    376     if (udhrPath == NULL) {
    377         // couldn't get path: error message already output...
    378         return;
    379     }
    380 
    381     UdhrTestCase testCases[] = {
    382         {"en", "udhr_eng.txt"},
    383         {"de", "udhr_deu_1996.txt"},
    384         {"fr", "udhr_fra.txt"},
    385         {"ru", "udhr_rus.txt"},
    386         {"th", "udhr_tha.txt"},
    387         {"ja", "udhr_jpn.txt"},
    388         {"ko", "udhr_kor.txt"},
    389         {"zh", "udhr_cmn_hans.txt"},
    390         {"zh_Hant", "udhr_cmn_hant.txt"}
    391     };
    392 
    393     int32_t testCount = ARRAY_SIZE(testCases);
    394 
    395     for (int32_t t = 0; t < testCount; t += 1) {
    396         int32_t len = 0;
    397         char *resolvedFileName = NULL;
    398         const char *encoding = NULL;
    399         UCHARBUF *ucharBuf = NULL;
    400 
    401         ucbuf_resolveFileName(udhrPath, testCases[t].file, NULL, &len, &status);
    402         resolvedFileName = NEW_ARRAY(char, len);
    403 
    404         if(resolvedFileName == NULL){
    405             continue;
    406         }
    407 
    408         if(status == U_BUFFER_OVERFLOW_ERROR){
    409             status = U_ZERO_ERROR;
    410         }
    411 
    412         ucbuf_resolveFileName(udhrPath, testCases[t].file, resolvedFileName, &len, &status);
    413         ucharBuf = ucbuf_open(resolvedFileName, &encoding, TRUE, FALSE, &status);
    414 
    415         DELETE_ARRAY(resolvedFileName);
    416 
    417         if(U_FAILURE(status)){
    418             infoln("Could not open the input file %s. Test skipped\n", testCases[t].file);
    419             continue;
    420         }
    421 
    422         int32_t targetLen = 0;
    423         const UChar *target = ucbuf_getBuffer(ucharBuf, &targetLen, &status);
    424 
    425         /* The first line of the file contains the pattern */
    426         int32_t start = 0, end = 0, plen = 0;
    427 
    428         for(end = start; ; end += 1) {
    429             UChar ch = target[end];
    430 
    431             if (ch == 0x000A || ch == 0x000D || ch == 0x2028) {
    432                 break;
    433             }
    434         }
    435 
    436         plen = end - start;
    437 
    438         UChar *pattern = NEW_ARRAY(UChar, plen);
    439         for (int32_t i = 0; i < plen; i += 1) {
    440             pattern[i] =  target[start++];
    441         }
    442 
    443         int32_t offset = 0;
    444         UCollator *coll = ucol_open(testCases[t].locale, &status);
    445         UCD *ucd = NULL;
    446         BMS *bms = NULL;
    447 
    448         if (U_FAILURE(status)) {
    449             errln("Could not open collator for %s", testCases[t].locale);
    450             goto delete_collator;
    451         }
    452 
    453         ucd = ucd_open(coll, &status);
    454 
    455         if (U_FAILURE(status)) {
    456             errln("Could not open CollData object for %s", testCases[t].locale);
    457             goto delete_ucd;
    458         }
    459 
    460         bms = bms_open(ucd, pattern, plen, target, targetLen, &status);
    461 
    462         if (U_FAILURE(status)) {
    463             errln("Could not open search object for %s", testCases[t].locale);
    464             goto delete_bms;
    465         }
    466 
    467         start = end = -1;
    468         while (bms_search(bms, offset, &start, &end)) {
    469             offset = end;
    470         }
    471 
    472         if (offset == 0) {
    473             errln("Could not find pattern - locale: %s, file: %s ", testCases[t].locale, testCases[t].file);
    474         }
    475 
    476 delete_bms:
    477         bms_close(bms);
    478 
    479 delete_ucd:
    480         ucd_close(ucd);
    481 
    482 delete_collator:
    483         ucol_close(coll);
    484 
    485         DELETE_ARRAY(pattern);
    486         ucbuf_close(ucharBuf);
    487     }
    488 
    489     ucd_flushCache();
    490 }
    491 
    492 void SSearchTest::bmSearchTest()
    493 {
    494 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
    495     UErrorCode status = U_ZERO_ERROR;
    496     char path[PATH_BUFFER_SIZE];
    497     const char *testFilePath = getPath(path, "ssearch.xml");
    498 
    499     if (testFilePath == NULL) {
    500         return; /* Couldn't get path: error message already output. */
    501     }
    502 
    503     UXMLParser  *parser = UXMLParser::createParser(status);
    504     TEST_ASSERT_SUCCESS(status);
    505     UXMLElement *root   = parser->parseFile(testFilePath, status);
    506     TEST_ASSERT_SUCCESS(status);
    507     if (U_FAILURE(status)) {
    508         return;
    509     }
    510 
    511     const UnicodeString *debugTestCase = root->getAttribute("debug");
    512     if (debugTestCase != NULL) {
    513 //       setenv("USEARCH_DEBUG", "1", 1);
    514     }
    515 
    516 
    517     const UXMLElement *testCase;
    518     int32_t tc = 0;
    519 
    520     while((testCase = root->nextChildElement(tc)) != NULL) {
    521 
    522         if (testCase->getTagName().compare("test-case") != 0) {
    523             errln("ssearch, unrecognized XML Element in test file");
    524             continue;
    525         }
    526         const UnicodeString *id       = testCase->getAttribute("id");
    527         *testId = 0;
    528         if (id != NULL) {
    529             id->extract(0, id->length(), testId,  sizeof(testId), US_INV);
    530         }
    531 
    532         // If debugging test case has been specified and this is not it, skip to next.
    533         if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) {
    534             continue;
    535         }
    536         //
    537         //  Get the requested collation strength.
    538         //    Default is tertiary if the XML attribute is missing from the test case.
    539         //
    540         const UnicodeString *strength = testCase->getAttribute("strength");
    541         UColAttributeValue collatorStrength;
    542         if      (strength==NULL)          { collatorStrength = UCOL_TERTIARY;}
    543         else if (*strength=="PRIMARY")    { collatorStrength = UCOL_PRIMARY;}
    544         else if (*strength=="SECONDARY")  { collatorStrength = UCOL_SECONDARY;}
    545         else if (*strength=="TERTIARY")   { collatorStrength = UCOL_TERTIARY;}
    546         else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;}
    547         else if (*strength=="IDENTICAL")  { collatorStrength = UCOL_IDENTICAL;}
    548         else {
    549             // Bogus value supplied for strength.  Shouldn't happen, even from
    550             //  typos, if the  XML source has been validated.
    551             //  This assert is a little deceiving in that strength can be
    552             //   any of the allowed values, not just TERTIARY, but it will
    553             //   do the job of getting the error output.
    554             TEST_ASSERT(*strength=="TERTIARY")
    555         }
    556 
    557         //
    558         // Get the collator normalization flag.  Default is UCOL_OFF.
    559         //
    560         UColAttributeValue normalize = UCOL_OFF;
    561         const UnicodeString *norm = testCase->getAttribute("norm");
    562         TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF");
    563         if (norm!=NULL && *norm=="ON") {
    564             normalize = UCOL_ON;
    565         }
    566 
    567         //
    568         // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE.
    569         //
    570         UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE;
    571         const UnicodeString *alt = testCase->getAttribute("alternate_handling");
    572         TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE");
    573         if (alt != NULL && *alt == "SHIFTED") {
    574             alternateHandling = UCOL_SHIFTED;
    575         }
    576 
    577         const UnicodeString defLocale("en");
    578         char  clocale[100];
    579         const UnicodeString *locale   = testCase->getAttribute("locale");
    580         if (locale == NULL || locale->length()==0) {
    581             locale = &defLocale;
    582         };
    583         locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL);
    584 
    585 
    586         UnicodeString  text;
    587         UnicodeString  target;
    588         UnicodeString  pattern;
    589         int32_t        expectedMatchStart = -1;
    590         int32_t        expectedMatchLimit = -1;
    591         const UXMLElement  *n;
    592         int32_t                nodeCount = 0;
    593 
    594         n = testCase->getChildElement("pattern");
    595         TEST_ASSERT(n != NULL);
    596         if (n==NULL) {
    597             continue;
    598         }
    599         text = n->getText(FALSE);
    600         text = text.unescape();
    601         pattern.append(text);
    602         nodeCount++;
    603 
    604         n = testCase->getChildElement("pre");
    605         if (n!=NULL) {
    606             text = n->getText(FALSE);
    607             text = text.unescape();
    608             target.append(text);
    609             nodeCount++;
    610         }
    611 
    612         n = testCase->getChildElement("m");
    613         if (n!=NULL) {
    614             expectedMatchStart = target.length();
    615             text = n->getText(FALSE);
    616             text = text.unescape();
    617             target.append(text);
    618             expectedMatchLimit = target.length();
    619             nodeCount++;
    620         }
    621 
    622         n = testCase->getChildElement("post");
    623         if (n!=NULL) {
    624             text = n->getText(FALSE);
    625             text = text.unescape();
    626             target.append(text);
    627             nodeCount++;
    628         }
    629 
    630         //  Check that there weren't extra things in the XML
    631         TEST_ASSERT(nodeCount == testCase->countChildren());
    632 
    633         // Open a collator and StringSearch based on the parameters
    634         //   obtained from the XML.
    635         //
    636         status = U_ZERO_ERROR;
    637         UCollator *collator = ucol_open(clocale, &status);
    638         ucol_setStrength(collator, collatorStrength);
    639         ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, normalize, &status);
    640         ucol_setAttribute(collator, UCOL_ALTERNATE_HANDLING, alternateHandling, &status);
    641         UCD *ucd = ucd_open(collator, &status);
    642         BMS *bms = bms_open(ucd, pattern.getBuffer(), pattern.length(), target.getBuffer(), target.length(), &status);
    643 
    644         TEST_ASSERT_SUCCESS(status);
    645         if (U_FAILURE(status)) {
    646             bms_close(bms);
    647             ucd_close(ucd);
    648             ucol_close(collator);
    649             continue;
    650         }
    651 
    652         int32_t foundStart = 0;
    653         int32_t foundLimit = 0;
    654         UBool   foundMatch;
    655 
    656         //
    657         // Do the search, check the match result against the expected results.
    658         //
    659         foundMatch = bms_search(bms, 0, &foundStart, &foundLimit);
    660       //TEST_ASSERT_SUCCESS(status);
    661         if (foundMatch && expectedMatchStart < 0 ||
    662             foundStart != expectedMatchStart   ||
    663             foundLimit != expectedMatchLimit) {
    664                 TEST_ASSERT(FALSE);   //  ouput generic error position
    665                 infoln("Found, expected match start = %d, %d \n"
    666                        "Found, expected match limit = %d, %d",
    667                 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
    668         }
    669 
    670         bms_close(bms);
    671         ucd_close(ucd);
    672         ucol_close(collator);
    673     }
    674 
    675     ucd_flushCache();
    676     delete root;
    677     delete parser;
    678 #endif
    679 }
    680 
    681 struct Order
    682 {
    683     int32_t order;
    684     int32_t lowOffset;
    685     int32_t highOffset;
    686 };
    687 
    688 class OrderList
    689 {
    690 public:
    691     OrderList();
    692     OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset = 0);
    693     ~OrderList();
    694 
    695     int32_t size(void) const;
    696     void add(int32_t order, int32_t low, int32_t high);
    697     const Order *get(int32_t index) const;
    698     int32_t getLowOffset(int32_t index) const;
    699     int32_t getHighOffset(int32_t index) const;
    700     int32_t getOrder(int32_t index) const;
    701     void reverse(void);
    702     UBool compare(const OrderList &other) const;
    703     UBool matchesAt(int32_t offset, const OrderList &other) const;
    704 
    705 private:
    706     Order *list;
    707     int32_t listMax;
    708     int32_t listSize;
    709 };
    710 
    711 OrderList::OrderList()
    712   : list(NULL), listSize(0), listMax(16)
    713 {
    714     list = new Order[listMax];
    715 }
    716 
    717 OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset)
    718     : list(NULL), listMax(16), listSize(0)
    719 {
    720     UErrorCode status = U_ZERO_ERROR;
    721     UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
    722     uint32_t strengthMask = 0;
    723     int32_t order, low, high;
    724 
    725     switch (ucol_getStrength(coll))
    726     {
    727     default:
    728         strengthMask |= UCOL_TERTIARYORDERMASK;
    729         /* fall through */
    730 
    731     case UCOL_SECONDARY:
    732         strengthMask |= UCOL_SECONDARYORDERMASK;
    733         /* fall through */
    734 
    735     case UCOL_PRIMARY:
    736         strengthMask |= UCOL_PRIMARYORDERMASK;
    737     }
    738 
    739     list = new Order[listMax];
    740 
    741     ucol_setOffset(elems, stringOffset, &status);
    742 
    743     do {
    744         low   = ucol_getOffset(elems);
    745         order = ucol_next(elems, &status);
    746         high  = ucol_getOffset(elems);
    747 
    748         if (order != UCOL_NULLORDER) {
    749             order &= strengthMask;
    750         }
    751 
    752         if (order != UCOL_IGNORABLE) {
    753             add(order, low, high);
    754         }
    755     } while (order != UCOL_NULLORDER);
    756 
    757     ucol_closeElements(elems);
    758 }
    759 
    760 OrderList::~OrderList()
    761 {
    762     delete[] list;
    763 }
    764 
    765 void OrderList::add(int32_t order, int32_t low, int32_t high)
    766 {
    767     if (listSize >= listMax) {
    768         listMax *= 2;
    769 
    770         Order *newList = new Order[listMax];
    771 
    772         uprv_memcpy(newList, list, listSize * sizeof(Order));
    773         delete[] list;
    774         list = newList;
    775     }
    776 
    777     list[listSize].order      = order;
    778     list[listSize].lowOffset  = low;
    779     list[listSize].highOffset = high;
    780 
    781     listSize += 1;
    782 }
    783 
    784 const Order *OrderList::get(int32_t index) const
    785 {
    786     if (index >= listSize) {
    787         return NULL;
    788     }
    789 
    790     return &list[index];
    791 }
    792 
    793 int32_t OrderList::getLowOffset(int32_t index) const
    794 {
    795     const Order *order = get(index);
    796 
    797     if (order != NULL) {
    798         return order->lowOffset;
    799     }
    800 
    801     return -1;
    802 }
    803 
    804 int32_t OrderList::getHighOffset(int32_t index) const
    805 {
    806     const Order *order = get(index);
    807 
    808     if (order != NULL) {
    809         return order->highOffset;
    810     }
    811 
    812     return -1;
    813 }
    814 
    815 int32_t OrderList::getOrder(int32_t index) const
    816 {
    817     const Order *order = get(index);
    818 
    819     if (order != NULL) {
    820         return order->order;
    821     }
    822 
    823     return UCOL_NULLORDER;
    824 }
    825 
    826 int32_t OrderList::size() const
    827 {
    828     return listSize;
    829 }
    830 
    831 void OrderList::reverse()
    832 {
    833     for(int32_t f = 0, b = listSize - 1; f < b; f += 1, b -= 1) {
    834         Order swap = list[b];
    835 
    836         list[b] = list[f];
    837         list[f] = swap;
    838     }
    839 }
    840 
    841 UBool OrderList::compare(const OrderList &other) const
    842 {
    843     if (listSize != other.listSize) {
    844         return FALSE;
    845     }
    846 
    847     for(int32_t i = 0; i < listSize; i += 1) {
    848         if (list[i].order  != other.list[i].order ||
    849             list[i].lowOffset != other.list[i].lowOffset ||
    850             list[i].highOffset != other.list[i].highOffset) {
    851                 return FALSE;
    852         }
    853     }
    854 
    855     return TRUE;
    856 }
    857 
    858 UBool OrderList::matchesAt(int32_t offset, const OrderList &other) const
    859 {
    860     // NOTE: sizes include the NULLORDER, which we don't want to compare.
    861     int32_t otherSize = other.size() - 1;
    862 
    863     if (listSize - 1 - offset < otherSize) {
    864         return FALSE;
    865     }
    866 
    867     for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) {
    868         if (getOrder(i) != other.getOrder(j)) {
    869             return FALSE;
    870         }
    871     }
    872 
    873     return TRUE;
    874 }
    875 
    876 static char *printOffsets(char *buffer, OrderList &list)
    877 {
    878     int32_t size = list.size();
    879     char *s = buffer;
    880 
    881     for(int32_t i = 0; i < size; i += 1) {
    882         const Order *order = list.get(i);
    883 
    884         if (i != 0) {
    885             s += sprintf(s, ", ");
    886         }
    887 
    888         s += sprintf(s, "(%d, %d)", order->lowOffset, order->highOffset);
    889     }
    890 
    891     return buffer;
    892 }
    893 
    894 static char *printOrders(char *buffer, OrderList &list)
    895 {
    896     int32_t size = list.size();
    897     char *s = buffer;
    898 
    899     for(int32_t i = 0; i < size; i += 1) {
    900         const Order *order = list.get(i);
    901 
    902         if (i != 0) {
    903             s += sprintf(s, ", ");
    904         }
    905 
    906         s += sprintf(s, "%8.8X", order->order);
    907     }
    908 
    909     return buffer;
    910 }
    911 
    912 void SSearchTest::offsetTest()
    913 {
    914     const char *test[] = {
    915         // The sequence \u0FB3\u0F71\u0F71\u0F80 contains a discontiguous
    916         // contraction (\u0FB3\u0F71\u0F80) logically followed by \u0F71.
    917         "\\u1E33\\u0FB3\\u0F71\\u0F71\\u0F80\\uD835\\uDF6C\\u01B0",
    918 
    919         "\\ua191\\u16ef\\u2036\\u017a",
    920 
    921 #if 0
    922         // This results in a complex interaction between contraction,
    923         // expansion and normalization that confuses the backwards offset fixups.
    924         "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
    925 #endif
    926 
    927         "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
    928         "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3",
    929 
    930         "\\u02FE\\u02FF"
    931         "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\\u030A\\u030B\\u030C\\u030D\\u030E\\u030F"
    932         "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F"
    933         "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F"
    934         "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F"
    935         "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E",
    936 
    937         "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318",
    938         "abc\\u0E41\\u0301\\u0316",
    939 		"abc\\u0E41\\u0316\\u0301",
    940 		"\\u0E41\\u0301\\u0316",
    941 		"\\u0E41\\u0316\\u0301",
    942 		"a\\u0301\\u0316",
    943 		"a\\u0316\\u0301",
    944 		"\\uAC52\\uAC53",
    945 		"\\u34CA\\u34CB",
    946 		"\\u11ED\\u11EE",
    947 		"\\u30C3\\u30D0",
    948 		"p\\u00E9ch\\u00E9",
    949         "a\\u0301\\u0325",
    950         "a\\u0300\\u0325",
    951         "a\\u0325\\u0300",
    952         "A\\u0323\\u0300B",
    953         "A\\u0300\\u0323B",
    954         "A\\u0301\\u0323B",
    955         "A\\u0302\\u0301\\u0323B",
    956         "abc",
    957         "ab\\u0300c",
    958         "ab\\u0300\\u0323c",
    959         " \\uD800\\uDC00\\uDC00",
    960         "a\\uD800\\uDC00\\uDC00",
    961         "A\\u0301\\u0301",
    962         "A\\u0301\\u0323",
    963         "A\\u0301\\u0323B",
    964         "B\\u0301\\u0323C",
    965         "A\\u0300\\u0323B",
    966         "\\u0301A\\u0301\\u0301",
    967         "abcd\\r\\u0301",
    968         "p\\u00EAche",
    969         "pe\\u0302che",
    970     };
    971 
    972     int32_t testCount = ARRAY_SIZE(test);
    973     UErrorCode status = U_ZERO_ERROR;
    974     RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Locale::getEnglish(), status);
    975     if (U_FAILURE(status)) {
    976         errcheckln(status, "Failed to create collator in offsetTest! - %s", u_errorName(status));
    977         return;
    978     }
    979     char buffer[4096];  // A bit of a hack... just happens to be long enough for all the test cases...
    980                         // We could allocate one that's the right size by (CE_count * 10) + 2
    981                         // 10 chars is enough room for 8 hex digits plus ", ". 2 extra chars for "[" and "]"
    982 
    983     col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
    984 
    985     for(int32_t i = 0; i < testCount; i += 1) {
    986         UnicodeString ts = CharsToUnicodeString(test[i]);
    987         CollationElementIterator *iter = col->createCollationElementIterator(ts);
    988         OrderList forwardList;
    989         OrderList backwardList;
    990         int32_t order, low, high;
    991 
    992         do {
    993             low   = iter->getOffset();
    994             order = iter->next(status);
    995             high  = iter->getOffset();
    996 
    997             forwardList.add(order, low, high);
    998         } while (order != CollationElementIterator::NULLORDER);
    999 
   1000         iter->reset();
   1001         iter->setOffset(ts.length(), status);
   1002 
   1003         backwardList.add(CollationElementIterator::NULLORDER, iter->getOffset(), iter->getOffset());
   1004 
   1005         do {
   1006             high  = iter->getOffset();
   1007             order = iter->previous(status);
   1008             low   = iter->getOffset();
   1009 
   1010             if (order == CollationElementIterator::NULLORDER) {
   1011                 break;
   1012             }
   1013 
   1014             backwardList.add(order, low, high);
   1015         } while (TRUE);
   1016 
   1017         backwardList.reverse();
   1018 
   1019         if (forwardList.compare(backwardList)) {
   1020             logln("Works with \"%s\"", test[i]);
   1021             logln("Forward offsets:  [%s]", printOffsets(buffer, forwardList));
   1022 //          logln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
   1023 
   1024             logln("Forward CEs:  [%s]", printOrders(buffer, forwardList));
   1025 //          logln("Backward CEs: [%s]", printOrders(buffer, backwardList));
   1026 
   1027             logln();
   1028         } else {
   1029             errln("Fails with \"%s\"", test[i]);
   1030             infoln("Forward offsets:  [%s]", printOffsets(buffer, forwardList));
   1031             infoln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
   1032 
   1033             infoln("Forward CEs:  [%s]", printOrders(buffer, forwardList));
   1034             infoln("Backward CEs: [%s]", printOrders(buffer, backwardList));
   1035 
   1036             infoln();
   1037         }
   1038         delete iter;
   1039     }
   1040     delete col;
   1041 }
   1042 
   1043 static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer)
   1044 {
   1045     for(int32_t i = 0; i < string.length(); i += 1) {
   1046         UChar32 ch = string.char32At(i);
   1047 
   1048         if (ch >= 0x0020 && ch <= 0x007F) {
   1049             if (ch == 0x005C) {
   1050                 buffer.append("\\\\");
   1051             } else {
   1052                 buffer.append(ch);
   1053             }
   1054         } else {
   1055             char cbuffer[12];
   1056 
   1057             if (ch <= 0xFFFFL) {
   1058                 sprintf(cbuffer, "\\u%4.4X", ch);
   1059             } else {
   1060                 sprintf(cbuffer, "\\U%8.8X", ch);
   1061             }
   1062 
   1063             buffer.append(cbuffer);
   1064         }
   1065 
   1066         if (ch >= 0x10000L) {
   1067             i += 1;
   1068         }
   1069     }
   1070 
   1071     return buffer;
   1072 }
   1073 
   1074 #if 1
   1075 
   1076 struct PCE
   1077 {
   1078     uint64_t ce;
   1079     int32_t  lowOffset;
   1080     int32_t  highOffset;
   1081 };
   1082 
   1083 class PCEList
   1084 {
   1085 public:
   1086     PCEList(UCollator *coll, const UnicodeString &string);
   1087     ~PCEList();
   1088 
   1089     int32_t size() const;
   1090 
   1091     const PCE *get(int32_t index) const;
   1092 
   1093     int32_t getLowOffset(int32_t index) const;
   1094     int32_t getHighOffset(int32_t index) const;
   1095     uint64_t getOrder(int32_t index) const;
   1096 
   1097     UBool matchesAt(int32_t offset, const PCEList &other) const;
   1098 
   1099     uint64_t operator[](int32_t index) const;
   1100 
   1101 private:
   1102     void add(uint64_t ce, int32_t low, int32_t high);
   1103 
   1104     PCE *list;
   1105     int32_t listMax;
   1106     int32_t listSize;
   1107 };
   1108 
   1109 PCEList::PCEList(UCollator *coll, const UnicodeString &string)
   1110 {
   1111     UErrorCode status = U_ZERO_ERROR;
   1112     UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
   1113     uint64_t order;
   1114     int32_t low, high;
   1115 
   1116     list = new PCE[listMax];
   1117 
   1118     ucol_setOffset(elems, 0, &status);
   1119 
   1120     do {
   1121         order = ucol_nextProcessed(elems, &low, &high, &status);
   1122         add(order, low, high);
   1123     } while (order != UCOL_PROCESSED_NULLORDER);
   1124 
   1125     ucol_closeElements(elems);
   1126 }
   1127 
   1128 PCEList::~PCEList()
   1129 {
   1130     delete[] list;
   1131 }
   1132 
   1133 void PCEList::add(uint64_t order, int32_t low, int32_t high)
   1134 {
   1135     if (listSize >= listMax) {
   1136         listMax *= 2;
   1137 
   1138         PCE *newList = new PCE[listMax];
   1139 
   1140         uprv_memcpy(newList, list, listSize * sizeof(Order));
   1141         delete[] list;
   1142         list = newList;
   1143     }
   1144 
   1145     list[listSize].ce         = order;
   1146     list[listSize].lowOffset  = low;
   1147     list[listSize].highOffset = high;
   1148 
   1149     listSize += 1;
   1150 }
   1151 
   1152 const PCE *PCEList::get(int32_t index) const
   1153 {
   1154     if (index >= listSize) {
   1155         return NULL;
   1156     }
   1157 
   1158     return &list[index];
   1159 }
   1160 
   1161 int32_t PCEList::getLowOffset(int32_t index) const
   1162 {
   1163     const PCE *pce = get(index);
   1164 
   1165     if (pce != NULL) {
   1166         return pce->lowOffset;
   1167     }
   1168 
   1169     return -1;
   1170 }
   1171 
   1172 int32_t PCEList::getHighOffset(int32_t index) const
   1173 {
   1174     const PCE *pce = get(index);
   1175 
   1176     if (pce != NULL) {
   1177         return pce->highOffset;
   1178     }
   1179 
   1180     return -1;
   1181 }
   1182 
   1183 uint64_t PCEList::getOrder(int32_t index) const
   1184 {
   1185     const PCE *pce = get(index);
   1186 
   1187     if (pce != NULL) {
   1188         return pce->ce;
   1189     }
   1190 
   1191     return UCOL_PROCESSED_NULLORDER;
   1192 }
   1193 
   1194 int32_t PCEList::size() const
   1195 {
   1196     return listSize;
   1197 }
   1198 
   1199 UBool PCEList::matchesAt(int32_t offset, const PCEList &other) const
   1200 {
   1201     // NOTE: sizes include the NULLORDER, which we don't want to compare.
   1202     int32_t otherSize = other.size() - 1;
   1203 
   1204     if (listSize - 1 - offset < otherSize) {
   1205         return FALSE;
   1206     }
   1207 
   1208     for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) {
   1209         if (getOrder(i) != other.getOrder(j)) {
   1210             return FALSE;
   1211         }
   1212     }
   1213 
   1214     return TRUE;
   1215 }
   1216 
   1217 uint64_t PCEList::operator[](int32_t index) const
   1218 {
   1219     return getOrder(index);
   1220 }
   1221 
   1222 void SSearchTest::boyerMooreTest()
   1223 {
   1224     UErrorCode status = U_ZERO_ERROR;
   1225     UCollator *coll = NULL;
   1226     CollData *data = NULL;
   1227     UnicodeString lp  = "fuss";
   1228     UnicodeString sp = "fu\\u00DF";
   1229     BoyerMooreSearch *longPattern = NULL;
   1230     BoyerMooreSearch *shortPattern = NULL;
   1231     UnicodeString targets[]  = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball",
   1232                                 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF",
   1233                                 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"};
   1234     int32_t start = -1, end = -1;
   1235 
   1236     coll = ucol_openFromShortString("S1", FALSE, NULL, &status);
   1237     if (U_FAILURE(status)) {
   1238         errcheckln(status, "Could not open collator. - %s", u_errorName(status));
   1239         return;
   1240     }
   1241 
   1242     data = CollData::open(coll, status);
   1243     if (U_FAILURE(status)) {
   1244         errln("Could not open CollData object.");
   1245         goto close_data;
   1246     }
   1247 
   1248 
   1249     longPattern = new BoyerMooreSearch(data, lp.unescape(), NULL, status);
   1250     shortPattern = new BoyerMooreSearch(data, sp.unescape(), NULL, status);
   1251     if (U_FAILURE(status)) {
   1252         errln("Could not create pattern objects.");
   1253         goto close_patterns;
   1254     }
   1255 
   1256     for (int32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) {
   1257         UnicodeString target = targets[t].unescape();
   1258 
   1259         longPattern->setTargetString(&target, status);
   1260         if (longPattern->search(0, start, end)) {
   1261             logln("Test %d: found long pattern at [%d, %d].", t, start, end);
   1262         } else {
   1263             errln("Test %d: did not find long pattern.", t);
   1264         }
   1265 
   1266         shortPattern->setTargetString(&target, status);
   1267         if (shortPattern->search(0, start, end)) {
   1268             logln("Test %d: found short pattern at [%d, %d].", t, start, end);
   1269         } else {
   1270             errln("Test %d: did not find short pattern.", t);
   1271         }
   1272     }
   1273 
   1274 close_patterns:
   1275     delete shortPattern;
   1276     delete longPattern;
   1277 
   1278 close_data:
   1279     CollData::close(data);
   1280     ucol_close(coll);
   1281 }
   1282 
   1283 void SSearchTest::bmsTest()
   1284 {
   1285     UErrorCode status = U_ZERO_ERROR;
   1286     UCollator *coll = NULL;
   1287     UCD *data = NULL;
   1288     UnicodeString lp  = "fuss";
   1289     UnicodeString lpu = lp.unescape();
   1290     UnicodeString sp  = "fu\\u00DF";
   1291     UnicodeString spu = sp.unescape();
   1292     BMS *longPattern = NULL;
   1293     BMS *shortPattern = NULL;
   1294     UnicodeString targets[]  = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball",
   1295                                 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF",
   1296                                 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"};
   1297     int32_t start = -1, end = -1;
   1298 
   1299     coll = ucol_openFromShortString("S1", FALSE, NULL, &status);
   1300     if (U_FAILURE(status)) {
   1301         errcheckln(status, "Could not open collator. - %s", u_errorName(status));
   1302         return;
   1303     }
   1304 
   1305     data = ucd_open(coll, &status);
   1306     if (U_FAILURE(status)) {
   1307         errln("Could not open CollData object.");
   1308         goto close_data;
   1309     }
   1310 
   1311     longPattern = bms_open(data, lpu.getBuffer(), lpu.length(), NULL, 0, &status);
   1312     shortPattern = bms_open(data, spu.getBuffer(), spu.length(), NULL, 0, &status);
   1313     if (U_FAILURE(status)) {
   1314         errln("Couldn't open pattern objects.");
   1315         goto close_patterns;
   1316     }
   1317 
   1318     for (int32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) {
   1319         UnicodeString target = targets[t].unescape();
   1320 
   1321         bms_setTargetString(longPattern, target.getBuffer(), target.length(), &status);
   1322         if (bms_search(longPattern, 0, &start, &end)) {
   1323             logln("Test %d: found long pattern at [%d, %d].", t, start, end);
   1324         } else {
   1325             errln("Test %d: did not find long pattern.", t);
   1326         }
   1327 
   1328         bms_setTargetString(shortPattern, target.getBuffer(), target.length(), &status);
   1329         if (bms_search(shortPattern, 0, &start, &end)) {
   1330             logln("Test %d: found short pattern at [%d, %d].", t, start, end);
   1331         } else {
   1332             errln("Test %d: did not find short pattern.", t);
   1333         }
   1334     }
   1335 
   1336 close_patterns:
   1337     bms_close(shortPattern);
   1338     bms_close(longPattern);
   1339 
   1340 close_data:
   1341     ucd_close(data);
   1342     ucol_close(coll);
   1343 }
   1344 
   1345 void SSearchTest::goodSuffixTest()
   1346 {
   1347     UErrorCode status = U_ZERO_ERROR;
   1348     UCollator *coll = NULL;
   1349     CollData *data = NULL;
   1350     UnicodeString pat = /*"gcagagag"*/ "fxeld";
   1351     UnicodeString target = /*"gcatcgcagagagtatacagtacg"*/ "cloveldfxeld";
   1352     BoyerMooreSearch *pattern = NULL;
   1353     int32_t start = -1, end = -1;
   1354 
   1355     coll = ucol_open(NULL, &status);
   1356     if (U_FAILURE(status)) {
   1357         errcheckln(status, "Couldn't open collator. - %s", u_errorName(status));
   1358         return;
   1359     }
   1360 
   1361     data = CollData::open(coll, status);
   1362     if (U_FAILURE(status)) {
   1363         errln("Couldn't open CollData object.");
   1364         goto close_data;
   1365     }
   1366 
   1367     pattern = new BoyerMooreSearch(data, pat, &target, status);
   1368     if (U_FAILURE(status)) {
   1369         errln("Couldn't open pattern object.");
   1370         goto close_pattern;
   1371     }
   1372 
   1373     if (pattern->search(0, start, end)) {
   1374         logln("Found pattern at [%d, %d].", start, end);
   1375     } else {
   1376         errln("Did not find pattern.");
   1377     }
   1378 
   1379 close_pattern:
   1380     delete pattern;
   1381 
   1382 close_data:
   1383     CollData::close(data);
   1384     ucol_close(coll);
   1385 }
   1386 
   1387 //
   1388 //  searchTime()    A quick and dirty performance test for string search.
   1389 //                  Probably  doesn't really belong as part of intltest, but it
   1390 //                  does check that the search succeeds, and gets the right result,
   1391 //                  so it serves as a functionality test also.
   1392 //
   1393 //                  To run as a perf test, up the loop count, select by commenting
   1394 //                  and uncommenting in the code the operation to be measured,
   1395 //                  rebuild, and measure the running time of this test alone.
   1396 //
   1397 //                     time LD_LIBRARY_PATH=whatever  ./intltest  collate/SSearchTest/searchTime
   1398 //
   1399 void SSearchTest::searchTime() {
   1400     static const char *longishText =
   1401 "Whylom, as olde stories tellen us,\n"
   1402 "Ther was a duk that highte Theseus:\n"
   1403 "Of Athenes he was lord and governour,\n"
   1404 "And in his tyme swich a conquerour,\n"
   1405 "That gretter was ther noon under the sonne.\n"
   1406 "Ful many a riche contree hadde he wonne;\n"
   1407 "What with his wisdom and his chivalrye,\n"
   1408 "He conquered al the regne of Femenye,\n"
   1409 "That whylom was y-cleped Scithia;\n"
   1410 "And weddede the quene Ipolita,\n"
   1411 "And broghte hir hoom with him in his contree\n"
   1412 "With muchel glorie and greet solempnitee,\n"
   1413 "And eek hir yonge suster Emelye.\n"
   1414 "And thus with victorie and with melodye\n"
   1415 "Lete I this noble duk to Athenes ryde,\n"
   1416 "And al his hoost, in armes, him bisyde.\n"
   1417 "And certes, if it nere to long to here,\n"
   1418 "I wolde han told yow fully the manere,\n"
   1419 "How wonnen was the regne of Femenye\n"
   1420 "By Theseus, and by his chivalrye;\n"
   1421 "And of the grete bataille for the nones\n"
   1422 "Bitwixen Athen's and Amazones;\n"
   1423 "And how asseged was Ipolita,\n"
   1424 "The faire hardy quene of Scithia;\n"
   1425 "And of the feste that was at hir weddinge,\n"
   1426 "And of the tempest at hir hoom-cominge;\n"
   1427 "But al that thing I moot as now forbere.\n"
   1428 "I have, God woot, a large feeld to ere,\n"
   1429 "And wayke been the oxen in my plough.\n"
   1430 "The remenant of the tale is long y-nough.\n"
   1431 "I wol nat letten eek noon of this route;\n"
   1432 "Lat every felawe telle his tale aboute,\n"
   1433 "And lat see now who shal the soper winne;\n"
   1434 "And ther I lefte, I wol ageyn biginne.\n"
   1435 "This duk, of whom I make mencioun,\n"
   1436 "When he was come almost unto the toun,\n"
   1437 "In al his wele and in his moste pryde,\n"
   1438 "He was war, as he caste his eye asyde,\n"
   1439 "Wher that ther kneled in the hye weye\n"
   1440 "A companye of ladies, tweye and tweye,\n"
   1441 "Ech after other, clad in clothes blake; \n"
   1442 "But swich a cry and swich a wo they make,\n"
   1443 "That in this world nis creature livinge,\n"
   1444 "That herde swich another weymentinge;\n"
   1445 "And of this cry they nolde never stenten,\n"
   1446 "Til they the reynes of his brydel henten.\n"
   1447 "'What folk ben ye, that at myn hoomcominge\n"
   1448 "Perturben so my feste with cryinge'?\n"
   1449 "Quod Theseus, 'have ye so greet envye\n"
   1450 "Of myn honour, that thus compleyne and crye? \n"
   1451 "Or who hath yow misboden, or offended?\n"
   1452 "And telleth me if it may been amended;\n"
   1453 "And why that ye ben clothed thus in blak'?\n"
   1454 "The eldest lady of hem alle spak,\n"
   1455 "When she hadde swowned with a deedly chere,\n"
   1456 "That it was routhe for to seen and here,\n"
   1457 "And seyde: 'Lord, to whom Fortune hath yiven\n"
   1458 "Victorie, and as a conquerour to liven,\n"
   1459 "Noght greveth us your glorie and your honour;\n"
   1460 "But we biseken mercy and socour.\n"
   1461 "Have mercy on our wo and our distresse.\n"
   1462 "Som drope of pitee, thurgh thy gentilesse,\n"
   1463 "Up-on us wrecched wommen lat thou falle.\n"
   1464 "For certes, lord, ther nis noon of us alle,\n"
   1465 "That she nath been a duchesse or a quene;\n"
   1466 "Now be we caitifs, as it is wel sene:\n"
   1467 "Thanked be Fortune, and hir false wheel,\n"
   1468 "That noon estat assureth to be weel.\n"
   1469 "And certes, lord, t'abyden your presence,\n"
   1470 "Here in the temple of the goddesse Clemence\n"
   1471 "We han ben waytinge al this fourtenight;\n"
   1472 "Now help us, lord, sith it is in thy might.\n"
   1473 "I wrecche, which that wepe and waille thus,\n"
   1474 "Was whylom wyf to king Capaneus,\n"
   1475 "That starf at Thebes, cursed be that day!\n"
   1476 "And alle we, that been in this array,\n"
   1477 "And maken al this lamentacioun,\n"
   1478 "We losten alle our housbondes at that toun,\n"
   1479 "Whyl that the sege ther-aboute lay.\n"
   1480 "And yet now th'olde Creon, weylaway!\n"
   1481 "The lord is now of Thebes the citee, \n"
   1482 "Fulfild of ire and of iniquitee,\n"
   1483 "He, for despyt, and for his tirannye,\n"
   1484 "To do the dede bodyes vileinye,\n"
   1485 "Of alle our lordes, whiche that ben slawe,\n"
   1486 "Hath alle the bodyes on an heep y-drawe,\n"
   1487 "And wol nat suffren hem, by noon assent,\n"
   1488 "Neither to been y-buried nor y-brent,\n"
   1489 "But maketh houndes ete hem in despyt. zet'\n";
   1490 
   1491 #define TEST_BOYER_MOORE 1
   1492 const char *cPattern = "maketh houndes ete hem";
   1493 //const char *cPattern = "Whylom";
   1494 //const char *cPattern = "zet";
   1495     const char *testId = "searchTime()";   // for error macros.
   1496     UnicodeString target = longishText;
   1497     UErrorCode status = U_ZERO_ERROR;
   1498 
   1499 
   1500     UCollator *collator = ucol_open("en", &status);
   1501     CollData *data = CollData::open(collator, status);
   1502     if (U_FAILURE(status) || collator == NULL || data == NULL) {
   1503         errcheckln(status, "Unable to open UCollator or CollData. - %s", u_errorName(status));
   1504         return;
   1505     }
   1506     //ucol_setStrength(collator, collatorStrength);
   1507     //ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, normalize, &status);
   1508     UnicodeString uPattern = cPattern;
   1509 #ifndef TEST_BOYER_MOORE
   1510     UStringSearch *uss = usearch_openFromCollator(uPattern.getBuffer(), uPattern.length(),
   1511                                         target.getBuffer(), target.length(),
   1512                                         collator,
   1513                                         NULL,     // the break iterator
   1514                                         &status);
   1515     TEST_ASSERT_SUCCESS(status);
   1516 #else
   1517     BoyerMooreSearch bms(data, uPattern, &target, status);
   1518     TEST_ASSERT_SUCCESS(status);
   1519 #endif
   1520 
   1521 //  int32_t foundStart;
   1522 //  int32_t foundEnd;
   1523     UBool   found;
   1524 
   1525     // Find the match position usgin strstr
   1526     const char *pm = strstr(longishText, cPattern);
   1527     TEST_ASSERT_M(pm!=NULL, "No pattern match with strstr");
   1528     int32_t  refMatchPos = (int32_t)(pm - longishText);
   1529     int32_t  icuMatchPos;
   1530     int32_t  icuMatchEnd;
   1531 #ifndef TEST_BOYER_MOORE
   1532     usearch_search(uss, 0, &icuMatchPos, &icuMatchEnd, &status);
   1533     TEST_ASSERT_SUCCESS(status);
   1534 #else
   1535     found = bms.search(0, icuMatchPos, icuMatchEnd);
   1536 #endif
   1537     TEST_ASSERT_M(refMatchPos == icuMatchPos, "strstr and icu give different match positions.");
   1538 
   1539     int32_t i;
   1540     int32_t j=0;
   1541 
   1542     // Try loopcounts around 100000 to some millions, depending on the operation,
   1543     //   to get runtimes of at least several seconds.
   1544     for (i=0; i<10000; i++) {
   1545 #ifndef TEST_BOYER_MOORE
   1546         found = usearch_search(uss, 0, &icuMatchPos, &icuMatchEnd, &status);
   1547 #else
   1548         found = bms.search(0, icuMatchPos, icuMatchEnd);
   1549 #endif
   1550         //TEST_ASSERT_SUCCESS(status);
   1551         //TEST_ASSERT(found);
   1552 
   1553         // usearch_setOffset(uss, 0, &status);
   1554         // icuMatchPos = usearch_next(uss, &status);
   1555 
   1556          // The i+j stuff is to confuse the optimizer and get it to actually leave the
   1557          //   call to strstr in place.
   1558          //pm = strstr(longishText+j, cPattern);
   1559          //j = (j + i)%5;
   1560     }
   1561 
   1562     printf("%d\n", pm-longishText, j);
   1563 #ifndef TEST_BOYER_MOORE
   1564     usearch_close(uss);
   1565 #else
   1566     CollData::close(data);
   1567 #endif
   1568     ucol_close(collator);
   1569 }
   1570 #endif
   1571 
   1572 //----------------------------------------------------------------------------------------
   1573 //
   1574 //   Random Numbers.  Similar to standard lib rand() and srand()
   1575 //                    Not using library to
   1576 //                      1.  Get same results on all platforms.
   1577 //                      2.  Get access to current seed, to more easily reproduce failures.
   1578 //
   1579 //---------------------------------------------------------------------------------------
   1580 static uint32_t m_seed = 1;
   1581 
   1582 static uint32_t m_rand()
   1583 {
   1584     m_seed = m_seed * 1103515245 + 12345;
   1585     return (uint32_t)(m_seed/65536) % 32768;
   1586 }
   1587 
   1588 class Monkey
   1589 {
   1590 public:
   1591     virtual void append(UnicodeString &test, UnicodeString &alternate) = 0;
   1592 
   1593 protected:
   1594     Monkey();
   1595     virtual ~Monkey();
   1596 };
   1597 
   1598 Monkey::Monkey()
   1599 {
   1600     // ook?
   1601 }
   1602 
   1603 Monkey::~Monkey()
   1604 {
   1605     // ook?
   1606 }
   1607 
   1608 class SetMonkey : public Monkey
   1609 {
   1610 public:
   1611     SetMonkey(const USet *theSet);
   1612     ~SetMonkey();
   1613 
   1614     virtual void append(UnicodeString &test, UnicodeString &alternate);
   1615 
   1616 private:
   1617     const USet *set;
   1618 };
   1619 
   1620 SetMonkey::SetMonkey(const USet *theSet)
   1621     : Monkey(), set(theSet)
   1622 {
   1623     // ook?
   1624 }
   1625 
   1626 SetMonkey::~SetMonkey()
   1627 {
   1628     //ook...
   1629 }
   1630 
   1631 void SetMonkey::append(UnicodeString &test, UnicodeString &alternate)
   1632 {
   1633     int32_t size = uset_size(set);
   1634     int32_t index = m_rand() % size;
   1635     UChar32 ch = uset_charAt(set, index);
   1636     UnicodeString str(ch);
   1637 
   1638     test.append(str);
   1639     alternate.append(str); // flip case, or some junk?
   1640 }
   1641 
   1642 class StringSetMonkey : public Monkey
   1643 {
   1644 public:
   1645     StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData);
   1646     ~StringSetMonkey();
   1647 
   1648     void append(UnicodeString &testCase, UnicodeString &alternate);
   1649 
   1650 private:
   1651     UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeString &alternate);
   1652 
   1653     const USet *set;
   1654     UCollator  *coll;
   1655     CollData   *collData;
   1656 };
   1657 
   1658 StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData)
   1659 : Monkey(), set(theSet), coll(theCollator), collData(theCollData)
   1660 {
   1661     // ook.
   1662 }
   1663 
   1664 StringSetMonkey::~StringSetMonkey()
   1665 {
   1666     // ook?
   1667 }
   1668 
   1669 void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate)
   1670 {
   1671     int32_t itemCount = uset_getItemCount(set), len = 0;
   1672     int32_t index = m_rand() % itemCount;
   1673     UChar32 rangeStart = 0, rangeEnd = 0;
   1674     UChar buffer[16];
   1675     UErrorCode err = U_ZERO_ERROR;
   1676 
   1677     len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err);
   1678 
   1679     if (len == 0) {
   1680         int32_t offset = m_rand() % (rangeEnd - rangeStart + 1);
   1681         UChar32 ch = rangeStart + offset;
   1682         UnicodeString str(ch);
   1683 
   1684         testCase.append(str);
   1685         generateAlternative(str, alternate);
   1686     } else if (len > 0) {
   1687         // should check that len < 16...
   1688         UnicodeString str(buffer, len);
   1689 
   1690         testCase.append(str);
   1691         generateAlternative(str, alternate);
   1692     } else {
   1693         // shouldn't happen...
   1694     }
   1695 }
   1696 
   1697 UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCase, UnicodeString &alternate)
   1698 {
   1699     // find out shortest string for the longest sequence of ces.
   1700     // needs to be refined to use dynamic programming, but will be roughly right
   1701     UErrorCode status = U_ZERO_ERROR;
   1702     CEList ceList(coll, testCase, status);
   1703     UnicodeString alt;
   1704     int32_t offset = 0;
   1705 
   1706     if (ceList.size() == 0) {
   1707         return alternate.append(testCase);
   1708     }
   1709 
   1710     while (offset < ceList.size()) {
   1711         int32_t ce = ceList.get(offset);
   1712         const StringList *strings = collData->getStringList(ce);
   1713 
   1714         if (strings == NULL) {
   1715             return alternate.append(testCase);
   1716         }
   1717 
   1718         int32_t stringCount = strings->size();
   1719         int32_t tries = 0;
   1720 
   1721         // find random string that generates the same CEList
   1722         const CEList *ceList2 = NULL;
   1723         const UnicodeString *string = NULL;
   1724               UBool matches = FALSE;
   1725 
   1726         do {
   1727             int32_t s = m_rand() % stringCount;
   1728 
   1729             if (tries++ > stringCount) {
   1730                 alternate.append(testCase);
   1731                 return alternate;
   1732             }
   1733 
   1734             string = strings->get(s);
   1735             ceList2 = collData->getCEList(string);
   1736             matches = ceList.matchesAt(offset, ceList2);
   1737 
   1738             if (! matches) {
   1739                 collData->freeCEList((CEList *) ceList2);
   1740             }
   1741         } while (! matches);
   1742 
   1743         alt.append(*string);
   1744         offset += ceList2->size();
   1745         collData->freeCEList(ceList2);
   1746     }
   1747 
   1748     const CEList altCEs(coll, alt, status);
   1749 
   1750     if (ceList.matchesAt(0, &altCEs)) {
   1751         return alternate.append(alt);
   1752     }
   1753 
   1754     return alternate.append(testCase);
   1755 }
   1756 
   1757 static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyCount, UnicodeString &testCase, UnicodeString &alternate)
   1758 {
   1759     int32_t pieces = (m_rand() % 4) + 1;
   1760     UErrorCode status = U_ZERO_ERROR;
   1761     UBool matches;
   1762 
   1763     do {
   1764         testCase.remove();
   1765         alternate.remove();
   1766         monkeys[0]->append(testCase, alternate);
   1767 
   1768         for(int32_t piece = 0; piece < pieces; piece += 1) {
   1769             int32_t monkey = m_rand() % monkeyCount;
   1770 
   1771             monkeys[monkey]->append(testCase, alternate);
   1772         }
   1773 
   1774         const CEList ceTest(coll, testCase, status);
   1775         const CEList ceAlt(coll, alternate, status);
   1776 
   1777         matches = ceTest.matchesAt(0, &ceAlt);
   1778     } while (! matches);
   1779 }
   1780 
   1781 //
   1782 //  Find the next acceptable boundary following the specified starting index
   1783 //     in the target text being searched.
   1784 //      TODO:  refine what is an acceptable boundary.  For the moment,
   1785 //             choose the next position not within a combining sequence.
   1786 //
   1787 static int32_t nextBoundaryAfter(const UnicodeString &string, int32_t startIndex) {
   1788     const UChar *text = string.getBuffer();
   1789     int32_t textLen   = string.length();
   1790 
   1791     if (startIndex >= textLen) {
   1792         return startIndex;
   1793     }
   1794 
   1795     UChar32  c;
   1796     int32_t  i = startIndex;
   1797 
   1798     U16_NEXT(text, i, textLen, c);
   1799 
   1800     // If we are on a control character, stop without looking for combining marks.
   1801     //    Control characters do not combine.
   1802     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   1803     if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
   1804         return i;
   1805     }
   1806 
   1807     // The initial character was not a control, and can thus accept trailing
   1808     //   combining characters.  Advance over however many of them there are.
   1809     int32_t  indexOfLastCharChecked;
   1810 
   1811     for (;;) {
   1812         indexOfLastCharChecked = i;
   1813 
   1814         if (i>=textLen) {
   1815             break;
   1816         }
   1817 
   1818         U16_NEXT(text, i, textLen, c);
   1819         gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   1820 
   1821         if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
   1822             break;
   1823         }
   1824     }
   1825 
   1826     return indexOfLastCharChecked;
   1827 }
   1828 
   1829 static UBool isInCombiningSequence(const UnicodeString &string, int32_t index) {
   1830     const UChar *text = string.getBuffer();
   1831     int32_t textLen   = string.length();
   1832 
   1833     if (index>=textLen || index<=0) {
   1834         return FALSE;
   1835     }
   1836 
   1837     // If the character at the current index is not a GRAPHEME_EXTEND
   1838     //    then we can not be within a combining sequence.
   1839     UChar32  c;
   1840     U16_GET(text, 0, index, textLen, c);
   1841     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   1842     if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
   1843         return FALSE;
   1844     }
   1845 
   1846     // We are at a combining mark.  If the preceding character is anything
   1847     //   except a CONTROL, CR or LF, we are in a combining sequence.
   1848     U16_PREV(text, 0, index, c);
   1849     gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   1850 
   1851     return !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
   1852 }
   1853 
   1854 static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd)
   1855 {
   1856     UErrorCode      status = U_ZERO_ERROR;
   1857     OrderList       targetOrders(coll, target, offset);
   1858     OrderList       patternOrders(coll, pattern);
   1859     int32_t         targetSize  = targetOrders.size() - 1;
   1860     int32_t         patternSize = patternOrders.size() - 1;
   1861     UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),
   1862                                                   target.getBuffer(), target.length(), &status);
   1863 
   1864     if (patternSize == 0) {
   1865         // Searching for an empty pattern always fails
   1866         matchStart = matchEnd = -1;
   1867         ubrk_close(charBreakIterator);
   1868         return FALSE;
   1869     }
   1870 
   1871     matchStart = matchEnd = -1;
   1872 
   1873     for(int32_t i = 0; i < targetSize; i += 1) {
   1874         if (targetOrders.matchesAt(i, patternOrders)) {
   1875             int32_t start    = targetOrders.getLowOffset(i);
   1876             int32_t maxLimit = targetOrders.getLowOffset(i + patternSize);
   1877             int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1);
   1878 
   1879             // if the low and high offsets of the first CE in
   1880             // the match are the same, it means that the match
   1881             // starts in the middle of an expansion - all but
   1882             // the first CE of the expansion will have the offset
   1883             // of the following character.
   1884             if (start == targetOrders.getHighOffset(i)) {
   1885                 continue;
   1886             }
   1887 
   1888             // Make sure match starts on a grapheme boundary
   1889             if (! ubrk_isBoundary(charBreakIterator, start)) {
   1890                 continue;
   1891             }
   1892 
   1893             // If the low and high offsets of the CE after the match
   1894             // are the same, it means that the match ends in the middle
   1895             // of an expansion sequence.
   1896             if (maxLimit == targetOrders.getHighOffset(i + patternSize) &&
   1897                 targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) {
   1898                 continue;
   1899             }
   1900 
   1901             int32_t mend = maxLimit;
   1902 
   1903             // Find the first grapheme break after the character index
   1904             // of the last CE in the match. If it's after character index
   1905             // that's after the last CE in the match, use that index
   1906             // as the end of the match.
   1907             if (minLimit < maxLimit) {
   1908                 int32_t nba = ubrk_following(charBreakIterator, minLimit);
   1909 
   1910                 if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) {
   1911                     mend = nba;
   1912                 }
   1913             }
   1914 
   1915             if (mend > maxLimit) {
   1916                 continue;
   1917             }
   1918 
   1919             if (! ubrk_isBoundary(charBreakIterator, mend)) {
   1920                 continue;
   1921             }
   1922 
   1923             matchStart = start;
   1924             matchEnd   = mend;
   1925 
   1926             ubrk_close(charBreakIterator);
   1927             return TRUE;
   1928         }
   1929     }
   1930 
   1931     ubrk_close(charBreakIterator);
   1932     return FALSE;
   1933 }
   1934 
   1935 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   1936 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
   1937     int32_t val = defaultVal;
   1938 
   1939     name.append(" *= *(-?\\d+)");
   1940 
   1941     UErrorCode status = U_ZERO_ERROR;
   1942     RegexMatcher m(name, params, 0, status);
   1943 
   1944     if (m.find()) {
   1945         // The param exists.  Convert the string to an int.
   1946         char valString[100];
   1947         int32_t paramLength = m.end(1, status) - m.start(1, status);
   1948 
   1949         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
   1950             paramLength = (int32_t)(sizeof(valString)-2);
   1951         }
   1952 
   1953         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
   1954         val = strtol(valString,  NULL, 10);
   1955 
   1956         // Delete this parameter from the params string.
   1957         m.reset();
   1958         params = m.replaceFirst("", status);
   1959     }
   1960 
   1961   //U_ASSERT(U_SUCCESS(status));
   1962     if (! U_SUCCESS(status)) {
   1963         val = defaultVal;
   1964     }
   1965 
   1966     return val;
   1967 }
   1968 #endif
   1969 
   1970 #if !UCONFIG_NO_COLLATION
   1971 int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
   1972                                     const char *name, const char *strength, uint32_t seed)
   1973 {
   1974     UErrorCode status = U_ZERO_ERROR;
   1975     int32_t actualStart = -1, actualEnd = -1;
   1976   //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
   1977     int32_t expectedStart = -1, expectedEnd = -1;
   1978     int32_t notFoundCount = 0;
   1979     UStringSearch *uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
   1980                                 testCase.getBuffer(), testCase.length(),
   1981                                 coll,
   1982                                 NULL,     // the break iterator
   1983                                 &status);
   1984 
   1985     // **** TODO: find *all* matches, not just first one ****
   1986     simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd);
   1987 
   1988     usearch_search(uss, 0, &actualStart, &actualEnd, &status);
   1989 
   1990     if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
   1991         errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
   1992               "    strength=%s seed=%d",
   1993               name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
   1994     }
   1995 
   1996     if (expectedStart == -1 && actualStart == -1) {
   1997         notFoundCount += 1;
   1998     }
   1999 
   2000     // **** TODO: find *all* matches, not just first one ****
   2001     simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd);
   2002 
   2003     usearch_setPattern(uss, altPattern.getBuffer(), altPattern.length(), &status);
   2004 
   2005     usearch_search(uss, 0, &actualStart, &actualEnd, &status);
   2006 
   2007     if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
   2008         errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
   2009               "    strength=%s seed=%d",
   2010               name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
   2011     }
   2012 
   2013     if (expectedStart == -1 && actualStart == -1) {
   2014         notFoundCount += 1;
   2015     }
   2016 
   2017     usearch_close(uss);
   2018 
   2019     return notFoundCount;
   2020 }
   2021 
   2022 int32_t SSearchTest::bmMonkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
   2023                                     BoyerMooreSearch *bms, BoyerMooreSearch *abms,
   2024                                     const char *name, const char *strength, uint32_t seed)
   2025 {
   2026     UErrorCode status = U_ZERO_ERROR;
   2027     int32_t actualStart = -1, actualEnd = -1;
   2028   //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
   2029     int32_t expectedStart = -1, expectedEnd = -1;
   2030     int32_t notFoundCount = 0;
   2031 
   2032     // **** TODO: find *all* matches, not just first one ****
   2033     simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd);
   2034 
   2035     bms->setTargetString(&testCase, status);
   2036     bms->search(0, actualStart, actualEnd);
   2037 
   2038     if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
   2039         errln("Boyer-Moore Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
   2040               "    strength=%s seed=%d",
   2041               name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
   2042     }
   2043 
   2044     if (expectedStart == -1 && actualStart == -1) {
   2045         notFoundCount += 1;
   2046     }
   2047 
   2048     // **** TODO: find *all* matches, not just first one ****
   2049     simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd);
   2050 
   2051     abms->setTargetString(&testCase, status);
   2052     abms->search(0, actualStart, actualEnd);
   2053 
   2054     if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
   2055         errln("Boyer-Moore Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
   2056               "    strength=%s seed=%d",
   2057               name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
   2058     }
   2059 
   2060     if (expectedStart == -1 && actualStart == -1) {
   2061         notFoundCount += 1;
   2062     }
   2063 
   2064 
   2065     return notFoundCount;
   2066 }
   2067 #endif
   2068 
   2069 void SSearchTest::monkeyTest(char *params)
   2070 {
   2071     // ook!
   2072     UErrorCode status = U_ZERO_ERROR;
   2073   //UCollator *coll = ucol_open(NULL, &status);
   2074     UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status);
   2075 
   2076     if (U_FAILURE(status)) {
   2077         errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status));
   2078         return;
   2079     }
   2080 
   2081     CollData  *monkeyData = CollData::open(coll, status);
   2082 
   2083     USet *expansions   = uset_openEmpty();
   2084     USet *contractions = uset_openEmpty();
   2085 
   2086     ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
   2087 
   2088     U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
   2089     U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
   2090     USet *letters = uset_openPattern(letter_pattern, 39, &status);
   2091     SetMonkey letterMonkey(letters);
   2092     StringSetMonkey contractionMonkey(contractions, coll, monkeyData);
   2093     StringSetMonkey expansionMonkey(expansions, coll, monkeyData);
   2094     UnicodeString testCase;
   2095     UnicodeString alternate;
   2096     UnicodeString pattern, altPattern;
   2097     UnicodeString prefix, altPrefix;
   2098     UnicodeString suffix, altSuffix;
   2099 
   2100     Monkey *monkeys[] = {
   2101         &letterMonkey,
   2102         &contractionMonkey,
   2103         &expansionMonkey,
   2104         &contractionMonkey,
   2105         &expansionMonkey,
   2106         &contractionMonkey,
   2107         &expansionMonkey,
   2108         &contractionMonkey,
   2109         &expansionMonkey};
   2110     int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]);
   2111     int32_t nonMatchCount = 0;
   2112 
   2113     UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
   2114     const char *strengthNames[] = {"primary", "secondary", "tertiary"};
   2115     int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]);
   2116     int32_t loopCount = quick? 1000 : 10000;
   2117     int32_t firstStrength = 0;
   2118     int32_t lastStrength  = strengthCount - 1; //*/ 0;
   2119 
   2120     if (params != NULL) {
   2121 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   2122         UnicodeString p(params);
   2123 
   2124         loopCount = getIntParam("loop", p, loopCount);
   2125         m_seed    = getIntParam("seed", p, m_seed);
   2126 
   2127         RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status);
   2128         if (m.find()) {
   2129             UnicodeString breakType = m.group(1, status);
   2130 
   2131             for (int32_t s = 0; s < strengthCount; s += 1) {
   2132                 if (breakType == strengthNames[s]) {
   2133                     firstStrength = lastStrength = s;
   2134                     break;
   2135                 }
   2136             }
   2137 
   2138             m.reset();
   2139             p = m.replaceFirst("", status);
   2140         }
   2141 
   2142         if (RegexMatcher("\\S", p, 0, status).find()) {
   2143             // Each option is stripped out of the option string as it is processed.
   2144             // All options have been checked.  The option string should have been completely emptied..
   2145             char buf[100];
   2146             p.extract(buf, sizeof(buf), NULL, status);
   2147             buf[sizeof(buf)-1] = 0;
   2148             errln("Unrecognized or extra parameter:  %s\n", buf);
   2149             return;
   2150         }
   2151 #else
   2152         infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
   2153 #endif
   2154     }
   2155 
   2156     for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
   2157         int32_t notFoundCount = 0;
   2158 
   2159         logln("Setting strength to %s.", strengthNames[s]);
   2160         ucol_setStrength(coll, strengths[s]);
   2161 
   2162         // TODO: try alternate prefix and suffix too?
   2163         // TODO: alterntaes are only equal at primary strength. Is this OK?
   2164         for(int32_t t = 0; t < loopCount; t += 1) {
   2165             uint32_t seed = m_seed;
   2166             int32_t  nmc = 0;
   2167 
   2168             generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
   2169             generateTestCase(coll, monkeys, monkeyCount, prefix,  altPrefix);
   2170             generateTestCase(coll, monkeys, monkeyCount, suffix,  altSuffix);
   2171 
   2172             // pattern
   2173             notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed);
   2174 
   2175             testCase.remove();
   2176             testCase.append(prefix);
   2177             testCase.append(/*alt*/pattern);
   2178 
   2179             // prefix + pattern
   2180             notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed);
   2181 
   2182             testCase.append(suffix);
   2183 
   2184             // prefix + pattern + suffix
   2185             notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed);
   2186 
   2187             testCase.remove();
   2188             testCase.append(pattern);
   2189             testCase.append(suffix);
   2190 
   2191             // pattern + suffix
   2192             notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed);
   2193         }
   2194 
   2195        logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
   2196     }
   2197 
   2198     uset_close(contractions);
   2199     uset_close(expansions);
   2200     uset_close(letters);
   2201 
   2202     CollData::close(monkeyData);
   2203 
   2204     ucol_close(coll);
   2205 }
   2206 
   2207 void SSearchTest::bmMonkeyTest(char *params)
   2208 {
   2209     // ook!
   2210     UErrorCode status = U_ZERO_ERROR;
   2211     UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status);
   2212 
   2213     if (U_FAILURE(status)) {
   2214         errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status));
   2215         return;
   2216     }
   2217 
   2218     CollData  *monkeyData = CollData::open(coll, status);
   2219 
   2220     USet *expansions   = uset_openEmpty();
   2221     USet *contractions = uset_openEmpty();
   2222 
   2223     ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
   2224 
   2225     U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
   2226     U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
   2227     USet *letters = uset_openPattern(letter_pattern, 39, &status);
   2228     SetMonkey letterMonkey(letters);
   2229     StringSetMonkey contractionMonkey(contractions, coll, monkeyData);
   2230     StringSetMonkey expansionMonkey(expansions, coll, monkeyData);
   2231     UnicodeString testCase;
   2232     UnicodeString alternate;
   2233     UnicodeString pattern, altPattern;
   2234     UnicodeString prefix, altPrefix;
   2235     UnicodeString suffix, altSuffix;
   2236 
   2237     Monkey *monkeys[] = {
   2238         &letterMonkey,
   2239         &contractionMonkey,
   2240         &expansionMonkey,
   2241         &contractionMonkey,
   2242         &expansionMonkey,
   2243         &contractionMonkey,
   2244         &expansionMonkey,
   2245         &contractionMonkey,
   2246         &expansionMonkey};
   2247     int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]);
   2248     int32_t nonMatchCount = 0;
   2249 
   2250     UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
   2251     const char *strengthNames[] = {"primary", "secondary", "tertiary"};
   2252     int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]);
   2253     int32_t loopCount = quick? 1000 : 10000;
   2254     int32_t firstStrength = 0;
   2255     int32_t lastStrength  = strengthCount - 1; //*/ 0;
   2256 
   2257     if (params != NULL) {
   2258 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   2259         UnicodeString p(params);
   2260 
   2261         loopCount = getIntParam("loop", p, loopCount);
   2262         m_seed    = getIntParam("seed", p, m_seed);
   2263 
   2264         RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status);
   2265         if (m.find()) {
   2266             UnicodeString breakType = m.group(1, status);
   2267 
   2268             for (int32_t s = 0; s < strengthCount; s += 1) {
   2269                 if (breakType == strengthNames[s]) {
   2270                     firstStrength = lastStrength = s;
   2271                     break;
   2272                 }
   2273             }
   2274 
   2275             m.reset();
   2276             p = m.replaceFirst("", status);
   2277         }
   2278 
   2279         if (RegexMatcher("\\S", p, 0, status).find()) {
   2280             // Each option is stripped out of the option string as it is processed.
   2281             // All options have been checked.  The option string should have been completely emptied..
   2282             char buf[100];
   2283             p.extract(buf, sizeof(buf), NULL, status);
   2284             buf[sizeof(buf)-1] = 0;
   2285             errln("Unrecognized or extra parameter:  %s\n", buf);
   2286             return;
   2287         }
   2288 #else
   2289         infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
   2290 #endif
   2291     }
   2292 
   2293     for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
   2294         int32_t notFoundCount = 0;
   2295 
   2296         logln("Setting strength to %s.", strengthNames[s]);
   2297         ucol_setStrength(coll, strengths[s]);
   2298 
   2299         CollData *data = CollData::open(coll, status);
   2300 
   2301         // TODO: try alternate prefix and suffix too?
   2302         // TODO: alterntaes are only equal at primary strength. Is this OK?
   2303         for(int32_t t = 0; t < loopCount; t += 1) {
   2304             uint32_t seed = m_seed;
   2305             int32_t  nmc = 0;
   2306 
   2307             generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
   2308             generateTestCase(coll, monkeys, monkeyCount, prefix,  altPrefix);
   2309             generateTestCase(coll, monkeys, monkeyCount, suffix,  altSuffix);
   2310 
   2311             BoyerMooreSearch pat(data, pattern, NULL, status);
   2312             BoyerMooreSearch alt(data, altPattern, NULL, status);
   2313 
   2314             // **** need a better way to deal with this ****
   2315 #if 0
   2316             if (pat.empty() ||
   2317                 alt.empty()) {
   2318                     continue;
   2319             }
   2320 #endif
   2321 
   2322             // pattern
   2323             notFoundCount += bmMonkeyTestCase(coll, pattern, pattern, altPattern, &pat, &alt, "pattern", strengthNames[s], seed);
   2324 
   2325             testCase.remove();
   2326             testCase.append(prefix);
   2327             testCase.append(/*alt*/pattern);
   2328 
   2329             // prefix + pattern
   2330             notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "prefix + pattern", strengthNames[s], seed);
   2331 
   2332             testCase.append(suffix);
   2333 
   2334             // prefix + pattern + suffix
   2335             notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "prefix + pattern + suffix", strengthNames[s], seed);
   2336 
   2337             testCase.remove();
   2338             testCase.append(pattern);
   2339             testCase.append(suffix);
   2340 
   2341             // pattern + suffix
   2342             notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "pattern + suffix", strengthNames[s], seed);
   2343         }
   2344 
   2345         CollData::close(data);
   2346 
   2347         logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
   2348     }
   2349 
   2350     uset_close(contractions);
   2351     uset_close(expansions);
   2352     uset_close(letters);
   2353 
   2354     CollData::close(monkeyData);
   2355 
   2356     ucol_close(coll);
   2357 }
   2358 
   2359 #endif
   2360 
   2361 #endif
   2362