Home | History | Annotate | Download | only in intltest
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /********************************************************************
      4  * COPYRIGHT:
      5  * Copyright (c) 1999-2016, International Business Machines Corporation and
      6  * others. All Rights Reserved.
      7  ********************************************************************/
      8 /************************************************************************
      9 *   Date        Name        Description
     10 *   12/15/99    Madhu        Creation.
     11 *   01/12/2000  Madhu        Updated for changed API and added new tests
     12 ************************************************************************/
     13 
     14 #include "unicode/utypes.h"
     15 #if !UCONFIG_NO_BREAK_ITERATION
     16 
     17 #include <stdio.h>
     18 #include <stdlib.h>
     19 #include <string.h>
     20 
     21 #include "unicode/brkiter.h"
     22 #include "unicode/localpointer.h"
     23 #include "unicode/numfmt.h"
     24 #include "unicode/rbbi.h"
     25 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     26 #include "unicode/regex.h"
     27 #endif
     28 #include "unicode/schriter.h"
     29 #include "unicode/uchar.h"
     30 #include "unicode/utf16.h"
     31 #include "unicode/ucnv.h"
     32 #include "unicode/uniset.h"
     33 #include "unicode/uscript.h"
     34 #include "unicode/ustring.h"
     35 #include "unicode/utext.h"
     36 
     37 #include "charstr.h"
     38 #include "cmemory.h"
     39 #include "cstr.h"
     40 #include "intltest.h"
     41 #include "rbbitst.h"
     42 #include "utypeinfo.h"  // for 'typeid' to work
     43 #include "uvector.h"
     44 #include "uvectr32.h"
     45 
     46 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
     47 #include "unicode/filteredbrk.h"
     48 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
     49 
     50 #define TEST_ASSERT(x) {if (!(x)) { \
     51     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
     52 
     53 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
     54     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
     55 
     56 //---------------------------------------------
     57 // runIndexedTest
     58 //---------------------------------------------
     59 
     60 
     61 //  Note:  Before adding new tests to this file, check whether the desired test data can
     62 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
     63 //         it's much less work than writing a new test, diagnostic output in the event of failures
     64 //         is good, and the test data file will is shared with ICU4J, so eventually the test
     65 //         will run there as well, without additional effort.
     66 
     67 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
     68 {
     69     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
     70     fTestParams = params;
     71 
     72     TESTCASE_AUTO_BEGIN;
     73 #if !UCONFIG_NO_FILE_IO
     74     TESTCASE_AUTO(TestBug4153072);
     75 #endif
     76 #if !UCONFIG_NO_FILE_IO
     77     TESTCASE_AUTO(TestUnicodeFiles);
     78     TESTCASE_AUTO(TestEmptyString);
     79 #endif
     80     TESTCASE_AUTO(TestGetAvailableLocales);
     81     TESTCASE_AUTO(TestGetDisplayName);
     82 #if !UCONFIG_NO_FILE_IO
     83     TESTCASE_AUTO(TestEndBehaviour);
     84     TESTCASE_AUTO(TestWordBreaks);
     85     TESTCASE_AUTO(TestWordBoundary);
     86     TESTCASE_AUTO(TestLineBreaks);
     87     TESTCASE_AUTO(TestSentBreaks);
     88     TESTCASE_AUTO(TestExtended);
     89 #endif
     90 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
     91     TESTCASE_AUTO(TestMonkey);
     92 #endif
     93 #if !UCONFIG_NO_FILE_IO
     94     TESTCASE_AUTO(TestBug3818);
     95 #endif
     96     TESTCASE_AUTO(TestDebug);
     97 #if !UCONFIG_NO_FILE_IO
     98     TESTCASE_AUTO(TestBug5775);
     99 #endif
    100     TESTCASE_AUTO(TestBug9983);
    101     TESTCASE_AUTO(TestDictRules);
    102     TESTCASE_AUTO(TestBug5532);
    103     TESTCASE_AUTO(TestBug7547);
    104     TESTCASE_AUTO(TestBug12797);
    105     TESTCASE_AUTO(TestBug12918);
    106     TESTCASE_AUTO(TestBug12932);
    107     TESTCASE_AUTO(TestEmoji);
    108     TESTCASE_AUTO(TestBug12519);
    109     TESTCASE_AUTO_END;
    110 }
    111 
    112 
    113 //---------------------------------------------------------------------------
    114 //
    115 //   class BITestData   Holds a set of Break iterator test data and results
    116 //                      Includes
    117 //                         - the string data to be broken
    118 //                         - a vector of the expected break positions.
    119 //                         - a vector of source line numbers for the data,
    120 //                               (to help see where errors occured.)
    121 //                         - The expected break tag values.
    122 //                         - Vectors of actual break positions and tag values.
    123 //                         - Functions for comparing actual with expected and
    124 //                            reporting errors.
    125 //
    126 //----------------------------------------------------------------------------
    127 class BITestData {
    128 public:
    129     UnicodeString    fDataToBreak;
    130     UVector          fExpectedBreakPositions;
    131     UVector          fExpectedTags;
    132     UVector          fLineNum;
    133     UVector          fActualBreakPositions;   // Test Results.
    134     UVector          fActualTags;
    135 
    136     BITestData(UErrorCode &status);
    137     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
    138     void             checkResults(const char *heading, RBBITest *test);
    139     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
    140     void             clearResults();
    141 };
    142 
    143 //
    144 // Constructor.
    145 //
    146 BITestData::BITestData(UErrorCode &status)
    147 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
    148   fActualTags(status)
    149 {
    150 }
    151 
    152 //
    153 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
    154 //                 The macro form collects the line number, which is helpful
    155 //                 when tracking down failures.
    156 //
    157 //                 A null data item is inserted at the start of each test's data
    158 //                  to put the starting zero into the data list.  The position saved for
    159 //                  each non-null item is its ending position.
    160 //
    161 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
    162 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
    163     if (U_FAILURE(status)) {return;}
    164     if (data != NULL) {
    165         fDataToBreak.append(CharsToUnicodeString(data));
    166     }
    167     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
    168     fExpectedTags.addElement(tag, status);
    169     fLineNum.addElement(lineNum, status);
    170 }
    171 
    172 
    173 //
    174 //  checkResults.   Compare the actual and expected break positions, report any differences.
    175 //
    176 void BITestData::checkResults(const char *heading, RBBITest *test) {
    177     int32_t   expectedIndex = 0;
    178     int32_t   actualIndex = 0;
    179 
    180     for (;;) {
    181         // If we've run through both the expected and actual results vectors, we're done.
    182         //   break out of the loop.
    183         if (expectedIndex >= fExpectedBreakPositions.size() &&
    184             actualIndex   >= fActualBreakPositions.size()) {
    185             break;
    186         }
    187 
    188 
    189         if (expectedIndex >= fExpectedBreakPositions.size()) {
    190             err(heading, test, expectedIndex-1, actualIndex);
    191             actualIndex++;
    192             continue;
    193         }
    194 
    195         if (actualIndex >= fActualBreakPositions.size()) {
    196             err(heading, test, expectedIndex, actualIndex-1);
    197             expectedIndex++;
    198             continue;
    199         }
    200 
    201         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
    202             err(heading, test, expectedIndex, actualIndex);
    203             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
    204             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
    205                 actualIndex++;
    206             } else {
    207                 expectedIndex++;
    208             }
    209             continue;
    210         }
    211 
    212         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
    213             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
    214                 heading, fLineNum.elementAt(expectedIndex),
    215                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
    216         }
    217 
    218         actualIndex++;
    219         expectedIndex++;
    220     }
    221 }
    222 
    223 //
    224 //  err   -  An error was found.  Report it, along with information about where the
    225 //                                incorrectly broken test data appeared in the source file.
    226 //
    227 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
    228 {
    229     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
    230     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
    231     int32_t   o        = 0;
    232     int32_t   line     = fLineNum.elementAti(expectedIdx);
    233     if (expectedIdx > 0) {
    234         // The line numbers are off by one because a premature break occurs somewhere
    235         //    within the previous item, rather than at the start of the current (expected) item.
    236         //    We want to report the offset of the unexpected break from the start of
    237         //      this previous item.
    238         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
    239     }
    240     if (actual < expected) {
    241         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
    242     } else {
    243         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
    244     }
    245 }
    246 
    247 
    248 void BITestData::clearResults() {
    249     fActualBreakPositions.removeAllElements();
    250     fActualTags.removeAllElements();
    251 }
    252 
    253 
    254 //--------------------------------------------------------------------------------------
    255 //
    256 //    RBBITest    constructor and destructor
    257 //
    258 //--------------------------------------------------------------------------------------
    259 
    260 RBBITest::RBBITest() {
    261     fTestParams = NULL;
    262 }
    263 
    264 
    265 RBBITest::~RBBITest() {
    266 }
    267 
    268 
    269 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
    270     UErrorCode status = U_ZERO_ERROR;
    271     char name[100];
    272     printf("code    alpha extend alphanum type word sent line name\n");
    273     int nextExpectedIndex = 0;
    274     utext_setNativeIndex(tstr, 0);
    275     for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
    276         if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
    277             printf("------------------------------------------------ %d\n", j);
    278             ++nextExpectedIndex;
    279         }
    280 
    281         UChar32 c = utext_next32(tstr);
    282         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
    283         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
    284                            u_isUAlphabetic(c),
    285                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
    286                            u_isalnum(c),
    287                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
    288                                                   u_charType(c),
    289                                                   U_SHORT_PROPERTY_NAME),
    290                            u_getPropertyValueName(UCHAR_WORD_BREAK,
    291                                                   u_getIntPropertyValue(c,
    292                                                           UCHAR_WORD_BREAK),
    293                                                   U_SHORT_PROPERTY_NAME),
    294                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
    295                                    u_getIntPropertyValue(c,
    296                                            UCHAR_SENTENCE_BREAK),
    297                                    U_SHORT_PROPERTY_NAME),
    298                            u_getPropertyValueName(UCHAR_LINE_BREAK,
    299                                    u_getIntPropertyValue(c,
    300                                            UCHAR_LINE_BREAK),
    301                                    U_SHORT_PROPERTY_NAME),
    302                            name);
    303     }
    304 }
    305 
    306 
    307 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
    308    UErrorCode status = U_ZERO_ERROR;
    309    UText *tstr = NULL;
    310    tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
    311    if (U_FAILURE(status)) {
    312        printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
    313        return;
    314     }
    315    printStringBreaks(tstr, expected, expectedCount);
    316    utext_close(tstr);
    317 }
    318 
    319 
    320 void RBBITest::TestBug3818() {
    321     UErrorCode  status = U_ZERO_ERROR;
    322 
    323     // Four Thai words...
    324     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
    325                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
    326     UnicodeString  thaiStr(thaiWordData);
    327 
    328     BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
    329     if (U_FAILURE(status) || bi == NULL) {
    330         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
    331         return;
    332     }
    333     bi->setText(thaiStr);
    334 
    335     int32_t  startOfSecondWord = bi->following(1);
    336     if (startOfSecondWord != 4) {
    337         errln("Fail at file %s, line %d expected start of word at 4, got %d",
    338             __FILE__, __LINE__, startOfSecondWord);
    339     }
    340     startOfSecondWord = bi->following(0);
    341     if (startOfSecondWord != 4) {
    342         errln("Fail at file %s, line %d expected start of word at 4, got %d",
    343             __FILE__, __LINE__, startOfSecondWord);
    344     }
    345     delete bi;
    346 }
    347 
    348 //----------------------------------------------------------------------------
    349 //
    350 // generalIteratorTest      Given a break iterator and a set of test data,
    351 //                          Run the tests and report the results.
    352 //
    353 //----------------------------------------------------------------------------
    354 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
    355 {
    356 
    357     bi.setText(td.fDataToBreak);
    358 
    359     testFirstAndNext(bi, td);
    360 
    361     testLastAndPrevious(bi, td);
    362 
    363     testFollowing(bi, td);
    364     testPreceding(bi, td);
    365     testIsBoundary(bi, td);
    366     doMultipleSelectionTest(bi, td);
    367 }
    368 
    369 
    370 //
    371 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
    372 //                       kind of loop.
    373 //
    374 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
    375 {
    376     UErrorCode  status = U_ZERO_ERROR;
    377     int32_t     p;
    378     int32_t     lastP = -1;
    379     int32_t     tag;
    380 
    381     logln("Test first and next");
    382     bi.setText(td.fDataToBreak);
    383     td.clearResults();
    384 
    385     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
    386         td.fActualBreakPositions.addElement(p, status);  // Save result.
    387         tag = bi.getRuleStatus();
    388         td.fActualTags.addElement(tag, status);
    389         if (p <= lastP) {
    390             // If the iterator is not making forward progress, stop.
    391             //  No need to raise an error here, it'll be detected in the normal check of results.
    392             break;
    393         }
    394         lastP = p;
    395     }
    396     td.checkResults("testFirstAndNext", this);
    397 }
    398 
    399 
    400 //
    401 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
    402 //
    403 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
    404 {
    405     UErrorCode  status = U_ZERO_ERROR;
    406     int32_t     p;
    407     int32_t     lastP  = 0x7ffffffe;
    408     int32_t     tag;
    409 
    410     logln("Test last and previous");
    411     bi.setText(td.fDataToBreak);
    412     td.clearResults();
    413 
    414     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
    415         // Save break position.  Insert it at start of vector of results, shoving
    416         //    already-saved results further towards the end.
    417         td.fActualBreakPositions.insertElementAt(p, 0, status);
    418         // bi.previous();   // TODO:  Why does this fix things up????
    419         // bi.next();
    420         tag = bi.getRuleStatus();
    421         td.fActualTags.insertElementAt(tag, 0, status);
    422         if (p >= lastP) {
    423             // If the iterator is not making progress, stop.
    424             //  No need to raise an error here, it'll be detected in the normal check of results.
    425             break;
    426         }
    427         lastP = p;
    428     }
    429     td.checkResults("testLastAndPrevious", this);
    430 }
    431 
    432 
    433 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
    434 {
    435     UErrorCode  status = U_ZERO_ERROR;
    436     int32_t     p;
    437     int32_t     tag;
    438     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
    439                                  //   cannot be -1; that is returned for DONE.
    440     int         i;
    441 
    442     logln("testFollowing():");
    443     bi.setText(td.fDataToBreak);
    444     td.clearResults();
    445 
    446     // Save the starting point, since we won't get that out of following.
    447     p = bi.first();
    448     td.fActualBreakPositions.addElement(p, status);  // Save result.
    449     tag = bi.getRuleStatus();
    450     td.fActualTags.addElement(tag, status);
    451 
    452     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
    453         p = bi.following(i);
    454         if (p != lastP) {
    455             if (p == RuleBasedBreakIterator::DONE) {
    456                 break;
    457             }
    458             // We've reached a new break position.  Save it.
    459             td.fActualBreakPositions.addElement(p, status);  // Save result.
    460             tag = bi.getRuleStatus();
    461             td.fActualTags.addElement(tag, status);
    462             lastP = p;
    463         }
    464     }
    465     // The loop normally exits by means of the break in the middle.
    466     // Make sure that the index was at the correct position for the break iterator to have
    467     //   returned DONE.
    468     if (i != td.fDataToBreak.length()) {
    469         errln("testFollowing():  iterator returned DONE prematurely.");
    470     }
    471 
    472     // Full check of all results.
    473     td.checkResults("testFollowing", this);
    474 }
    475 
    476 
    477 
    478 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
    479     UErrorCode  status = U_ZERO_ERROR;
    480     int32_t     p;
    481     int32_t     tag;
    482     int32_t     lastP  = 0x7ffffffe;
    483     int         i;
    484 
    485     logln("testPreceding():");
    486     bi.setText(td.fDataToBreak);
    487     td.clearResults();
    488 
    489     p = bi.last();
    490     td.fActualBreakPositions.addElement(p, status);
    491     tag = bi.getRuleStatus();
    492     td.fActualTags.addElement(tag, status);
    493 
    494     for (i = td.fDataToBreak.length(); i>=-1; i--) {
    495         p = bi.preceding(i);
    496         if (p != lastP) {
    497             if (p == RuleBasedBreakIterator::DONE) {
    498                 break;
    499             }
    500             // We've reached a new break position.  Save it.
    501             td.fActualBreakPositions.insertElementAt(p, 0, status);
    502             lastP = p;
    503             tag = bi.getRuleStatus();
    504             td.fActualTags.insertElementAt(tag, 0, status);
    505         }
    506     }
    507     // The loop normally exits by means of the break in the middle.
    508     // Make sure that the index was at the correct position for the break iterator to have
    509     //   returned DONE.
    510     if (i != 0) {
    511         errln("testPreceding():  iterator returned DONE prematurely.");
    512     }
    513 
    514     // Full check of all results.
    515     td.checkResults("testPreceding", this);
    516 }
    517 
    518 
    519 
    520 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
    521     UErrorCode  status = U_ZERO_ERROR;
    522     int         i;
    523     int32_t     tag;
    524 
    525     logln("testIsBoundary():");
    526     bi.setText(td.fDataToBreak);
    527     td.clearResults();
    528 
    529     for (i = 0; i <= td.fDataToBreak.length(); i++) {
    530         if (bi.isBoundary(i)) {
    531             td.fActualBreakPositions.addElement(i, status);  // Save result.
    532             tag = bi.getRuleStatus();
    533             td.fActualTags.addElement(tag, status);
    534         }
    535     }
    536     td.checkResults("testIsBoundary: ", this);
    537 }
    538 
    539 
    540 
    541 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
    542 {
    543     iterator.setText(td.fDataToBreak);
    544 
    545     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
    546     int32_t offset = iterator.first();
    547     int32_t testOffset;
    548     int32_t count = 0;
    549 
    550     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
    551 
    552     if (*testIterator != iterator)
    553         errln("clone() or operator!= failed: two clones compared unequal");
    554 
    555     do {
    556         testOffset = testIterator->first();
    557         testOffset = testIterator->next(count);
    558         if (offset != testOffset)
    559             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
    560 
    561         if (offset != RuleBasedBreakIterator::DONE) {
    562             count++;
    563             offset = iterator.next();
    564 
    565             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
    566                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
    567                 if (count > 10000 || offset == -1) {
    568                     errln("operator== failed too many times. Stopping test.");
    569                     if (offset == -1) {
    570                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
    571                     }
    572                     return;
    573                 }
    574             }
    575         }
    576     } while (offset != RuleBasedBreakIterator::DONE);
    577 
    578     // now do it backwards...
    579     offset = iterator.last();
    580     count = 0;
    581 
    582     do {
    583         testOffset = testIterator->last();
    584         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
    585         if (offset != testOffset)
    586             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
    587 
    588         if (offset != RuleBasedBreakIterator::DONE) {
    589             count--;
    590             offset = iterator.previous();
    591         }
    592     } while (offset != RuleBasedBreakIterator::DONE);
    593 
    594     delete testIterator;
    595 }
    596 
    597 
    598 //---------------------------------------------
    599 //
    600 //     other tests
    601 //
    602 //---------------------------------------------
    603 void RBBITest::TestEmptyString()
    604 {
    605     UnicodeString text = "";
    606     UErrorCode status = U_ZERO_ERROR;
    607 
    608     BITestData x(status);
    609     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
    610     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
    611     if (U_FAILURE(status))
    612     {
    613         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
    614         return;
    615     }
    616     generalIteratorTest(*bi, x);
    617     delete bi;
    618 }
    619 
    620 void RBBITest::TestGetAvailableLocales()
    621 {
    622     int32_t locCount = 0;
    623     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
    624 
    625     if (locCount == 0)
    626         dataerrln("getAvailableLocales() returned an empty list!");
    627     // Just make sure that it's returning good memory.
    628     int32_t i;
    629     for (i = 0; i < locCount; ++i) {
    630         logln(locList[i].getName());
    631     }
    632 }
    633 
    634 //Testing the BreakIterator::getDisplayName() function
    635 void RBBITest::TestGetDisplayName()
    636 {
    637     UnicodeString   result;
    638 
    639     BreakIterator::getDisplayName(Locale::getUS(), result);
    640     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
    641         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
    642                 + result);
    643 
    644     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
    645     if (result != "French (France)")
    646         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
    647                 + result);
    648 }
    649 /**
    650  * Test End Behaviour
    651  * @bug 4068137
    652  */
    653 void RBBITest::TestEndBehaviour()
    654 {
    655     UErrorCode status = U_ZERO_ERROR;
    656     UnicodeString testString("boo.");
    657     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
    658     if (U_FAILURE(status))
    659     {
    660         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
    661         return;
    662     }
    663     wb->setText(testString);
    664 
    665     if (wb->first() != 0)
    666         errln("Didn't get break at beginning of string.");
    667     if (wb->next() != 3)
    668         errln("Didn't get break before period in \"boo.\"");
    669     if (wb->current() != 4 && wb->next() != 4)
    670         errln("Didn't get break at end of string.");
    671     delete wb;
    672 }
    673 /*
    674  * @bug 4153072
    675  */
    676 void RBBITest::TestBug4153072() {
    677     UErrorCode status = U_ZERO_ERROR;
    678     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
    679     if (U_FAILURE(status))
    680     {
    681         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
    682         return;
    683     }
    684     UnicodeString str("...Hello, World!...");
    685     int32_t begin = 3;
    686     int32_t end = str.length() - 3;
    687     UBool onBoundary;
    688 
    689     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
    690     iter->adoptText(textIterator);
    691     int index;
    692     // Note: with the switch to UText, there is no way to restrict the
    693     //       iteration range to begin at an index other than zero.
    694     //       String character iterators created with a non-zero bound are
    695     //         treated by RBBI as being empty.
    696     for (index = -1; index < begin + 1; ++index) {
    697         onBoundary = iter->isBoundary(index);
    698         if (index == 0?  !onBoundary : onBoundary) {
    699             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
    700                             " and begin index = " + begin);
    701         }
    702     }
    703     delete iter;
    704 }
    705 
    706 
    707 //
    708 // Test for problem reported by Ashok Matoria on 9 July 2007
    709 //    One.<kSoftHyphen><kSpace>Two.
    710 //
    711 //    Sentence break at start (0) and then on calling next() it breaks at
    712 //   'T' of "Two". Now, at this point if I do next() and
    713 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
    714 //
    715 void RBBITest::TestBug5775() {
    716     UErrorCode status = U_ZERO_ERROR;
    717     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
    718     TEST_ASSERT_SUCCESS(status);
    719     if (U_FAILURE(status)) {
    720         return;
    721     }
    722 // Check for status first for better handling of no data errors.
    723     TEST_ASSERT(bi != NULL);
    724     if (bi == NULL) {
    725         return;
    726     }
    727 
    728     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
    729     //               01234      56789
    730     s = s.unescape();
    731     bi->setText(s);
    732     int pos = bi->next();
    733     TEST_ASSERT(pos == 6);
    734     pos = bi->next();
    735     TEST_ASSERT(pos == 10);
    736     pos = bi->previous();
    737     TEST_ASSERT(pos == 6);
    738     delete bi;
    739 }
    740 
    741 
    742 
    743 //------------------------------------------------------------------------------
    744 //
    745 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
    746 //
    747 //------------------------------------------------------------------------------
    748 
    749 struct TestParams {
    750     BreakIterator   *bi;                   // Break iterator is set while parsing test source.
    751                                            //   Changed out whenever test data changes break type.
    752 
    753     UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
    754     UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
    755     UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
    756     UVector32       *srcCol;
    757 
    758     UText           *textToBreak;          // UText, could be UTF8 or UTF16.
    759     UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
    760     CharString       utf8String;           // UTF-8 form of text to break.
    761 
    762     TestParams(UErrorCode &status) : dataToBreak() {
    763         bi               = NULL;
    764         expectedBreaks   = new UVector32(status);
    765         srcLine          = new UVector32(status);
    766         srcCol           = new UVector32(status);
    767         textToBreak      = NULL;
    768         textMap          = new UVector32(status);
    769     }
    770 
    771     ~TestParams() {
    772         delete bi;
    773         delete expectedBreaks;
    774         delete srcLine;
    775         delete srcCol;
    776         utext_close(textToBreak);
    777         delete textMap;
    778     }
    779 
    780     int32_t getSrcLine(int32_t bp);
    781     int32_t getExpectedBreak(int32_t bp);
    782     int32_t getSrcCol(int32_t bp);
    783 
    784     void setUTF16(UErrorCode &status);
    785     void setUTF8(UErrorCode &status);
    786 };
    787 
    788 // Append a UnicodeString to a CharString with UTF-8 encoding.
    789 // Substitute any invalid chars.
    790 //   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
    791 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
    792     if (U_FAILURE(status)) {
    793         return;
    794     }
    795     int32_t utf8Length;
    796     u_strToUTF8WithSub(NULL, 0, &utf8Length,            // Output Buffer, NULL for preflight.
    797                        src.getBuffer(), src.length(),   // UTF-16 data
    798                        0xfffd, NULL,                    // Substitution char, number of subs.
    799                        &status);
    800     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
    801         return;
    802     }
    803     status = U_ZERO_ERROR;
    804     int32_t capacity;
    805     char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
    806     u_strToUTF8WithSub(buffer, utf8Length, NULL,
    807                        src.getBuffer(), src.length(),
    808                        0xfffd, NULL, &status);
    809     dest.append(buffer, utf8Length, status);
    810 }
    811 
    812 
    813 void TestParams::setUTF16(UErrorCode &status) {
    814     textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
    815     textMap->removeAllElements();
    816     for (int32_t i=0; i<dataToBreak.length(); i++) {
    817         if (i == dataToBreak.getChar32Start(i)) {
    818             textMap->addElement(i, status);
    819         } else {
    820             textMap->addElement(-1, status);
    821         }
    822     }
    823     textMap->addElement(dataToBreak.length(), status);
    824     U_ASSERT(dataToBreak.length() + 1 == textMap->size());
    825 }
    826 
    827 
    828 void TestParams::setUTF8(UErrorCode &status) {
    829     if (U_FAILURE(status)) {
    830         return;
    831     }
    832     utf8String.clear();
    833     CharStringAppend(utf8String, dataToBreak, status);
    834     textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
    835     if (U_FAILURE(status)) {
    836         return;
    837     }
    838 
    839     textMap->removeAllElements();
    840     int32_t utf16Index = 0;
    841     for (;;) {
    842         textMap->addElement(utf16Index, status);
    843         UChar32 c32 = utext_current32(textToBreak);
    844         if (c32 < 0) {
    845             break;
    846         }
    847         utf16Index += U16_LENGTH(c32);
    848         utext_next32(textToBreak);
    849         while (textMap->size() < utext_getNativeIndex(textToBreak)) {
    850             textMap->addElement(-1, status);
    851         }
    852     }
    853     U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
    854 }
    855 
    856 
    857 int32_t TestParams::getSrcLine(int32_t bp) {
    858     if (bp >= textMap->size()) {
    859         bp = textMap->size() - 1;
    860     }
    861     int32_t i = 0;
    862     for(; bp >= 0 ; --bp) {
    863         // Move to a character boundary if we are not on one already.
    864         i = textMap->elementAti(bp);
    865         if (i >= 0) {
    866             break;
    867         }
    868     }
    869     return srcLine->elementAti(i);
    870 }
    871 
    872 
    873 int32_t TestParams::getExpectedBreak(int32_t bp) {
    874     if (bp >= textMap->size()) {
    875         return 0;
    876     }
    877     int32_t i = textMap->elementAti(bp);
    878     int32_t retVal = 0;
    879     if (i >= 0) {
    880         retVal = expectedBreaks->elementAti(i);
    881     }
    882     return retVal;
    883 }
    884 
    885 
    886 int32_t TestParams::getSrcCol(int32_t bp) {
    887     if (bp >= textMap->size()) {
    888         bp = textMap->size() - 1;
    889     }
    890     int32_t i = 0;
    891     for(; bp >= 0; --bp) {
    892         // Move bp to a character boundary if we are not on one already.
    893         i = textMap->elementAti(bp);
    894         if (i >= 0) {
    895             break;
    896         }
    897     }
    898     return srcCol->elementAti(i);
    899 }
    900 
    901 
    902 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
    903     int32_t    bp;
    904     int32_t    prevBP;
    905     int32_t    i;
    906 
    907     TEST_ASSERT_SUCCESS(status);
    908     if (U_FAILURE(status)) {
    909         return;
    910     }
    911 
    912     if (t->bi == NULL) {
    913         return;
    914     }
    915 
    916     t->bi->setText(t->textToBreak, status);
    917     //
    918     //  Run the iterator forward
    919     //
    920     prevBP = -1;
    921     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
    922         if (prevBP ==  bp) {
    923             // Fail for lack of forward progress.
    924             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
    925                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
    926             break;
    927         }
    928 
    929         // Check that there we didn't miss an expected break between the last one
    930         //  and this one.
    931         for (i=prevBP+1; i<bp; i++) {
    932             if (t->getExpectedBreak(i) != 0) {
    933                 int expected[] = {0, i};
    934                 printStringBreaks(t->dataToBreak, expected, 2);
    935                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
    936                       i, t->getSrcLine(i), t->getSrcCol(i));
    937             }
    938         }
    939 
    940         // Check that the break we did find was expected
    941         if (t->getExpectedBreak(bp) == 0) {
    942             int expected[] = {0, bp};
    943             printStringBreaks(t->textToBreak, expected, 2);
    944             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
    945                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
    946         } else {
    947             // The break was expected.
    948             //   Check that the {nnn} tag value is correct.
    949             int32_t expectedTagVal = t->getExpectedBreak(bp);
    950             if (expectedTagVal == -1) {
    951                 expectedTagVal = 0;
    952             }
    953             int32_t line = t->getSrcLine(bp);
    954             int32_t rs = t->bi->getRuleStatus();
    955             if (rs != expectedTagVal) {
    956                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
    957                       "          Actual, Expected status = %4d, %4d",
    958                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
    959             }
    960         }
    961 
    962         prevBP = bp;
    963     }
    964 
    965     // Verify that there were no missed expected breaks after the last one found
    966     for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
    967         if (t->getExpectedBreak(i) != 0) {
    968             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
    969                       i, t->getSrcLine(i), t->getSrcCol(i));
    970         }
    971     }
    972 
    973     //
    974     //  Run the iterator backwards, verify that the same breaks are found.
    975     //
    976     prevBP = utext_nativeLength(t->textToBreak)+2;  // start with a phony value for the last break pos seen.
    977     bp = t->bi->last();
    978     while (bp != BreakIterator::DONE) {
    979         if (prevBP ==  bp) {
    980             // Fail for lack of progress.
    981             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
    982                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
    983             break;
    984         }
    985 
    986         // Check that we didn't miss an expected break between the last one
    987         //  and this one.  (UVector returns zeros for index out of bounds.)
    988         for (i=prevBP-1; i>bp; i--) {
    989             if (t->getExpectedBreak(i) != 0) {
    990                 errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
    991                       i, t->getSrcLine(i), t->getSrcCol(i));
    992             }
    993         }
    994 
    995         // Check that the break we did find was expected
    996         if (t->getExpectedBreak(bp) == 0) {
    997             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
    998                    bp, t->getSrcLine(bp), t->getSrcCol(bp));
    999         } else {
   1000             // The break was expected.
   1001             //   Check that the {nnn} tag value is correct.
   1002             int32_t expectedTagVal = t->getExpectedBreak(bp);
   1003             if (expectedTagVal == -1) {
   1004                 expectedTagVal = 0;
   1005             }
   1006             int line = t->getSrcLine(bp);
   1007             int32_t rs = t->bi->getRuleStatus();
   1008             if (rs != expectedTagVal) {
   1009                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
   1010                       "          Actual, Expected status = %4d, %4d",
   1011                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
   1012             }
   1013         }
   1014 
   1015         prevBP = bp;
   1016         bp = t->bi->previous();
   1017     }
   1018 
   1019     // Verify that there were no missed breaks prior to the last one found
   1020     for (i=prevBP-1; i>=0; i--) {
   1021         if (t->getExpectedBreak(i) != 0) {
   1022             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1023                       i, t->getSrcLine(i), t->getSrcCol(i));
   1024         }
   1025     }
   1026 
   1027     // Check isBoundary()
   1028     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
   1029         UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
   1030         UBool boundaryFound    = t->bi->isBoundary(i);
   1031         if (boundaryExpected != boundaryFound) {
   1032             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
   1033                   "        Expected, Actual= %s, %s",
   1034                   i, t->getSrcLine(i), t->getSrcCol(i),
   1035                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
   1036         }
   1037     }
   1038 
   1039     // Check following()
   1040     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
   1041         int32_t actualBreak = t->bi->following(i);
   1042         int32_t expectedBreak = BreakIterator::DONE;
   1043         for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
   1044             if (t->getExpectedBreak(j) != 0) {
   1045                 expectedBreak = j;
   1046                 break;
   1047             }
   1048         }
   1049         if (expectedBreak != actualBreak) {
   1050             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
   1051                   "        Expected, Actual= %d, %d",
   1052                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
   1053         }
   1054     }
   1055 
   1056     // Check preceding()
   1057     for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
   1058         int32_t actualBreak = t->bi->preceding(i);
   1059         int32_t expectedBreak = BreakIterator::DONE;
   1060 
   1061         // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
   1062         // preceding(trailing byte) will return the index of some preceding code point,
   1063         // not the lead byte of the current code point, even though that has a smaller index.
   1064         // Therefore, start looking at the expected break data not at i-1, but at
   1065         // the start of code point index - 1.
   1066         utext_setNativeIndex(t->textToBreak, i);
   1067         int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
   1068         for (; j >= 0; j--) {
   1069             if (t->getExpectedBreak(j) != 0) {
   1070                 expectedBreak = j;
   1071                 break;
   1072             }
   1073         }
   1074         if (expectedBreak != actualBreak) {
   1075             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
   1076                   "        Expected, Actual= %d, %d",
   1077                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
   1078         }
   1079     }
   1080 }
   1081 
   1082 
   1083 void RBBITest::TestExtended() {
   1084   // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
   1085   // data driven test closely entangles filtered and regular data.
   1086 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
   1087     UErrorCode      status  = U_ZERO_ERROR;
   1088     Locale          locale("");
   1089 
   1090     TestParams          tp(status);
   1091 
   1092     RegexMatcher      localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
   1093     if (U_FAILURE(status)) {
   1094         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
   1095     }
   1096 
   1097     //
   1098     //  Open and read the test data file.
   1099     //
   1100     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   1101     CharString testFileName(testDataDirectory, -1, status);
   1102     testFileName.append("rbbitst.txt", -1, status);
   1103 
   1104     int    len;
   1105     UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
   1106     if (U_FAILURE(status)) {
   1107         errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
   1108         return;
   1109     }
   1110 
   1111     bool skipTest = false; // Skip this test?
   1112 
   1113     //
   1114     //  Put the test data into a UnicodeString
   1115     //
   1116     UnicodeString testString(FALSE, testFile, len);
   1117 
   1118     enum EParseState{
   1119         PARSE_COMMENT,
   1120         PARSE_TAG,
   1121         PARSE_DATA,
   1122         PARSE_NUM,
   1123         PARSE_RULES
   1124     }
   1125     parseState = PARSE_TAG;
   1126 
   1127     EParseState savedState = PARSE_TAG;
   1128 
   1129     int32_t    lineNum  = 1;
   1130     int32_t    colStart = 0;
   1131     int32_t    column   = 0;
   1132     int32_t    charIdx  = 0;
   1133 
   1134     int32_t    tagValue = 0;             // The numeric value of a <nnn> tag.
   1135 
   1136     UnicodeString       rules;           // Holds rules from a <rules> ... </rules> block
   1137     int32_t             rulesFirstLine;  // Line number of the start of current <rules> block
   1138 
   1139     for (charIdx = 0; charIdx < len; ) {
   1140         status = U_ZERO_ERROR;
   1141         UChar  c = testString.charAt(charIdx);
   1142         charIdx++;
   1143         if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
   1144             // treat CRLF as a unit
   1145             c = u'\n';
   1146             charIdx++;
   1147         }
   1148         if (c == u'\n' || c == u'\r') {
   1149             lineNum++;
   1150             colStart = charIdx;
   1151         }
   1152         column = charIdx - colStart + 1;
   1153 
   1154         switch (parseState) {
   1155         case PARSE_COMMENT:
   1156             if (c == u'\n' || c == u'\r') {
   1157                 parseState = savedState;
   1158             }
   1159             break;
   1160 
   1161         case PARSE_TAG:
   1162             {
   1163             if (c == u'#') {
   1164                 parseState = PARSE_COMMENT;
   1165                 savedState = PARSE_TAG;
   1166                 break;
   1167             }
   1168             if (u_isUWhiteSpace(c)) {
   1169                 break;
   1170             }
   1171             if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
   1172                 delete tp.bi;
   1173                 tp.bi = BreakIterator::createWordInstance(locale,  status);
   1174                 skipTest = false;
   1175                 charIdx += 5;
   1176                 break;
   1177             }
   1178             if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
   1179                 delete tp.bi;
   1180                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
   1181                 skipTest = false;
   1182                 charIdx += 5;
   1183                 break;
   1184             }
   1185             if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
   1186                 delete tp.bi;
   1187                 tp.bi = BreakIterator::createLineInstance(locale,  status);
   1188                 skipTest = false;
   1189                 charIdx += 5;
   1190                 break;
   1191             }
   1192             if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
   1193                 delete tp.bi;
   1194                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
   1195                 skipTest = false;
   1196                 charIdx += 5;
   1197                 break;
   1198             }
   1199             if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
   1200                 delete tp.bi;
   1201                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
   1202                 charIdx += 6;
   1203                 break;
   1204             }
   1205 
   1206             if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
   1207                 testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
   1208                 charIdx = testString.indexOf(u'>', charIdx) + 1;
   1209                 parseState = PARSE_RULES;
   1210                 rules.remove();
   1211                 rulesFirstLine = lineNum;
   1212                 break;
   1213             }
   1214 
   1215             // <locale  loc_name>
   1216             localeMatcher.reset(testString);
   1217             if (localeMatcher.lookingAt(charIdx-1, status)) {
   1218                 UnicodeString localeName = localeMatcher.group(1, status);
   1219                 char localeName8[100];
   1220                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
   1221                 locale = Locale::createFromName(localeName8);
   1222                 charIdx += localeMatcher.group(0, status).length() - 1;
   1223                 TEST_ASSERT_SUCCESS(status);
   1224                 break;
   1225             }
   1226             if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
   1227                 parseState = PARSE_DATA;
   1228                 charIdx += 5;
   1229                 tp.dataToBreak = "";
   1230                 tp.expectedBreaks->removeAllElements();
   1231                 tp.srcCol ->removeAllElements();
   1232                 tp.srcLine->removeAllElements();
   1233                 break;
   1234             }
   1235 
   1236             errln("line %d: Tag expected in test file.", lineNum);
   1237             parseState = PARSE_COMMENT;
   1238             savedState = PARSE_DATA;
   1239             goto end_test; // Stop the test.
   1240             }
   1241             break;
   1242 
   1243         case PARSE_RULES:
   1244             if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
   1245                 charIdx += 7;
   1246                 parseState = PARSE_TAG;
   1247                 delete tp.bi;
   1248                 UParseError pe;
   1249                 tp.bi = new RuleBasedBreakIterator(rules, pe, status);
   1250                 skipTest = U_FAILURE(status);
   1251                 if (U_FAILURE(status)) {
   1252                     errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
   1253                         rulesFirstLine + pe.line - 1, u_errorName(status));
   1254                 }
   1255             } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
   1256                 charIdx += 10;
   1257                 parseState = PARSE_TAG;
   1258                 UErrorCode ec = U_ZERO_ERROR;
   1259                 UParseError pe;
   1260                 RuleBasedBreakIterator bi(rules, pe, ec);
   1261                 if (U_SUCCESS(ec)) {
   1262                     errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
   1263                         rulesFirstLine + pe.line - 1);
   1264                 }
   1265             } else {
   1266                 rules.append(c);
   1267             }
   1268             break;
   1269 
   1270         case PARSE_DATA:
   1271             if (c == u'') {
   1272                 int32_t  breakIdx = tp.dataToBreak.length();
   1273                 tp.expectedBreaks->setSize(breakIdx+1);
   1274                 tp.expectedBreaks->setElementAt(-1, breakIdx);
   1275                 tp.srcLine->setSize(breakIdx+1);
   1276                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1277                 tp.srcCol ->setSize(breakIdx+1);
   1278                 tp.srcCol ->setElementAt(column, breakIdx);
   1279                 break;
   1280             }
   1281 
   1282             if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
   1283                 // Add final entry to mappings from break location to source file position.
   1284                 //  Need one extra because last break position returned is after the
   1285                 //    last char in the data, not at the last char.
   1286                 tp.srcLine->addElement(lineNum, status);
   1287                 tp.srcCol ->addElement(column, status);
   1288 
   1289                 parseState = PARSE_TAG;
   1290                 charIdx += 6;
   1291 
   1292                 if (!skipTest) {
   1293                     // RUN THE TEST!
   1294                     status = U_ZERO_ERROR;
   1295                     tp.setUTF16(status);
   1296                     executeTest(&tp, status);
   1297                     TEST_ASSERT_SUCCESS(status);
   1298 
   1299                     // Run again, this time with UTF-8 text wrapped in a UText.
   1300                     status = U_ZERO_ERROR;
   1301                     tp.setUTF8(status);
   1302                     TEST_ASSERT_SUCCESS(status);
   1303                     executeTest(&tp, status);
   1304                 }
   1305                 break;
   1306             }
   1307 
   1308             if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
   1309                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
   1310                 // Get the code point from the name and insert it into the test data.
   1311                 //   (Damn, no API takes names in Unicode  !!!
   1312                 //    we've got to take it back to char *)
   1313                 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
   1314                 int32_t nameLength = nameEndIdx - (charIdx+2);
   1315                 char charNameBuf[200];
   1316                 UChar32 theChar = -1;
   1317                 if (nameEndIdx != -1) {
   1318                     UErrorCode status = U_ZERO_ERROR;
   1319                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
   1320                     charNameBuf[sizeof(charNameBuf)-1] = 0;
   1321                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
   1322                     if (U_FAILURE(status)) {
   1323                         theChar = -1;
   1324                     }
   1325                 }
   1326                 if (theChar == -1) {
   1327                     errln("Error in named character in test file at line %d, col %d",
   1328                         lineNum, column);
   1329                 } else {
   1330                     // Named code point was recognized.  Insert it
   1331                     //   into the test data.
   1332                     tp.dataToBreak.append(theChar);
   1333                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
   1334                         tp.srcLine->addElement(lineNum, status);
   1335                         tp.srcCol ->addElement(column, status);
   1336                     }
   1337                 }
   1338                 if (nameEndIdx > charIdx) {
   1339                     charIdx = nameEndIdx+1;
   1340 
   1341                 }
   1342                 break;
   1343             }
   1344 
   1345 
   1346 
   1347             if (testString.compare(charIdx-1, 2, u"<>") == 0) {
   1348                 charIdx++;
   1349                 int32_t  breakIdx = tp.dataToBreak.length();
   1350                 tp.expectedBreaks->setSize(breakIdx+1);
   1351                 tp.expectedBreaks->setElementAt(-1, breakIdx);
   1352                 tp.srcLine->setSize(breakIdx+1);
   1353                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1354                 tp.srcCol ->setSize(breakIdx+1);
   1355                 tp.srcCol ->setElementAt(column, breakIdx);
   1356                 break;
   1357             }
   1358 
   1359             if (c == u'<') {
   1360                 tagValue   = 0;
   1361                 parseState = PARSE_NUM;
   1362                 break;
   1363             }
   1364 
   1365             if (c == u'#' && column==3) {   // TODO:  why is column off so far?
   1366                 parseState = PARSE_COMMENT;
   1367                 savedState = PARSE_DATA;
   1368                 break;
   1369             }
   1370 
   1371             if (c == u'\\') {
   1372                 // Check for \ at end of line, a line continuation.
   1373                 //     Advance over (discard) the newline
   1374                 UChar32 cp = testString.char32At(charIdx);
   1375                 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
   1376                     // We have a CR LF
   1377                     //  Need an extra increment of the input ptr to move over both of them
   1378                     charIdx++;
   1379                 }
   1380                 if (cp == u'\n' || cp == u'\r') {
   1381                     lineNum++;
   1382                     colStart = charIdx;
   1383                     charIdx++;
   1384                     break;
   1385                 }
   1386 
   1387                 // Let unescape handle the back slash.
   1388                 cp = testString.unescapeAt(charIdx);
   1389                 if (cp != -1) {
   1390                     // Escape sequence was recognized.  Insert the char
   1391                     //   into the test data.
   1392                     tp.dataToBreak.append(cp);
   1393                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
   1394                         tp.srcLine->addElement(lineNum, status);
   1395                         tp.srcCol ->addElement(column, status);
   1396                     }
   1397                     break;
   1398                 }
   1399 
   1400 
   1401                 // Not a recognized backslash escape sequence.
   1402                 // Take the next char as a literal.
   1403                 //  TODO:  Should this be an error?
   1404                 c = testString.charAt(charIdx);
   1405                 charIdx = testString.moveIndex32(charIdx, 1);
   1406             }
   1407 
   1408             // Normal, non-escaped data char.
   1409             tp.dataToBreak.append(c);
   1410 
   1411             // Save the mapping from offset in the data to line/column numbers in
   1412             //   the original input file.  Will be used for better error messages only.
   1413             //   If there's an expected break before this char, the slot in the mapping
   1414             //     vector will already be set for this char; don't overwrite it.
   1415             if (tp.dataToBreak.length() > tp.srcLine->size()) {
   1416                 tp.srcLine->addElement(lineNum, status);
   1417                 tp.srcCol ->addElement(column, status);
   1418             }
   1419             break;
   1420 
   1421 
   1422         case PARSE_NUM:
   1423             // We are parsing an expected numeric tag value, like <1234>,
   1424             //   within a chunk of data.
   1425             if (u_isUWhiteSpace(c)) {
   1426                 break;
   1427             }
   1428 
   1429             if (c == u'>') {
   1430                 // Finished the number.  Add the info to the expected break data,
   1431                 //   and switch parse state back to doing plain data.
   1432                 parseState = PARSE_DATA;
   1433                 if (tagValue == 0) {
   1434                     tagValue = -1;
   1435                 }
   1436                 int32_t  breakIdx = tp.dataToBreak.length();
   1437                 tp.expectedBreaks->setSize(breakIdx+1);
   1438                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
   1439                 tp.srcLine->setSize(breakIdx+1);
   1440                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1441                 tp.srcCol ->setSize(breakIdx+1);
   1442                 tp.srcCol ->setElementAt(column, breakIdx);
   1443                 break;
   1444             }
   1445 
   1446             if (u_isdigit(c)) {
   1447                 tagValue = tagValue*10 + u_charDigitValue(c);
   1448                 break;
   1449             }
   1450 
   1451             errln("Syntax Error in test file at line %d, col %d",
   1452                 lineNum, column);
   1453             parseState = PARSE_COMMENT;
   1454             goto end_test; // Stop the test
   1455             break;
   1456         }
   1457 
   1458 
   1459         if (U_FAILURE(status)) {
   1460             errln("ICU Error %s while parsing test file at line %d.",
   1461                 u_errorName(status), lineNum);
   1462             status = U_ZERO_ERROR;
   1463             goto end_test; // Stop the test
   1464         }
   1465 
   1466     }
   1467 
   1468     // Reached end of test file. Raise an error if parseState indicates that we are
   1469     //   within a block that should have been terminated.
   1470 
   1471     if (parseState == PARSE_RULES) {
   1472         errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
   1473             lineNum, rulesFirstLine);
   1474     }
   1475     if (parseState == PARSE_DATA) {
   1476         errln("rbbitst.txt:%d <data> block not closed.", lineNum);
   1477     }
   1478 
   1479 
   1480 end_test:
   1481     delete [] testFile;
   1482 #endif
   1483 }
   1484 
   1485 
   1486 //-------------------------------------------------------------------------------
   1487 //
   1488 //  TestDictRules   create a break iterator from source rules that includes a
   1489 //                  dictionary range.   Regression for bug #7130.  Source rules
   1490 //                  do not declare a break iterator type (word, line, sentence, etc.
   1491 //                  but the dictionary code, without a type, would loop.
   1492 //
   1493 //-------------------------------------------------------------------------------
   1494 void RBBITest::TestDictRules() {
   1495     const char *rules =  "$dictionary = [a-z]; \n"
   1496                          "!!forward; \n"
   1497                          "$dictionary $dictionary; \n"
   1498                          "!!reverse; \n"
   1499                          "$dictionary $dictionary; \n";
   1500     const char *text = "aa";
   1501     UErrorCode status = U_ZERO_ERROR;
   1502     UParseError parseError;
   1503 
   1504     RuleBasedBreakIterator bi(rules, parseError, status);
   1505     if (U_SUCCESS(status)) {
   1506         UnicodeString utext = text;
   1507         bi.setText(utext);
   1508         int32_t position;
   1509         int32_t loops;
   1510         for (loops = 0; loops<10; loops++) {
   1511             position = bi.next();
   1512             if (position == RuleBasedBreakIterator::DONE) {
   1513                 break;
   1514             }
   1515         }
   1516         TEST_ASSERT(loops == 1);
   1517     } else {
   1518         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
   1519     }
   1520 }
   1521 
   1522 
   1523 
   1524 //-------------------------------------------------------------------------------
   1525 //
   1526 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
   1527 //    return the data in one big UChar * buffer, which the caller must delete.
   1528 //
   1529 //    parameters:
   1530 //          fileName:   the name of the file, with no directory part.  The test data directory
   1531 //                      is assumed.
   1532 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
   1533 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
   1534 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
   1535 //                      Pass NULL for the system default encoding.
   1536 //          status
   1537 //    returns:
   1538 //                      The file data, converted to UChar.
   1539 //                      The caller must delete this when done with
   1540 //                           delete [] theBuffer;
   1541 //
   1542 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
   1543 //           Move this function to some common place.
   1544 //
   1545 //--------------------------------------------------------------------------------
   1546 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
   1547     UChar       *retPtr  = NULL;
   1548     char        *fileBuf = NULL;
   1549     UConverter* conv     = NULL;
   1550     FILE        *f       = NULL;
   1551 
   1552     ulen = 0;
   1553     if (U_FAILURE(status)) {
   1554         return retPtr;
   1555     }
   1556 
   1557     //
   1558     //  Open the file.
   1559     //
   1560     f = fopen(fileName, "rb");
   1561     if (f == 0) {
   1562         dataerrln("Error opening test data file %s\n", fileName);
   1563         status = U_FILE_ACCESS_ERROR;
   1564         return NULL;
   1565     }
   1566     //
   1567     //  Read it in
   1568     //
   1569     int   fileSize;
   1570     int   amt_read;
   1571 
   1572     fseek( f, 0, SEEK_END);
   1573     fileSize = ftell(f);
   1574     fileBuf = new char[fileSize];
   1575     fseek(f, 0, SEEK_SET);
   1576     amt_read = fread(fileBuf, 1, fileSize, f);
   1577     if (amt_read != fileSize || fileSize <= 0) {
   1578         errln("Error reading test data file.");
   1579         goto cleanUpAndReturn;
   1580     }
   1581 
   1582     //
   1583     // Look for a Unicode Signature (BOM) on the data just read
   1584     //
   1585     int32_t        signatureLength;
   1586     const char *   fileBufC;
   1587     const char*    bomEncoding;
   1588 
   1589     fileBufC = fileBuf;
   1590     bomEncoding = ucnv_detectUnicodeSignature(
   1591         fileBuf, fileSize, &signatureLength, &status);
   1592     if(bomEncoding!=NULL ){
   1593         fileBufC  += signatureLength;
   1594         fileSize  -= signatureLength;
   1595         encoding = bomEncoding;
   1596     }
   1597 
   1598     //
   1599     // Open a converter to take the rule file to UTF-16
   1600     //
   1601     conv = ucnv_open(encoding, &status);
   1602     if (U_FAILURE(status)) {
   1603         goto cleanUpAndReturn;
   1604     }
   1605 
   1606     //
   1607     // Convert the rules to UChar.
   1608     //  Preflight first to determine required buffer size.
   1609     //
   1610     ulen = ucnv_toUChars(conv,
   1611         NULL,           //  dest,
   1612         0,              //  destCapacity,
   1613         fileBufC,
   1614         fileSize,
   1615         &status);
   1616     if (status == U_BUFFER_OVERFLOW_ERROR) {
   1617         // Buffer Overflow is expected from the preflight operation.
   1618         status = U_ZERO_ERROR;
   1619 
   1620         retPtr = new UChar[ulen+1];
   1621         ucnv_toUChars(conv,
   1622             retPtr,       //  dest,
   1623             ulen+1,
   1624             fileBufC,
   1625             fileSize,
   1626             &status);
   1627     }
   1628 
   1629 cleanUpAndReturn:
   1630     fclose(f);
   1631     delete []fileBuf;
   1632     ucnv_close(conv);
   1633     if (U_FAILURE(status)) {
   1634         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   1635         delete []retPtr;
   1636         retPtr = 0;
   1637         ulen   = 0;
   1638     };
   1639     return retPtr;
   1640 }
   1641 
   1642 
   1643 
   1644 //--------------------------------------------------------------------------------------------
   1645 //
   1646 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
   1647 //
   1648 //-------------------------------------------------------------------------------------------
   1649 void RBBITest::TestUnicodeFiles() {
   1650     RuleBasedBreakIterator  *bi;
   1651     UErrorCode               status = U_ZERO_ERROR;
   1652 
   1653     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
   1654     TEST_ASSERT_SUCCESS(status);
   1655     if (U_SUCCESS(status)) {
   1656         runUnicodeTestData("GraphemeBreakTest.txt", bi);
   1657     }
   1658     delete bi;
   1659 
   1660     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
   1661     TEST_ASSERT_SUCCESS(status);
   1662     if (U_SUCCESS(status)) {
   1663         runUnicodeTestData("WordBreakTest.txt", bi);
   1664     }
   1665     delete bi;
   1666 
   1667     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
   1668     TEST_ASSERT_SUCCESS(status);
   1669     if (U_SUCCESS(status)) {
   1670         runUnicodeTestData("SentenceBreakTest.txt", bi);
   1671     }
   1672     delete bi;
   1673 
   1674     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
   1675     TEST_ASSERT_SUCCESS(status);
   1676     if (U_SUCCESS(status)) {
   1677         runUnicodeTestData("LineBreakTest.txt", bi);
   1678     }
   1679     delete bi;
   1680 }
   1681 
   1682 
   1683 // Check for test cases from the Unicode test data files that are known to fail
   1684 // and should be skipped because ICU is not yet able to fully implement the spec.
   1685 // See ticket #7270.
   1686 
   1687 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
   1688     static struct TestCase {
   1689         const char *fFileName;
   1690         const UChar *fString;
   1691     } badTestCases[] = {                                // Line Numbers from Unicode 7.0.0 file.
   1692         {"LineBreakTest.txt", u"\u200B\u0020}"},        // Line 5198
   1693         {"LineBreakTest.txt", u"\u200B\u0020)"},        // Line 5202
   1694         {"LineBreakTest.txt", u"\u200B\u0020!"},        // Line 5214
   1695         {"LineBreakTest.txt", u"\u200B\u0020,"},        // Line 5246
   1696         {"LineBreakTest.txt", u"\u200B\u0020/"},        // Line 5298
   1697         {"LineBreakTest.txt", u"\u200B\u0020\u2060"},   // Line 5302
   1698                                                         // Line Numbers from pre-release verion of GraphemeBreakTest-10.0.0.txt
   1699         {"GraphemeBreakTest.txt", u"\u200D\u2640"},     // Line 656, old GB 11 test ZWJ x GAZ
   1700         {"GraphemeBreakTest.txt", u"\u200D\U0001F466"}, // Line 658, old GB 11 test ZWJ x EBG
   1701         {"GraphemeBreakTest.txt", u"\u200D\U0001F466\U0001F3FB"}, // Line 842, old GB 11 test ZWJ x EBG x EModifier
   1702 
   1703                                                         // Line Numbers from pre-release verion of WordBreakTest-10.0.0.txt
   1704         {"WordBreakTest.txt", u"\u200D\u261D"},         // Line 1356, ZWJ x EmojiNRK
   1705         {"WordBreakTest.txt", u"\u200D\U0001F3FB"},     // Line 1358, ZWJ x EmojiNRK
   1706     };
   1707 
   1708     for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
   1709         const TestCase &badCase = badTestCases[n];
   1710         if (!strcmp(fileName, badCase.fFileName) &&
   1711                 testCase == UnicodeString(badCase.fString)) {
   1712             return logKnownIssue("7270");
   1713         }
   1714     }
   1715     return FALSE;
   1716 }
   1717 
   1718 
   1719 //--------------------------------------------------------------------------------------------
   1720 //
   1721 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
   1722 //
   1723 //-------------------------------------------------------------------------------------------
   1724 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
   1725 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   1726     UErrorCode  status = U_ZERO_ERROR;
   1727 
   1728     //
   1729     //  Open and read the test data file, put it into a UnicodeString.
   1730     //
   1731     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   1732     char testFileName[1000];
   1733     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
   1734         dataerrln("Can't open test data.  Path too long.");
   1735         return;
   1736     }
   1737     strcpy(testFileName, testDataDirectory);
   1738     strcat(testFileName, fileName);
   1739 
   1740     logln("Opening data file %s\n", fileName);
   1741 
   1742     int    len;
   1743     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
   1744     if (status != U_FILE_ACCESS_ERROR) {
   1745         TEST_ASSERT_SUCCESS(status);
   1746         TEST_ASSERT(testFile != NULL);
   1747     }
   1748     if (U_FAILURE(status) || testFile == NULL) {
   1749         return; /* something went wrong, error already output */
   1750     }
   1751     UnicodeString testFileAsString(TRUE, testFile, len);
   1752 
   1753     //
   1754     //  Parse the test data file using a regular expression.
   1755     //  Each kind of token is recognized in its own capture group; what type of item was scanned
   1756     //     is identified by which group had a match.
   1757     //
   1758     //    Caputure Group #                  1          2            3            4           5
   1759     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
   1760     //
   1761     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
   1762     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
   1763     UnicodeString   testString;
   1764     UVector32       breakPositions(status);
   1765     int             lineNumber = 1;
   1766     TEST_ASSERT_SUCCESS(status);
   1767     if (U_FAILURE(status)) {
   1768         return;
   1769     }
   1770 
   1771     //
   1772     //  Scan through each test case, building up the string to be broken in testString,
   1773     //   and the positions that should be boundaries in the breakPositions vector.
   1774     //
   1775     int spin = 0;
   1776     while (tokenMatcher.find()) {
   1777       	if(tokenMatcher.hitEnd()) {
   1778           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
   1779              This occurred when the text file was corrupt (wasn't marked as UTF-8)
   1780              and caused an infinite loop here on EBCDIC systems!
   1781           */
   1782           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
   1783           //	   return;
   1784       	}
   1785         if (tokenMatcher.start(1, status) >= 0) {
   1786             // Scanned a divide sign, indicating a break position in the test data.
   1787             if (testString.length()>0) {
   1788                 breakPositions.addElement(testString.length(), status);
   1789             }
   1790         }
   1791         else if (tokenMatcher.start(2, status) >= 0) {
   1792             // Scanned an 'x', meaning no break at this position in the test data
   1793             //   Nothing to be done here.
   1794             }
   1795         else if (tokenMatcher.start(3, status) >= 0) {
   1796             // Scanned Hex digits.  Convert them to binary, append to the character data string.
   1797             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
   1798             int length = hexNumber.length();
   1799             if (length<=8) {
   1800                 char buf[10];
   1801                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
   1802                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
   1803                 if (c<=0x10ffff) {
   1804                     testString.append(c);
   1805                 } else {
   1806                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
   1807                        fileName, lineNumber);
   1808                 }
   1809             } else {
   1810                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
   1811                        fileName, lineNumber);
   1812              }
   1813         }
   1814         else if (tokenMatcher.start(4, status) >= 0) {
   1815             // Scanned to end of a line, possibly skipping over a comment in the process.
   1816             //   If the line from the file contained test data, run the test now.
   1817             if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
   1818                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
   1819             }
   1820 
   1821             // Clear out this test case.
   1822             //    The string and breakPositions vector will be refilled as the next
   1823             //       test case is parsed.
   1824             testString.remove();
   1825             breakPositions.removeAllElements();
   1826             lineNumber++;
   1827         } else {
   1828             // Scanner catchall.  Something unrecognized appeared on the line.
   1829             char token[16];
   1830             UnicodeString uToken = tokenMatcher.group(0, status);
   1831             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
   1832             token[sizeof(token)-1] = 0;
   1833             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
   1834 
   1835             // Clean up, in preparation for continuing with the next line.
   1836             testString.remove();
   1837             breakPositions.removeAllElements();
   1838             lineNumber++;
   1839         }
   1840         TEST_ASSERT_SUCCESS(status);
   1841         if (U_FAILURE(status)) {
   1842             break;
   1843         }
   1844     }
   1845 
   1846     delete [] testFile;
   1847  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
   1848 }
   1849 
   1850 //--------------------------------------------------------------------------------------------
   1851 //
   1852 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
   1853 //                            test data files.  Do only a simple, forward-only check -
   1854 //                            this test is mostly to check that ICU and the Unicode
   1855 //                            data agree with each other.
   1856 //
   1857 //--------------------------------------------------------------------------------------------
   1858 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
   1859                          const UnicodeString &testString,   // Text data to be broken
   1860                          UVector32 *breakPositions,         // Positions where breaks should be found.
   1861                          RuleBasedBreakIterator *bi) {
   1862     int32_t pos;                 // Break Position in the test string
   1863     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
   1864     int32_t expectedPos;         // Expected break position (index into test string)
   1865 
   1866     bi->setText(testString);
   1867     pos = bi->first();
   1868     pos = bi->next();
   1869 
   1870     while (pos != BreakIterator::DONE) {
   1871         if (expectedI >= breakPositions->size()) {
   1872             errln("Test file \"%s\", line %d, unexpected break found at position %d",
   1873                 testFileName, lineNumber, pos);
   1874             break;
   1875         }
   1876         expectedPos = breakPositions->elementAti(expectedI);
   1877         if (pos < expectedPos) {
   1878             errln("Test file \"%s\", line %d, unexpected break found at position %d",
   1879                 testFileName, lineNumber, pos);
   1880             break;
   1881         }
   1882         if (pos > expectedPos) {
   1883             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
   1884                 testFileName, lineNumber, expectedPos);
   1885             break;
   1886         }
   1887         pos = bi->next();
   1888         expectedI++;
   1889     }
   1890 
   1891     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
   1892         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
   1893             testFileName, lineNumber, breakPositions->elementAti(expectedI));
   1894     }
   1895 }
   1896 
   1897 
   1898 
   1899 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   1900 //---------------------------------------------------------------------------------------
   1901 //
   1902 //   classs RBBIMonkeyKind
   1903 //
   1904 //      Monkey Test for Break Iteration
   1905 //      Abstract interface class.   Concrete derived classes independently
   1906 //      implement the break rules for different iterator types.
   1907 //
   1908 //      The Monkey Test itself uses doesn't know which type of break iterator it is
   1909 //      testing, but works purely in terms of the interface defined here.
   1910 //
   1911 //---------------------------------------------------------------------------------------
   1912 class RBBIMonkeyKind {
   1913 public:
   1914     // Return a UVector of UnicodeSets, representing the character classes used
   1915     //   for this type of iterator.
   1916     virtual  UVector  *charClasses() = 0;
   1917 
   1918     // Set the test text on which subsequent calls to next() will operate
   1919     virtual  void      setText(const UnicodeString &s) = 0;
   1920 
   1921     // Find the next break postion, starting from the prev break position, or from zero.
   1922     // Return -1 after reaching end of string.
   1923     virtual  int32_t   next(int32_t i) = 0;
   1924 
   1925     virtual ~RBBIMonkeyKind();
   1926     UErrorCode       deferredStatus;
   1927 
   1928 
   1929 protected:
   1930     RBBIMonkeyKind();
   1931 
   1932 private:
   1933 };
   1934 
   1935 RBBIMonkeyKind::RBBIMonkeyKind() {
   1936     deferredStatus = U_ZERO_ERROR;
   1937 }
   1938 
   1939 RBBIMonkeyKind::~RBBIMonkeyKind() {
   1940 }
   1941 
   1942 
   1943 //----------------------------------------------------------------------------------------
   1944 //
   1945 //   Random Numbers.  Similar to standard lib rand() and srand()
   1946 //                    Not using library to
   1947 //                      1.  Get same results on all platforms.
   1948 //                      2.  Get access to current seed, to more easily reproduce failures.
   1949 //
   1950 //---------------------------------------------------------------------------------------
   1951 static uint32_t m_seed = 1;
   1952 
   1953 static uint32_t m_rand()
   1954 {
   1955     m_seed = m_seed * 1103515245 + 12345;
   1956     return (uint32_t)(m_seed/65536) % 32768;
   1957 }
   1958 
   1959 
   1960 //
   1961 // Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
   1962 //
   1963 static const char16_t *gExtended_Pict = u"["
   1964     "\\U0001F774-\\U0001F77F\\U00002700-\\U00002701\\U00002703-\\U00002704\\U0000270E\\U00002710-\\U00002711\\U00002765-\\U00002767"
   1965     "\\U0001F030-\\U0001F093\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5"
   1966     "\\U0001F260-\\U0001F265\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F25F"
   1967     "\\U0001F266-\\U0001F2FF\\U0001F7D5-\\U0001F7FF\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F"
   1968     "\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6"
   1969     "\\U0001F4FE\\U0001F53E-\\U0001F548\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586"
   1970     "\\U0001F588-\\U0001F589\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7"
   1971     "\\U0001F5A9-\\U0001F5B0\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB"
   1972     "\\U0001F5DF-\\U0001F5E0\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9"
   1973     "\\U00002605\\U00002607-\\U0000260D\\U0000260F-\\U00002610\\U00002612\\U00002616-\\U00002617\\U00002619-\\U0000261C"
   1974     "\\U0000261E-\\U0000261F\\U00002621\\U00002624-\\U00002625\\U00002627-\\U00002629\\U0000262B-\\U0000262D\\U00002630-\\U00002637"
   1975     "\\U0000263B-\\U00002647\\U00002654-\\U0000265F\\U00002661-\\U00002662\\U00002664\\U00002667\\U00002669-\\U0000267A"
   1976     "\\U0000267C-\\U0000267E\\U00002680-\\U00002691\\U00002695\\U00002698\\U0000269A\\U0000269D-\\U0000269F\\U000026A2-\\U000026A9"
   1977     "\\U000026AC-\\U000026AF\\U000026B2-\\U000026BC\\U000026BF-\\U000026C3\\U000026C6-\\U000026C7\\U000026C9-\\U000026CD"
   1978     "\\U000026D0\\U000026D2\\U000026D5-\\U000026E8\\U000026EB-\\U000026EF\\U000026F6\\U000026FB-\\U000026FC\\U000026FE-\\U000026FF"
   1979     "\\U00002388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5"
   1980     "\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F"
   1981     "\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF\\U0001F900-\\U0001F90B\\U0001F91F\\U0001F928-\\U0001F92F"
   1982     "\\U0001F931-\\U0001F932\\U0001F94C\\U0001F95F-\\U0001F96B\\U0001F992-\\U0001F997\\U0001F9D0-\\U0001F9E6\\U0001F90C-\\U0001F90F"
   1983     "\\U0001F93F\\U0001F94D-\\U0001F94F\\U0001F96C-\\U0001F97F\\U0001F998-\\U0001F9BF\\U0001F9C1-\\U0001F9CF\\U0001F9E7-\\U0001F9FF"
   1984     "\\U0001F6C6-\\U0001F6CA\\U0001F6D3-\\U0001F6D4\\U0001F6E6-\\U0001F6E8\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6F7-\\U0001F6F8"
   1985     "\\U0001F6D5-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F9-\\U0001F6FF"
   1986     "]";
   1987 
   1988 //------------------------------------------------------------------------------------------
   1989 //
   1990 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
   1991 //                             of RBBIMonkeyKind.
   1992 //
   1993 //------------------------------------------------------------------------------------------
   1994 class RBBICharMonkey: public RBBIMonkeyKind {
   1995 public:
   1996     RBBICharMonkey();
   1997     virtual          ~RBBICharMonkey();
   1998     virtual  UVector *charClasses();
   1999     virtual  void     setText(const UnicodeString &s);
   2000     virtual  int32_t  next(int32_t i);
   2001 private:
   2002     UVector   *fSets;
   2003 
   2004     UnicodeSet  *fCRLFSet;
   2005     UnicodeSet  *fControlSet;
   2006     UnicodeSet  *fExtendSet;
   2007     UnicodeSet  *fZWJSet;
   2008     UnicodeSet  *fRegionalIndicatorSet;
   2009     UnicodeSet  *fPrependSet;
   2010     UnicodeSet  *fSpacingSet;
   2011     UnicodeSet  *fLSet;
   2012     UnicodeSet  *fVSet;
   2013     UnicodeSet  *fTSet;
   2014     UnicodeSet  *fLVSet;
   2015     UnicodeSet  *fLVTSet;
   2016     UnicodeSet  *fHangulSet;
   2017     UnicodeSet  *fEmojiBaseSet;
   2018     UnicodeSet  *fEmojiModifierSet;
   2019     UnicodeSet  *fExtendedPictSet;
   2020     UnicodeSet  *fEBGSet;
   2021     UnicodeSet  *fEmojiNRKSet;
   2022     UnicodeSet  *fAnySet;
   2023 
   2024     const UnicodeString *fText;
   2025 };
   2026 
   2027 
   2028 RBBICharMonkey::RBBICharMonkey() {
   2029     UErrorCode  status = U_ZERO_ERROR;
   2030 
   2031     fText = NULL;
   2032 
   2033     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
   2034     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
   2035     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
   2036     fZWJSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
   2037     fRegionalIndicatorSet =
   2038                   new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
   2039     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
   2040     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
   2041     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
   2042     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
   2043     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
   2044     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
   2045     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
   2046     fHangulSet  = new UnicodeSet();
   2047     fHangulSet->addAll(*fLSet);
   2048     fHangulSet->addAll(*fVSet);
   2049     fHangulSet->addAll(*fTSet);
   2050     fHangulSet->addAll(*fLVSet);
   2051     fHangulSet->addAll(*fLVTSet);
   2052 
   2053     fEmojiBaseSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EB}]"), status);
   2054     fEmojiModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EM}]"), status);
   2055     fExtendedPictSet  = new UnicodeSet(gExtended_Pict, status);
   2056     fEBGSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EBG}]"), status);
   2057     fEmojiNRKSet      = new UnicodeSet(UNICODE_STRING_SIMPLE(
   2058                 "[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
   2059     fAnySet           = new UnicodeSet(0, 0x10ffff);
   2060 
   2061     fSets             = new UVector(status);
   2062     fSets->addElement(fCRLFSet,    status);
   2063     fSets->addElement(fControlSet, status);
   2064     fSets->addElement(fExtendSet,  status);
   2065     fSets->addElement(fRegionalIndicatorSet, status);
   2066     if (!fPrependSet->isEmpty()) {
   2067         fSets->addElement(fPrependSet, status);
   2068     }
   2069     fSets->addElement(fSpacingSet, status);
   2070     fSets->addElement(fHangulSet,  status);
   2071     fSets->addElement(fAnySet,     status);
   2072     fSets->addElement(fEmojiBaseSet, status);
   2073     fSets->addElement(fEmojiModifierSet, status);
   2074     fSets->addElement(fZWJSet,     status);
   2075     fSets->addElement(fExtendedPictSet, status);
   2076     fSets->addElement(fEBGSet,     status);
   2077     fSets->addElement(fEmojiNRKSet,status);
   2078     if (U_FAILURE(status)) {
   2079         deferredStatus = status;
   2080     }
   2081 }
   2082 
   2083 
   2084 void RBBICharMonkey::setText(const UnicodeString &s) {
   2085     fText = &s;
   2086 }
   2087 
   2088 
   2089 
   2090 int32_t RBBICharMonkey::next(int32_t prevPos) {
   2091     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2092                               //   break position being tested.  The candidate break
   2093                               //   location is before p2.
   2094 
   2095     int     breakPos = -1;
   2096 
   2097     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2098     UChar32 cBase;            // for (X Extend*) patterns, the X character.
   2099 
   2100     if (U_FAILURE(deferredStatus)) {
   2101         return -1;
   2102     }
   2103 
   2104     // Previous break at end of string.  return DONE.
   2105     if (prevPos >= fText->length()) {
   2106         return -1;
   2107     }
   2108     p0 = p1 = p2 = p3 = prevPos;
   2109     c3 =  fText->char32At(prevPos);
   2110     c0 = c1 = c2 = cBase = 0;
   2111     (void)p0;   // suppress set but not used warning.
   2112     (void)c0;
   2113 
   2114     // Loop runs once per "significant" character position in the input text.
   2115     for (;;) {
   2116         // Move all of the positions forward in the input string.
   2117         p0 = p1;  c0 = c1;
   2118         p1 = p2;  c1 = c2;
   2119         p2 = p3;  c2 = c3;
   2120 
   2121         // Advancd p3 by one codepoint
   2122         p3 = fText->moveIndex32(p3, 1);
   2123         c3 = fText->char32At(p3);
   2124 
   2125         if (p1 == p2) {
   2126             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2127             continue;
   2128         }
   2129         if (p2 == fText->length()) {
   2130             // Reached end of string.  Always a break position.
   2131             break;
   2132         }
   2133 
   2134         // Rule  GB3   CR x LF
   2135         //     No Extend or Format characters may appear between the CR and LF,
   2136         //     which requires the additional check for p2 immediately following p1.
   2137         //
   2138         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
   2139             continue;
   2140         }
   2141 
   2142         // Rule (GB4).   ( Control | CR | LF ) <break>
   2143         if (fControlSet->contains(c1) ||
   2144             c1 == 0x0D ||
   2145             c1 == 0x0A)  {
   2146             break;
   2147         }
   2148 
   2149         // Rule (GB5)    <break>  ( Control | CR | LF )
   2150         //
   2151         if (fControlSet->contains(c2) ||
   2152             c2 == 0x0D ||
   2153             c2 == 0x0A)  {
   2154             break;
   2155         }
   2156 
   2157 
   2158         // Rule (GB6)  L x ( L | V | LV | LVT )
   2159         if (fLSet->contains(c1) &&
   2160                (fLSet->contains(c2)  ||
   2161                 fVSet->contains(c2)  ||
   2162                 fLVSet->contains(c2) ||
   2163                 fLVTSet->contains(c2))) {
   2164             continue;
   2165         }
   2166 
   2167         // Rule (GB7)    ( LV | V )  x  ( V | T )
   2168         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
   2169             (fVSet->contains(c2) || fTSet->contains(c2)))  {
   2170             continue;
   2171         }
   2172 
   2173         // Rule (GB8)    ( LVT | T)  x T
   2174         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
   2175             fTSet->contains(c2))  {
   2176             continue;
   2177         }
   2178 
   2179         // Rule (GB9)    x (Extend | ZWJ)
   2180         if (fExtendSet->contains(c2) || fZWJSet->contains(c2))  {
   2181             if (!fExtendSet->contains(c1)) {
   2182                 cBase = c1;
   2183             }
   2184             continue;
   2185         }
   2186 
   2187         // Rule (GB9a)   x  SpacingMark
   2188         if (fSpacingSet->contains(c2)) {
   2189             continue;
   2190         }
   2191 
   2192         // Rule (GB9b)   Prepend x
   2193         if (fPrependSet->contains(c1)) {
   2194             continue;
   2195         }
   2196 
   2197         // Rule (GB10)   (Emoji_Base | EBG) Extend * x Emoji_Modifier
   2198         if ((fEmojiBaseSet->contains(c1) || fEBGSet->contains(c1)) && fEmojiModifierSet->contains(c2)) {
   2199             continue;
   2200         }
   2201         if ((fEmojiBaseSet->contains(cBase) || fEBGSet->contains(cBase)) &&
   2202                 fExtendSet->contains(c1) && fEmojiModifierSet->contains(c2)) {
   2203             continue;
   2204         }
   2205 
   2206         // Rule (GB11)   (Glue_After_ZWJ | Emoji) Extend * ZWJ x (Glue_After_ZWJ | Emoji)
   2207         if ((fExtendedPictSet->contains(c0) || fEmojiNRKSet->contains(c0)) && fZWJSet->contains(c1) &&
   2208                 (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
   2209             continue;
   2210         }
   2211         if ((fExtendedPictSet->contains(cBase) || fEmojiNRKSet->contains(cBase)) && fExtendSet->contains(c0) && fZWJSet->contains(c1) &&
   2212                 (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
   2213             continue;
   2214         }
   2215 
   2216         // Rule (GB12-13)    Regional_Indicator x Regional_Indicator
   2217         //                   Note: The first if condition is a little tricky. We only need to force
   2218         //                      a break if there are three or more contiguous RIs. If there are
   2219         //                      only two, a break following will occur via other rules, and will include
   2220         //                      any trailing extend characters, which is needed behavior.
   2221         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
   2222                 && fRegionalIndicatorSet->contains(c2)) {
   2223             break;
   2224         }
   2225         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
   2226             continue;
   2227         }
   2228 
   2229         // Rule (GB999)  Any  <break>  Any
   2230         break;
   2231     }
   2232 
   2233     breakPos = p2;
   2234     return breakPos;
   2235 }
   2236 
   2237 
   2238 
   2239 UVector  *RBBICharMonkey::charClasses() {
   2240     return fSets;
   2241 }
   2242 
   2243 
   2244 RBBICharMonkey::~RBBICharMonkey() {
   2245     delete fSets;
   2246     delete fCRLFSet;
   2247     delete fControlSet;
   2248     delete fExtendSet;
   2249     delete fRegionalIndicatorSet;
   2250     delete fPrependSet;
   2251     delete fSpacingSet;
   2252     delete fLSet;
   2253     delete fVSet;
   2254     delete fTSet;
   2255     delete fLVSet;
   2256     delete fLVTSet;
   2257     delete fHangulSet;
   2258     delete fAnySet;
   2259     delete fEmojiBaseSet;
   2260     delete fEmojiModifierSet;
   2261     delete fZWJSet;
   2262     delete fExtendedPictSet;
   2263     delete fEBGSet;
   2264     delete fEmojiNRKSet;
   2265 }
   2266 
   2267 //------------------------------------------------------------------------------------------
   2268 //
   2269 //   class RBBIWordMonkey      Word Break specific implementation
   2270 //                             of RBBIMonkeyKind.
   2271 //
   2272 //------------------------------------------------------------------------------------------
   2273 class RBBIWordMonkey: public RBBIMonkeyKind {
   2274 public:
   2275     RBBIWordMonkey();
   2276     virtual          ~RBBIWordMonkey();
   2277     virtual  UVector *charClasses();
   2278     virtual  void     setText(const UnicodeString &s);
   2279     virtual int32_t   next(int32_t i);
   2280 private:
   2281     UVector      *fSets;
   2282 
   2283     UnicodeSet  *fCRSet;
   2284     UnicodeSet  *fLFSet;
   2285     UnicodeSet  *fNewlineSet;
   2286     UnicodeSet  *fRegionalIndicatorSet;
   2287     UnicodeSet  *fKatakanaSet;
   2288     UnicodeSet  *fHebrew_LetterSet;
   2289     UnicodeSet  *fALetterSet;
   2290     UnicodeSet  *fSingle_QuoteSet;
   2291     UnicodeSet  *fDouble_QuoteSet;
   2292     UnicodeSet  *fMidNumLetSet;
   2293     UnicodeSet  *fMidLetterSet;
   2294     UnicodeSet  *fMidNumSet;
   2295     UnicodeSet  *fNumericSet;
   2296     UnicodeSet  *fFormatSet;
   2297     UnicodeSet  *fOtherSet;
   2298     UnicodeSet  *fExtendSet;
   2299     UnicodeSet  *fExtendNumLetSet;
   2300     UnicodeSet  *fDictionarySet;
   2301     UnicodeSet  *fEBaseSet;
   2302     UnicodeSet  *fEBGSet;
   2303     UnicodeSet  *fEModifierSet;
   2304     UnicodeSet  *fZWJSet;
   2305     UnicodeSet  *fExtendedPictSet;
   2306     UnicodeSet  *fEmojiNRKSet;
   2307 
   2308     const UnicodeString  *fText;
   2309 };
   2310 
   2311 
   2312 RBBIWordMonkey::RBBIWordMonkey()
   2313 {
   2314     UErrorCode  status = U_ZERO_ERROR;
   2315 
   2316     fSets            = new UVector(status);
   2317 
   2318     fCRSet            = new UnicodeSet(u"[\\p{Word_Break = CR}]",           status);
   2319     fLFSet            = new UnicodeSet(u"[\\p{Word_Break = LF}]",           status);
   2320     fNewlineSet       = new UnicodeSet(u"[\\p{Word_Break = Newline}]",      status);
   2321     fKatakanaSet      = new UnicodeSet(u"[\\p{Word_Break = Katakana}]",     status);
   2322     fRegionalIndicatorSet =  new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
   2323     fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
   2324     fALetterSet       = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status);
   2325     fSingle_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]",    status);
   2326     fDouble_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]",    status);
   2327     fMidNumLetSet     = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]",    status);
   2328     fMidLetterSet     = new UnicodeSet(u"[\\p{Word_Break = MidLetter}]",    status);
   2329     fMidNumSet        = new UnicodeSet(u"[\\p{Word_Break = MidNum}]",       status);
   2330     fNumericSet       = new UnicodeSet(u"[\\p{Word_Break = Numeric}]",      status);
   2331     fFormatSet        = new UnicodeSet(u"[\\p{Word_Break = Format}]",       status);
   2332     fExtendNumLetSet  = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
   2333     fExtendSet        = new UnicodeSet(u"[\\p{Word_Break = Extend}]",       status);
   2334 
   2335     fEBaseSet         = new UnicodeSet(u"[\\p{Word_Break = EB}]",           status);
   2336     fEBGSet           = new UnicodeSet(u"[\\p{Word_Break = EBG}]",          status);
   2337     fEModifierSet     = new UnicodeSet(u"[\\p{Word_Break = EM}]",           status);
   2338     fZWJSet           = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]",          status);
   2339     fExtendedPictSet  = new UnicodeSet(gExtended_Pict, status);
   2340     fEmojiNRKSet      = new UnicodeSet(
   2341             u"[[\\p{Emoji}]-[\\p{Word_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]", status);
   2342 
   2343     fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
   2344     fDictionarySet->addAll(*fKatakanaSet);
   2345     fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
   2346 
   2347     fALetterSet->removeAll(*fDictionarySet);
   2348 
   2349     fOtherSet        = new UnicodeSet();
   2350     if(U_FAILURE(status)) {
   2351         IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
   2352         deferredStatus = status;
   2353         return;
   2354     }
   2355 
   2356     fOtherSet->complement();
   2357     fOtherSet->removeAll(*fCRSet);
   2358     fOtherSet->removeAll(*fLFSet);
   2359     fOtherSet->removeAll(*fNewlineSet);
   2360     fOtherSet->removeAll(*fKatakanaSet);
   2361     fOtherSet->removeAll(*fHebrew_LetterSet);
   2362     fOtherSet->removeAll(*fALetterSet);
   2363     fOtherSet->removeAll(*fSingle_QuoteSet);
   2364     fOtherSet->removeAll(*fDouble_QuoteSet);
   2365     fOtherSet->removeAll(*fMidLetterSet);
   2366     fOtherSet->removeAll(*fMidNumSet);
   2367     fOtherSet->removeAll(*fNumericSet);
   2368     fOtherSet->removeAll(*fExtendNumLetSet);
   2369     fOtherSet->removeAll(*fFormatSet);
   2370     fOtherSet->removeAll(*fExtendSet);
   2371     fOtherSet->removeAll(*fRegionalIndicatorSet);
   2372     fOtherSet->removeAll(*fEBaseSet);
   2373     fOtherSet->removeAll(*fEBGSet);
   2374     fOtherSet->removeAll(*fEModifierSet);
   2375     fOtherSet->removeAll(*fZWJSet);
   2376     fOtherSet->removeAll(*fExtendedPictSet);
   2377     fOtherSet->removeAll(*fEmojiNRKSet);
   2378 
   2379     // Inhibit dictionary characters from being tested at all.
   2380     fOtherSet->removeAll(*fDictionarySet);
   2381 
   2382     fSets->addElement(fCRSet,                status);
   2383     fSets->addElement(fLFSet,                status);
   2384     fSets->addElement(fNewlineSet,           status);
   2385     fSets->addElement(fRegionalIndicatorSet, status);
   2386     fSets->addElement(fHebrew_LetterSet,     status);
   2387     fSets->addElement(fALetterSet,           status);
   2388     fSets->addElement(fSingle_QuoteSet,      status);
   2389     fSets->addElement(fDouble_QuoteSet,      status);
   2390     //fSets->addElement(fKatakanaSet,          status); // Omit Katakana from fSets, which omits Katakana characters
   2391                                                         // from the test data. They are all in the dictionary set,
   2392                                                         // which this (old, to be retired) monkey test cannot handle.
   2393     fSets->addElement(fMidLetterSet,         status);
   2394     fSets->addElement(fMidNumLetSet,         status);
   2395     fSets->addElement(fMidNumSet,            status);
   2396     fSets->addElement(fNumericSet,           status);
   2397     fSets->addElement(fFormatSet,            status);
   2398     fSets->addElement(fExtendSet,            status);
   2399     fSets->addElement(fOtherSet,             status);
   2400     fSets->addElement(fExtendNumLetSet,      status);
   2401 
   2402     fSets->addElement(fEBaseSet,             status);
   2403     fSets->addElement(fEBGSet,               status);
   2404     fSets->addElement(fEModifierSet,         status);
   2405     fSets->addElement(fZWJSet,               status);
   2406     fSets->addElement(fExtendedPictSet,      status);
   2407     fSets->addElement(fEmojiNRKSet,          status);
   2408 
   2409     if (U_FAILURE(status)) {
   2410         deferredStatus = status;
   2411     }
   2412 }
   2413 
   2414 void RBBIWordMonkey::setText(const UnicodeString &s) {
   2415     fText       = &s;
   2416 }
   2417 
   2418 
   2419 int32_t RBBIWordMonkey::next(int32_t prevPos) {
   2420     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2421                               //   break position being tested.  The candidate break
   2422                               //   location is before p2.
   2423 
   2424     int     breakPos = -1;
   2425 
   2426     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2427 
   2428     if (U_FAILURE(deferredStatus)) {
   2429         return -1;
   2430     }
   2431 
   2432     // Prev break at end of string.  return DONE.
   2433     if (prevPos >= fText->length()) {
   2434         return -1;
   2435     }
   2436     p0 = p1 = p2 = p3 = prevPos;
   2437     c3 =  fText->char32At(prevPos);
   2438     c0 = c1 = c2 = 0;
   2439     (void)p0;       // Suppress set but not used warning.
   2440 
   2441     // Loop runs once per "significant" character position in the input text.
   2442     for (;;) {
   2443         // Move all of the positions forward in the input string.
   2444         p0 = p1;  c0 = c1;
   2445         p1 = p2;  c1 = c2;
   2446         p2 = p3;  c2 = c3;
   2447 
   2448         // Advancd p3 by    X(Extend | Format)*   Rule 4
   2449         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
   2450         do {
   2451             p3 = fText->moveIndex32(p3, 1);
   2452             c3 = fText->char32At(p3);
   2453             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
   2454                break;
   2455             };
   2456         }
   2457         while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
   2458 
   2459 
   2460         if (p1 == p2) {
   2461             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2462             continue;
   2463         }
   2464         if (p2 == fText->length()) {
   2465             // Reached end of string.  Always a break position.
   2466             break;
   2467         }
   2468 
   2469         // Rule  (3)   CR x LF
   2470         //     No Extend or Format characters may appear between the CR and LF,
   2471         //     which requires the additional check for p2 immediately following p1.
   2472         //
   2473         if (c1==0x0D && c2==0x0A) {
   2474             continue;
   2475         }
   2476 
   2477         // Rule (3a)  Break before and after newlines (including CR and LF)
   2478         //
   2479         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
   2480             break;
   2481         };
   2482         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
   2483             break;
   2484         };
   2485 
   2486         // Rule (3c)    ZWJ x (Glue_after_ZWJ | EmojiNRK).
   2487         //              Not ignoring extend chars, so peek into input text to
   2488         //              get the potential ZWJ, the character immediately preceding c2.
   2489         //              Sloppy UChar32 indexing: p2-1 may reference trail half
   2490         //              but char32At will get the full code point.
   2491         if (fZWJSet->contains(fText->char32At(p2-1)) && (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
   2492             continue;
   2493         }
   2494 
   2495         // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
   2496         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
   2497             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
   2498             continue;
   2499         }
   2500 
   2501         // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
   2502         //
   2503         if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
   2504              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
   2505              (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
   2506             continue;
   2507         }
   2508 
   2509         // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
   2510         if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
   2511             (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
   2512             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
   2513             continue;
   2514         }
   2515 
   2516         // Rule (7a)     Hebrew_Letter x Single_Quote
   2517         if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
   2518             continue;
   2519         }
   2520 
   2521         // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
   2522         if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
   2523             continue;
   2524         }
   2525 
   2526         // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
   2527         if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
   2528             continue;
   2529         }
   2530 
   2531         // Rule (8)    Numeric x Numeric
   2532         if (fNumericSet->contains(c1) &&
   2533             fNumericSet->contains(c2))  {
   2534             continue;
   2535         }
   2536 
   2537         // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
   2538         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
   2539             fNumericSet->contains(c2))  {
   2540             continue;
   2541         }
   2542 
   2543         // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
   2544         if (fNumericSet->contains(c1) &&
   2545             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
   2546             continue;
   2547         }
   2548 
   2549         // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
   2550         if (fNumericSet->contains(c0) &&
   2551             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
   2552             fNumericSet->contains(c2)) {
   2553             continue;
   2554         }
   2555 
   2556         // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
   2557         if (fNumericSet->contains(c1) &&
   2558             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
   2559             fNumericSet->contains(c3)) {
   2560             continue;
   2561         }
   2562 
   2563         // Rule (13)  Katakana x Katakana
   2564         //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
   2565         //                  all Katakana are handled by the dictionary breaker.
   2566         if (fKatakanaSet->contains(c1) &&
   2567             fKatakanaSet->contains(c2))  {
   2568             continue;
   2569         }
   2570 
   2571         // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
   2572         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
   2573              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
   2574              fExtendNumLetSet->contains(c2)) {
   2575                 continue;
   2576         }
   2577 
   2578         // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
   2579         if (fExtendNumLetSet->contains(c1) &&
   2580                 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
   2581                  fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
   2582             continue;
   2583         }
   2584 
   2585         // WB 14  (E_Base | EBG) x E_Modifier
   2586         if ((fEBaseSet->contains(c1)  || fEBGSet->contains(c1)) && fEModifierSet->contains(c2)) {
   2587             continue;
   2588         }
   2589 
   2590         // Rule 15 - 17   Group pairs of Regional Indicators.
   2591         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
   2592             break;
   2593         }
   2594         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
   2595             continue;
   2596         }
   2597 
   2598         // Rule 999.  Break found here.
   2599         break;
   2600     }
   2601 
   2602     breakPos = p2;
   2603     return breakPos;
   2604 }
   2605 
   2606 
   2607 UVector  *RBBIWordMonkey::charClasses() {
   2608     return fSets;
   2609 }
   2610 
   2611 
   2612 RBBIWordMonkey::~RBBIWordMonkey() {
   2613     delete fSets;
   2614     delete fCRSet;
   2615     delete fLFSet;
   2616     delete fNewlineSet;
   2617     delete fKatakanaSet;
   2618     delete fHebrew_LetterSet;
   2619     delete fALetterSet;
   2620     delete fSingle_QuoteSet;
   2621     delete fDouble_QuoteSet;
   2622     delete fMidNumLetSet;
   2623     delete fMidLetterSet;
   2624     delete fMidNumSet;
   2625     delete fNumericSet;
   2626     delete fFormatSet;
   2627     delete fExtendSet;
   2628     delete fExtendNumLetSet;
   2629     delete fRegionalIndicatorSet;
   2630     delete fDictionarySet;
   2631     delete fOtherSet;
   2632     delete fEBaseSet;
   2633     delete fEBGSet;
   2634     delete fEModifierSet;
   2635     delete fZWJSet;
   2636     delete fExtendedPictSet;
   2637     delete fEmojiNRKSet;
   2638 }
   2639 
   2640 
   2641 
   2642 
   2643 //------------------------------------------------------------------------------------------
   2644 //
   2645 //   class RBBISentMonkey      Sentence Break specific implementation
   2646 //                             of RBBIMonkeyKind.
   2647 //
   2648 //------------------------------------------------------------------------------------------
   2649 class RBBISentMonkey: public RBBIMonkeyKind {
   2650 public:
   2651     RBBISentMonkey();
   2652     virtual          ~RBBISentMonkey();
   2653     virtual  UVector *charClasses();
   2654     virtual  void     setText(const UnicodeString &s);
   2655     virtual int32_t   next(int32_t i);
   2656 private:
   2657     int               moveBack(int posFrom);
   2658     int               moveForward(int posFrom);
   2659     UChar32           cAt(int pos);
   2660 
   2661     UVector      *fSets;
   2662 
   2663     UnicodeSet  *fSepSet;
   2664     UnicodeSet  *fFormatSet;
   2665     UnicodeSet  *fSpSet;
   2666     UnicodeSet  *fLowerSet;
   2667     UnicodeSet  *fUpperSet;
   2668     UnicodeSet  *fOLetterSet;
   2669     UnicodeSet  *fNumericSet;
   2670     UnicodeSet  *fATermSet;
   2671     UnicodeSet  *fSContinueSet;
   2672     UnicodeSet  *fSTermSet;
   2673     UnicodeSet  *fCloseSet;
   2674     UnicodeSet  *fOtherSet;
   2675     UnicodeSet  *fExtendSet;
   2676 
   2677     const UnicodeString  *fText;
   2678 
   2679 };
   2680 
   2681 RBBISentMonkey::RBBISentMonkey()
   2682 {
   2683     UErrorCode  status = U_ZERO_ERROR;
   2684 
   2685     fSets            = new UVector(status);
   2686 
   2687     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
   2688     //                       set and made into character classes of their own.  For the monkey impl,
   2689     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
   2690     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
   2691     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
   2692     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
   2693     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
   2694     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
   2695     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
   2696     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
   2697     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
   2698     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
   2699     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
   2700     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
   2701     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
   2702     fOtherSet        = new UnicodeSet();
   2703 
   2704     if(U_FAILURE(status)) {
   2705       deferredStatus = status;
   2706       return;
   2707     }
   2708 
   2709     fOtherSet->complement();
   2710     fOtherSet->removeAll(*fSepSet);
   2711     fOtherSet->removeAll(*fFormatSet);
   2712     fOtherSet->removeAll(*fSpSet);
   2713     fOtherSet->removeAll(*fLowerSet);
   2714     fOtherSet->removeAll(*fUpperSet);
   2715     fOtherSet->removeAll(*fOLetterSet);
   2716     fOtherSet->removeAll(*fNumericSet);
   2717     fOtherSet->removeAll(*fATermSet);
   2718     fOtherSet->removeAll(*fSContinueSet);
   2719     fOtherSet->removeAll(*fSTermSet);
   2720     fOtherSet->removeAll(*fCloseSet);
   2721     fOtherSet->removeAll(*fExtendSet);
   2722 
   2723     fSets->addElement(fSepSet,       status);
   2724     fSets->addElement(fFormatSet,    status);
   2725     fSets->addElement(fSpSet,        status);
   2726     fSets->addElement(fLowerSet,     status);
   2727     fSets->addElement(fUpperSet,     status);
   2728     fSets->addElement(fOLetterSet,   status);
   2729     fSets->addElement(fNumericSet,   status);
   2730     fSets->addElement(fATermSet,     status);
   2731     fSets->addElement(fSContinueSet, status);
   2732     fSets->addElement(fSTermSet,     status);
   2733     fSets->addElement(fCloseSet,     status);
   2734     fSets->addElement(fOtherSet,     status);
   2735     fSets->addElement(fExtendSet,    status);
   2736 
   2737     if (U_FAILURE(status)) {
   2738         deferredStatus = status;
   2739     }
   2740 }
   2741 
   2742 
   2743 
   2744 void RBBISentMonkey::setText(const UnicodeString &s) {
   2745     fText       = &s;
   2746 }
   2747 
   2748 UVector  *RBBISentMonkey::charClasses() {
   2749     return fSets;
   2750 }
   2751 
   2752 
   2753 //  moveBack()   Find the "significant" code point preceding the index i.
   2754 //               Skips over ($Extend | $Format)* .
   2755 //
   2756 int RBBISentMonkey::moveBack(int i) {
   2757     if (i <= 0) {
   2758         return -1;
   2759     }
   2760     UChar32   c;
   2761     int32_t   j = i;
   2762     do {
   2763         j = fText->moveIndex32(j, -1);
   2764         c = fText->char32At(j);
   2765     }
   2766     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
   2767     return j;
   2768 
   2769  }
   2770 
   2771 
   2772 int RBBISentMonkey::moveForward(int i) {
   2773     if (i>=fText->length()) {
   2774         return fText->length();
   2775     }
   2776     UChar32   c;
   2777     int32_t   j = i;
   2778     do {
   2779         j = fText->moveIndex32(j, 1);
   2780         c = cAt(j);
   2781     }
   2782     while (fFormatSet->contains(c) || fExtendSet->contains(c));
   2783     return j;
   2784 }
   2785 
   2786 UChar32 RBBISentMonkey::cAt(int pos) {
   2787     if (pos<0 || pos>=fText->length()) {
   2788         return -1;
   2789     } else {
   2790         return fText->char32At(pos);
   2791     }
   2792 }
   2793 
   2794 int32_t RBBISentMonkey::next(int32_t prevPos) {
   2795     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2796                               //   break position being tested.  The candidate break
   2797                               //   location is before p2.
   2798 
   2799     int     breakPos = -1;
   2800 
   2801     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2802     UChar32 c;
   2803 
   2804     if (U_FAILURE(deferredStatus)) {
   2805         return -1;
   2806     }
   2807 
   2808     // Prev break at end of string.  return DONE.
   2809     if (prevPos >= fText->length()) {
   2810         return -1;
   2811     }
   2812     p0 = p1 = p2 = p3 = prevPos;
   2813     c3 =  fText->char32At(prevPos);
   2814     c0 = c1 = c2 = 0;
   2815     (void)p0;     // Suppress set but not used warning.
   2816 
   2817     // Loop runs once per "significant" character position in the input text.
   2818     for (;;) {
   2819         // Move all of the positions forward in the input string.
   2820         p0 = p1;  c0 = c1;
   2821         p1 = p2;  c1 = c2;
   2822         p2 = p3;  c2 = c3;
   2823 
   2824         // Advancd p3 by    X(Extend | Format)*   Rule 4
   2825         p3 = moveForward(p3);
   2826         c3 = cAt(p3);
   2827 
   2828         // Rule (3)  CR x LF
   2829         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
   2830             continue;
   2831         }
   2832 
   2833         // Rule (4).   Sep  <break>
   2834         if (fSepSet->contains(c1)) {
   2835             p2 = p1+1;   // Separators don't combine with Extend or Format.
   2836             break;
   2837         }
   2838 
   2839         if (p2 >= fText->length()) {
   2840             // Reached end of string.  Always a break position.
   2841             break;
   2842         }
   2843 
   2844         if (p2 == prevPos) {
   2845             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2846             continue;
   2847         }
   2848 
   2849         // Rule (6).   ATerm x Numeric
   2850         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
   2851             continue;
   2852         }
   2853 
   2854         // Rule (7).  (Upper | Lower) ATerm  x  Uppper
   2855         if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
   2856                 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
   2857             continue;
   2858         }
   2859 
   2860         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
   2861         //           Note:  STerm | ATerm are added to the negated part of the expression by a
   2862         //                  note to the Unicode 5.0 documents.
   2863         int p8 = p1;
   2864         while (fSpSet->contains(cAt(p8))) {
   2865             p8 = moveBack(p8);
   2866         }
   2867         while (fCloseSet->contains(cAt(p8))) {
   2868             p8 = moveBack(p8);
   2869         }
   2870         if (fATermSet->contains(cAt(p8))) {
   2871             p8=p2;
   2872             for (;;) {
   2873                 c = cAt(p8);
   2874                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
   2875                     fLowerSet->contains(c) || fSepSet->contains(c) ||
   2876                     fATermSet->contains(c) || fSTermSet->contains(c))  {
   2877                     break;
   2878                 }
   2879                 p8 = moveForward(p8);
   2880             }
   2881             if (fLowerSet->contains(cAt(p8))) {
   2882                 continue;
   2883             }
   2884         }
   2885 
   2886         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
   2887         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
   2888             p8 = p1;
   2889             while (fSpSet->contains(cAt(p8))) {
   2890                 p8 = moveBack(p8);
   2891             }
   2892             while (fCloseSet->contains(cAt(p8))) {
   2893                 p8 = moveBack(p8);
   2894             }
   2895             c = cAt(p8);
   2896             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
   2897                 continue;
   2898             }
   2899         }
   2900 
   2901         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
   2902         int p9 = p1;
   2903         while (fCloseSet->contains(cAt(p9))) {
   2904             p9 = moveBack(p9);
   2905         }
   2906         c = cAt(p9);
   2907         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
   2908             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
   2909                 continue;
   2910             }
   2911         }
   2912 
   2913         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
   2914         int p10 = p1;
   2915         while (fSpSet->contains(cAt(p10))) {
   2916             p10 = moveBack(p10);
   2917         }
   2918         while (fCloseSet->contains(cAt(p10))) {
   2919             p10 = moveBack(p10);
   2920         }
   2921         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
   2922             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
   2923                 continue;
   2924             }
   2925         }
   2926 
   2927         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
   2928         int p11 = p1;
   2929         if (fSepSet->contains(cAt(p11))) {
   2930             p11 = moveBack(p11);
   2931         }
   2932         while (fSpSet->contains(cAt(p11))) {
   2933             p11 = moveBack(p11);
   2934         }
   2935         while (fCloseSet->contains(cAt(p11))) {
   2936             p11 = moveBack(p11);
   2937         }
   2938         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
   2939             break;
   2940         }
   2941 
   2942         //  Rule (12)  Any x Any
   2943         continue;
   2944     }
   2945     breakPos = p2;
   2946     return breakPos;
   2947 }
   2948 
   2949 RBBISentMonkey::~RBBISentMonkey() {
   2950     delete fSets;
   2951     delete fSepSet;
   2952     delete fFormatSet;
   2953     delete fSpSet;
   2954     delete fLowerSet;
   2955     delete fUpperSet;
   2956     delete fOLetterSet;
   2957     delete fNumericSet;
   2958     delete fATermSet;
   2959     delete fSContinueSet;
   2960     delete fSTermSet;
   2961     delete fCloseSet;
   2962     delete fOtherSet;
   2963     delete fExtendSet;
   2964 }
   2965 
   2966 
   2967 
   2968 //-------------------------------------------------------------------------------------------
   2969 //
   2970 //  RBBILineMonkey
   2971 //
   2972 //-------------------------------------------------------------------------------------------
   2973 
   2974 class RBBILineMonkey: public RBBIMonkeyKind {
   2975 public:
   2976     RBBILineMonkey();
   2977     virtual          ~RBBILineMonkey();
   2978     virtual  UVector *charClasses();
   2979     virtual  void     setText(const UnicodeString &s);
   2980     virtual  int32_t  next(int32_t i);
   2981     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
   2982 private:
   2983     UVector      *fSets;
   2984 
   2985     UnicodeSet  *fBK;
   2986     UnicodeSet  *fCR;
   2987     UnicodeSet  *fLF;
   2988     UnicodeSet  *fCM;
   2989     UnicodeSet  *fNL;
   2990     UnicodeSet  *fSG;
   2991     UnicodeSet  *fWJ;
   2992     UnicodeSet  *fZW;
   2993     UnicodeSet  *fGL;
   2994     UnicodeSet  *fCB;
   2995     UnicodeSet  *fSP;
   2996     UnicodeSet  *fB2;
   2997     UnicodeSet  *fBA;
   2998     UnicodeSet  *fBB;
   2999     UnicodeSet  *fHY;
   3000     UnicodeSet  *fH2;
   3001     UnicodeSet  *fH3;
   3002     UnicodeSet  *fCL;
   3003     UnicodeSet  *fCP;
   3004     UnicodeSet  *fEX;
   3005     UnicodeSet  *fIN;
   3006     UnicodeSet  *fJL;
   3007     UnicodeSet  *fJV;
   3008     UnicodeSet  *fJT;
   3009     UnicodeSet  *fNS;
   3010     UnicodeSet  *fOP;
   3011     UnicodeSet  *fQU;
   3012     UnicodeSet  *fIS;
   3013     UnicodeSet  *fNU;
   3014     UnicodeSet  *fPO;
   3015     UnicodeSet  *fPR;
   3016     UnicodeSet  *fSY;
   3017     UnicodeSet  *fAI;
   3018     UnicodeSet  *fAL;
   3019     UnicodeSet  *fCJ;
   3020     UnicodeSet  *fHL;
   3021     UnicodeSet  *fID;
   3022     UnicodeSet  *fRI;
   3023     UnicodeSet  *fXX;
   3024     UnicodeSet  *fEB;
   3025     UnicodeSet  *fEM;
   3026     UnicodeSet  *fZJ;
   3027     UnicodeSet  *fExtendedPict;
   3028     UnicodeSet  *fEmojiNRK;
   3029 
   3030     BreakIterator        *fCharBI;
   3031     const UnicodeString  *fText;
   3032     RegexMatcher         *fNumberMatcher;
   3033 };
   3034 
   3035 RBBILineMonkey::RBBILineMonkey() :
   3036     RBBIMonkeyKind(),
   3037     fSets(NULL),
   3038 
   3039     fCharBI(NULL),
   3040     fText(NULL),
   3041     fNumberMatcher(NULL)
   3042 
   3043 {
   3044     if (U_FAILURE(deferredStatus)) {
   3045         return;
   3046     }
   3047 
   3048     UErrorCode  status = U_ZERO_ERROR;
   3049 
   3050     fSets  = new UVector(status);
   3051 
   3052     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
   3053     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
   3054     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
   3055     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
   3056     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
   3057     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
   3058     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
   3059     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
   3060     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
   3061     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
   3062     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
   3063     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
   3064     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
   3065     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
   3066     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
   3067     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
   3068     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
   3069     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
   3070     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
   3071     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
   3072     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
   3073     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
   3074     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
   3075     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
   3076     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
   3077     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
   3078     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
   3079     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
   3080     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
   3081     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
   3082     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
   3083     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
   3084     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
   3085     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
   3086     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
   3087     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
   3088     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
   3089     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
   3090     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
   3091     fEB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
   3092     fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
   3093     fZJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
   3094     fEmojiNRK = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
   3095     fExtendedPict = new UnicodeSet(gExtended_Pict, status);
   3096 
   3097     if (U_FAILURE(status)) {
   3098         deferredStatus = status;
   3099         return;
   3100     }
   3101 
   3102     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
   3103     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
   3104     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
   3105 
   3106     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
   3107     fCM->addAll(*fZJ);     // ZWJ behaves as a CM.
   3108 
   3109     fSets->addElement(fBK, status);
   3110     fSets->addElement(fCR, status);
   3111     fSets->addElement(fLF, status);
   3112     fSets->addElement(fCM, status);
   3113     fSets->addElement(fNL, status);
   3114     fSets->addElement(fWJ, status);
   3115     fSets->addElement(fZW, status);
   3116     fSets->addElement(fGL, status);
   3117     fSets->addElement(fCB, status);
   3118     fSets->addElement(fSP, status);
   3119     fSets->addElement(fB2, status);
   3120     fSets->addElement(fBA, status);
   3121     fSets->addElement(fBB, status);
   3122     fSets->addElement(fHY, status);
   3123     fSets->addElement(fH2, status);
   3124     fSets->addElement(fH3, status);
   3125     fSets->addElement(fCL, status);
   3126     fSets->addElement(fCP, status);
   3127     fSets->addElement(fEX, status);
   3128     fSets->addElement(fIN, status);
   3129     fSets->addElement(fJL, status);
   3130     fSets->addElement(fJT, status);
   3131     fSets->addElement(fJV, status);
   3132     fSets->addElement(fNS, status);
   3133     fSets->addElement(fOP, status);
   3134     fSets->addElement(fQU, status);
   3135     fSets->addElement(fIS, status);
   3136     fSets->addElement(fNU, status);
   3137     fSets->addElement(fPO, status);
   3138     fSets->addElement(fPR, status);
   3139     fSets->addElement(fSY, status);
   3140     fSets->addElement(fAI, status);
   3141     fSets->addElement(fAL, status);
   3142     fSets->addElement(fHL, status);
   3143     fSets->addElement(fID, status);
   3144     fSets->addElement(fWJ, status);
   3145     fSets->addElement(fRI, status);
   3146     fSets->addElement(fSG, status);
   3147     fSets->addElement(fEB, status);
   3148     fSets->addElement(fEM, status);
   3149     fSets->addElement(fZJ, status);
   3150     fSets->addElement(fExtendedPict, status);
   3151     fSets->addElement(fEmojiNRK, status);
   3152 
   3153 
   3154     const char *rules =
   3155             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
   3156             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
   3157             "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
   3158             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
   3159             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
   3160             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
   3161 
   3162     fNumberMatcher = new RegexMatcher(
   3163         UnicodeString(rules, -1, US_INV), 0, status);
   3164 
   3165     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
   3166 
   3167     if (U_FAILURE(status)) {
   3168         deferredStatus = status;
   3169     }
   3170 }
   3171 
   3172 
   3173 void RBBILineMonkey::setText(const UnicodeString &s) {
   3174     fText       = &s;
   3175     fCharBI->setText(s);
   3176     fNumberMatcher->reset(s);
   3177 }
   3178 
   3179 //
   3180 //  rule9Adjust
   3181 //     Line Break TR rules 9 and 10 implementation.
   3182 //     This deals with combining marks and other sequences that
   3183 //     that must be treated as if they were something other than what they actually are.
   3184 //
   3185 //     This is factored out into a separate function because it must be applied twice for
   3186 //     each potential break, once to the chars before the position being checked, then
   3187 //     again to the text following the possible break.
   3188 //
   3189 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
   3190     if (pos == -1) {
   3191         // Invalid initial position.  Happens during the warmup iteration of the
   3192         //   main loop in next().
   3193         return;
   3194     }
   3195 
   3196     int32_t  nPos = *nextPos;
   3197 
   3198     // LB 9  Keep combining sequences together.
   3199     //  advance over any CM class chars.  Note that Line Break CM is different
   3200     //  from the normal Grapheme Extend property.
   3201     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
   3202           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
   3203         for (;;) {
   3204             *nextChar = fText->char32At(nPos);
   3205             if (!fCM->contains(*nextChar)) {
   3206                 break;
   3207             }
   3208             nPos = fText->moveIndex32(nPos, 1);
   3209         }
   3210     }
   3211 
   3212 
   3213     // LB 9 Treat X CM* as if it were x.
   3214     //       No explicit action required.
   3215 
   3216     // LB 10  Treat any remaining combining mark as AL
   3217     if (fCM->contains(*posChar)) {
   3218         *posChar = u'A';
   3219     }
   3220 
   3221     // Push the updated nextPos and nextChar back to our caller.
   3222     // This only makes a difference if posChar got bigger by consuming a
   3223     // combining sequence.
   3224     *nextPos  = nPos;
   3225     *nextChar = fText->char32At(nPos);
   3226 }
   3227 
   3228 
   3229 
   3230 int32_t RBBILineMonkey::next(int32_t startPos) {
   3231     UErrorCode status = U_ZERO_ERROR;
   3232     int32_t    pos;       //  Index of the char following a potential break position
   3233     UChar32    thisChar;  //  Character at above position "pos"
   3234 
   3235     int32_t    prevPos;   //  Index of the char preceding a potential break position
   3236     UChar32    prevChar;  //  Character at above position.  Note that prevChar
   3237                           //   and thisChar may not be adjacent because combining
   3238                           //   characters between them will be ignored.
   3239 
   3240     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
   3241     UChar32    prevCharX2;
   3242 
   3243     int32_t    nextPos;   //  Index of the next character following pos.
   3244                           //     Usually skips over combining marks.
   3245     int32_t    nextCPPos; //  Index of the code point following "pos."
   3246                           //     May point to a combining mark.
   3247     int32_t    tPos;      //  temp value.
   3248     UChar32    c;
   3249 
   3250     if (U_FAILURE(deferredStatus)) {
   3251         return -1;
   3252     }
   3253 
   3254     if (startPos >= fText->length()) {
   3255         return -1;
   3256     }
   3257 
   3258 
   3259     // Initial values for loop.  Loop will run the first time without finding breaks,
   3260     //                           while the invalid values shift out and the "this" and
   3261     //                           "prev" positions are filled in with good values.
   3262     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
   3263     thisChar = prevChar  = prevCharX2 = 0;
   3264     nextPos  = nextCPPos = startPos;
   3265 
   3266 
   3267     // Loop runs once per position in the test text, until a break position
   3268     //  is found.
   3269     for (;;) {
   3270         prevPosX2 = prevPos;
   3271         prevCharX2 = prevChar;
   3272 
   3273         prevPos   = pos;
   3274         prevChar  = thisChar;
   3275 
   3276         pos       = nextPos;
   3277         thisChar  = fText->char32At(pos);
   3278 
   3279         nextCPPos = fText->moveIndex32(pos, 1);
   3280         nextPos   = nextCPPos;
   3281 
   3282         // Rule LB2 - Break at end of text.
   3283         if (pos >= fText->length()) {
   3284             break;
   3285         }
   3286 
   3287         // Rule LB 9 - adjust for combining sequences.
   3288         //             We do this one out-of-order because the adjustment does not change anything
   3289         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
   3290         //             be applied.
   3291         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
   3292         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
   3293         c = fText->char32At(nextPos);
   3294         rule9Adjust(pos,     &thisChar, &nextPos, &c);
   3295 
   3296         // If the loop is still warming up - if we haven't shifted the initial
   3297         //   -1 positions out of prevPos yet - loop back to advance the
   3298         //    position in the input without any further looking for breaks.
   3299         if (prevPos == -1) {
   3300             continue;
   3301         }
   3302 
   3303         // LB 4  Always break after hard line breaks,
   3304         if (fBK->contains(prevChar)) {
   3305             break;
   3306         }
   3307 
   3308         // LB 5  Break after CR, LF, NL, but not inside CR LF
   3309         if (prevChar == 0x0d && thisChar == 0x0a) {
   3310             continue;
   3311         }
   3312         if (prevChar == 0x0d ||
   3313             prevChar == 0x0a ||
   3314             prevChar == 0x85)  {
   3315             break;
   3316         }
   3317 
   3318         // LB 6  Don't break before hard line breaks
   3319         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
   3320             fBK->contains(thisChar)) {
   3321                 continue;
   3322         }
   3323 
   3324 
   3325         // LB 7  Don't break before spaces or zero-width space.
   3326         if (fSP->contains(thisChar)) {
   3327             continue;
   3328         }
   3329 
   3330         if (fZW->contains(thisChar)) {
   3331             continue;
   3332         }
   3333 
   3334         // LB 8  Break after zero width space
   3335         if (fZW->contains(prevChar)) {
   3336             break;
   3337         }
   3338 
   3339         // LB 8a ZWJ x (ID | ExtendedPict | Emoji)
   3340         //       The monkey test's way of ignoring combining characters doesn't work
   3341         //       for this rule. ZJ is also a CM. Need to get the actual character
   3342         //       preceding "thisChar", not ignoring combining marks, possibly ZJ.
   3343         {
   3344             int32_t prevIdx = fText->moveIndex32(pos, -1);
   3345             UChar32 prevC = fText->char32At(prevIdx);
   3346             if (fZJ->contains(prevC) && (fID->contains(thisChar) || fExtendedPict->contains(thisChar) || fEmojiNRK->contains(thisChar))) {
   3347                 continue;
   3348             }
   3349         }
   3350 
   3351         // LB 9, 10  Already done, at top of loop.
   3352         //
   3353 
   3354 
   3355         // LB 11  Do not break before or after WORD JOINER and related characters.
   3356         //    x  WJ
   3357         //    WJ  x
   3358         //
   3359         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
   3360             continue;
   3361         }
   3362 
   3363         // LB 12
   3364         //    GL  x
   3365         if (fGL->contains(prevChar)) {
   3366             continue;
   3367         }
   3368 
   3369         // LB 12a
   3370         //    [^SP BA HY] x GL
   3371         if (!(fSP->contains(prevChar) ||
   3372               fBA->contains(prevChar) ||
   3373               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
   3374             continue;
   3375         }
   3376 
   3377 
   3378 
   3379         // LB 13  Don't break before closings.
   3380         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
   3381         //        fall into LB 17 and the more general number regular expression.
   3382         //
   3383         if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
   3384             (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
   3385                                          fEX->contains(thisChar)  ||
   3386             (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
   3387             (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
   3388             continue;
   3389         }
   3390 
   3391         // LB 14 Don't break after OP SP*
   3392         //       Scan backwards, checking for this sequence.
   3393         //       The OP char could include combining marks, so we actually check for
   3394         //           OP CM* SP*
   3395         //       Another Twist: The Rule 67 fixes may have changed a SP CM
   3396         //       sequence into a ID char, so before scanning back through spaces,
   3397         //       verify that prevChar is indeed a space.  The prevChar variable
   3398         //       may differ from fText[prevPos]
   3399         tPos = prevPos;
   3400         if (fSP->contains(prevChar)) {
   3401             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
   3402                 tPos=fText->moveIndex32(tPos, -1);
   3403             }
   3404         }
   3405         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
   3406             tPos=fText->moveIndex32(tPos, -1);
   3407         }
   3408         if (fOP->contains(fText->char32At(tPos))) {
   3409             continue;
   3410         }
   3411 
   3412 
   3413         // LB 15    QU SP* x OP
   3414         if (fOP->contains(thisChar)) {
   3415             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
   3416             int tPos = prevPos;
   3417             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
   3418                 tPos = fText->moveIndex32(tPos, -1);
   3419             }
   3420             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
   3421                 tPos = fText->moveIndex32(tPos, -1);
   3422             }
   3423             if (fQU->contains(fText->char32At(tPos))) {
   3424                 continue;
   3425             }
   3426         }
   3427 
   3428 
   3429 
   3430         // LB 16   (CL | CP) SP* x NS
   3431         //    Scan backwards for SP* CM* (CL | CP)
   3432         if (fNS->contains(thisChar)) {
   3433             int tPos = prevPos;
   3434             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
   3435                 tPos = fText->moveIndex32(tPos, -1);
   3436             }
   3437             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
   3438                 tPos = fText->moveIndex32(tPos, -1);
   3439             }
   3440             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
   3441                 continue;
   3442             }
   3443         }
   3444 
   3445 
   3446         // LB 17        B2 SP* x B2
   3447         if (fB2->contains(thisChar)) {
   3448             //  Scan backwards, checking for the B2 CM* SP* sequence.
   3449             tPos = prevPos;
   3450             if (fSP->contains(prevChar)) {
   3451                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
   3452                     tPos=fText->moveIndex32(tPos, -1);
   3453                 }
   3454             }
   3455             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
   3456                 tPos=fText->moveIndex32(tPos, -1);
   3457             }
   3458             if (fB2->contains(fText->char32At(tPos))) {
   3459                 continue;
   3460             }
   3461         }
   3462 
   3463 
   3464         // LB 18    break after space
   3465         if (fSP->contains(prevChar)) {
   3466             break;
   3467         }
   3468 
   3469         // LB 19
   3470         //    x   QU
   3471         //    QU  x
   3472         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
   3473             continue;
   3474         }
   3475 
   3476         // LB 20  Break around a CB
   3477         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
   3478             break;
   3479         }
   3480 
   3481         // LB 21
   3482         if (fBA->contains(thisChar) ||
   3483             fHY->contains(thisChar) ||
   3484             fNS->contains(thisChar) ||
   3485             fBB->contains(prevChar) )   {
   3486             continue;
   3487         }
   3488 
   3489         // LB 21a
   3490         //   HL (HY | BA) x
   3491         if (fHL->contains(prevCharX2) &&
   3492                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
   3493             continue;
   3494         }
   3495 
   3496         // LB 21b
   3497         //   SY x HL
   3498         if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
   3499             continue;
   3500         }
   3501 
   3502         // LB 22
   3503         if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
   3504             (fEX->contains(prevChar) && fIN->contains(thisChar)) ||
   3505             (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
   3506             ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fIN->contains(thisChar)) ||
   3507             (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
   3508             (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
   3509             continue;
   3510         }
   3511 
   3512 
   3513         // LB 23    (AL | HL) x NU
   3514         //          NU x (AL | HL)
   3515         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
   3516             continue;
   3517         }
   3518         if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
   3519             continue;
   3520         }
   3521 
   3522         // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
   3523         //      PR x (ID | EB | EM)
   3524         //     (ID | EB | EM) x PO
   3525         if (fPR->contains(prevChar) &&
   3526                 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar)))  {
   3527             continue;
   3528         }
   3529         if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
   3530                 fPO->contains(thisChar)) {
   3531             continue;
   3532         }
   3533 
   3534         // LB 24  Do not break between prefix and letters or ideographs.
   3535         //         (PR | PO) x (AL | HL)
   3536         //         (AL | HL) x (PR | PO)
   3537         if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
   3538                 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
   3539             continue;
   3540         }
   3541         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
   3542                 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
   3543             continue;
   3544         }
   3545 
   3546 
   3547 
   3548         // LB 25    Numbers
   3549         if (fNumberMatcher->lookingAt(prevPos, status)) {
   3550             if (U_FAILURE(status)) {
   3551                 break;
   3552             }
   3553             // Matched a number.  But could have been just a single digit, which would
   3554             //    not represent a "no break here" between prevChar and thisChar
   3555             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
   3556             if (numEndIdx > pos) {
   3557                 // Number match includes at least our two chars being checked
   3558                 if (numEndIdx > nextPos) {
   3559                     // Number match includes additional chars.  Update pos and nextPos
   3560                     //   so that next loop iteration will continue at the end of the number,
   3561                     //   checking for breaks between last char in number & whatever follows.
   3562                     pos = nextPos = numEndIdx;
   3563                     do {
   3564                         pos = fText->moveIndex32(pos, -1);
   3565                         thisChar = fText->char32At(pos);
   3566                     } while (fCM->contains(thisChar));
   3567                 }
   3568                 continue;
   3569             }
   3570         }
   3571 
   3572 
   3573         // LB 26 Do not break a Korean syllable.
   3574         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
   3575                                         fJV->contains(thisChar) ||
   3576                                         fH2->contains(thisChar) ||
   3577                                         fH3->contains(thisChar))) {
   3578                                             continue;
   3579                                         }
   3580 
   3581         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
   3582             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
   3583                 continue;
   3584         }
   3585 
   3586         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
   3587             fJT->contains(thisChar)) {
   3588                 continue;
   3589         }
   3590 
   3591         // LB 27 Treat a Korean Syllable Block the same as ID.
   3592         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
   3593             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
   3594             fIN->contains(thisChar)) {
   3595                 continue;
   3596             }
   3597         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
   3598             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
   3599             fPO->contains(thisChar)) {
   3600                 continue;
   3601             }
   3602         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
   3603             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
   3604                 continue;
   3605             }
   3606 
   3607 
   3608 
   3609         // LB 28  Do not break between alphabetics ("at").
   3610         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
   3611             continue;
   3612         }
   3613 
   3614         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
   3615         if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
   3616             continue;
   3617         }
   3618 
   3619         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
   3620         //          (AL | NU) x OP
   3621         //          CP x (AL | NU)
   3622         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
   3623             continue;
   3624         }
   3625         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
   3626             continue;
   3627         }
   3628 
   3629         // LB30a    RI RI <break> RI
   3630         //             RI    x    RI
   3631         if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
   3632             break;
   3633         }
   3634         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
   3635             continue;
   3636         }
   3637 
   3638         // LB30b    Emoji Base x Emoji Modifier
   3639         if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
   3640             continue;
   3641         }
   3642 
   3643         // LB 31    Break everywhere else
   3644         break;
   3645 
   3646     }
   3647 
   3648     return pos;
   3649 }
   3650 
   3651 
   3652 UVector  *RBBILineMonkey::charClasses() {
   3653     return fSets;
   3654 }
   3655 
   3656 
   3657 RBBILineMonkey::~RBBILineMonkey() {
   3658     delete fSets;
   3659 
   3660     delete fBK;
   3661     delete fCR;
   3662     delete fLF;
   3663     delete fCM;
   3664     delete fNL;
   3665     delete fWJ;
   3666     delete fZW;
   3667     delete fGL;
   3668     delete fCB;
   3669     delete fSP;
   3670     delete fB2;
   3671     delete fBA;
   3672     delete fBB;
   3673     delete fHY;
   3674     delete fH2;
   3675     delete fH3;
   3676     delete fCL;
   3677     delete fCP;
   3678     delete fEX;
   3679     delete fIN;
   3680     delete fJL;
   3681     delete fJV;
   3682     delete fJT;
   3683     delete fNS;
   3684     delete fOP;
   3685     delete fQU;
   3686     delete fIS;
   3687     delete fNU;
   3688     delete fPO;
   3689     delete fPR;
   3690     delete fSY;
   3691     delete fAI;
   3692     delete fAL;
   3693     delete fCJ;
   3694     delete fHL;
   3695     delete fID;
   3696     delete fRI;
   3697     delete fSG;
   3698     delete fXX;
   3699     delete fEB;
   3700     delete fEM;
   3701     delete fZJ;
   3702     delete fExtendedPict;
   3703     delete fEmojiNRK;
   3704 
   3705     delete fCharBI;
   3706     delete fNumberMatcher;
   3707 }
   3708 
   3709 
   3710 //-------------------------------------------------------------------------------------------
   3711 //
   3712 //   TestMonkey
   3713 //
   3714 //     params
   3715 //       seed=nnnnn        Random number starting seed.
   3716 //                         Setting the seed allows errors to be reproduced.
   3717 //       loop=nnn          Looping count.  Controls running time.
   3718 //                         -1:  run forever.
   3719 //                          0 or greater:  run length.
   3720 //
   3721 //       type = char | word | line | sent | title
   3722 //
   3723 //  Example:
   3724 //     intltest  rbbi/RBBITest/TestMonkey@"type=line loop=-1"
   3725 //
   3726 //-------------------------------------------------------------------------------------------
   3727 
   3728 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
   3729     int32_t val = defaultVal;
   3730     name.append(" *= *(-?\\d+)");
   3731     UErrorCode status = U_ZERO_ERROR;
   3732     RegexMatcher m(name, params, 0, status);
   3733     if (m.find()) {
   3734         // The param exists.  Convert the string to an int.
   3735         char valString[100];
   3736         int32_t paramLength = m.end(1, status) - m.start(1, status);
   3737         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
   3738             paramLength = (int32_t)(sizeof(valString)-2);
   3739         }
   3740         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
   3741         val = strtol(valString,  NULL, 10);
   3742 
   3743         // Delete this parameter from the params string.
   3744         m.reset();
   3745         params = m.replaceFirst("", status);
   3746     }
   3747     U_ASSERT(U_SUCCESS(status));
   3748     return val;
   3749 }
   3750 #endif
   3751 
   3752 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3753 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
   3754                                     BreakIterator *bi,
   3755                                     int expected[],
   3756                                     int expectedcount)
   3757 {
   3758     int count = 0;
   3759     int i = 0;
   3760     int forward[50];
   3761     bi->setText(ustr);
   3762     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
   3763         forward[count] = i;
   3764         if (count < expectedcount && expected[count] != i) {
   3765             test->errln("%s:%d break forward test failed: expected %d but got %d",
   3766                         __FILE__, __LINE__, expected[count], i);
   3767             break;
   3768         }
   3769         count ++;
   3770     }
   3771     if (count != expectedcount) {
   3772         printStringBreaks(ustr, expected, expectedcount);
   3773         test->errln("%s:%d break forward test failed: missed %d match",
   3774                     __FILE__, __LINE__, expectedcount - count);
   3775         return;
   3776     }
   3777     // testing boundaries
   3778     for (i = 1; i < expectedcount; i ++) {
   3779         int j = expected[i - 1];
   3780         if (!bi->isBoundary(j)) {
   3781             printStringBreaks(ustr, expected, expectedcount);
   3782             test->errln("%s:%d isBoundary() failed.  Expected boundary at position %d",
   3783                     __FILE__, __LINE__, j);
   3784             return;
   3785         }
   3786         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
   3787             if (bi->isBoundary(j)) {
   3788                 printStringBreaks(ustr, expected, expectedcount);
   3789                 test->errln("%s:%d isBoundary() failed.  Not expecting boundary at position %d",
   3790                     __FILE__, __LINE__, j);
   3791                 return;
   3792             }
   3793         }
   3794     }
   3795 
   3796     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
   3797         count --;
   3798         if (forward[count] != i) {
   3799             printStringBreaks(ustr, expected, expectedcount);
   3800             test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
   3801                         __FILE__, __LINE__, forward[count], i);
   3802             break;
   3803         }
   3804     }
   3805     if (count != 0) {
   3806         printStringBreaks(ustr, expected, expectedcount);
   3807         test->errln("break test previous() failed: missed a match");
   3808         return;
   3809     }
   3810 
   3811     // testing preceding
   3812     for (i = 0; i < expectedcount - 1; i ++) {
   3813         // int j = expected[i] + 1;
   3814         int j = ustr.moveIndex32(expected[i], 1);
   3815         for (; j <= expected[i + 1]; j ++) {
   3816             int32_t expectedPreceding = expected[i];
   3817             int32_t actualPreceding = bi->preceding(j);
   3818             if (actualPreceding != expectedPreceding) {
   3819                 printStringBreaks(ustr, expected, expectedcount);
   3820                 test->errln("%s:%d preceding(%d): expected %d, got %d",
   3821                         __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
   3822                 return;
   3823             }
   3824         }
   3825     }
   3826 }
   3827 #endif
   3828 
   3829 void RBBITest::TestWordBreaks(void)
   3830 {
   3831 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3832 
   3833     Locale        locale("en");
   3834     UErrorCode    status = U_ZERO_ERROR;
   3835     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   3836     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
   3837     // Replaced any C+J characters in a row with a random sequence of characters
   3838     // of the same length to make our C+J segmentation not get in the way.
   3839     static const char *strlist[] =
   3840     {
   3841     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
   3842     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
   3843     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
   3844     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
   3845     "\\uac00\\u3588\\u009c\\u0953\\u194b",
   3846     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   3847     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
   3848     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
   3849     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   3850     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   3851     "\\u2027\\U000e0067\\u0a47\\u00b7",
   3852     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   3853     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   3854     "\\u0589\\U000e006e\\u0a42\\U000104a5",
   3855     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
   3856     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   3857     "\\u0027\\u11af\\U000e0057\\u0602",
   3858     "\\U0001d7f2\\U000e007\\u0004\\u0589",
   3859     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
   3860     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   3861     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   3862     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   3863     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
   3864     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   3865     "\\u0233\\U000e0020\\u0a69\\u0d6a",
   3866     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   3867     "\\u18f4\\U000e0049\\u20e7\\u2027",
   3868     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   3869     "\\ua183\\u102d\\u0bec\\u003a",
   3870     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   3871     "\\u003a\\u0e57\\u0fad\\u002e",
   3872     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
   3873     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
   3874     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
   3875     "\\u003a\\u0664\\u00b7\\u1fba",
   3876     "\\u003b\\u0027\\u00b7\\u47a3",
   3877     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
   3878     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
   3879     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
   3880     };
   3881     int loop;
   3882     if (U_FAILURE(status)) {
   3883         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   3884         return;
   3885     }
   3886     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
   3887         // printf("looping %d\n", loop);
   3888         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
   3889         // RBBICharMonkey monkey;
   3890         RBBIWordMonkey monkey;
   3891 
   3892         int expected[50];
   3893         int expectedcount = 0;
   3894 
   3895         monkey.setText(ustr);
   3896         int i;
   3897         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   3898             expected[expectedcount ++] = i;
   3899         }
   3900 
   3901         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   3902     }
   3903     delete bi;
   3904 #endif
   3905 }
   3906 
   3907 void RBBITest::TestWordBoundary(void)
   3908 {
   3909     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
   3910     Locale        locale("en");
   3911     UErrorCode    status = U_ZERO_ERROR;
   3912     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   3913     LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
   3914     if (U_FAILURE(status)) {
   3915         errcheckln(status, "%s:%d Creation of break iterator failed %s",
   3916                 __FILE__, __LINE__, u_errorName(status));
   3917         return;
   3918     }
   3919     UChar         str[50];
   3920     static const char *strlist[] =
   3921     {
   3922     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   3923     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   3924     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   3925     "\\u2027\\U000e0067\\u0a47\\u00b7",
   3926     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   3927     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   3928     "\\u0589\\U000e006e\\u0a42\\U000104a5",
   3929     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
   3930     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   3931     "\\u0027\\u11af\\U000e0057\\u0602",
   3932     "\\U0001d7f2\\U000e007\\u0004\\u0589",
   3933     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
   3934     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   3935     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   3936     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   3937     "\\U000e0065\\u302c\\u09ee\\U000e0068",
   3938     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   3939     "\\u0233\\U000e0020\\u0a69\\u0d6a",
   3940     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   3941     "\\u58f4\\U000e0049\\u20e7\\u2027",
   3942     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   3943     "\\ua183\\u102d\\u0bec\\u003a",
   3944     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   3945     "\\u003a\\u0e57\\u0fad\\u002e",
   3946     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
   3947     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
   3948     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
   3949     "\\u003a\\u0664\\u00b7\\u1fba",
   3950     "\\u003b\\u0027\\u00b7\\u47a3",
   3951     };
   3952     int loop;
   3953     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
   3954         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
   3955         UnicodeString ustr(str);
   3956         int forward[50];
   3957         int count = 0;
   3958 
   3959         bi->setText(ustr);
   3960         int prev = -1;
   3961         for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
   3962             ++count;
   3963             if (count >= UPRV_LENGTHOF(forward)) {
   3964                 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
   3965                         __FILE__, __LINE__, loop, count, boundary);
   3966                 return;
   3967             }
   3968             forward[count] = boundary;
   3969             if (boundary <= prev) {
   3970                 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
   3971                         __FILE__, __LINE__, loop, prev, boundary);
   3972                 break;
   3973             }
   3974             for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
   3975                 if (bi->isBoundary(nonBoundary)) {
   3976                     printStringBreaks(ustr, forward, count);
   3977                     errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
   3978                            __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
   3979                     return;
   3980                 }
   3981             }
   3982             if (!bi->isBoundary(boundary)) {
   3983                 printStringBreaks(ustr, forward, count);
   3984                 errln("%s:%d happy boundary test failed: expected %d a boundary",
   3985                        __FILE__, __LINE__, boundary);
   3986                 return;
   3987             }
   3988             prev = boundary;
   3989         }
   3990     }
   3991 }
   3992 
   3993 void RBBITest::TestLineBreaks(void)
   3994 {
   3995 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3996     Locale        locale("en");
   3997     UErrorCode    status = U_ZERO_ERROR;
   3998     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
   3999     const int32_t  STRSIZE = 50;
   4000     UChar         str[STRSIZE];
   4001     static const char *strlist[] =
   4002     {
   4003      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
   4004      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
   4005              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
   4006      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
   4007              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
   4008      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
   4009      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
   4010      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
   4011      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
   4012      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
   4013      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
   4014      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
   4015      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
   4016      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
   4017      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
   4018      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
   4019      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
   4020      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
   4021      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
   4022      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
   4023      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
   4024      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
   4025      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
   4026      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
   4027      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
   4028      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
   4029      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
   4030      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
   4031      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
   4032      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
   4033      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
   4034      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
   4035      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
   4036      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
   4037      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
   4038      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
   4039      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
   4040      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
   4041      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
   4042          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
   4043     };
   4044     int loop;
   4045     TEST_ASSERT_SUCCESS(status);
   4046     if (U_FAILURE(status)) {
   4047         return;
   4048     }
   4049     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
   4050         // printf("looping %d\n", loop);
   4051         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
   4052         if (t >= STRSIZE) {
   4053             TEST_ASSERT(FALSE);
   4054             continue;
   4055         }
   4056 
   4057 
   4058         UnicodeString ustr(str);
   4059         RBBILineMonkey monkey;
   4060         if (U_FAILURE(monkey.deferredStatus)) {
   4061             continue;
   4062         }
   4063 
   4064         const int EXPECTEDSIZE = 50;
   4065         int expected[EXPECTEDSIZE];
   4066         int expectedcount = 0;
   4067 
   4068         monkey.setText(ustr);
   4069         int i;
   4070         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   4071             if (expectedcount >= EXPECTEDSIZE) {
   4072                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
   4073                 return;
   4074             }
   4075             expected[expectedcount ++] = i;
   4076         }
   4077 
   4078         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   4079     }
   4080     delete bi;
   4081 #endif
   4082 }
   4083 
   4084 void RBBITest::TestSentBreaks(void)
   4085 {
   4086 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4087     Locale        locale("en");
   4088     UErrorCode    status = U_ZERO_ERROR;
   4089     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
   4090     UChar         str[200];
   4091     static const char *strlist[] =
   4092     {
   4093      "Now\ris\nthe\r\ntime\n\rfor\r\r",
   4094      "This\n",
   4095      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
   4096      "\"Sentence ending with a quote.\" Bye.",
   4097      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
   4098      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
   4099      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
   4100      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
   4101      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
   4102      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
   4103      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
   4104              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
   4105              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
   4106              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
   4107      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
   4108              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
   4109              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
   4110              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
   4111              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
   4112              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
   4113     };
   4114     int loop;
   4115     if (U_FAILURE(status)) {
   4116         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   4117         return;
   4118     }
   4119     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
   4120         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
   4121         UnicodeString ustr(str);
   4122 
   4123         RBBISentMonkey monkey;
   4124         if (U_FAILURE(monkey.deferredStatus)) {
   4125             continue;
   4126         }
   4127 
   4128         const int EXPECTEDSIZE = 50;
   4129         int expected[EXPECTEDSIZE];
   4130         int expectedcount = 0;
   4131 
   4132         monkey.setText(ustr);
   4133         int i;
   4134         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   4135             if (expectedcount >= EXPECTEDSIZE) {
   4136                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
   4137                 return;
   4138             }
   4139             expected[expectedcount ++] = i;
   4140         }
   4141 
   4142         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   4143     }
   4144     delete bi;
   4145 #endif
   4146 }
   4147 
   4148 void RBBITest::TestMonkey() {
   4149 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4150 
   4151     UErrorCode     status    = U_ZERO_ERROR;
   4152     int32_t        loopCount = 500;
   4153     int32_t        seed      = 1;
   4154     UnicodeString  breakType = "all";
   4155     Locale         locale("en");
   4156     UBool          useUText  = FALSE;
   4157 
   4158     if (quick == FALSE) {
   4159         loopCount = 10000;
   4160     }
   4161 
   4162     if (fTestParams) {
   4163         UnicodeString p(fTestParams);
   4164         loopCount = getIntParam("loop", p, loopCount);
   4165         seed      = getIntParam("seed", p, seed);
   4166 
   4167         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
   4168         if (m.find()) {
   4169             breakType = m.group(1, status);
   4170             m.reset();
   4171             p = m.replaceFirst("", status);
   4172         }
   4173 
   4174         RegexMatcher u(" *utext", p, 0, status);
   4175         if (u.find()) {
   4176             useUText = TRUE;
   4177             u.reset();
   4178             p = u.replaceFirst("", status);
   4179         }
   4180 
   4181 
   4182         // m.reset(p);
   4183         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
   4184             // Each option is stripped out of the option string as it is processed.
   4185             // All options have been checked.  The option string should have been completely emptied..
   4186             char buf[100];
   4187             p.extract(buf, sizeof(buf), NULL, status);
   4188             buf[sizeof(buf)-1] = 0;
   4189             errln("Unrecognized or extra parameter:  %s\n", buf);
   4190             return;
   4191         }
   4192 
   4193     }
   4194 
   4195     if (breakType == "char" || breakType == "all") {
   4196         RBBICharMonkey  m;
   4197         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   4198         if (U_SUCCESS(status)) {
   4199             RunMonkey(bi, m, "char", seed, loopCount, useUText);
   4200             if (breakType == "all" && useUText==FALSE) {
   4201                 // Also run a quick test with UText when "all" is specified
   4202                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
   4203             }
   4204         }
   4205         else {
   4206             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
   4207         }
   4208         delete bi;
   4209     }
   4210 
   4211     if (breakType == "word" || breakType == "all") {
   4212         logln("Word Break Monkey Test");
   4213         RBBIWordMonkey  m;
   4214         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
   4215         if (U_SUCCESS(status)) {
   4216             RunMonkey(bi, m, "word", seed, loopCount, useUText);
   4217         }
   4218         else {
   4219             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
   4220         }
   4221         delete bi;
   4222     }
   4223 
   4224     if (breakType == "line" || breakType == "all") {
   4225         logln("Line Break Monkey Test");
   4226         RBBILineMonkey  m;
   4227         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
   4228         if (loopCount >= 10) {
   4229             loopCount = loopCount / 5;   // Line break runs slower than the others.
   4230         }
   4231         if (U_SUCCESS(status)) {
   4232             RunMonkey(bi, m, "line", seed, loopCount, useUText);
   4233         }
   4234         else {
   4235             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
   4236         }
   4237         delete bi;
   4238     }
   4239 
   4240     if (breakType == "sent" || breakType == "all"  ) {
   4241         logln("Sentence Break Monkey Test");
   4242         RBBISentMonkey  m;
   4243         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
   4244         if (loopCount >= 10) {
   4245             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
   4246         }
   4247         if (U_SUCCESS(status)) {
   4248             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
   4249         }
   4250         else {
   4251             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
   4252         }
   4253         delete bi;
   4254     }
   4255 
   4256 #endif
   4257 }
   4258 
   4259 //
   4260 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
   4261 //    Parameters:
   4262 //       bi      - the break iterator to use
   4263 //       mk      - MonkeyKind, abstraction for obtaining expected results
   4264 //       name    - Name of test (char, word, etc.) for use in error messages
   4265 //       seed    - Seed for starting random number generator (parameter from user)
   4266 //       numIterations
   4267 //
   4268 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
   4269                          int32_t numIterations, UBool useUText) {
   4270 
   4271 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4272 
   4273     const int32_t    TESTSTRINGLEN = 500;
   4274     UnicodeString    testText;
   4275     int32_t          numCharClasses;
   4276     UVector          *chClasses;
   4277     int              expected[TESTSTRINGLEN*2 + 1];
   4278     int              expectedCount = 0;
   4279     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
   4280     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
   4281     char             reverseBreaks[TESTSTRINGLEN*2+1];
   4282     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
   4283     char             followingBreaks[TESTSTRINGLEN*2+1];
   4284     char             precedingBreaks[TESTSTRINGLEN*2+1];
   4285     int              i;
   4286     int              loopCount = 0;
   4287 
   4288     m_seed = seed;
   4289 
   4290     numCharClasses = mk.charClasses()->size();
   4291     chClasses      = mk.charClasses();
   4292 
   4293     // Check for errors that occured during the construction of the MonkeyKind object.
   4294     //  Can't report them where they occured because errln() is a method coming from intlTest,
   4295     //  and is not visible outside of RBBITest :-(
   4296     if (U_FAILURE(mk.deferredStatus)) {
   4297         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
   4298         return;
   4299     }
   4300 
   4301     // Verify that the character classes all have at least one member.
   4302     for (i=0; i<numCharClasses; i++) {
   4303         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
   4304         if (s == NULL || s->size() == 0) {
   4305             errln("Character Class #%d is null or of zero size.", i);
   4306             return;
   4307         }
   4308     }
   4309 
   4310     while (loopCount < numIterations || numIterations == -1) {
   4311         if (numIterations == -1 && loopCount % 10 == 0) {
   4312             // If test is running in an infinite loop, display a periodic tic so
   4313             //   we can tell that it is making progress.
   4314             fprintf(stderr, ".");
   4315         }
   4316         // Save current random number seed, so that we can recreate the random numbers
   4317         //   for this loop iteration in event of an error.
   4318         seed = m_seed;
   4319 
   4320         // Populate a test string with data.
   4321         testText.truncate(0);
   4322         for (i=0; i<TESTSTRINGLEN; i++) {
   4323             int32_t  aClassNum = m_rand() % numCharClasses;
   4324             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
   4325             int32_t   charIdx = m_rand() % classSet->size();
   4326             UChar32   c = classSet->charAt(charIdx);
   4327             if (c < 0) {   // TODO:  deal with sets containing strings.
   4328                 errln("%s:%d c < 0", __FILE__, __LINE__);
   4329                 break;
   4330             }
   4331             // Do not assemble a supplementary character from randomly generated separate surrogates.
   4332             //   (It could be a dictionary character)
   4333             if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
   4334                 continue;
   4335             }
   4336 
   4337             testText.append(c);
   4338         }
   4339 
   4340         // Calculate the expected results for this test string.
   4341         mk.setText(testText);
   4342         memset(expectedBreaks, 0, sizeof(expectedBreaks));
   4343         expectedBreaks[0] = 1;
   4344         int32_t breakPos = 0;
   4345         expectedCount = 0;
   4346         for (;;) {
   4347             breakPos = mk.next(breakPos);
   4348             if (breakPos == -1) {
   4349                 break;
   4350             }
   4351             if (breakPos > testText.length()) {
   4352                 errln("breakPos > testText.length()");
   4353             }
   4354             expectedBreaks[breakPos] = 1;
   4355             U_ASSERT(expectedCount<testText.length());
   4356             expected[expectedCount ++] = breakPos;
   4357             (void)expected;   // Set but not used warning.
   4358                               // TODO (andy): check it out.
   4359         }
   4360 
   4361         // Find the break positions using forward iteration
   4362         memset(forwardBreaks, 0, sizeof(forwardBreaks));
   4363         if (useUText) {
   4364             UErrorCode status = U_ZERO_ERROR;
   4365             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
   4366             // testUText = utext_openUnicodeString(testUText, &testText, &status);
   4367             bi->setText(testUText, status);
   4368             TEST_ASSERT_SUCCESS(status);
   4369             utext_close(testUText);   // The break iterator does a shallow clone of the UText
   4370                                       //  This UText can be closed immediately, so long as the
   4371                                       //  testText string continues to exist.
   4372         } else {
   4373             bi->setText(testText);
   4374         }
   4375 
   4376         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
   4377             if (i < 0 || i > testText.length()) {
   4378                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
   4379                 break;
   4380             }
   4381             forwardBreaks[i] = 1;
   4382         }
   4383 
   4384         // Find the break positions using reverse iteration
   4385         memset(reverseBreaks, 0, sizeof(reverseBreaks));
   4386         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
   4387             if (i < 0 || i > testText.length()) {
   4388                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
   4389                 break;
   4390             }
   4391             reverseBreaks[i] = 1;
   4392         }
   4393 
   4394         // Find the break positions using isBoundary() tests.
   4395         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
   4396         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
   4397         for (i=0; i<=testText.length(); i++) {
   4398             isBoundaryBreaks[i] = bi->isBoundary(i);
   4399         }
   4400 
   4401 
   4402         // Find the break positions using the following() function.
   4403         // printf(".");
   4404         memset(followingBreaks, 0, sizeof(followingBreaks));
   4405         int32_t   lastBreakPos = 0;
   4406         followingBreaks[0] = 1;
   4407         for (i=0; i<testText.length(); i++) {
   4408             breakPos = bi->following(i);
   4409             if (breakPos <= i ||
   4410                 breakPos < lastBreakPos ||
   4411                 breakPos > testText.length() ||
   4412                 (breakPos > lastBreakPos && lastBreakPos > i)) {
   4413                 errln("%s break monkey test: "
   4414                     "Out of range value returned by BreakIterator::following().\n"
   4415                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
   4416                          name, seed, i, breakPos, lastBreakPos);
   4417                 break;
   4418             }
   4419             followingBreaks[breakPos] = 1;
   4420             lastBreakPos = breakPos;
   4421         }
   4422 
   4423         // Find the break positions using the preceding() function.
   4424         memset(precedingBreaks, 0, sizeof(precedingBreaks));
   4425         lastBreakPos = testText.length();
   4426         precedingBreaks[testText.length()] = 1;
   4427         for (i=testText.length(); i>0; i--) {
   4428             breakPos = bi->preceding(i);
   4429             if (breakPos >= i ||
   4430                 breakPos > lastBreakPos ||
   4431                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
   4432                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
   4433                 errln("%s break monkey test: "
   4434                     "Out of range value returned by BreakIterator::preceding().\n"
   4435                     "index=%d;  prev returned %d; lastBreak=%d" ,
   4436                     name,  i, breakPos, lastBreakPos);
   4437                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
   4438                     precedingBreaks[i] = 2;   // Forces an error.
   4439                 }
   4440             } else {
   4441                 if (breakPos >= 0) {
   4442                     precedingBreaks[breakPos] = 1;
   4443                 }
   4444                 lastBreakPos = breakPos;
   4445             }
   4446         }
   4447 
   4448         // Compare the expected and actual results.
   4449         for (i=0; i<=testText.length(); i++) {
   4450             const char *errorType = NULL;
   4451             if  (forwardBreaks[i] != expectedBreaks[i]) {
   4452                 errorType = "next()";
   4453             } else if (reverseBreaks[i] != forwardBreaks[i]) {
   4454                 errorType = "previous()";
   4455             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
   4456                 errorType = "isBoundary()";
   4457             } else if (followingBreaks[i] != expectedBreaks[i]) {
   4458                 errorType = "following()";
   4459             } else if (precedingBreaks[i] != expectedBreaks[i]) {
   4460                 errorType = "preceding()";
   4461             }
   4462 
   4463 
   4464             if (errorType != NULL) {
   4465                 // Format a range of the test text that includes the failure as
   4466                 //  a data item that can be included in the rbbi test data file.
   4467 
   4468                 // Start of the range is the last point where expected and actual results
   4469                 //   both agreed that there was a break position.
   4470                 int startContext = i;
   4471                 int32_t count = 0;
   4472                 for (;;) {
   4473                     if (startContext==0) { break; }
   4474                     startContext --;
   4475                     if (expectedBreaks[startContext] != 0) {
   4476                         if (count == 2) break;
   4477                         count ++;
   4478                     }
   4479                 }
   4480 
   4481                 // End of range is two expected breaks past the start position.
   4482                 int endContext = i + 1;
   4483                 int ci;
   4484                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
   4485                     for (;;) {
   4486                         if (endContext >= testText.length()) {break;}
   4487                         if (expectedBreaks[endContext-1] != 0) {
   4488                             if (count == 0) break;
   4489                             count --;
   4490                         }
   4491                         endContext ++;
   4492                     }
   4493                 }
   4494 
   4495                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
   4496                 UnicodeString errorText = "<data>";
   4497                 /***if (strcmp(errorType, "next()") == 0) {
   4498                     startContext = 0;
   4499                     endContext = testText.length();
   4500 
   4501                     printStringBreaks(testText, expected, expectedCount);
   4502                 }***/
   4503 
   4504                 for (ci=startContext; ci<endContext;) {
   4505                     UnicodeString hexChars("0123456789abcdef");
   4506                     UChar32  c;
   4507                     int      bn;
   4508                     c = testText.char32At(ci);
   4509                     if (ci == i) {
   4510                         // This is the location of the error.
   4511                         errorText.append("<?>");
   4512                     } else if (expectedBreaks[ci] != 0) {
   4513                         // This a non-error expected break position.
   4514                         errorText.append("\\");
   4515                     }
   4516                     if (c < 0x10000) {
   4517                         errorText.append("\\u");
   4518                         for (bn=12; bn>=0; bn-=4) {
   4519                             errorText.append(hexChars.charAt((c>>bn)&0xf));
   4520                         }
   4521                     } else {
   4522                         errorText.append("\\U");
   4523                         for (bn=28; bn>=0; bn-=4) {
   4524                             errorText.append(hexChars.charAt((c>>bn)&0xf));
   4525                         }
   4526                     }
   4527                     ci = testText.moveIndex32(ci, 1);
   4528                 }
   4529                 errorText.append("\\");
   4530                 errorText.append("</data>\n");
   4531 
   4532                 // Output the error
   4533                 char  charErrorTxt[500];
   4534                 UErrorCode status = U_ZERO_ERROR;
   4535                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
   4536                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
   4537                 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
   4538 
   4539                 errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
   4540                     name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
   4541                     errorType, seed, i, charErrorTxt);
   4542                 break;
   4543             }
   4544         }
   4545 
   4546         loopCount++;
   4547     }
   4548 #endif
   4549 }
   4550 
   4551 
   4552 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
   4553 //             This test checks the initial patch,
   4554 //             which is to just keep it from crashing.  Correct word boundaries
   4555 //             await a proper fix to the dictionary code.
   4556 //
   4557 void RBBITest::TestBug5532(void)  {
   4558    // Text includes a mixture of Thai and Latin.
   4559    const unsigned char utf8Data[] = {
   4560            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
   4561            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
   4562            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
   4563            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
   4564            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
   4565            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
   4566            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
   4567            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
   4568            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
   4569            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
   4570            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
   4571 
   4572     UErrorCode status = U_ZERO_ERROR;
   4573     UText utext=UTEXT_INITIALIZER;
   4574     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
   4575     TEST_ASSERT_SUCCESS(status);
   4576 
   4577     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
   4578     TEST_ASSERT_SUCCESS(status);
   4579     if (U_SUCCESS(status)) {
   4580         bi->setText(&utext, status);
   4581         TEST_ASSERT_SUCCESS(status);
   4582 
   4583         int32_t breakCount = 0;
   4584         int32_t previousBreak = -1;
   4585         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
   4586             // For now, just make sure that the break iterator doesn't hang.
   4587             TEST_ASSERT(previousBreak < bi->current());
   4588             previousBreak = bi->current();
   4589         }
   4590         TEST_ASSERT(breakCount > 0);
   4591     }
   4592     delete bi;
   4593     utext_close(&utext);
   4594 }
   4595 
   4596 
   4597 void RBBITest::TestBug9983(void)  {
   4598     UnicodeString text = UnicodeString("\\u002A"  // * Other
   4599                                        "\\uFF65"  //   Other
   4600                                        "\\u309C"  //   Katakana
   4601                                        "\\uFF9F"  //   Extend
   4602                                        "\\uFF65"  //   Other
   4603                                        "\\u0020"  //   Other
   4604                                        "\\u0000").unescape();
   4605 
   4606     UErrorCode status = U_ZERO_ERROR;
   4607     LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
   4608         BreakIterator::createWordInstance(Locale::getRoot(), status)));
   4609     TEST_ASSERT_SUCCESS(status);
   4610     LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
   4611         BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
   4612     TEST_ASSERT_SUCCESS(status);
   4613     if (U_FAILURE(status)) {
   4614         return;
   4615     }
   4616     int32_t offset, rstatus, iterationCount;
   4617 
   4618     brkiter->setText(text);
   4619     brkiter->last();
   4620     iterationCount = 0;
   4621     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
   4622         iterationCount++;
   4623         rstatus = brkiter->getRuleStatus();
   4624         (void)rstatus;     // Suppress set but not used warning.
   4625         if (iterationCount >= 10) {
   4626            break;
   4627         }
   4628     }
   4629     TEST_ASSERT(iterationCount == 6);
   4630 
   4631     brkiterPOSIX->setText(text);
   4632     brkiterPOSIX->last();
   4633     iterationCount = 0;
   4634     while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
   4635         iterationCount++;
   4636         rstatus = brkiterPOSIX->getRuleStatus();
   4637         (void)rstatus;     // Suppress set but not used warning.
   4638         if (iterationCount >= 10) {
   4639            break;
   4640         }
   4641     }
   4642     TEST_ASSERT(iterationCount == 6);
   4643 }
   4644 
   4645 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
   4646 //
   4647 void RBBITest::TestBug7547() {
   4648     UnicodeString rules;
   4649     UErrorCode status = U_ZERO_ERROR;
   4650     UParseError parseError;
   4651     RuleBasedBreakIterator breakIterator(rules, parseError, status);
   4652     if (status != U_BRK_RULE_SYNTAX) {
   4653         errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
   4654     }
   4655     if (parseError.line != 1 || parseError.offset != 0) {
   4656         errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
   4657     }
   4658 }
   4659 
   4660 
   4661 void RBBITest::TestBug12797() {
   4662     UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
   4663     UErrorCode status = U_ZERO_ERROR;
   4664     UParseError parseError;
   4665     RuleBasedBreakIterator bi(rules, parseError, status);
   4666     if (U_FAILURE(status)) {
   4667         errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
   4668         return;
   4669     }
   4670     UnicodeString text = "abc";
   4671     bi.setText(text);
   4672     bi.first();
   4673     int32_t boundary = bi.next();
   4674     if (boundary != 3) {
   4675         errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
   4676     }
   4677 }
   4678 
   4679 void RBBITest::TestBug12918() {
   4680     // This test triggers an assertion failure in dictbe.cpp
   4681     const UChar *crasherString = u"\u3325\u4a16";
   4682     UErrorCode status = U_ZERO_ERROR;
   4683     UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
   4684     if (U_FAILURE(status)) {
   4685         dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
   4686         return;
   4687     }
   4688     ubrk_first(iter);
   4689     int32_t pos = 0;
   4690     int32_t lastPos = -1;
   4691     while((pos = ubrk_next(iter)) != UBRK_DONE) {
   4692         if (pos <= lastPos) {
   4693             errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
   4694             break;
   4695         }
   4696     }
   4697     ubrk_close(iter);
   4698 }
   4699 
   4700 void RBBITest::TestBug12932() {
   4701     // Node Stack overflow in the RBBI rule parser caused a seg fault.
   4702     UnicodeString ruleStr(
   4703             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
   4704             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
   4705             "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
   4706             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
   4707             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
   4708             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
   4709 
   4710     UErrorCode status = U_ZERO_ERROR;
   4711     UParseError parseError;
   4712     RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
   4713     if (status != U_BRK_RULE_SYNTAX) {
   4714         errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
   4715                 __FILE__, __LINE__, u_errorName(status));
   4716     }
   4717 }
   4718 
   4719 
   4720 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
   4721 //             remain undevided by ICU char, word and line break.
   4722 void RBBITest::TestEmoji() {
   4723 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4724     UErrorCode  status = U_ZERO_ERROR;
   4725 
   4726     CharString testFileName;
   4727     testFileName.append(IntlTest::getSourceTestData(status), status);
   4728     testFileName.appendPathPart("emoji-test.txt", status);
   4729     if (U_FAILURE(status)) {
   4730         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
   4731         return;
   4732     }
   4733     logln("Opening data file %s\n", testFileName.data());
   4734 
   4735     int    len;
   4736     UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
   4737     if (U_FAILURE(status) || testFile == NULL) {
   4738         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
   4739         return;
   4740     }
   4741     UnicodeString testFileAsString(testFile, len);
   4742     delete [] testFile;
   4743 
   4744     RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
   4745     RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
   4746     //           hexMatcher group(1) is a hex number, or empty string if no hex number present.
   4747     int32_t lineNumber = 0;
   4748 
   4749     LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
   4750     LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
   4751     LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
   4752     if (U_FAILURE(status)) {
   4753         dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
   4754         return;
   4755     }
   4756 
   4757     while (lineMatcher.find()) {
   4758         ++lineNumber;
   4759         UnicodeString line = lineMatcher.group(status);
   4760         hexMatcher.reset(line);
   4761         UnicodeString testString;   // accumulates the emoji sequence.
   4762         while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
   4763             UnicodeString hex = hexMatcher.group(1, status);
   4764             if (hex.length() > 8) {
   4765                 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
   4766                 break;
   4767             }
   4768             CharString hex8;
   4769             hex8.appendInvariantChars(hex, status);
   4770             UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
   4771             if (c<=0x10ffff) {
   4772                 testString.append(c);
   4773             } else {
   4774                 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
   4775                         __FILE__, __LINE__, lineNumber, hex8.data());
   4776                 break;
   4777             }
   4778         }
   4779 
   4780         if (testString.length() > 1) {
   4781             charBreaks->setText(testString);
   4782             charBreaks->first();
   4783             int32_t firstBreak = charBreaks->next();
   4784             if (testString.length() != firstBreak) {
   4785                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
   4786                         __FILE__, __LINE__, lineNumber, firstBreak);
   4787             }
   4788             wordBreaks->setText(testString);
   4789             wordBreaks->first();
   4790             firstBreak = wordBreaks->next();
   4791             if (testString.length() != firstBreak) {
   4792                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
   4793                         __FILE__, __LINE__, lineNumber, firstBreak);
   4794             }
   4795             lineBreaks->setText(testString);
   4796             lineBreaks->first();
   4797             firstBreak = lineBreaks->next();
   4798             if (testString.length() != firstBreak) {
   4799                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
   4800                         __FILE__, __LINE__, lineNumber, firstBreak);
   4801             }
   4802         }
   4803     }
   4804 #endif
   4805 }
   4806 
   4807 
   4808 // TestBug12519  -  Correct handling of Locales by assignment / copy / clone
   4809 
   4810 // WHERE Macro yields a literal string of the form "source_file_name:line number "
   4811 // TODO: propose something equivalent as a test framework addition.
   4812 
   4813 #define WHERE __FILE__ ":" XLINE(__LINE__) " "
   4814 #define XLINE(s) LINE(s)
   4815 #define LINE(s) #s
   4816 
   4817 void RBBITest::TestBug12519() {
   4818     UErrorCode status = U_ZERO_ERROR;
   4819     LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
   4820     LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
   4821     if (!assertSuccess(WHERE, status)) {
   4822         dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
   4823         return;
   4824     }
   4825     assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
   4826 
   4827     assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
   4828     assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
   4829 
   4830     LocalPointer<RuleBasedBreakIterator>cloneEn((RuleBasedBreakIterator *)biEn->clone());
   4831     assertTrue(WHERE, *biEn == *cloneEn);
   4832     assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
   4833 
   4834     LocalPointer<RuleBasedBreakIterator>cloneFr((RuleBasedBreakIterator *)biFr->clone());
   4835     assertTrue(WHERE, *biFr == *cloneFr);
   4836     assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
   4837 
   4838     LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
   4839     UnicodeString text("Hallo Welt");
   4840     biDe->setText(text);
   4841     assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
   4842     *biDe = *biFr;
   4843     assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
   4844 }
   4845 
   4846 //
   4847 //  TestDebug    -  A place-holder test for debugging purposes.
   4848 //                  For putting in fragments of other tests that can be invoked
   4849 //                  for tracing  without a lot of unwanted extra stuff happening.
   4850 //
   4851 void RBBITest::TestDebug(void) {
   4852 }
   4853 
   4854 void RBBITest::TestProperties() {
   4855     UErrorCode errorCode = U_ZERO_ERROR;
   4856     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
   4857     if (!prependSet.isEmpty()) {
   4858         errln(
   4859             "[:GCB=Prepend:] is not empty any more. "
   4860             "Uncomment relevant lines in source/data/brkitr/char.txt and "
   4861             "change this test to the opposite condition.");
   4862     }
   4863 }
   4864 
   4865 #endif // #if !UCONFIG_NO_BREAK_ITERATION
   4866