Home | History | Annotate | Download | only in intltest
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /********************************************************************
      4  * COPYRIGHT:
      5  * Copyright (c) 1999-2016, International Business Machines Corporation and
      6  * others. All Rights Reserved.
      7  ********************************************************************/
      8 /************************************************************************
      9 *   Date        Name        Description
     10 *   12/15/99    Madhu        Creation.
     11 *   01/12/2000  Madhu        Updated for changed API and added new tests
     12 ************************************************************************/
     13 
     14 #include "unicode/utypes.h"
     15 #if !UCONFIG_NO_BREAK_ITERATION
     16 
     17 #include <stdio.h>
     18 #include <stdlib.h>
     19 #include <string.h>
     20 
     21 #include "unicode/brkiter.h"
     22 #include "unicode/localpointer.h"
     23 #include "unicode/numfmt.h"
     24 #include "unicode/rbbi.h"
     25 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     26 #include "unicode/regex.h"
     27 #endif
     28 #include "unicode/schriter.h"
     29 #include "unicode/uchar.h"
     30 #include "unicode/utf16.h"
     31 #include "unicode/ucnv.h"
     32 #include "unicode/uniset.h"
     33 #include "unicode/uscript.h"
     34 #include "unicode/ustring.h"
     35 #include "unicode/utext.h"
     36 
     37 #include "charstr.h"
     38 #include "cmemory.h"
     39 #include "intltest.h"
     40 #include "rbbitst.h"
     41 #include "utypeinfo.h"  // for 'typeid' to work
     42 #include "uvector.h"
     43 #include "uvectr32.h"
     44 
     45 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
     46 #include "unicode/filteredbrk.h"
     47 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
     48 
     49 #define TEST_ASSERT(x) {if (!(x)) { \
     50     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
     51 
     52 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
     53     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
     54 
     55 
     56 //---------------------------------------------
     57 // runIndexedTest
     58 //---------------------------------------------
     59 
     60 
     61 //  Note:  Before adding new tests to this file, check whether the desired test data can
     62 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
     63 //         it's much less work than writing a new test, diagnostic output in the event of failures
     64 //         is good, and the test data file will is shared with ICU4J, so eventually the test
     65 //         will run there as well, without additional effort.
     66 
     67 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
     68 {
     69     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
     70     fTestParams = params;
     71 
     72     TESTCASE_AUTO_BEGIN;
     73 #if !UCONFIG_NO_FILE_IO
     74     TESTCASE_AUTO(TestBug4153072);
     75 #endif
     76     TESTCASE_AUTO(TestStatusReturn);
     77 #if !UCONFIG_NO_FILE_IO
     78     TESTCASE_AUTO(TestUnicodeFiles);
     79     TESTCASE_AUTO(TestEmptyString);
     80 #endif
     81     TESTCASE_AUTO(TestGetAvailableLocales);
     82     TESTCASE_AUTO(TestGetDisplayName);
     83 #if !UCONFIG_NO_FILE_IO
     84     TESTCASE_AUTO(TestEndBehaviour);
     85     TESTCASE_AUTO(TestWordBreaks);
     86     TESTCASE_AUTO(TestWordBoundary);
     87     TESTCASE_AUTO(TestLineBreaks);
     88     TESTCASE_AUTO(TestSentBreaks);
     89     TESTCASE_AUTO(TestExtended);
     90 #endif
     91 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
     92     TESTCASE_AUTO(TestMonkey);
     93 #endif
     94 #if !UCONFIG_NO_FILE_IO
     95     TESTCASE_AUTO(TestBug3818);
     96 #endif
     97     TESTCASE_AUTO(TestDebug);
     98 #if !UCONFIG_NO_FILE_IO
     99     TESTCASE_AUTO(TestBug5775);
    100 #endif
    101     TESTCASE_AUTO(TestBug9983);
    102     TESTCASE_AUTO(TestDictRules);
    103     TESTCASE_AUTO(TestBug5532);
    104     TESTCASE_AUTO(TestBug7547);
    105     TESTCASE_AUTO(TestBug12797);
    106     TESTCASE_AUTO(TestBug12918);
    107     TESTCASE_AUTO_END;
    108 }
    109 
    110 
    111 //---------------------------------------------------------------------------
    112 //
    113 //   class BITestData   Holds a set of Break iterator test data and results
    114 //                      Includes
    115 //                         - the string data to be broken
    116 //                         - a vector of the expected break positions.
    117 //                         - a vector of source line numbers for the data,
    118 //                               (to help see where errors occured.)
    119 //                         - The expected break tag values.
    120 //                         - Vectors of actual break positions and tag values.
    121 //                         - Functions for comparing actual with expected and
    122 //                            reporting errors.
    123 //
    124 //----------------------------------------------------------------------------
    125 class BITestData {
    126 public:
    127     UnicodeString    fDataToBreak;
    128     UVector          fExpectedBreakPositions;
    129     UVector          fExpectedTags;
    130     UVector          fLineNum;
    131     UVector          fActualBreakPositions;   // Test Results.
    132     UVector          fActualTags;
    133 
    134     BITestData(UErrorCode &status);
    135     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
    136     void             checkResults(const char *heading, RBBITest *test);
    137     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
    138     void             clearResults();
    139 };
    140 
    141 //
    142 // Constructor.
    143 //
    144 BITestData::BITestData(UErrorCode &status)
    145 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
    146   fActualTags(status)
    147 {
    148 }
    149 
    150 //
    151 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
    152 //                 The macro form collects the line number, which is helpful
    153 //                 when tracking down failures.
    154 //
    155 //                 A null data item is inserted at the start of each test's data
    156 //                  to put the starting zero into the data list.  The position saved for
    157 //                  each non-null item is its ending position.
    158 //
    159 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
    160 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
    161     if (U_FAILURE(status)) {return;}
    162     if (data != NULL) {
    163         fDataToBreak.append(CharsToUnicodeString(data));
    164     }
    165     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
    166     fExpectedTags.addElement(tag, status);
    167     fLineNum.addElement(lineNum, status);
    168 }
    169 
    170 
    171 //
    172 //  checkResults.   Compare the actual and expected break positions, report any differences.
    173 //
    174 void BITestData::checkResults(const char *heading, RBBITest *test) {
    175     int32_t   expectedIndex = 0;
    176     int32_t   actualIndex = 0;
    177 
    178     for (;;) {
    179         // If we've run through both the expected and actual results vectors, we're done.
    180         //   break out of the loop.
    181         if (expectedIndex >= fExpectedBreakPositions.size() &&
    182             actualIndex   >= fActualBreakPositions.size()) {
    183             break;
    184         }
    185 
    186 
    187         if (expectedIndex >= fExpectedBreakPositions.size()) {
    188             err(heading, test, expectedIndex-1, actualIndex);
    189             actualIndex++;
    190             continue;
    191         }
    192 
    193         if (actualIndex >= fActualBreakPositions.size()) {
    194             err(heading, test, expectedIndex, actualIndex-1);
    195             expectedIndex++;
    196             continue;
    197         }
    198 
    199         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
    200             err(heading, test, expectedIndex, actualIndex);
    201             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
    202             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
    203                 actualIndex++;
    204             } else {
    205                 expectedIndex++;
    206             }
    207             continue;
    208         }
    209 
    210         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
    211             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
    212                 heading, fLineNum.elementAt(expectedIndex),
    213                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
    214         }
    215 
    216         actualIndex++;
    217         expectedIndex++;
    218     }
    219 }
    220 
    221 //
    222 //  err   -  An error was found.  Report it, along with information about where the
    223 //                                incorrectly broken test data appeared in the source file.
    224 //
    225 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
    226 {
    227     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
    228     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
    229     int32_t   o        = 0;
    230     int32_t   line     = fLineNum.elementAti(expectedIdx);
    231     if (expectedIdx > 0) {
    232         // The line numbers are off by one because a premature break occurs somewhere
    233         //    within the previous item, rather than at the start of the current (expected) item.
    234         //    We want to report the offset of the unexpected break from the start of
    235         //      this previous item.
    236         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
    237     }
    238     if (actual < expected) {
    239         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
    240     } else {
    241         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
    242     }
    243 }
    244 
    245 
    246 void BITestData::clearResults() {
    247     fActualBreakPositions.removeAllElements();
    248     fActualTags.removeAllElements();
    249 }
    250 
    251 
    252 //--------------------------------------------------------------------------------------
    253 //
    254 //    RBBITest    constructor and destructor
    255 //
    256 //--------------------------------------------------------------------------------------
    257 
    258 RBBITest::RBBITest() {
    259     fTestParams = NULL;
    260 }
    261 
    262 
    263 RBBITest::~RBBITest() {
    264 }
    265 
    266 //-----------------------------------------------------------------------------------
    267 //
    268 //   Test for status {tag} return value from break rules.
    269 //        TODO:  a more thorough test.
    270 //
    271 //-----------------------------------------------------------------------------------
    272 void RBBITest::TestStatusReturn() {
    273      UnicodeString rulesString1("$Letters = [:L:];\n"
    274                                   "$Numbers = [:N:];\n"
    275                                   "$Letters+{1};\n"
    276                                   "$Numbers+{2};\n"
    277                                   "Help\\ /me\\!{4};\n"
    278                                   "[^$Letters $Numbers];\n"
    279                                   "!.*;\n", -1, US_INV);
    280      UnicodeString testString1  = "abc123..abc Help me Help me!";
    281                                 // 01234567890123456789012345678
    282      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
    283      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
    284 
    285      UErrorCode status=U_ZERO_ERROR;
    286      UParseError    parseError;
    287 
    288      LocalPointer <BreakIterator> bi(new RuleBasedBreakIterator(rulesString1, parseError, status));
    289      if(U_FAILURE(status)) {
    290          dataerrln("%s:%d error in break iterator construction - %s", __FILE__, __LINE__,  u_errorName(status));
    291          return;
    292      }
    293      int32_t  pos;
    294      int32_t  i = 0;
    295      bi->setText(testString1);
    296      for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
    297          if (pos != bounds1[i]) {
    298              errln("%s:%d  expected break at %d, got %d\n", __FILE__, __LINE__, bounds1[i], pos);
    299              break;
    300          }
    301 
    302          int tag = bi->getRuleStatus();
    303          if (tag != brkStatus[i]) {
    304              errln("%s:%d  break at %d, expected tag %d, got tag %d\n", __FILE__, __LINE__, pos, brkStatus[i], tag);
    305              break;
    306          }
    307          i++;
    308      }
    309 }
    310 
    311 
    312 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
    313     UErrorCode status = U_ZERO_ERROR;
    314     char name[100];
    315     printf("code    alpha extend alphanum type word sent line name\n");
    316     int nextExpectedIndex = 0;
    317     utext_setNativeIndex(tstr, 0);
    318     for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
    319         if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
    320             printf("------------------------------------------------ %d\n", j);
    321             ++nextExpectedIndex;
    322         }
    323 
    324         UChar32 c = utext_next32(tstr);
    325         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
    326         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
    327                            u_isUAlphabetic(c),
    328                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
    329                            u_isalnum(c),
    330                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
    331                                                   u_charType(c),
    332                                                   U_SHORT_PROPERTY_NAME),
    333                            u_getPropertyValueName(UCHAR_WORD_BREAK,
    334                                                   u_getIntPropertyValue(c,
    335                                                           UCHAR_WORD_BREAK),
    336                                                   U_SHORT_PROPERTY_NAME),
    337                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
    338                                    u_getIntPropertyValue(c,
    339                                            UCHAR_SENTENCE_BREAK),
    340                                    U_SHORT_PROPERTY_NAME),
    341                            u_getPropertyValueName(UCHAR_LINE_BREAK,
    342                                    u_getIntPropertyValue(c,
    343                                            UCHAR_LINE_BREAK),
    344                                    U_SHORT_PROPERTY_NAME),
    345                            name);
    346     }
    347 }
    348 
    349 
    350 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
    351    UErrorCode status = U_ZERO_ERROR;
    352    UText *tstr = NULL;
    353    tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
    354    if (U_FAILURE(status)) {
    355        printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
    356        return;
    357     }
    358    printStringBreaks(tstr, expected, expectedCount);
    359    utext_close(tstr);
    360 }
    361 
    362 
    363 void RBBITest::TestBug3818() {
    364     UErrorCode  status = U_ZERO_ERROR;
    365 
    366     // Four Thai words...
    367     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
    368                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
    369     UnicodeString  thaiStr(thaiWordData);
    370 
    371     BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
    372     if (U_FAILURE(status) || bi == NULL) {
    373         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
    374         return;
    375     }
    376     bi->setText(thaiStr);
    377 
    378     int32_t  startOfSecondWord = bi->following(1);
    379     if (startOfSecondWord != 4) {
    380         errln("Fail at file %s, line %d expected start of word at 4, got %d",
    381             __FILE__, __LINE__, startOfSecondWord);
    382     }
    383     startOfSecondWord = bi->following(0);
    384     if (startOfSecondWord != 4) {
    385         errln("Fail at file %s, line %d expected start of word at 4, got %d",
    386             __FILE__, __LINE__, startOfSecondWord);
    387     }
    388     delete bi;
    389 }
    390 
    391 //----------------------------------------------------------------------------
    392 //
    393 // generalIteratorTest      Given a break iterator and a set of test data,
    394 //                          Run the tests and report the results.
    395 //
    396 //----------------------------------------------------------------------------
    397 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
    398 {
    399 
    400     bi.setText(td.fDataToBreak);
    401 
    402     testFirstAndNext(bi, td);
    403 
    404     testLastAndPrevious(bi, td);
    405 
    406     testFollowing(bi, td);
    407     testPreceding(bi, td);
    408     testIsBoundary(bi, td);
    409     doMultipleSelectionTest(bi, td);
    410 }
    411 
    412 
    413 //
    414 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
    415 //                       kind of loop.
    416 //
    417 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
    418 {
    419     UErrorCode  status = U_ZERO_ERROR;
    420     int32_t     p;
    421     int32_t     lastP = -1;
    422     int32_t     tag;
    423 
    424     logln("Test first and next");
    425     bi.setText(td.fDataToBreak);
    426     td.clearResults();
    427 
    428     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
    429         td.fActualBreakPositions.addElement(p, status);  // Save result.
    430         tag = bi.getRuleStatus();
    431         td.fActualTags.addElement(tag, status);
    432         if (p <= lastP) {
    433             // If the iterator is not making forward progress, stop.
    434             //  No need to raise an error here, it'll be detected in the normal check of results.
    435             break;
    436         }
    437         lastP = p;
    438     }
    439     td.checkResults("testFirstAndNext", this);
    440 }
    441 
    442 
    443 //
    444 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
    445 //
    446 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
    447 {
    448     UErrorCode  status = U_ZERO_ERROR;
    449     int32_t     p;
    450     int32_t     lastP  = 0x7ffffffe;
    451     int32_t     tag;
    452 
    453     logln("Test last and previous");
    454     bi.setText(td.fDataToBreak);
    455     td.clearResults();
    456 
    457     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
    458         // Save break position.  Insert it at start of vector of results, shoving
    459         //    already-saved results further towards the end.
    460         td.fActualBreakPositions.insertElementAt(p, 0, status);
    461         // bi.previous();   // TODO:  Why does this fix things up????
    462         // bi.next();
    463         tag = bi.getRuleStatus();
    464         td.fActualTags.insertElementAt(tag, 0, status);
    465         if (p >= lastP) {
    466             // If the iterator is not making progress, stop.
    467             //  No need to raise an error here, it'll be detected in the normal check of results.
    468             break;
    469         }
    470         lastP = p;
    471     }
    472     td.checkResults("testLastAndPrevious", this);
    473 }
    474 
    475 
    476 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
    477 {
    478     UErrorCode  status = U_ZERO_ERROR;
    479     int32_t     p;
    480     int32_t     tag;
    481     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
    482                                  //   cannot be -1; that is returned for DONE.
    483     int         i;
    484 
    485     logln("testFollowing():");
    486     bi.setText(td.fDataToBreak);
    487     td.clearResults();
    488 
    489     // Save the starting point, since we won't get that out of following.
    490     p = bi.first();
    491     td.fActualBreakPositions.addElement(p, status);  // Save result.
    492     tag = bi.getRuleStatus();
    493     td.fActualTags.addElement(tag, status);
    494 
    495     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
    496         p = bi.following(i);
    497         if (p != lastP) {
    498             if (p == RuleBasedBreakIterator::DONE) {
    499                 break;
    500             }
    501             // We've reached a new break position.  Save it.
    502             td.fActualBreakPositions.addElement(p, status);  // Save result.
    503             tag = bi.getRuleStatus();
    504             td.fActualTags.addElement(tag, status);
    505             lastP = p;
    506         }
    507     }
    508     // The loop normally exits by means of the break in the middle.
    509     // Make sure that the index was at the correct position for the break iterator to have
    510     //   returned DONE.
    511     if (i != td.fDataToBreak.length()) {
    512         errln("testFollowing():  iterator returned DONE prematurely.");
    513     }
    514 
    515     // Full check of all results.
    516     td.checkResults("testFollowing", this);
    517 }
    518 
    519 
    520 
    521 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
    522     UErrorCode  status = U_ZERO_ERROR;
    523     int32_t     p;
    524     int32_t     tag;
    525     int32_t     lastP  = 0x7ffffffe;
    526     int         i;
    527 
    528     logln("testPreceding():");
    529     bi.setText(td.fDataToBreak);
    530     td.clearResults();
    531 
    532     p = bi.last();
    533     td.fActualBreakPositions.addElement(p, status);
    534     tag = bi.getRuleStatus();
    535     td.fActualTags.addElement(tag, status);
    536 
    537     for (i = td.fDataToBreak.length(); i>=-1; i--) {
    538         p = bi.preceding(i);
    539         if (p != lastP) {
    540             if (p == RuleBasedBreakIterator::DONE) {
    541                 break;
    542             }
    543             // We've reached a new break position.  Save it.
    544             td.fActualBreakPositions.insertElementAt(p, 0, status);
    545             lastP = p;
    546             tag = bi.getRuleStatus();
    547             td.fActualTags.insertElementAt(tag, 0, status);
    548         }
    549     }
    550     // The loop normally exits by means of the break in the middle.
    551     // Make sure that the index was at the correct position for the break iterator to have
    552     //   returned DONE.
    553     if (i != 0) {
    554         errln("testPreceding():  iterator returned DONE prematurely.");
    555     }
    556 
    557     // Full check of all results.
    558     td.checkResults("testPreceding", this);
    559 }
    560 
    561 
    562 
    563 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
    564     UErrorCode  status = U_ZERO_ERROR;
    565     int         i;
    566     int32_t     tag;
    567 
    568     logln("testIsBoundary():");
    569     bi.setText(td.fDataToBreak);
    570     td.clearResults();
    571 
    572     for (i = 0; i <= td.fDataToBreak.length(); i++) {
    573         if (bi.isBoundary(i)) {
    574             td.fActualBreakPositions.addElement(i, status);  // Save result.
    575             tag = bi.getRuleStatus();
    576             td.fActualTags.addElement(tag, status);
    577         }
    578     }
    579     td.checkResults("testIsBoundary: ", this);
    580 }
    581 
    582 
    583 
    584 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
    585 {
    586     iterator.setText(td.fDataToBreak);
    587 
    588     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
    589     int32_t offset = iterator.first();
    590     int32_t testOffset;
    591     int32_t count = 0;
    592 
    593     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
    594 
    595     if (*testIterator != iterator)
    596         errln("clone() or operator!= failed: two clones compared unequal");
    597 
    598     do {
    599         testOffset = testIterator->first();
    600         testOffset = testIterator->next(count);
    601         if (offset != testOffset)
    602             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
    603 
    604         if (offset != RuleBasedBreakIterator::DONE) {
    605             count++;
    606             offset = iterator.next();
    607 
    608             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
    609                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
    610                 if (count > 10000 || offset == -1) {
    611                     errln("operator== failed too many times. Stopping test.");
    612                     if (offset == -1) {
    613                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
    614                     }
    615                     return;
    616                 }
    617             }
    618         }
    619     } while (offset != RuleBasedBreakIterator::DONE);
    620 
    621     // now do it backwards...
    622     offset = iterator.last();
    623     count = 0;
    624 
    625     do {
    626         testOffset = testIterator->last();
    627         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
    628         if (offset != testOffset)
    629             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
    630 
    631         if (offset != RuleBasedBreakIterator::DONE) {
    632             count--;
    633             offset = iterator.previous();
    634         }
    635     } while (offset != RuleBasedBreakIterator::DONE);
    636 
    637     delete testIterator;
    638 }
    639 
    640 
    641 //---------------------------------------------
    642 //
    643 //     other tests
    644 //
    645 //---------------------------------------------
    646 void RBBITest::TestEmptyString()
    647 {
    648     UnicodeString text = "";
    649     UErrorCode status = U_ZERO_ERROR;
    650 
    651     BITestData x(status);
    652     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
    653     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
    654     if (U_FAILURE(status))
    655     {
    656         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
    657         return;
    658     }
    659     generalIteratorTest(*bi, x);
    660     delete bi;
    661 }
    662 
    663 void RBBITest::TestGetAvailableLocales()
    664 {
    665     int32_t locCount = 0;
    666     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
    667 
    668     if (locCount == 0)
    669         dataerrln("getAvailableLocales() returned an empty list!");
    670     // Just make sure that it's returning good memory.
    671     int32_t i;
    672     for (i = 0; i < locCount; ++i) {
    673         logln(locList[i].getName());
    674     }
    675 }
    676 
    677 //Testing the BreakIterator::getDisplayName() function
    678 void RBBITest::TestGetDisplayName()
    679 {
    680     UnicodeString   result;
    681 
    682     BreakIterator::getDisplayName(Locale::getUS(), result);
    683     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
    684         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
    685                 + result);
    686 
    687     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
    688     if (result != "French (France)")
    689         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
    690                 + result);
    691 }
    692 /**
    693  * Test End Behaviour
    694  * @bug 4068137
    695  */
    696 void RBBITest::TestEndBehaviour()
    697 {
    698     UErrorCode status = U_ZERO_ERROR;
    699     UnicodeString testString("boo.");
    700     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
    701     if (U_FAILURE(status))
    702     {
    703         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
    704         return;
    705     }
    706     wb->setText(testString);
    707 
    708     if (wb->first() != 0)
    709         errln("Didn't get break at beginning of string.");
    710     if (wb->next() != 3)
    711         errln("Didn't get break before period in \"boo.\"");
    712     if (wb->current() != 4 && wb->next() != 4)
    713         errln("Didn't get break at end of string.");
    714     delete wb;
    715 }
    716 /*
    717  * @bug 4153072
    718  */
    719 void RBBITest::TestBug4153072() {
    720     UErrorCode status = U_ZERO_ERROR;
    721     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
    722     if (U_FAILURE(status))
    723     {
    724         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
    725         return;
    726     }
    727     UnicodeString str("...Hello, World!...");
    728     int32_t begin = 3;
    729     int32_t end = str.length() - 3;
    730     UBool onBoundary;
    731 
    732     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
    733     iter->adoptText(textIterator);
    734     int index;
    735     // Note: with the switch to UText, there is no way to restrict the
    736     //       iteration range to begin at an index other than zero.
    737     //       String character iterators created with a non-zero bound are
    738     //         treated by RBBI as being empty.
    739     for (index = -1; index < begin + 1; ++index) {
    740         onBoundary = iter->isBoundary(index);
    741         if (index == 0?  !onBoundary : onBoundary) {
    742             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
    743                             " and begin index = " + begin);
    744         }
    745     }
    746     delete iter;
    747 }
    748 
    749 
    750 //
    751 // Test for problem reported by Ashok Matoria on 9 July 2007
    752 //    One.<kSoftHyphen><kSpace>Two.
    753 //
    754 //    Sentence break at start (0) and then on calling next() it breaks at
    755 //   'T' of "Two". Now, at this point if I do next() and
    756 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
    757 //
    758 void RBBITest::TestBug5775() {
    759     UErrorCode status = U_ZERO_ERROR;
    760     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
    761     TEST_ASSERT_SUCCESS(status);
    762     if (U_FAILURE(status)) {
    763         return;
    764     }
    765 // Check for status first for better handling of no data errors.
    766     TEST_ASSERT(bi != NULL);
    767     if (bi == NULL) {
    768         return;
    769     }
    770 
    771     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
    772     //               01234      56789
    773     s = s.unescape();
    774     bi->setText(s);
    775     int pos = bi->next();
    776     TEST_ASSERT(pos == 6);
    777     pos = bi->next();
    778     TEST_ASSERT(pos == 10);
    779     pos = bi->previous();
    780     TEST_ASSERT(pos == 6);
    781     delete bi;
    782 }
    783 
    784 
    785 
    786 //------------------------------------------------------------------------------
    787 //
    788 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
    789 //
    790 //------------------------------------------------------------------------------
    791 
    792 struct TestParams {
    793     BreakIterator   *bi;                   // Break iterator is set while parsing test source.
    794                                            //   Changed out whenever test data changes break type.
    795 
    796     UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
    797     UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
    798     UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
    799     UVector32       *srcCol;
    800 
    801     UText           *textToBreak;          // UText, could be UTF8 or UTF16.
    802     UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
    803     CharString       utf8String;           // UTF-8 form of text to break.
    804 
    805     TestParams(UErrorCode &status) : dataToBreak() {
    806         bi               = NULL;
    807         expectedBreaks   = new UVector32(status);
    808         srcLine          = new UVector32(status);
    809         srcCol           = new UVector32(status);
    810         textToBreak      = NULL;
    811         textMap          = new UVector32(status);
    812     }
    813 
    814     ~TestParams() {
    815         delete bi;
    816         delete expectedBreaks;
    817         delete srcLine;
    818         delete srcCol;
    819         utext_close(textToBreak);
    820         delete textMap;
    821     }
    822 
    823     int32_t getSrcLine(int32_t bp);
    824     int32_t getExpectedBreak(int32_t bp);
    825     int32_t getSrcCol(int32_t bp);
    826 
    827     void setUTF16(UErrorCode &status);
    828     void setUTF8(UErrorCode &status);
    829 };
    830 
    831 // Append a UnicodeString to a CharString with UTF-8 encoding.
    832 // Substitute any invalid chars.
    833 //   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
    834 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
    835     if (U_FAILURE(status)) {
    836         return;
    837     }
    838     int32_t utf8Length;
    839     u_strToUTF8WithSub(NULL, 0, &utf8Length,            // Output Buffer, NULL for preflight.
    840                        src.getBuffer(), src.length(),   // UTF-16 data
    841                        0xfffd, NULL,                    // Substitution char, number of subs.
    842                        &status);
    843     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
    844         return;
    845     }
    846     status = U_ZERO_ERROR;
    847     int32_t capacity;
    848     char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
    849     u_strToUTF8WithSub(buffer, utf8Length, NULL,
    850                        src.getBuffer(), src.length(),
    851                        0xfffd, NULL, &status);
    852     dest.append(buffer, utf8Length, status);
    853 }
    854 
    855 
    856 void TestParams::setUTF16(UErrorCode &status) {
    857     textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
    858     textMap->removeAllElements();
    859     for (int32_t i=0; i<dataToBreak.length(); i++) {
    860         if (i == dataToBreak.getChar32Start(i)) {
    861             textMap->addElement(i, status);
    862         } else {
    863             textMap->addElement(-1, status);
    864         }
    865     }
    866     textMap->addElement(dataToBreak.length(), status);
    867     U_ASSERT(dataToBreak.length() + 1 == textMap->size());
    868 }
    869 
    870 
    871 void TestParams::setUTF8(UErrorCode &status) {
    872     if (U_FAILURE(status)) {
    873         return;
    874     }
    875     utf8String.clear();
    876     CharStringAppend(utf8String, dataToBreak, status);
    877     textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
    878     if (U_FAILURE(status)) {
    879         return;
    880     }
    881 
    882     textMap->removeAllElements();
    883     int32_t utf16Index = 0;
    884     for (;;) {
    885         textMap->addElement(utf16Index, status);
    886         UChar32 c32 = utext_current32(textToBreak);
    887         if (c32 < 0) {
    888             break;
    889         }
    890         utf16Index += U16_LENGTH(c32);
    891         utext_next32(textToBreak);
    892         while (textMap->size() < utext_getNativeIndex(textToBreak)) {
    893             textMap->addElement(-1, status);
    894         }
    895     }
    896     U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
    897 }
    898 
    899 
    900 int32_t TestParams::getSrcLine(int32_t bp) {
    901     if (bp >= textMap->size()) {
    902         bp = textMap->size() - 1;
    903     }
    904     int32_t i = 0;
    905     for(; bp >= 0 ; --bp) {
    906         // Move to a character boundary if we are not on one already.
    907         i = textMap->elementAti(bp);
    908         if (i >= 0) {
    909             break;
    910         }
    911     }
    912     return srcLine->elementAti(i);
    913 }
    914 
    915 
    916 int32_t TestParams::getExpectedBreak(int32_t bp) {
    917     if (bp >= textMap->size()) {
    918         return 0;
    919     }
    920     int32_t i = textMap->elementAti(bp);
    921     int32_t retVal = 0;
    922     if (i >= 0) {
    923         retVal = expectedBreaks->elementAti(i);
    924     }
    925     return retVal;
    926 }
    927 
    928 
    929 int32_t TestParams::getSrcCol(int32_t bp) {
    930     if (bp >= textMap->size()) {
    931         bp = textMap->size() - 1;
    932     }
    933     int32_t i = 0;
    934     for(; bp >= 0; --bp) {
    935         // Move bp to a character boundary if we are not on one already.
    936         i = textMap->elementAti(bp);
    937         if (i >= 0) {
    938             break;
    939         }
    940     }
    941     return srcCol->elementAti(i);
    942 }
    943 
    944 
    945 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
    946     int32_t    bp;
    947     int32_t    prevBP;
    948     int32_t    i;
    949 
    950     TEST_ASSERT_SUCCESS(status);
    951     if (U_FAILURE(status)) {
    952         return;
    953     }
    954 
    955     if (t->bi == NULL) {
    956         return;
    957     }
    958 
    959     t->bi->setText(t->textToBreak, status);
    960     //
    961     //  Run the iterator forward
    962     //
    963     prevBP = -1;
    964     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
    965         if (prevBP ==  bp) {
    966             // Fail for lack of forward progress.
    967             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
    968                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
    969             break;
    970         }
    971 
    972         // Check that there we didn't miss an expected break between the last one
    973         //  and this one.
    974         for (i=prevBP+1; i<bp; i++) {
    975             if (t->getExpectedBreak(i) != 0) {
    976                 int expected[] = {0, i};
    977                 printStringBreaks(t->dataToBreak, expected, 2);
    978                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
    979                       i, t->getSrcLine(i), t->getSrcCol(i));
    980             }
    981         }
    982 
    983         // Check that the break we did find was expected
    984         if (t->getExpectedBreak(bp) == 0) {
    985             int expected[] = {0, bp};
    986             printStringBreaks(t->textToBreak, expected, 2);
    987             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
    988                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
    989         } else {
    990             // The break was expected.
    991             //   Check that the {nnn} tag value is correct.
    992             int32_t expectedTagVal = t->getExpectedBreak(bp);
    993             if (expectedTagVal == -1) {
    994                 expectedTagVal = 0;
    995             }
    996             int32_t line = t->getSrcLine(bp);
    997             int32_t rs = t->bi->getRuleStatus();
    998             if (rs != expectedTagVal) {
    999                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
   1000                       "          Actual, Expected status = %4d, %4d",
   1001                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
   1002             }
   1003         }
   1004 
   1005         prevBP = bp;
   1006     }
   1007 
   1008     // Verify that there were no missed expected breaks after the last one found
   1009     for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
   1010         if (t->getExpectedBreak(i) != 0) {
   1011             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1012                       i, t->getSrcLine(i), t->getSrcCol(i));
   1013         }
   1014     }
   1015 
   1016     //
   1017     //  Run the iterator backwards, verify that the same breaks are found.
   1018     //
   1019     prevBP = utext_nativeLength(t->textToBreak)+2;  // start with a phony value for the last break pos seen.
   1020     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
   1021         if (prevBP ==  bp) {
   1022             // Fail for lack of progress.
   1023             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
   1024                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
   1025             break;
   1026         }
   1027 
   1028         // Check that we didn't miss an expected break between the last one
   1029         //  and this one.  (UVector returns zeros for index out of bounds.)
   1030         for (i=prevBP-1; i>bp; i--) {
   1031             if (t->getExpectedBreak(i) != 0) {
   1032                 errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1033                       i, t->getSrcLine(i), t->getSrcCol(i));
   1034             }
   1035         }
   1036 
   1037         // Check that the break we did find was expected
   1038         if (t->getExpectedBreak(bp) == 0) {
   1039             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
   1040                    bp, t->getSrcLine(bp), t->getSrcCol(bp));
   1041         } else {
   1042             // The break was expected.
   1043             //   Check that the {nnn} tag value is correct.
   1044             int32_t expectedTagVal = t->getExpectedBreak(bp);
   1045             if (expectedTagVal == -1) {
   1046                 expectedTagVal = 0;
   1047             }
   1048             int line = t->getSrcLine(bp);
   1049             int32_t rs = t->bi->getRuleStatus();
   1050             if (rs != expectedTagVal) {
   1051                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
   1052                       "          Actual, Expected status = %4d, %4d",
   1053                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
   1054             }
   1055         }
   1056 
   1057         prevBP = bp;
   1058     }
   1059 
   1060     // Verify that there were no missed breaks prior to the last one found
   1061     for (i=prevBP-1; i>=0; i--) {
   1062         if (t->getExpectedBreak(i) != 0) {
   1063             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1064                       i, t->getSrcLine(i), t->getSrcCol(i));
   1065         }
   1066     }
   1067 
   1068     // Check isBoundary()
   1069     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
   1070         UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
   1071         UBool boundaryFound    = t->bi->isBoundary(i);
   1072         if (boundaryExpected != boundaryFound) {
   1073             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
   1074                   "        Expected, Actual= %s, %s",
   1075                   i, t->getSrcLine(i), t->getSrcCol(i),
   1076                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
   1077         }
   1078     }
   1079 
   1080     // Check following()
   1081     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
   1082         int32_t actualBreak = t->bi->following(i);
   1083         int32_t expectedBreak = BreakIterator::DONE;
   1084         for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
   1085             if (t->getExpectedBreak(j) != 0) {
   1086                 expectedBreak = j;
   1087                 break;
   1088             }
   1089         }
   1090         if (expectedBreak != actualBreak) {
   1091             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
   1092                   "        Expected, Actual= %d, %d",
   1093                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
   1094         }
   1095     }
   1096 
   1097     // Check preceding()
   1098     for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
   1099         int32_t actualBreak = t->bi->preceding(i);
   1100         int32_t expectedBreak = BreakIterator::DONE;
   1101 
   1102         // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
   1103         // preceding(trailing byte) will return the index of some preceding code point,
   1104         // not the lead byte of the current code point, even though that has a smaller index.
   1105         // Therefore, start looking at the expected break data not at i-1, but at
   1106         // the start of code point index - 1.
   1107         utext_setNativeIndex(t->textToBreak, i);
   1108         int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
   1109         for (; j >= 0; j--) {
   1110             if (t->getExpectedBreak(j) != 0) {
   1111                 expectedBreak = j;
   1112                 break;
   1113             }
   1114         }
   1115         if (expectedBreak != actualBreak) {
   1116             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
   1117                   "        Expected, Actual= %d, %d",
   1118                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
   1119         }
   1120     }
   1121 }
   1122 
   1123 
   1124 void RBBITest::TestExtended() {
   1125 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   1126     UErrorCode      status  = U_ZERO_ERROR;
   1127     Locale          locale("");
   1128 
   1129     UnicodeString       rules;
   1130     TestParams          tp(status);
   1131 
   1132     RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
   1133     if (U_FAILURE(status)) {
   1134         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
   1135     }
   1136 
   1137 
   1138     //
   1139     //  Open and read the test data file.
   1140     //
   1141     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   1142     char testFileName[1000];
   1143     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
   1144         errln("Can't open test data.  Path too long.");
   1145         return;
   1146     }
   1147     strcpy(testFileName, testDataDirectory);
   1148     strcat(testFileName, "rbbitst.txt");
   1149 
   1150     int    len;
   1151     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
   1152     if (U_FAILURE(status)) {
   1153         return; /* something went wrong, error already output */
   1154     }
   1155 
   1156 
   1157     bool skipTest = false; // Skip this test?
   1158 
   1159     //
   1160     //  Put the test data into a UnicodeString
   1161     //
   1162     UnicodeString testString(FALSE, testFile, len);
   1163 
   1164     enum EParseState{
   1165         PARSE_COMMENT,
   1166         PARSE_TAG,
   1167         PARSE_DATA,
   1168         PARSE_NUM
   1169     }
   1170     parseState = PARSE_TAG;
   1171 
   1172     EParseState savedState = PARSE_TAG;
   1173 
   1174     static const UChar CH_LF        = 0x0a;
   1175     static const UChar CH_CR        = 0x0d;
   1176     static const UChar CH_HASH      = 0x23;
   1177     /*static const UChar CH_PERIOD    = 0x2e;*/
   1178     static const UChar CH_LT        = 0x3c;
   1179     static const UChar CH_GT        = 0x3e;
   1180     static const UChar CH_BACKSLASH = 0x5c;
   1181     static const UChar CH_BULLET    = 0x2022;
   1182 
   1183     int32_t    lineNum  = 1;
   1184     int32_t    colStart = 0;
   1185     int32_t    column   = 0;
   1186     int32_t    charIdx  = 0;
   1187 
   1188     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
   1189 
   1190     for (charIdx = 0; charIdx < len; ) {
   1191         status = U_ZERO_ERROR;
   1192         UChar  c = testString.charAt(charIdx);
   1193         charIdx++;
   1194         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
   1195             // treat CRLF as a unit
   1196             c = CH_LF;
   1197             charIdx++;
   1198         }
   1199         if (c == CH_LF || c == CH_CR) {
   1200             lineNum++;
   1201             colStart = charIdx;
   1202         }
   1203         column = charIdx - colStart + 1;
   1204 
   1205         switch (parseState) {
   1206         case PARSE_COMMENT:
   1207             if (c == 0x0a || c == 0x0d) {
   1208                 parseState = savedState;
   1209             }
   1210             break;
   1211 
   1212         case PARSE_TAG:
   1213             {
   1214             if (c == CH_HASH) {
   1215                 parseState = PARSE_COMMENT;
   1216                 savedState = PARSE_TAG;
   1217                 break;
   1218             }
   1219             if (u_isUWhiteSpace(c)) {
   1220                 break;
   1221             }
   1222             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
   1223                 delete tp.bi;
   1224                 tp.bi = BreakIterator::createWordInstance(locale,  status);
   1225                 skipTest = false;
   1226                 charIdx += 5;
   1227                 break;
   1228             }
   1229             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
   1230                 delete tp.bi;
   1231                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
   1232                 skipTest = false;
   1233                 charIdx += 5;
   1234                 break;
   1235             }
   1236             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
   1237                 delete tp.bi;
   1238                 tp.bi = BreakIterator::createLineInstance(locale,  status);
   1239                 skipTest = false;
   1240                 charIdx += 5;
   1241                 break;
   1242             }
   1243             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
   1244                 delete tp.bi;
   1245                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
   1246                 skipTest = false;
   1247                 charIdx += 5;
   1248                 break;
   1249             }
   1250             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
   1251                 delete tp.bi;
   1252                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
   1253                 charIdx += 6;
   1254                 break;
   1255             }
   1256 
   1257             // <locale  loc_name>
   1258             localeMatcher.reset(testString);
   1259             if (localeMatcher.lookingAt(charIdx-1, status)) {
   1260                 UnicodeString localeName = localeMatcher.group(1, status);
   1261                 char localeName8[100];
   1262                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
   1263                 locale = Locale::createFromName(localeName8);
   1264                 charIdx += localeMatcher.group(0, status).length() - 1;
   1265                 TEST_ASSERT_SUCCESS(status);
   1266                 break;
   1267             }
   1268             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
   1269                 parseState = PARSE_DATA;
   1270                 charIdx += 5;
   1271                 tp.dataToBreak = "";
   1272                 tp.expectedBreaks->removeAllElements();
   1273                 tp.srcCol ->removeAllElements();
   1274                 tp.srcLine->removeAllElements();
   1275                 break;
   1276             }
   1277 
   1278             errln("line %d: Tag expected in test file.", lineNum);
   1279             parseState = PARSE_COMMENT;
   1280             savedState = PARSE_DATA;
   1281             goto end_test; // Stop the test.
   1282             }
   1283             break;
   1284 
   1285         case PARSE_DATA:
   1286             if (c == CH_BULLET) {
   1287                 int32_t  breakIdx = tp.dataToBreak.length();
   1288                 tp.expectedBreaks->setSize(breakIdx+1);
   1289                 tp.expectedBreaks->setElementAt(-1, breakIdx);
   1290                 tp.srcLine->setSize(breakIdx+1);
   1291                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1292                 tp.srcCol ->setSize(breakIdx+1);
   1293                 tp.srcCol ->setElementAt(column, breakIdx);
   1294                 break;
   1295             }
   1296 
   1297             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
   1298                 // Add final entry to mappings from break location to source file position.
   1299                 //  Need one extra because last break position returned is after the
   1300                 //    last char in the data, not at the last char.
   1301                 tp.srcLine->addElement(lineNum, status);
   1302                 tp.srcCol ->addElement(column, status);
   1303 
   1304                 parseState = PARSE_TAG;
   1305                 charIdx += 6;
   1306 
   1307                 if (!skipTest) {
   1308                     // RUN THE TEST!
   1309                     status = U_ZERO_ERROR;
   1310                     tp.setUTF16(status);
   1311                     executeTest(&tp, status);
   1312                     TEST_ASSERT_SUCCESS(status);
   1313 
   1314                     // Run again, this time with UTF-8 text wrapped in a UText.
   1315                     status = U_ZERO_ERROR;
   1316                     tp.setUTF8(status);
   1317                     TEST_ASSERT_SUCCESS(status);
   1318                     executeTest(&tp, status);
   1319                 }
   1320                 break;
   1321             }
   1322 
   1323             if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
   1324                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
   1325                 // Get the code point from the name and insert it into the test data.
   1326                 //   (Damn, no API takes names in Unicode  !!!
   1327                 //    we've got to take it back to char *)
   1328                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
   1329                 int32_t nameLength = nameEndIdx - (charIdx+2);
   1330                 char charNameBuf[200];
   1331                 UChar32 theChar = -1;
   1332                 if (nameEndIdx != -1) {
   1333                     UErrorCode status = U_ZERO_ERROR;
   1334                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
   1335                     charNameBuf[sizeof(charNameBuf)-1] = 0;
   1336                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
   1337                     if (U_FAILURE(status)) {
   1338                         theChar = -1;
   1339                     }
   1340                 }
   1341                 if (theChar == -1) {
   1342                     errln("Error in named character in test file at line %d, col %d",
   1343                         lineNum, column);
   1344                 } else {
   1345                     // Named code point was recognized.  Insert it
   1346                     //   into the test data.
   1347                     tp.dataToBreak.append(theChar);
   1348                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
   1349                         tp.srcLine->addElement(lineNum, status);
   1350                         tp.srcCol ->addElement(column, status);
   1351                     }
   1352                 }
   1353                 if (nameEndIdx > charIdx) {
   1354                     charIdx = nameEndIdx+1;
   1355 
   1356                 }
   1357                 break;
   1358             }
   1359 
   1360 
   1361 
   1362 
   1363             if (testString.compare(charIdx-1, 2, "<>") == 0) {
   1364                 charIdx++;
   1365                 int32_t  breakIdx = tp.dataToBreak.length();
   1366                 tp.expectedBreaks->setSize(breakIdx+1);
   1367                 tp.expectedBreaks->setElementAt(-1, breakIdx);
   1368                 tp.srcLine->setSize(breakIdx+1);
   1369                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1370                 tp.srcCol ->setSize(breakIdx+1);
   1371                 tp.srcCol ->setElementAt(column, breakIdx);
   1372                 break;
   1373             }
   1374 
   1375             if (c == CH_LT) {
   1376                 tagValue   = 0;
   1377                 parseState = PARSE_NUM;
   1378                 break;
   1379             }
   1380 
   1381             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
   1382                 parseState = PARSE_COMMENT;
   1383                 savedState = PARSE_DATA;
   1384                 break;
   1385             }
   1386 
   1387             if (c == CH_BACKSLASH) {
   1388                 // Check for \ at end of line, a line continuation.
   1389                 //     Advance over (discard) the newline
   1390                 UChar32 cp = testString.char32At(charIdx);
   1391                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
   1392                     // We have a CR LF
   1393                     //  Need an extra increment of the input ptr to move over both of them
   1394                     charIdx++;
   1395                 }
   1396                 if (cp == CH_LF || cp == CH_CR) {
   1397                     lineNum++;
   1398                     colStart = charIdx;
   1399                     charIdx++;
   1400                     break;
   1401                 }
   1402 
   1403                 // Let unescape handle the back slash.
   1404                 cp = testString.unescapeAt(charIdx);
   1405                 if (cp != -1) {
   1406                     // Escape sequence was recognized.  Insert the char
   1407                     //   into the test data.
   1408                     tp.dataToBreak.append(cp);
   1409                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
   1410                         tp.srcLine->addElement(lineNum, status);
   1411                         tp.srcCol ->addElement(column, status);
   1412                     }
   1413                     break;
   1414                 }
   1415 
   1416 
   1417                 // Not a recognized backslash escape sequence.
   1418                 // Take the next char as a literal.
   1419                 //  TODO:  Should this be an error?
   1420                 c = testString.charAt(charIdx);
   1421                 charIdx = testString.moveIndex32(charIdx, 1);
   1422             }
   1423 
   1424             // Normal, non-escaped data char.
   1425             tp.dataToBreak.append(c);
   1426 
   1427             // Save the mapping from offset in the data to line/column numbers in
   1428             //   the original input file.  Will be used for better error messages only.
   1429             //   If there's an expected break before this char, the slot in the mapping
   1430             //     vector will already be set for this char; don't overwrite it.
   1431             if (tp.dataToBreak.length() > tp.srcLine->size()) {
   1432                 tp.srcLine->addElement(lineNum, status);
   1433                 tp.srcCol ->addElement(column, status);
   1434             }
   1435             break;
   1436 
   1437 
   1438         case PARSE_NUM:
   1439             // We are parsing an expected numeric tag value, like <1234>,
   1440             //   within a chunk of data.
   1441             if (u_isUWhiteSpace(c)) {
   1442                 break;
   1443             }
   1444 
   1445             if (c == CH_GT) {
   1446                 // Finished the number.  Add the info to the expected break data,
   1447                 //   and switch parse state back to doing plain data.
   1448                 parseState = PARSE_DATA;
   1449                 if (tagValue == 0) {
   1450                     tagValue = -1;
   1451                 }
   1452                 int32_t  breakIdx = tp.dataToBreak.length();
   1453                 tp.expectedBreaks->setSize(breakIdx+1);
   1454                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
   1455                 tp.srcLine->setSize(breakIdx+1);
   1456                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1457                 tp.srcCol ->setSize(breakIdx+1);
   1458                 tp.srcCol ->setElementAt(column, breakIdx);
   1459                 break;
   1460             }
   1461 
   1462             if (u_isdigit(c)) {
   1463                 tagValue = tagValue*10 + u_charDigitValue(c);
   1464                 break;
   1465             }
   1466 
   1467             errln("Syntax Error in test file at line %d, col %d",
   1468                 lineNum, column);
   1469             parseState = PARSE_COMMENT;
   1470             goto end_test; // Stop the test
   1471             break;
   1472         }
   1473 
   1474 
   1475         if (U_FAILURE(status)) {
   1476             dataerrln("ICU Error %s while parsing test file at line %d.",
   1477                 u_errorName(status), lineNum);
   1478             status = U_ZERO_ERROR;
   1479             goto end_test; // Stop the test
   1480         }
   1481 
   1482     }
   1483 
   1484 end_test:
   1485     delete [] testFile;
   1486 #endif
   1487 }
   1488 
   1489 
   1490 //-------------------------------------------------------------------------------
   1491 //
   1492 //  TestDictRules   create a break iterator from source rules that includes a
   1493 //                  dictionary range.   Regression for bug #7130.  Source rules
   1494 //                  do not declare a break iterator type (word, line, sentence, etc.
   1495 //                  but the dictionary code, without a type, would loop.
   1496 //
   1497 //-------------------------------------------------------------------------------
   1498 void RBBITest::TestDictRules() {
   1499     const char *rules =  "$dictionary = [a-z]; \n"
   1500                          "!!forward; \n"
   1501                          "$dictionary $dictionary; \n"
   1502                          "!!reverse; \n"
   1503                          "$dictionary $dictionary; \n";
   1504     const char *text = "aa";
   1505     UErrorCode status = U_ZERO_ERROR;
   1506     UParseError parseError;
   1507 
   1508     RuleBasedBreakIterator bi(rules, parseError, status);
   1509     if (U_SUCCESS(status)) {
   1510         UnicodeString utext = text;
   1511         bi.setText(utext);
   1512         int32_t position;
   1513         int32_t loops;
   1514         for (loops = 0; loops<10; loops++) {
   1515             position = bi.next();
   1516             if (position == RuleBasedBreakIterator::DONE) {
   1517                 break;
   1518             }
   1519         }
   1520         TEST_ASSERT(loops == 1);
   1521     } else {
   1522         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
   1523     }
   1524 }
   1525 
   1526 
   1527 
   1528 //-------------------------------------------------------------------------------
   1529 //
   1530 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
   1531 //    return the data in one big UChar * buffer, which the caller must delete.
   1532 //
   1533 //    parameters:
   1534 //          fileName:   the name of the file, with no directory part.  The test data directory
   1535 //                      is assumed.
   1536 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
   1537 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
   1538 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
   1539 //                      Pass NULL for the system default encoding.
   1540 //          status
   1541 //    returns:
   1542 //                      The file data, converted to UChar.
   1543 //                      The caller must delete this when done with
   1544 //                           delete [] theBuffer;
   1545 //
   1546 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
   1547 //           Move this function to some common place.
   1548 //
   1549 //--------------------------------------------------------------------------------
   1550 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
   1551     UChar       *retPtr  = NULL;
   1552     char        *fileBuf = NULL;
   1553     UConverter* conv     = NULL;
   1554     FILE        *f       = NULL;
   1555 
   1556     ulen = 0;
   1557     if (U_FAILURE(status)) {
   1558         return retPtr;
   1559     }
   1560 
   1561     //
   1562     //  Open the file.
   1563     //
   1564     f = fopen(fileName, "rb");
   1565     if (f == 0) {
   1566         dataerrln("Error opening test data file %s\n", fileName);
   1567         status = U_FILE_ACCESS_ERROR;
   1568         return NULL;
   1569     }
   1570     //
   1571     //  Read it in
   1572     //
   1573     int   fileSize;
   1574     int   amt_read;
   1575 
   1576     fseek( f, 0, SEEK_END);
   1577     fileSize = ftell(f);
   1578     fileBuf = new char[fileSize];
   1579     fseek(f, 0, SEEK_SET);
   1580     amt_read = fread(fileBuf, 1, fileSize, f);
   1581     if (amt_read != fileSize || fileSize <= 0) {
   1582         errln("Error reading test data file.");
   1583         goto cleanUpAndReturn;
   1584     }
   1585 
   1586     //
   1587     // Look for a Unicode Signature (BOM) on the data just read
   1588     //
   1589     int32_t        signatureLength;
   1590     const char *   fileBufC;
   1591     const char*    bomEncoding;
   1592 
   1593     fileBufC = fileBuf;
   1594     bomEncoding = ucnv_detectUnicodeSignature(
   1595         fileBuf, fileSize, &signatureLength, &status);
   1596     if(bomEncoding!=NULL ){
   1597         fileBufC  += signatureLength;
   1598         fileSize  -= signatureLength;
   1599         encoding = bomEncoding;
   1600     }
   1601 
   1602     //
   1603     // Open a converter to take the rule file to UTF-16
   1604     //
   1605     conv = ucnv_open(encoding, &status);
   1606     if (U_FAILURE(status)) {
   1607         goto cleanUpAndReturn;
   1608     }
   1609 
   1610     //
   1611     // Convert the rules to UChar.
   1612     //  Preflight first to determine required buffer size.
   1613     //
   1614     ulen = ucnv_toUChars(conv,
   1615         NULL,           //  dest,
   1616         0,              //  destCapacity,
   1617         fileBufC,
   1618         fileSize,
   1619         &status);
   1620     if (status == U_BUFFER_OVERFLOW_ERROR) {
   1621         // Buffer Overflow is expected from the preflight operation.
   1622         status = U_ZERO_ERROR;
   1623 
   1624         retPtr = new UChar[ulen+1];
   1625         ucnv_toUChars(conv,
   1626             retPtr,       //  dest,
   1627             ulen+1,
   1628             fileBufC,
   1629             fileSize,
   1630             &status);
   1631     }
   1632 
   1633 cleanUpAndReturn:
   1634     fclose(f);
   1635     delete []fileBuf;
   1636     ucnv_close(conv);
   1637     if (U_FAILURE(status)) {
   1638         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   1639         delete []retPtr;
   1640         retPtr = 0;
   1641         ulen   = 0;
   1642     };
   1643     return retPtr;
   1644 }
   1645 
   1646 
   1647 
   1648 //--------------------------------------------------------------------------------------------
   1649 //
   1650 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
   1651 //
   1652 //-------------------------------------------------------------------------------------------
   1653 void RBBITest::TestUnicodeFiles() {
   1654     RuleBasedBreakIterator  *bi;
   1655     UErrorCode               status = U_ZERO_ERROR;
   1656 
   1657     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
   1658     TEST_ASSERT_SUCCESS(status);
   1659     if (U_SUCCESS(status)) {
   1660         runUnicodeTestData("GraphemeBreakTest.txt", bi);
   1661     }
   1662     delete bi;
   1663 
   1664     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
   1665     TEST_ASSERT_SUCCESS(status);
   1666     if (U_SUCCESS(status)) {
   1667         runUnicodeTestData("WordBreakTest.txt", bi);
   1668     }
   1669     delete bi;
   1670 
   1671     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
   1672     TEST_ASSERT_SUCCESS(status);
   1673     if (U_SUCCESS(status)) {
   1674         runUnicodeTestData("SentenceBreakTest.txt", bi);
   1675     }
   1676     delete bi;
   1677 
   1678     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
   1679     TEST_ASSERT_SUCCESS(status);
   1680     if (U_SUCCESS(status)) {
   1681         runUnicodeTestData("LineBreakTest.txt", bi);
   1682     }
   1683     delete bi;
   1684 }
   1685 
   1686 
   1687 // Check for test cases from the Unicode test data files that are known to fail
   1688 // and should be skipped because ICU is not yet able to fully implement the spec.
   1689 // See ticket #7270.
   1690 
   1691 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
   1692     static const UChar badTestCases[][4] = {                     // Line Numbers from Unicode 7.0.0 file.
   1693         {(UChar)0x200B, (UChar)0x0020, (UChar)0x007D, (UChar)0x0000},   // Line 5198
   1694         {(UChar)0x200B, (UChar)0x0020, (UChar)0x0029, (UChar)0x0000},   // Line 5202
   1695         {(UChar)0x200B, (UChar)0x0020, (UChar)0x0021, (UChar)0x0000},   // Line 5214
   1696         {(UChar)0x200B, (UChar)0x0020, (UChar)0x002c, (UChar)0x0000},   // Line 5246
   1697         {(UChar)0x200B, (UChar)0x0020, (UChar)0x002f, (UChar)0x0000},   // Line 5298
   1698         {(UChar)0x200B, (UChar)0x0020, (UChar)0x2060, (UChar)0x0000}    // Line 5302
   1699     };
   1700     if (strcmp(fileName, "LineBreakTest.txt") != 0) {
   1701         return FALSE;
   1702     }
   1703 
   1704     for (int i=0; i<UPRV_LENGTHOF(badTestCases); i++) {
   1705         if (testCase == UnicodeString(badTestCases[i])) {
   1706             return logKnownIssue("7270");
   1707         }
   1708     }
   1709     return FALSE;
   1710 }
   1711 
   1712 
   1713 //--------------------------------------------------------------------------------------------
   1714 //
   1715 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
   1716 //
   1717 //-------------------------------------------------------------------------------------------
   1718 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
   1719 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   1720     UErrorCode  status = U_ZERO_ERROR;
   1721 
   1722     //
   1723     //  Open and read the test data file, put it into a UnicodeString.
   1724     //
   1725     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   1726     char testFileName[1000];
   1727     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
   1728         dataerrln("Can't open test data.  Path too long.");
   1729         return;
   1730     }
   1731     strcpy(testFileName, testDataDirectory);
   1732     strcat(testFileName, fileName);
   1733 
   1734     logln("Opening data file %s\n", fileName);
   1735 
   1736     int    len;
   1737     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
   1738     if (status != U_FILE_ACCESS_ERROR) {
   1739         TEST_ASSERT_SUCCESS(status);
   1740         TEST_ASSERT(testFile != NULL);
   1741     }
   1742     if (U_FAILURE(status) || testFile == NULL) {
   1743         return; /* something went wrong, error already output */
   1744     }
   1745     UnicodeString testFileAsString(TRUE, testFile, len);
   1746 
   1747     //
   1748     //  Parse the test data file using a regular expression.
   1749     //  Each kind of token is recognized in its own capture group; what type of item was scanned
   1750     //     is identified by which group had a match.
   1751     //
   1752     //    Caputure Group #                  1          2            3            4           5
   1753     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
   1754     //
   1755     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
   1756     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
   1757     UnicodeString   testString;
   1758     UVector32       breakPositions(status);
   1759     int             lineNumber = 1;
   1760     TEST_ASSERT_SUCCESS(status);
   1761     if (U_FAILURE(status)) {
   1762         return;
   1763     }
   1764 
   1765     //
   1766     //  Scan through each test case, building up the string to be broken in testString,
   1767     //   and the positions that should be boundaries in the breakPositions vector.
   1768     //
   1769     int spin = 0;
   1770     while (tokenMatcher.find()) {
   1771       	if(tokenMatcher.hitEnd()) {
   1772           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
   1773              This occurred when the text file was corrupt (wasn't marked as UTF-8)
   1774              and caused an infinite loop here on EBCDIC systems!
   1775           */
   1776           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
   1777           //	   return;
   1778       	}
   1779         if (tokenMatcher.start(1, status) >= 0) {
   1780             // Scanned a divide sign, indicating a break position in the test data.
   1781             if (testString.length()>0) {
   1782                 breakPositions.addElement(testString.length(), status);
   1783             }
   1784         }
   1785         else if (tokenMatcher.start(2, status) >= 0) {
   1786             // Scanned an 'x', meaning no break at this position in the test data
   1787             //   Nothing to be done here.
   1788             }
   1789         else if (tokenMatcher.start(3, status) >= 0) {
   1790             // Scanned Hex digits.  Convert them to binary, append to the character data string.
   1791             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
   1792             int length = hexNumber.length();
   1793             if (length<=8) {
   1794                 char buf[10];
   1795                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
   1796                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
   1797                 if (c<=0x10ffff) {
   1798                     testString.append(c);
   1799                 } else {
   1800                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
   1801                        fileName, lineNumber);
   1802                 }
   1803             } else {
   1804                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
   1805                        fileName, lineNumber);
   1806              }
   1807         }
   1808         else if (tokenMatcher.start(4, status) >= 0) {
   1809             // Scanned to end of a line, possibly skipping over a comment in the process.
   1810             //   If the line from the file contained test data, run the test now.
   1811             if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
   1812                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
   1813             }
   1814 
   1815             // Clear out this test case.
   1816             //    The string and breakPositions vector will be refilled as the next
   1817             //       test case is parsed.
   1818             testString.remove();
   1819             breakPositions.removeAllElements();
   1820             lineNumber++;
   1821         } else {
   1822             // Scanner catchall.  Something unrecognized appeared on the line.
   1823             char token[16];
   1824             UnicodeString uToken = tokenMatcher.group(0, status);
   1825             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
   1826             token[sizeof(token)-1] = 0;
   1827             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
   1828 
   1829             // Clean up, in preparation for continuing with the next line.
   1830             testString.remove();
   1831             breakPositions.removeAllElements();
   1832             lineNumber++;
   1833         }
   1834         TEST_ASSERT_SUCCESS(status);
   1835         if (U_FAILURE(status)) {
   1836             break;
   1837         }
   1838     }
   1839 
   1840     delete [] testFile;
   1841  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
   1842 }
   1843 
   1844 //--------------------------------------------------------------------------------------------
   1845 //
   1846 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
   1847 //                            test data files.  Do only a simple, forward-only check -
   1848 //                            this test is mostly to check that ICU and the Unicode
   1849 //                            data agree with each other.
   1850 //
   1851 //--------------------------------------------------------------------------------------------
   1852 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
   1853                          const UnicodeString &testString,   // Text data to be broken
   1854                          UVector32 *breakPositions,         // Positions where breaks should be found.
   1855                          RuleBasedBreakIterator *bi) {
   1856     int32_t pos;                 // Break Position in the test string
   1857     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
   1858     int32_t expectedPos;         // Expected break position (index into test string)
   1859 
   1860     bi->setText(testString);
   1861     pos = bi->first();
   1862     pos = bi->next();
   1863 
   1864     while (pos != BreakIterator::DONE) {
   1865         if (expectedI >= breakPositions->size()) {
   1866             errln("Test file \"%s\", line %d, unexpected break found at position %d",
   1867                 testFileName, lineNumber, pos);
   1868             break;
   1869         }
   1870         expectedPos = breakPositions->elementAti(expectedI);
   1871         if (pos < expectedPos) {
   1872             errln("Test file \"%s\", line %d, unexpected break found at position %d",
   1873                 testFileName, lineNumber, pos);
   1874             break;
   1875         }
   1876         if (pos > expectedPos) {
   1877             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
   1878                 testFileName, lineNumber, expectedPos);
   1879             break;
   1880         }
   1881         pos = bi->next();
   1882         expectedI++;
   1883     }
   1884 
   1885     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
   1886         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
   1887             testFileName, lineNumber, breakPositions->elementAti(expectedI));
   1888     }
   1889 }
   1890 
   1891 
   1892 
   1893 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   1894 //---------------------------------------------------------------------------------------
   1895 //
   1896 //   classs RBBIMonkeyKind
   1897 //
   1898 //      Monkey Test for Break Iteration
   1899 //      Abstract interface class.   Concrete derived classes independently
   1900 //      implement the break rules for different iterator types.
   1901 //
   1902 //      The Monkey Test itself uses doesn't know which type of break iterator it is
   1903 //      testing, but works purely in terms of the interface defined here.
   1904 //
   1905 //---------------------------------------------------------------------------------------
   1906 class RBBIMonkeyKind {
   1907 public:
   1908     // Return a UVector of UnicodeSets, representing the character classes used
   1909     //   for this type of iterator.
   1910     virtual  UVector  *charClasses() = 0;
   1911 
   1912     // Set the test text on which subsequent calls to next() will operate
   1913     virtual  void      setText(const UnicodeString &s) = 0;
   1914 
   1915     // Find the next break postion, starting from the prev break position, or from zero.
   1916     // Return -1 after reaching end of string.
   1917     virtual  int32_t   next(int32_t i) = 0;
   1918 
   1919     virtual ~RBBIMonkeyKind();
   1920     UErrorCode       deferredStatus;
   1921 
   1922 
   1923 protected:
   1924     RBBIMonkeyKind();
   1925 
   1926 private:
   1927 };
   1928 
   1929 RBBIMonkeyKind::RBBIMonkeyKind() {
   1930     deferredStatus = U_ZERO_ERROR;
   1931 }
   1932 
   1933 RBBIMonkeyKind::~RBBIMonkeyKind() {
   1934 }
   1935 
   1936 
   1937 //----------------------------------------------------------------------------------------
   1938 //
   1939 //   Random Numbers.  Similar to standard lib rand() and srand()
   1940 //                    Not using library to
   1941 //                      1.  Get same results on all platforms.
   1942 //                      2.  Get access to current seed, to more easily reproduce failures.
   1943 //
   1944 //---------------------------------------------------------------------------------------
   1945 static uint32_t m_seed = 1;
   1946 
   1947 static uint32_t m_rand()
   1948 {
   1949     m_seed = m_seed * 1103515245 + 12345;
   1950     return (uint32_t)(m_seed/65536) % 32768;
   1951 }
   1952 
   1953 
   1954 //
   1955 // Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
   1956 //
   1957 static const char *gExtended_Pict = "["
   1958     "\\U0001F774-\\U0001F77F\\u2700-\\u2701\\u2703-\\u2704\\u270E\\u2710-\\u2711\\u2765-\\u2767\\U0001F030-\\U0001F093"
   1959     "\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5"
   1960     "\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F2FF\\U0001F7D5-\\U0001F7FF"
   1961     "\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395"
   1962     "\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6\\U0001F4FE\\U0001F53E-\\U0001F548"
   1963     "\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586\\U0001F588-\\U0001F589"
   1964     "\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7\\U0001F5A9-\\U0001F5B0"
   1965     "\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB\\U0001F5DF-\\U0001F5E0"
   1966     "\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9"
   1967     "\\u2605\\u2607-\\u260D\\u260F-\\u2610\\u2612\\u2616-\\u2617\\u2619-\\u261C\\u261E-\\u261F\\u2621\\u2624-\\u2625"
   1968     "\\u2627-\\u2629\\u262B-\\u262D\\u2630-\\u2637\\u263B-\\u2647\\u2654-\\u265F\\u2661-\\u2662\\u2664\\u2667"
   1969     "\\u2669-\\u267A\\u267C-\\u267E\\u2680-\\u2691\\u2695\\u2698\\u269A\\u269D-\\u269F\\u26A2-\\u26A9\\u26AC-\\u26AF"
   1970     "\\u26B2-\\u26BC\\u26BF-\\u26C3\\u26C6-\\u26C7\\u26C9-\\u26CD\\u26D0\\u26D2\\u26D5-\\u26E8\\u26EB-\\u26EF"
   1971     "\\u26F6\\u26FB-\\u26FC\\u26FE-\\u26FF\\u2388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF"
   1972     "\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF"
   1973     "\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF"
   1974     "\\U0001F900-\\U0001F90F\\U0001F91F\\U0001F928-\\U0001F92F\\U0001F931-\\U0001F932\\U0001F93F\\U0001F94C-\\U0001F94F"
   1975     "\\U0001F95F-\\U0001F97F\\U0001F992-\\U0001F9BF\\U0001F9C1-\\U0001F9FF\\U0001F6C6-\\U0001F6CA\\U0001F6E6-\\U0001F6E8"
   1976     "\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6D3-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F7-\\U0001F6FF"
   1977     "]";
   1978 
   1979 //------------------------------------------------------------------------------------------
   1980 //
   1981 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
   1982 //                             of RBBIMonkeyKind.
   1983 //
   1984 //------------------------------------------------------------------------------------------
   1985 class RBBICharMonkey: public RBBIMonkeyKind {
   1986 public:
   1987     RBBICharMonkey();
   1988     virtual          ~RBBICharMonkey();
   1989     virtual  UVector *charClasses();
   1990     virtual  void     setText(const UnicodeString &s);
   1991     virtual  int32_t  next(int32_t i);
   1992 private:
   1993     UVector   *fSets;
   1994 
   1995     UnicodeSet  *fCRLFSet;
   1996     UnicodeSet  *fControlSet;
   1997     UnicodeSet  *fExtendSet;
   1998     UnicodeSet  *fZWJSet;
   1999     UnicodeSet  *fRegionalIndicatorSet;
   2000     UnicodeSet  *fPrependSet;
   2001     UnicodeSet  *fSpacingSet;
   2002     UnicodeSet  *fLSet;
   2003     UnicodeSet  *fVSet;
   2004     UnicodeSet  *fTSet;
   2005     UnicodeSet  *fLVSet;
   2006     UnicodeSet  *fLVTSet;
   2007     UnicodeSet  *fHangulSet;
   2008     UnicodeSet  *fEmojiBaseSet;
   2009     UnicodeSet  *fEmojiModifierSet;
   2010     UnicodeSet  *fExtendedPictSet;
   2011     UnicodeSet  *fEBGSet;
   2012     UnicodeSet  *fEmojiNRKSet;
   2013     UnicodeSet  *fAnySet;
   2014 
   2015     const UnicodeString *fText;
   2016 };
   2017 
   2018 
   2019 RBBICharMonkey::RBBICharMonkey() {
   2020     UErrorCode  status = U_ZERO_ERROR;
   2021 
   2022     fText = NULL;
   2023 
   2024     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
   2025     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
   2026     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
   2027     fZWJSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
   2028     fRegionalIndicatorSet =
   2029                   new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
   2030     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
   2031     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
   2032     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
   2033     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
   2034     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
   2035     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
   2036     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
   2037     fHangulSet  = new UnicodeSet();
   2038     fHangulSet->addAll(*fLSet);
   2039     fHangulSet->addAll(*fVSet);
   2040     fHangulSet->addAll(*fTSet);
   2041     fHangulSet->addAll(*fLVSet);
   2042     fHangulSet->addAll(*fLVTSet);
   2043 
   2044     fEmojiBaseSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
   2045     fEmojiModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EM}]"), status);
   2046     fExtendedPictSet  = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
   2047     fEBGSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EBG}]"), status);
   2048     fEmojiNRKSet      = new UnicodeSet(UNICODE_STRING_SIMPLE(
   2049                 "[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
   2050     fAnySet           = new UnicodeSet(0, 0x10ffff);
   2051 
   2052     fSets             = new UVector(status);
   2053     fSets->addElement(fCRLFSet,    status);
   2054     fSets->addElement(fControlSet, status);
   2055     fSets->addElement(fExtendSet,  status);
   2056     fSets->addElement(fRegionalIndicatorSet, status);
   2057     if (!fPrependSet->isEmpty()) {
   2058         fSets->addElement(fPrependSet, status);
   2059     }
   2060     fSets->addElement(fSpacingSet, status);
   2061     fSets->addElement(fHangulSet,  status);
   2062     fSets->addElement(fAnySet,     status);
   2063     fSets->addElement(fEmojiBaseSet, status);
   2064     fSets->addElement(fEmojiModifierSet, status);
   2065     fSets->addElement(fZWJSet,     status);
   2066     fSets->addElement(fExtendedPictSet, status);
   2067     fSets->addElement(fEBGSet,     status);
   2068     fSets->addElement(fEmojiNRKSet,status);
   2069     if (U_FAILURE(status)) {
   2070         deferredStatus = status;
   2071     }
   2072 }
   2073 
   2074 
   2075 void RBBICharMonkey::setText(const UnicodeString &s) {
   2076     fText = &s;
   2077 }
   2078 
   2079 
   2080 
   2081 int32_t RBBICharMonkey::next(int32_t prevPos) {
   2082     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2083                               //   break position being tested.  The candidate break
   2084                               //   location is before p2.
   2085 
   2086     int     breakPos = -1;
   2087 
   2088     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2089     UChar32 cBase;            // for (X Extend*) patterns, the X character.
   2090 
   2091     if (U_FAILURE(deferredStatus)) {
   2092         return -1;
   2093     }
   2094 
   2095     // Previous break at end of string.  return DONE.
   2096     if (prevPos >= fText->length()) {
   2097         return -1;
   2098     }
   2099     p0 = p1 = p2 = p3 = prevPos;
   2100     c3 =  fText->char32At(prevPos);
   2101     c0 = c1 = c2 = cBase = 0;
   2102     (void)p0;   // suppress set but not used warning.
   2103     (void)c0;
   2104 
   2105     // Loop runs once per "significant" character position in the input text.
   2106     for (;;) {
   2107         // Move all of the positions forward in the input string.
   2108         p0 = p1;  c0 = c1;
   2109         p1 = p2;  c1 = c2;
   2110         p2 = p3;  c2 = c3;
   2111 
   2112         // Advancd p3 by one codepoint
   2113         p3 = fText->moveIndex32(p3, 1);
   2114         c3 = fText->char32At(p3);
   2115 
   2116         if (p1 == p2) {
   2117             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2118             continue;
   2119         }
   2120         if (p2 == fText->length()) {
   2121             // Reached end of string.  Always a break position.
   2122             break;
   2123         }
   2124 
   2125         // Rule  GB3   CR x LF
   2126         //     No Extend or Format characters may appear between the CR and LF,
   2127         //     which requires the additional check for p2 immediately following p1.
   2128         //
   2129         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
   2130             continue;
   2131         }
   2132 
   2133         // Rule (GB4).   ( Control | CR | LF ) <break>
   2134         if (fControlSet->contains(c1) ||
   2135             c1 == 0x0D ||
   2136             c1 == 0x0A)  {
   2137             break;
   2138         }
   2139 
   2140         // Rule (GB5)    <break>  ( Control | CR | LF )
   2141         //
   2142         if (fControlSet->contains(c2) ||
   2143             c2 == 0x0D ||
   2144             c2 == 0x0A)  {
   2145             break;
   2146         }
   2147 
   2148 
   2149         // Rule (GB6)  L x ( L | V | LV | LVT )
   2150         if (fLSet->contains(c1) &&
   2151                (fLSet->contains(c2)  ||
   2152                 fVSet->contains(c2)  ||
   2153                 fLVSet->contains(c2) ||
   2154                 fLVTSet->contains(c2))) {
   2155             continue;
   2156         }
   2157 
   2158         // Rule (GB7)    ( LV | V )  x  ( V | T )
   2159         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
   2160             (fVSet->contains(c2) || fTSet->contains(c2)))  {
   2161             continue;
   2162         }
   2163 
   2164         // Rule (GB8)    ( LVT | T)  x T
   2165         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
   2166             fTSet->contains(c2))  {
   2167             continue;
   2168         }
   2169 
   2170         // Rule (GB9)    x (Extend | ZWJ)
   2171         if (fExtendSet->contains(c2) || fZWJSet->contains(c2))  {
   2172             if (!fExtendSet->contains(c1)) {
   2173                 cBase = c1;
   2174             }
   2175             continue;
   2176         }
   2177 
   2178         // Rule (GB9a)   x  SpacingMark
   2179         if (fSpacingSet->contains(c2)) {
   2180             continue;
   2181         }
   2182 
   2183         // Rule (GB9b)   Prepend x
   2184         if (fPrependSet->contains(c1)) {
   2185             continue;
   2186         }
   2187 
   2188         // Rule (GB10)   (Emoji_Base | EBG) Extend * x Emoji_Modifier
   2189         if ((fEmojiBaseSet->contains(c1) || fEBGSet->contains(c1)) && fEmojiModifierSet->contains(c2)) {
   2190             continue;
   2191         }
   2192         if ((fEmojiBaseSet->contains(cBase) || fEBGSet->contains(cBase)) &&
   2193                 fExtendSet->contains(c1) && fEmojiModifierSet->contains(c2)) {
   2194             continue;
   2195         }
   2196 
   2197         // Rule (GB11)   (Glue_After_ZWJ | Emoji) ZWJ x (Glue_After_ZWJ | Emoji)
   2198         if ((fExtendedPictSet->contains(c0) || fEmojiNRKSet->contains(c0)) && fZWJSet->contains(c1) &&
   2199                 (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
   2200             continue;
   2201         }
   2202 
   2203         // Rule (GB12-13)    Regional_Indicator x Regional_Indicator
   2204         //                   Note: The first if condition is a little tricky. We only need to force
   2205         //                      a break if there are three or more contiguous RIs. If there are
   2206         //                      only two, a break following will occur via other rules, and will include
   2207         //                      any trailing extend characters, which is needed behavior.
   2208         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
   2209                 && fRegionalIndicatorSet->contains(c2)) {
   2210             break;
   2211         }
   2212         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
   2213             continue;
   2214         }
   2215 
   2216         // Rule (GB999)  Any  <break>  Any
   2217         break;
   2218     }
   2219 
   2220     breakPos = p2;
   2221     return breakPos;
   2222 }
   2223 
   2224 
   2225 
   2226 UVector  *RBBICharMonkey::charClasses() {
   2227     return fSets;
   2228 }
   2229 
   2230 
   2231 RBBICharMonkey::~RBBICharMonkey() {
   2232     delete fSets;
   2233     delete fCRLFSet;
   2234     delete fControlSet;
   2235     delete fExtendSet;
   2236     delete fRegionalIndicatorSet;
   2237     delete fPrependSet;
   2238     delete fSpacingSet;
   2239     delete fLSet;
   2240     delete fVSet;
   2241     delete fTSet;
   2242     delete fLVSet;
   2243     delete fLVTSet;
   2244     delete fHangulSet;
   2245     delete fAnySet;
   2246     delete fEmojiBaseSet;
   2247     delete fEmojiModifierSet;
   2248     delete fZWJSet;
   2249     delete fExtendedPictSet;
   2250     delete fEBGSet;
   2251     delete fEmojiNRKSet;
   2252 }
   2253 
   2254 //------------------------------------------------------------------------------------------
   2255 //
   2256 //   class RBBIWordMonkey      Word Break specific implementation
   2257 //                             of RBBIMonkeyKind.
   2258 //
   2259 //------------------------------------------------------------------------------------------
   2260 class RBBIWordMonkey: public RBBIMonkeyKind {
   2261 public:
   2262     RBBIWordMonkey();
   2263     virtual          ~RBBIWordMonkey();
   2264     virtual  UVector *charClasses();
   2265     virtual  void     setText(const UnicodeString &s);
   2266     virtual int32_t   next(int32_t i);
   2267 private:
   2268     UVector      *fSets;
   2269 
   2270     UnicodeSet  *fCRSet;
   2271     UnicodeSet  *fLFSet;
   2272     UnicodeSet  *fNewlineSet;
   2273     UnicodeSet  *fRegionalIndicatorSet;
   2274     UnicodeSet  *fKatakanaSet;
   2275     UnicodeSet  *fHebrew_LetterSet;
   2276     UnicodeSet  *fALetterSet;
   2277     UnicodeSet  *fSingle_QuoteSet;
   2278     UnicodeSet  *fDouble_QuoteSet;
   2279     UnicodeSet  *fMidNumLetSet;
   2280     UnicodeSet  *fMidLetterSet;
   2281     UnicodeSet  *fMidNumSet;
   2282     UnicodeSet  *fNumericSet;
   2283     UnicodeSet  *fFormatSet;
   2284     UnicodeSet  *fOtherSet;
   2285     UnicodeSet  *fExtendSet;
   2286     UnicodeSet  *fExtendNumLetSet;
   2287     UnicodeSet  *fDictionarySet;
   2288     UnicodeSet  *fEBaseSet;
   2289     UnicodeSet  *fEBGSet;
   2290     UnicodeSet  *fEModifierSet;
   2291     UnicodeSet  *fZWJSet;
   2292     UnicodeSet  *fExtendedPictSet;
   2293     UnicodeSet  *fEmojiNRKSet;
   2294 
   2295     const UnicodeString  *fText;
   2296 };
   2297 
   2298 
   2299 RBBIWordMonkey::RBBIWordMonkey()
   2300 {
   2301     UErrorCode  status = U_ZERO_ERROR;
   2302 
   2303     fSets            = new UVector(status);
   2304 
   2305     fCRSet            = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
   2306     fLFSet            = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
   2307     fNewlineSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
   2308     fKatakanaSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
   2309     fRegionalIndicatorSet =  new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
   2310     fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);
   2311     fALetterSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
   2312     fSingle_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"),    status);
   2313     fDouble_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"),    status);
   2314     fMidNumLetSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
   2315     fMidLetterSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
   2316     fMidNumSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
   2317     fNumericSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
   2318     fFormatSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
   2319     fExtendNumLetSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
   2320     fExtendSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
   2321 
   2322     fEBaseSet         = new UnicodeSet(UNICODE_STRING_SIMPLE(
   2323             "[\\p{Word_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
   2324     fEBGSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = EBG}]"),          status);
   2325     fEModifierSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = EM}]"),           status);
   2326     fZWJSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ZWJ}]"),          status);
   2327     fExtendedPictSet  = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
   2328     fEmojiNRKSet      = new UnicodeSet(UNICODE_STRING_SIMPLE(
   2329             "[[\\p{Emoji}]-[\\p{Word_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
   2330 
   2331     fDictionarySet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]"), status);
   2332     fDictionarySet->addAll(*fKatakanaSet);
   2333     fDictionarySet->addAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
   2334 
   2335     fALetterSet->removeAll(*fDictionarySet);
   2336 
   2337     fOtherSet        = new UnicodeSet();
   2338     if(U_FAILURE(status)) {
   2339       deferredStatus = status;
   2340       return;
   2341     }
   2342 
   2343     fOtherSet->complement();
   2344     fOtherSet->removeAll(*fCRSet);
   2345     fOtherSet->removeAll(*fLFSet);
   2346     fOtherSet->removeAll(*fNewlineSet);
   2347     fOtherSet->removeAll(*fKatakanaSet);
   2348     fOtherSet->removeAll(*fHebrew_LetterSet);
   2349     fOtherSet->removeAll(*fALetterSet);
   2350     fOtherSet->removeAll(*fSingle_QuoteSet);
   2351     fOtherSet->removeAll(*fDouble_QuoteSet);
   2352     fOtherSet->removeAll(*fMidLetterSet);
   2353     fOtherSet->removeAll(*fMidNumSet);
   2354     fOtherSet->removeAll(*fNumericSet);
   2355     fOtherSet->removeAll(*fExtendNumLetSet);
   2356     fOtherSet->removeAll(*fFormatSet);
   2357     fOtherSet->removeAll(*fExtendSet);
   2358     fOtherSet->removeAll(*fRegionalIndicatorSet);
   2359     fOtherSet->removeAll(*fEBaseSet);
   2360     fOtherSet->removeAll(*fEBGSet);
   2361     fOtherSet->removeAll(*fEModifierSet);
   2362     fOtherSet->removeAll(*fZWJSet);
   2363     fOtherSet->removeAll(*fExtendedPictSet);
   2364     fOtherSet->removeAll(*fEmojiNRKSet);
   2365 
   2366     // Inhibit dictionary characters from being tested at all.
   2367     fOtherSet->removeAll(*fDictionarySet);
   2368 
   2369     fSets->addElement(fCRSet,                status);
   2370     fSets->addElement(fLFSet,                status);
   2371     fSets->addElement(fNewlineSet,           status);
   2372     fSets->addElement(fRegionalIndicatorSet, status);
   2373     fSets->addElement(fHebrew_LetterSet,     status);
   2374     fSets->addElement(fALetterSet,           status);
   2375     fSets->addElement(fSingle_QuoteSet,      status);
   2376     fSets->addElement(fDouble_QuoteSet,      status);
   2377     //fSets->addElement(fKatakanaSet,          status); // Omit Katakana from fSets, which omits Katakana characters
   2378                                                         // from the test data. They are all in the dictionary set,
   2379                                                         // which this (old, to be retired) monkey test cannot handle.
   2380     fSets->addElement(fMidLetterSet,         status);
   2381     fSets->addElement(fMidNumLetSet,         status);
   2382     fSets->addElement(fMidNumSet,            status);
   2383     fSets->addElement(fNumericSet,           status);
   2384     fSets->addElement(fFormatSet,            status);
   2385     fSets->addElement(fExtendSet,            status);
   2386     fSets->addElement(fOtherSet,             status);
   2387     fSets->addElement(fExtendNumLetSet,      status);
   2388 
   2389     fSets->addElement(fEBaseSet,             status);
   2390     fSets->addElement(fEBGSet,               status);
   2391     fSets->addElement(fEModifierSet,         status);
   2392     fSets->addElement(fZWJSet,               status);
   2393     fSets->addElement(fExtendedPictSet,      status);
   2394     fSets->addElement(fEmojiNRKSet,          status);
   2395 
   2396     if (U_FAILURE(status)) {
   2397         deferredStatus = status;
   2398     }
   2399 }
   2400 
   2401 void RBBIWordMonkey::setText(const UnicodeString &s) {
   2402     fText       = &s;
   2403 }
   2404 
   2405 
   2406 int32_t RBBIWordMonkey::next(int32_t prevPos) {
   2407     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2408                               //   break position being tested.  The candidate break
   2409                               //   location is before p2.
   2410 
   2411     int     breakPos = -1;
   2412 
   2413     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2414 
   2415     if (U_FAILURE(deferredStatus)) {
   2416         return -1;
   2417     }
   2418 
   2419     // Prev break at end of string.  return DONE.
   2420     if (prevPos >= fText->length()) {
   2421         return -1;
   2422     }
   2423     p0 = p1 = p2 = p3 = prevPos;
   2424     c3 =  fText->char32At(prevPos);
   2425     c0 = c1 = c2 = 0;
   2426     (void)p0;       // Suppress set but not used warning.
   2427 
   2428     // Loop runs once per "significant" character position in the input text.
   2429     for (;;) {
   2430         // Move all of the positions forward in the input string.
   2431         p0 = p1;  c0 = c1;
   2432         p1 = p2;  c1 = c2;
   2433         p2 = p3;  c2 = c3;
   2434 
   2435         // Advancd p3 by    X(Extend | Format)*   Rule 4
   2436         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
   2437         do {
   2438             p3 = fText->moveIndex32(p3, 1);
   2439             c3 = fText->char32At(p3);
   2440             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
   2441                break;
   2442             };
   2443         }
   2444         while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
   2445 
   2446 
   2447         if (p1 == p2) {
   2448             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2449             continue;
   2450         }
   2451         if (p2 == fText->length()) {
   2452             // Reached end of string.  Always a break position.
   2453             break;
   2454         }
   2455 
   2456         // Rule  (3)   CR x LF
   2457         //     No Extend or Format characters may appear between the CR and LF,
   2458         //     which requires the additional check for p2 immediately following p1.
   2459         //
   2460         if (c1==0x0D && c2==0x0A) {
   2461             continue;
   2462         }
   2463 
   2464         // Rule (3a)  Break before and after newlines (including CR and LF)
   2465         //
   2466         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
   2467             break;
   2468         };
   2469         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
   2470             break;
   2471         };
   2472 
   2473         // Rule (3c)    ZWJ x (Glue_after_ZWJ | EmojiNRK).
   2474         //              Not ignoring extend chars, so peek into input text to
   2475         //              get the potential ZWJ, the character immediately preceding c2.
   2476         //              Sloppy UChar32 indexing: p2-1 may reference trail half
   2477         //              but char32At will get the full code point.
   2478         if (fZWJSet->contains(fText->char32At(p2-1)) && (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
   2479             continue;
   2480         }
   2481 
   2482         // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
   2483         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
   2484             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
   2485             continue;
   2486         }
   2487 
   2488         // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
   2489         //
   2490         if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
   2491              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
   2492              (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
   2493             continue;
   2494         }
   2495 
   2496         // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
   2497         if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
   2498             (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
   2499             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
   2500             continue;
   2501         }
   2502 
   2503         // Rule (7a)     Hebrew_Letter x Single_Quote
   2504         if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
   2505             continue;
   2506         }
   2507 
   2508         // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
   2509         if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
   2510             continue;
   2511         }
   2512 
   2513         // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
   2514         if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
   2515             continue;
   2516         }
   2517 
   2518         // Rule (8)    Numeric x Numeric
   2519         if (fNumericSet->contains(c1) &&
   2520             fNumericSet->contains(c2))  {
   2521             continue;
   2522         }
   2523 
   2524         // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
   2525         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
   2526             fNumericSet->contains(c2))  {
   2527             continue;
   2528         }
   2529 
   2530         // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
   2531         if (fNumericSet->contains(c1) &&
   2532             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
   2533             continue;
   2534         }
   2535 
   2536         // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
   2537         if (fNumericSet->contains(c0) &&
   2538             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
   2539             fNumericSet->contains(c2)) {
   2540             continue;
   2541         }
   2542 
   2543         // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
   2544         if (fNumericSet->contains(c1) &&
   2545             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
   2546             fNumericSet->contains(c3)) {
   2547             continue;
   2548         }
   2549 
   2550         // Rule (13)  Katakana x Katakana
   2551         //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
   2552         //                  all Katakana are handled by the dictionary breaker.
   2553         if (fKatakanaSet->contains(c1) &&
   2554             fKatakanaSet->contains(c2))  {
   2555             continue;
   2556         }
   2557 
   2558         // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
   2559         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
   2560              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
   2561              fExtendNumLetSet->contains(c2)) {
   2562                 continue;
   2563         }
   2564 
   2565         // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
   2566         if (fExtendNumLetSet->contains(c1) &&
   2567                 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
   2568                  fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
   2569             continue;
   2570         }
   2571 
   2572         // WB 14  (E_Base | EBG) x E_Modifier
   2573         if ((fEBaseSet->contains(c1)  || fEBGSet->contains(c1)) && fEModifierSet->contains(c2)) {
   2574             continue;
   2575         }
   2576 
   2577         // Rule 15 - 17   Group pairs of Regional Indicators.
   2578         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
   2579             break;
   2580         }
   2581         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
   2582             continue;
   2583         }
   2584 
   2585         // Rule 999.  Break found here.
   2586         break;
   2587     }
   2588 
   2589     breakPos = p2;
   2590     return breakPos;
   2591 }
   2592 
   2593 
   2594 UVector  *RBBIWordMonkey::charClasses() {
   2595     return fSets;
   2596 }
   2597 
   2598 
   2599 RBBIWordMonkey::~RBBIWordMonkey() {
   2600     delete fSets;
   2601     delete fCRSet;
   2602     delete fLFSet;
   2603     delete fNewlineSet;
   2604     delete fKatakanaSet;
   2605     delete fHebrew_LetterSet;
   2606     delete fALetterSet;
   2607     delete fSingle_QuoteSet;
   2608     delete fDouble_QuoteSet;
   2609     delete fMidNumLetSet;
   2610     delete fMidLetterSet;
   2611     delete fMidNumSet;
   2612     delete fNumericSet;
   2613     delete fFormatSet;
   2614     delete fExtendSet;
   2615     delete fExtendNumLetSet;
   2616     delete fRegionalIndicatorSet;
   2617     delete fDictionarySet;
   2618     delete fOtherSet;
   2619     delete fEBaseSet;
   2620     delete fEBGSet;
   2621     delete fEModifierSet;
   2622     delete fZWJSet;
   2623     delete fExtendedPictSet;
   2624     delete fEmojiNRKSet;
   2625 }
   2626 
   2627 
   2628 
   2629 
   2630 //------------------------------------------------------------------------------------------
   2631 //
   2632 //   class RBBISentMonkey      Sentence Break specific implementation
   2633 //                             of RBBIMonkeyKind.
   2634 //
   2635 //------------------------------------------------------------------------------------------
   2636 class RBBISentMonkey: public RBBIMonkeyKind {
   2637 public:
   2638     RBBISentMonkey();
   2639     virtual          ~RBBISentMonkey();
   2640     virtual  UVector *charClasses();
   2641     virtual  void     setText(const UnicodeString &s);
   2642     virtual int32_t   next(int32_t i);
   2643 private:
   2644     int               moveBack(int posFrom);
   2645     int               moveForward(int posFrom);
   2646     UChar32           cAt(int pos);
   2647 
   2648     UVector      *fSets;
   2649 
   2650     UnicodeSet  *fSepSet;
   2651     UnicodeSet  *fFormatSet;
   2652     UnicodeSet  *fSpSet;
   2653     UnicodeSet  *fLowerSet;
   2654     UnicodeSet  *fUpperSet;
   2655     UnicodeSet  *fOLetterSet;
   2656     UnicodeSet  *fNumericSet;
   2657     UnicodeSet  *fATermSet;
   2658     UnicodeSet  *fSContinueSet;
   2659     UnicodeSet  *fSTermSet;
   2660     UnicodeSet  *fCloseSet;
   2661     UnicodeSet  *fOtherSet;
   2662     UnicodeSet  *fExtendSet;
   2663 
   2664     const UnicodeString  *fText;
   2665 
   2666 };
   2667 
   2668 RBBISentMonkey::RBBISentMonkey()
   2669 {
   2670     UErrorCode  status = U_ZERO_ERROR;
   2671 
   2672     fSets            = new UVector(status);
   2673 
   2674     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
   2675     //                       set and made into character classes of their own.  For the monkey impl,
   2676     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
   2677     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
   2678     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
   2679     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
   2680     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
   2681     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
   2682     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
   2683     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
   2684     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
   2685     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
   2686     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
   2687     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
   2688     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
   2689     fOtherSet        = new UnicodeSet();
   2690 
   2691     if(U_FAILURE(status)) {
   2692       deferredStatus = status;
   2693       return;
   2694     }
   2695 
   2696     fOtherSet->complement();
   2697     fOtherSet->removeAll(*fSepSet);
   2698     fOtherSet->removeAll(*fFormatSet);
   2699     fOtherSet->removeAll(*fSpSet);
   2700     fOtherSet->removeAll(*fLowerSet);
   2701     fOtherSet->removeAll(*fUpperSet);
   2702     fOtherSet->removeAll(*fOLetterSet);
   2703     fOtherSet->removeAll(*fNumericSet);
   2704     fOtherSet->removeAll(*fATermSet);
   2705     fOtherSet->removeAll(*fSContinueSet);
   2706     fOtherSet->removeAll(*fSTermSet);
   2707     fOtherSet->removeAll(*fCloseSet);
   2708     fOtherSet->removeAll(*fExtendSet);
   2709 
   2710     fSets->addElement(fSepSet,       status);
   2711     fSets->addElement(fFormatSet,    status);
   2712     fSets->addElement(fSpSet,        status);
   2713     fSets->addElement(fLowerSet,     status);
   2714     fSets->addElement(fUpperSet,     status);
   2715     fSets->addElement(fOLetterSet,   status);
   2716     fSets->addElement(fNumericSet,   status);
   2717     fSets->addElement(fATermSet,     status);
   2718     fSets->addElement(fSContinueSet, status);
   2719     fSets->addElement(fSTermSet,     status);
   2720     fSets->addElement(fCloseSet,     status);
   2721     fSets->addElement(fOtherSet,     status);
   2722     fSets->addElement(fExtendSet,    status);
   2723 
   2724     if (U_FAILURE(status)) {
   2725         deferredStatus = status;
   2726     }
   2727 }
   2728 
   2729 
   2730 
   2731 void RBBISentMonkey::setText(const UnicodeString &s) {
   2732     fText       = &s;
   2733 }
   2734 
   2735 UVector  *RBBISentMonkey::charClasses() {
   2736     return fSets;
   2737 }
   2738 
   2739 
   2740 //  moveBack()   Find the "significant" code point preceding the index i.
   2741 //               Skips over ($Extend | $Format)* .
   2742 //
   2743 int RBBISentMonkey::moveBack(int i) {
   2744     if (i <= 0) {
   2745         return -1;
   2746     }
   2747     UChar32   c;
   2748     int32_t   j = i;
   2749     do {
   2750         j = fText->moveIndex32(j, -1);
   2751         c = fText->char32At(j);
   2752     }
   2753     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
   2754     return j;
   2755 
   2756  }
   2757 
   2758 
   2759 int RBBISentMonkey::moveForward(int i) {
   2760     if (i>=fText->length()) {
   2761         return fText->length();
   2762     }
   2763     UChar32   c;
   2764     int32_t   j = i;
   2765     do {
   2766         j = fText->moveIndex32(j, 1);
   2767         c = cAt(j);
   2768     }
   2769     while (fFormatSet->contains(c) || fExtendSet->contains(c));
   2770     return j;
   2771 }
   2772 
   2773 UChar32 RBBISentMonkey::cAt(int pos) {
   2774     if (pos<0 || pos>=fText->length()) {
   2775         return -1;
   2776     } else {
   2777         return fText->char32At(pos);
   2778     }
   2779 }
   2780 
   2781 int32_t RBBISentMonkey::next(int32_t prevPos) {
   2782     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2783                               //   break position being tested.  The candidate break
   2784                               //   location is before p2.
   2785 
   2786     int     breakPos = -1;
   2787 
   2788     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2789     UChar32 c;
   2790 
   2791     if (U_FAILURE(deferredStatus)) {
   2792         return -1;
   2793     }
   2794 
   2795     // Prev break at end of string.  return DONE.
   2796     if (prevPos >= fText->length()) {
   2797         return -1;
   2798     }
   2799     p0 = p1 = p2 = p3 = prevPos;
   2800     c3 =  fText->char32At(prevPos);
   2801     c0 = c1 = c2 = 0;
   2802     (void)p0;     // Suppress set but not used warning.
   2803 
   2804     // Loop runs once per "significant" character position in the input text.
   2805     for (;;) {
   2806         // Move all of the positions forward in the input string.
   2807         p0 = p1;  c0 = c1;
   2808         p1 = p2;  c1 = c2;
   2809         p2 = p3;  c2 = c3;
   2810 
   2811         // Advancd p3 by    X(Extend | Format)*   Rule 4
   2812         p3 = moveForward(p3);
   2813         c3 = cAt(p3);
   2814 
   2815         // Rule (3)  CR x LF
   2816         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
   2817             continue;
   2818         }
   2819 
   2820         // Rule (4).   Sep  <break>
   2821         if (fSepSet->contains(c1)) {
   2822             p2 = p1+1;   // Separators don't combine with Extend or Format.
   2823             break;
   2824         }
   2825 
   2826         if (p2 >= fText->length()) {
   2827             // Reached end of string.  Always a break position.
   2828             break;
   2829         }
   2830 
   2831         if (p2 == prevPos) {
   2832             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2833             continue;
   2834         }
   2835 
   2836         // Rule (6).   ATerm x Numeric
   2837         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
   2838             continue;
   2839         }
   2840 
   2841         // Rule (7).  (Upper | Lower) ATerm  x  Uppper
   2842         if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
   2843                 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
   2844             continue;
   2845         }
   2846 
   2847         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
   2848         //           Note:  STerm | ATerm are added to the negated part of the expression by a
   2849         //                  note to the Unicode 5.0 documents.
   2850         int p8 = p1;
   2851         while (fSpSet->contains(cAt(p8))) {
   2852             p8 = moveBack(p8);
   2853         }
   2854         while (fCloseSet->contains(cAt(p8))) {
   2855             p8 = moveBack(p8);
   2856         }
   2857         if (fATermSet->contains(cAt(p8))) {
   2858             p8=p2;
   2859             for (;;) {
   2860                 c = cAt(p8);
   2861                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
   2862                     fLowerSet->contains(c) || fSepSet->contains(c) ||
   2863                     fATermSet->contains(c) || fSTermSet->contains(c))  {
   2864                     break;
   2865                 }
   2866                 p8 = moveForward(p8);
   2867             }
   2868             if (fLowerSet->contains(cAt(p8))) {
   2869                 continue;
   2870             }
   2871         }
   2872 
   2873         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
   2874         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
   2875             p8 = p1;
   2876             while (fSpSet->contains(cAt(p8))) {
   2877                 p8 = moveBack(p8);
   2878             }
   2879             while (fCloseSet->contains(cAt(p8))) {
   2880                 p8 = moveBack(p8);
   2881             }
   2882             c = cAt(p8);
   2883             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
   2884                 continue;
   2885             }
   2886         }
   2887 
   2888         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
   2889         int p9 = p1;
   2890         while (fCloseSet->contains(cAt(p9))) {
   2891             p9 = moveBack(p9);
   2892         }
   2893         c = cAt(p9);
   2894         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
   2895             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
   2896                 continue;
   2897             }
   2898         }
   2899 
   2900         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
   2901         int p10 = p1;
   2902         while (fSpSet->contains(cAt(p10))) {
   2903             p10 = moveBack(p10);
   2904         }
   2905         while (fCloseSet->contains(cAt(p10))) {
   2906             p10 = moveBack(p10);
   2907         }
   2908         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
   2909             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
   2910                 continue;
   2911             }
   2912         }
   2913 
   2914         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
   2915         int p11 = p1;
   2916         if (fSepSet->contains(cAt(p11))) {
   2917             p11 = moveBack(p11);
   2918         }
   2919         while (fSpSet->contains(cAt(p11))) {
   2920             p11 = moveBack(p11);
   2921         }
   2922         while (fCloseSet->contains(cAt(p11))) {
   2923             p11 = moveBack(p11);
   2924         }
   2925         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
   2926             break;
   2927         }
   2928 
   2929         //  Rule (12)  Any x Any
   2930         continue;
   2931     }
   2932     breakPos = p2;
   2933     return breakPos;
   2934 }
   2935 
   2936 RBBISentMonkey::~RBBISentMonkey() {
   2937     delete fSets;
   2938     delete fSepSet;
   2939     delete fFormatSet;
   2940     delete fSpSet;
   2941     delete fLowerSet;
   2942     delete fUpperSet;
   2943     delete fOLetterSet;
   2944     delete fNumericSet;
   2945     delete fATermSet;
   2946     delete fSContinueSet;
   2947     delete fSTermSet;
   2948     delete fCloseSet;
   2949     delete fOtherSet;
   2950     delete fExtendSet;
   2951 }
   2952 
   2953 
   2954 
   2955 //-------------------------------------------------------------------------------------------
   2956 //
   2957 //  RBBILineMonkey
   2958 //
   2959 //-------------------------------------------------------------------------------------------
   2960 
   2961 class RBBILineMonkey: public RBBIMonkeyKind {
   2962 public:
   2963     RBBILineMonkey();
   2964     virtual          ~RBBILineMonkey();
   2965     virtual  UVector *charClasses();
   2966     virtual  void     setText(const UnicodeString &s);
   2967     virtual  int32_t  next(int32_t i);
   2968     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
   2969 private:
   2970     UVector      *fSets;
   2971 
   2972     UnicodeSet  *fBK;
   2973     UnicodeSet  *fCR;
   2974     UnicodeSet  *fLF;
   2975     UnicodeSet  *fCM;
   2976     UnicodeSet  *fNL;
   2977     UnicodeSet  *fSG;
   2978     UnicodeSet  *fWJ;
   2979     UnicodeSet  *fZW;
   2980     UnicodeSet  *fGL;
   2981     UnicodeSet  *fCB;
   2982     UnicodeSet  *fSP;
   2983     UnicodeSet  *fB2;
   2984     UnicodeSet  *fBA;
   2985     UnicodeSet  *fBB;
   2986     UnicodeSet  *fHY;
   2987     UnicodeSet  *fH2;
   2988     UnicodeSet  *fH3;
   2989     UnicodeSet  *fCL;
   2990     UnicodeSet  *fCP;
   2991     UnicodeSet  *fEX;
   2992     UnicodeSet  *fIN;
   2993     UnicodeSet  *fJL;
   2994     UnicodeSet  *fJV;
   2995     UnicodeSet  *fJT;
   2996     UnicodeSet  *fNS;
   2997     UnicodeSet  *fOP;
   2998     UnicodeSet  *fQU;
   2999     UnicodeSet  *fIS;
   3000     UnicodeSet  *fNU;
   3001     UnicodeSet  *fPO;
   3002     UnicodeSet  *fPR;
   3003     UnicodeSet  *fSY;
   3004     UnicodeSet  *fAI;
   3005     UnicodeSet  *fAL;
   3006     UnicodeSet  *fCJ;
   3007     UnicodeSet  *fHL;
   3008     UnicodeSet  *fID;
   3009     UnicodeSet  *fRI;
   3010     UnicodeSet  *fXX;
   3011     UnicodeSet  *fEB;
   3012     UnicodeSet  *fEM;
   3013     UnicodeSet  *fZJ;
   3014     UnicodeSet  *fExtendedPict;
   3015     UnicodeSet  *fEmojiNRK;
   3016 
   3017     BreakIterator        *fCharBI;
   3018     const UnicodeString  *fText;
   3019     RegexMatcher         *fNumberMatcher;
   3020 };
   3021 
   3022 RBBILineMonkey::RBBILineMonkey() :
   3023     RBBIMonkeyKind(),
   3024     fSets(NULL),
   3025 
   3026     fCharBI(NULL),
   3027     fText(NULL),
   3028     fNumberMatcher(NULL)
   3029 
   3030 {
   3031     if (U_FAILURE(deferredStatus)) {
   3032         return;
   3033     }
   3034 
   3035     UErrorCode  status = U_ZERO_ERROR;
   3036 
   3037     fSets  = new UVector(status);
   3038 
   3039     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
   3040     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
   3041     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
   3042     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
   3043     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
   3044     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
   3045     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
   3046     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
   3047     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
   3048     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
   3049     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
   3050     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
   3051     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
   3052     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
   3053     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
   3054     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
   3055     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
   3056     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
   3057     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
   3058     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
   3059     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
   3060     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
   3061     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
   3062     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
   3063     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
   3064     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
   3065     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
   3066     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
   3067     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
   3068     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
   3069     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
   3070     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
   3071     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
   3072     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
   3073     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
   3074     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
   3075     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
   3076     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
   3077     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
   3078     fEB    = new UnicodeSet(UNICODE_STRING_SIMPLE(
   3079             "[\\p{Line_break=EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
   3080     fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
   3081     fZJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
   3082     fEmojiNRK = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
   3083     fExtendedPict = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
   3084 
   3085     if (U_FAILURE(status)) {
   3086         deferredStatus = status;
   3087         return;
   3088     }
   3089 
   3090     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
   3091     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
   3092     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
   3093 
   3094     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
   3095     fCM->addAll(*fZJ);     // ZWJ behaves as a CM.
   3096 
   3097     fSets->addElement(fBK, status);
   3098     fSets->addElement(fCR, status);
   3099     fSets->addElement(fLF, status);
   3100     fSets->addElement(fCM, status);
   3101     fSets->addElement(fNL, status);
   3102     fSets->addElement(fWJ, status);
   3103     fSets->addElement(fZW, status);
   3104     fSets->addElement(fGL, status);
   3105     fSets->addElement(fCB, status);
   3106     fSets->addElement(fSP, status);
   3107     fSets->addElement(fB2, status);
   3108     fSets->addElement(fBA, status);
   3109     fSets->addElement(fBB, status);
   3110     fSets->addElement(fHY, status);
   3111     fSets->addElement(fH2, status);
   3112     fSets->addElement(fH3, status);
   3113     fSets->addElement(fCL, status);
   3114     fSets->addElement(fCP, status);
   3115     fSets->addElement(fEX, status);
   3116     fSets->addElement(fIN, status);
   3117     fSets->addElement(fJL, status);
   3118     fSets->addElement(fJT, status);
   3119     fSets->addElement(fJV, status);
   3120     fSets->addElement(fNS, status);
   3121     fSets->addElement(fOP, status);
   3122     fSets->addElement(fQU, status);
   3123     fSets->addElement(fIS, status);
   3124     fSets->addElement(fNU, status);
   3125     fSets->addElement(fPO, status);
   3126     fSets->addElement(fPR, status);
   3127     fSets->addElement(fSY, status);
   3128     fSets->addElement(fAI, status);
   3129     fSets->addElement(fAL, status);
   3130     fSets->addElement(fHL, status);
   3131     fSets->addElement(fID, status);
   3132     fSets->addElement(fWJ, status);
   3133     fSets->addElement(fRI, status);
   3134     fSets->addElement(fSG, status);
   3135     fSets->addElement(fEB, status);
   3136     fSets->addElement(fEM, status);
   3137     fSets->addElement(fZJ, status);
   3138     fSets->addElement(fExtendedPict, status);
   3139     fSets->addElement(fEmojiNRK, status);
   3140 
   3141 
   3142     const char *rules =
   3143             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
   3144             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
   3145             "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
   3146             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
   3147             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
   3148             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
   3149 
   3150     fNumberMatcher = new RegexMatcher(
   3151         UnicodeString(rules, -1, US_INV), 0, status);
   3152 
   3153     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
   3154 
   3155     if (U_FAILURE(status)) {
   3156         deferredStatus = status;
   3157     }
   3158 }
   3159 
   3160 
   3161 void RBBILineMonkey::setText(const UnicodeString &s) {
   3162     fText       = &s;
   3163     fCharBI->setText(s);
   3164     fNumberMatcher->reset(s);
   3165 }
   3166 
   3167 //
   3168 //  rule9Adjust
   3169 //     Line Break TR rules 9 and 10 implementation.
   3170 //     This deals with combining marks and other sequences that
   3171 //     that must be treated as if they were something other than what they actually are.
   3172 //
   3173 //     This is factored out into a separate function because it must be applied twice for
   3174 //     each potential break, once to the chars before the position being checked, then
   3175 //     again to the text following the possible break.
   3176 //
   3177 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
   3178     if (pos == -1) {
   3179         // Invalid initial position.  Happens during the warmup iteration of the
   3180         //   main loop in next().
   3181         return;
   3182     }
   3183 
   3184     int32_t  nPos = *nextPos;
   3185 
   3186     // LB 9  Keep combining sequences together.
   3187     //  advance over any CM class chars.  Note that Line Break CM is different
   3188     //  from the normal Grapheme Extend property.
   3189     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
   3190           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
   3191         for (;;) {
   3192             *nextChar = fText->char32At(nPos);
   3193             if (!fCM->contains(*nextChar)) {
   3194                 break;
   3195             }
   3196             nPos = fText->moveIndex32(nPos, 1);
   3197         }
   3198     }
   3199 
   3200 
   3201     // LB 9 Treat X CM* as if it were x.
   3202     //       No explicit action required.
   3203 
   3204     // LB 10  Treat any remaining combining mark as AL
   3205     if (fCM->contains(*posChar)) {
   3206         *posChar = 0x41;   // thisChar = 'A';
   3207     }
   3208 
   3209     // Push the updated nextPos and nextChar back to our caller.
   3210     // This only makes a difference if posChar got bigger by consuming a
   3211     // combining sequence.
   3212     *nextPos  = nPos;
   3213     *nextChar = fText->char32At(nPos);
   3214 }
   3215 
   3216 
   3217 
   3218 int32_t RBBILineMonkey::next(int32_t startPos) {
   3219     UErrorCode status = U_ZERO_ERROR;
   3220     int32_t    pos;       //  Index of the char following a potential break position
   3221     UChar32    thisChar;  //  Character at above position "pos"
   3222 
   3223     int32_t    prevPos;   //  Index of the char preceding a potential break position
   3224     UChar32    prevChar;  //  Character at above position.  Note that prevChar
   3225                           //   and thisChar may not be adjacent because combining
   3226                           //   characters between them will be ignored.
   3227 
   3228     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
   3229     UChar32    prevCharX2;
   3230 
   3231     int32_t    nextPos;   //  Index of the next character following pos.
   3232                           //     Usually skips over combining marks.
   3233     int32_t    nextCPPos; //  Index of the code point following "pos."
   3234                           //     May point to a combining mark.
   3235     int32_t    tPos;      //  temp value.
   3236     UChar32    c;
   3237 
   3238     if (U_FAILURE(deferredStatus)) {
   3239         return -1;
   3240     }
   3241 
   3242     if (startPos >= fText->length()) {
   3243         return -1;
   3244     }
   3245 
   3246 
   3247     // Initial values for loop.  Loop will run the first time without finding breaks,
   3248     //                           while the invalid values shift out and the "this" and
   3249     //                           "prev" positions are filled in with good values.
   3250     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
   3251     thisChar = prevChar  = prevCharX2 = 0;
   3252     nextPos  = nextCPPos = startPos;
   3253 
   3254 
   3255     // Loop runs once per position in the test text, until a break position
   3256     //  is found.
   3257     for (;;) {
   3258         prevPosX2 = prevPos;
   3259         prevCharX2 = prevChar;
   3260 
   3261         prevPos   = pos;
   3262         prevChar  = thisChar;
   3263 
   3264         pos       = nextPos;
   3265         thisChar  = fText->char32At(pos);
   3266 
   3267         nextCPPos = fText->moveIndex32(pos, 1);
   3268         nextPos   = nextCPPos;
   3269 
   3270         // Rule LB2 - Break at end of text.
   3271         if (pos >= fText->length()) {
   3272             break;
   3273         }
   3274 
   3275         // Rule LB 9 - adjust for combining sequences.
   3276         //             We do this one out-of-order because the adjustment does not change anything
   3277         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
   3278         //             be applied.
   3279         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
   3280         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
   3281         c = fText->char32At(nextPos);
   3282         rule9Adjust(pos,     &thisChar, &nextPos, &c);
   3283 
   3284         // If the loop is still warming up - if we haven't shifted the initial
   3285         //   -1 positions out of prevPos yet - loop back to advance the
   3286         //    position in the input without any further looking for breaks.
   3287         if (prevPos == -1) {
   3288             continue;
   3289         }
   3290 
   3291         // LB 4  Always break after hard line breaks,
   3292         if (fBK->contains(prevChar)) {
   3293             break;
   3294         }
   3295 
   3296         // LB 5  Break after CR, LF, NL, but not inside CR LF
   3297         if (prevChar == 0x0d && thisChar == 0x0a) {
   3298             continue;
   3299         }
   3300         if (prevChar == 0x0d ||
   3301             prevChar == 0x0a ||
   3302             prevChar == 0x85)  {
   3303             break;
   3304         }
   3305 
   3306         // LB 6  Don't break before hard line breaks
   3307         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
   3308             fBK->contains(thisChar)) {
   3309                 continue;
   3310         }
   3311 
   3312 
   3313         // LB 7  Don't break before spaces or zero-width space.
   3314         if (fSP->contains(thisChar)) {
   3315             continue;
   3316         }
   3317 
   3318         if (fZW->contains(thisChar)) {
   3319             continue;
   3320         }
   3321 
   3322         // LB 8  Break after zero width space
   3323         if (fZW->contains(prevChar)) {
   3324             break;
   3325         }
   3326 
   3327         // LB 8a ZWJ x (ID | ExtendedPict | Emoji)
   3328         //       The monkey test's way of ignoring combining characters doesn't work
   3329         //       for this rule. ZJ is also a CM. Need to get the actual character
   3330         //       preceding "thisChar", not ignoring combining marks, possibly ZJ.
   3331         {
   3332             int32_t prevIdx = fText->moveIndex32(pos, -1);
   3333             UChar32 prevC = fText->char32At(prevIdx);
   3334             if (fZJ->contains(prevC) && (fID->contains(thisChar) || fExtendedPict->contains(thisChar) || fEmojiNRK->contains(thisChar))) {
   3335                 continue;
   3336             }
   3337         }
   3338 
   3339         // LB 9, 10  Already done, at top of loop.
   3340         //
   3341 
   3342 
   3343         // LB 11  Do not break before or after WORD JOINER and related characters.
   3344         //    x  WJ
   3345         //    WJ  x
   3346         //
   3347         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
   3348             continue;
   3349         }
   3350 
   3351         // LB 12
   3352         //    GL  x
   3353         if (fGL->contains(prevChar)) {
   3354             continue;
   3355         }
   3356 
   3357         // LB 12a
   3358         //    [^SP BA HY] x GL
   3359         if (!(fSP->contains(prevChar) ||
   3360               fBA->contains(prevChar) ||
   3361               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
   3362             continue;
   3363         }
   3364 
   3365 
   3366 
   3367         // LB 13  Don't break before closings.
   3368         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
   3369         //        fall into LB 17 and the more general number regular expression.
   3370         //
   3371         if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
   3372             (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
   3373                                          fEX->contains(thisChar)  ||
   3374             (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
   3375             (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
   3376             continue;
   3377         }
   3378 
   3379         // LB 14 Don't break after OP SP*
   3380         //       Scan backwards, checking for this sequence.
   3381         //       The OP char could include combining marks, so we actually check for
   3382         //           OP CM* SP*
   3383         //       Another Twist: The Rule 67 fixes may have changed a SP CM
   3384         //       sequence into a ID char, so before scanning back through spaces,
   3385         //       verify that prevChar is indeed a space.  The prevChar variable
   3386         //       may differ from fText[prevPos]
   3387         tPos = prevPos;
   3388         if (fSP->contains(prevChar)) {
   3389             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
   3390                 tPos=fText->moveIndex32(tPos, -1);
   3391             }
   3392         }
   3393         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
   3394             tPos=fText->moveIndex32(tPos, -1);
   3395         }
   3396         if (fOP->contains(fText->char32At(tPos))) {
   3397             continue;
   3398         }
   3399 
   3400 
   3401         // LB 15    QU SP* x OP
   3402         if (fOP->contains(thisChar)) {
   3403             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
   3404             int tPos = prevPos;
   3405             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
   3406                 tPos = fText->moveIndex32(tPos, -1);
   3407             }
   3408             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
   3409                 tPos = fText->moveIndex32(tPos, -1);
   3410             }
   3411             if (fQU->contains(fText->char32At(tPos))) {
   3412                 continue;
   3413             }
   3414         }
   3415 
   3416 
   3417 
   3418         // LB 16   (CL | CP) SP* x NS
   3419         //    Scan backwards for SP* CM* (CL | CP)
   3420         if (fNS->contains(thisChar)) {
   3421             int tPos = prevPos;
   3422             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
   3423                 tPos = fText->moveIndex32(tPos, -1);
   3424             }
   3425             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
   3426                 tPos = fText->moveIndex32(tPos, -1);
   3427             }
   3428             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
   3429                 continue;
   3430             }
   3431         }
   3432 
   3433 
   3434         // LB 17        B2 SP* x B2
   3435         if (fB2->contains(thisChar)) {
   3436             //  Scan backwards, checking for the B2 CM* SP* sequence.
   3437             tPos = prevPos;
   3438             if (fSP->contains(prevChar)) {
   3439                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
   3440                     tPos=fText->moveIndex32(tPos, -1);
   3441                 }
   3442             }
   3443             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
   3444                 tPos=fText->moveIndex32(tPos, -1);
   3445             }
   3446             if (fB2->contains(fText->char32At(tPos))) {
   3447                 continue;
   3448             }
   3449         }
   3450 
   3451 
   3452         // LB 18    break after space
   3453         if (fSP->contains(prevChar)) {
   3454             break;
   3455         }
   3456 
   3457         // LB 19
   3458         //    x   QU
   3459         //    QU  x
   3460         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
   3461             continue;
   3462         }
   3463 
   3464         // LB 20  Break around a CB
   3465         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
   3466             break;
   3467         }
   3468 
   3469         // LB 21
   3470         if (fBA->contains(thisChar) ||
   3471             fHY->contains(thisChar) ||
   3472             fNS->contains(thisChar) ||
   3473             fBB->contains(prevChar) )   {
   3474             continue;
   3475         }
   3476 
   3477         // LB 21a
   3478         //   HL (HY | BA) x
   3479         if (fHL->contains(prevCharX2) &&
   3480                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
   3481             continue;
   3482         }
   3483 
   3484         // LB 21b
   3485         //   SY x HL
   3486         if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
   3487             continue;
   3488         }
   3489 
   3490         // LB 22
   3491         if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
   3492             (fEX->contains(prevChar) && fIN->contains(thisChar)) ||
   3493             (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
   3494             ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fIN->contains(thisChar)) ||
   3495             (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
   3496             (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
   3497             continue;
   3498         }
   3499 
   3500 
   3501         // LB 23    (AL | HL) x NU
   3502         //          NU x (AL | HL)
   3503         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
   3504             continue;
   3505         }
   3506         if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
   3507             continue;
   3508         }
   3509 
   3510         // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
   3511         //      PR x (ID | EB | EM)
   3512         //     (ID | EB | EM) x PO
   3513         if (fPR->contains(prevChar) &&
   3514                 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar)))  {
   3515             continue;
   3516         }
   3517         if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
   3518                 fPO->contains(thisChar)) {
   3519             continue;
   3520         }
   3521 
   3522         // LB 24  Do not break between prefix and letters or ideographs.
   3523         //         (PR | PO) x (AL | HL)
   3524         //         (AL | HL) x (PR | PO)
   3525         if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
   3526                 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
   3527             continue;
   3528         }
   3529         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
   3530                 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
   3531             continue;
   3532         }
   3533 
   3534 
   3535 
   3536         // LB 25    Numbers
   3537         if (fNumberMatcher->lookingAt(prevPos, status)) {
   3538             if (U_FAILURE(status)) {
   3539                 break;
   3540             }
   3541             // Matched a number.  But could have been just a single digit, which would
   3542             //    not represent a "no break here" between prevChar and thisChar
   3543             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
   3544             if (numEndIdx > pos) {
   3545                 // Number match includes at least our two chars being checked
   3546                 if (numEndIdx > nextPos) {
   3547                     // Number match includes additional chars.  Update pos and nextPos
   3548                     //   so that next loop iteration will continue at the end of the number,
   3549                     //   checking for breaks between last char in number & whatever follows.
   3550                     pos = nextPos = numEndIdx;
   3551                     do {
   3552                         pos = fText->moveIndex32(pos, -1);
   3553                         thisChar = fText->char32At(pos);
   3554                     } while (fCM->contains(thisChar));
   3555                 }
   3556                 continue;
   3557             }
   3558         }
   3559 
   3560 
   3561         // LB 26 Do not break a Korean syllable.
   3562         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
   3563                                         fJV->contains(thisChar) ||
   3564                                         fH2->contains(thisChar) ||
   3565                                         fH3->contains(thisChar))) {
   3566                                             continue;
   3567                                         }
   3568 
   3569         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
   3570             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
   3571                 continue;
   3572         }
   3573 
   3574         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
   3575             fJT->contains(thisChar)) {
   3576                 continue;
   3577         }
   3578 
   3579         // LB 27 Treat a Korean Syllable Block the same as ID.
   3580         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
   3581             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
   3582             fIN->contains(thisChar)) {
   3583                 continue;
   3584             }
   3585         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
   3586             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
   3587             fPO->contains(thisChar)) {
   3588                 continue;
   3589             }
   3590         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
   3591             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
   3592                 continue;
   3593             }
   3594 
   3595 
   3596 
   3597         // LB 28  Do not break between alphabetics ("at").
   3598         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
   3599             continue;
   3600         }
   3601 
   3602         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
   3603         if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
   3604             continue;
   3605         }
   3606 
   3607         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
   3608         //          (AL | NU) x OP
   3609         //          CP x (AL | NU)
   3610         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
   3611             continue;
   3612         }
   3613         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
   3614             continue;
   3615         }
   3616 
   3617         // LB30a    RI RI <break> RI
   3618         //             RI    x    RI
   3619         if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
   3620             break;
   3621         }
   3622         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
   3623             continue;
   3624         }
   3625 
   3626         // LB30b    Emoji Base x Emoji Modifier
   3627         if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
   3628             continue;
   3629         }
   3630 
   3631         // LB 31    Break everywhere else
   3632         break;
   3633 
   3634     }
   3635 
   3636     return pos;
   3637 }
   3638 
   3639 
   3640 UVector  *RBBILineMonkey::charClasses() {
   3641     return fSets;
   3642 }
   3643 
   3644 
   3645 RBBILineMonkey::~RBBILineMonkey() {
   3646     delete fSets;
   3647 
   3648     delete fBK;
   3649     delete fCR;
   3650     delete fLF;
   3651     delete fCM;
   3652     delete fNL;
   3653     delete fWJ;
   3654     delete fZW;
   3655     delete fGL;
   3656     delete fCB;
   3657     delete fSP;
   3658     delete fB2;
   3659     delete fBA;
   3660     delete fBB;
   3661     delete fHY;
   3662     delete fH2;
   3663     delete fH3;
   3664     delete fCL;
   3665     delete fCP;
   3666     delete fEX;
   3667     delete fIN;
   3668     delete fJL;
   3669     delete fJV;
   3670     delete fJT;
   3671     delete fNS;
   3672     delete fOP;
   3673     delete fQU;
   3674     delete fIS;
   3675     delete fNU;
   3676     delete fPO;
   3677     delete fPR;
   3678     delete fSY;
   3679     delete fAI;
   3680     delete fAL;
   3681     delete fCJ;
   3682     delete fHL;
   3683     delete fID;
   3684     delete fRI;
   3685     delete fSG;
   3686     delete fXX;
   3687     delete fEB;
   3688     delete fEM;
   3689     delete fZJ;
   3690     delete fExtendedPict;
   3691     delete fEmojiNRK;
   3692 
   3693     delete fCharBI;
   3694     delete fNumberMatcher;
   3695 }
   3696 
   3697 
   3698 //-------------------------------------------------------------------------------------------
   3699 //
   3700 //   TestMonkey
   3701 //
   3702 //     params
   3703 //       seed=nnnnn        Random number starting seed.
   3704 //                         Setting the seed allows errors to be reproduced.
   3705 //       loop=nnn          Looping count.  Controls running time.
   3706 //                         -1:  run forever.
   3707 //                          0 or greater:  run length.
   3708 //
   3709 //       type = char | word | line | sent | title
   3710 //
   3711 //  Example:
   3712 //     intltest  rbbi/RBBITest/TestMonkey@"type=line loop=-1"
   3713 //
   3714 //-------------------------------------------------------------------------------------------
   3715 
   3716 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
   3717     int32_t val = defaultVal;
   3718     name.append(" *= *(-?\\d+)");
   3719     UErrorCode status = U_ZERO_ERROR;
   3720     RegexMatcher m(name, params, 0, status);
   3721     if (m.find()) {
   3722         // The param exists.  Convert the string to an int.
   3723         char valString[100];
   3724         int32_t paramLength = m.end(1, status) - m.start(1, status);
   3725         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
   3726             paramLength = (int32_t)(sizeof(valString)-2);
   3727         }
   3728         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
   3729         val = strtol(valString,  NULL, 10);
   3730 
   3731         // Delete this parameter from the params string.
   3732         m.reset();
   3733         params = m.replaceFirst("", status);
   3734     }
   3735     U_ASSERT(U_SUCCESS(status));
   3736     return val;
   3737 }
   3738 #endif
   3739 
   3740 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3741 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
   3742                                     BreakIterator *bi,
   3743                                     int expected[],
   3744                                     int expectedcount)
   3745 {
   3746     int count = 0;
   3747     int i = 0;
   3748     int forward[50];
   3749     bi->setText(ustr);
   3750     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
   3751         forward[count] = i;
   3752         if (count < expectedcount && expected[count] != i) {
   3753             test->errln("break forward test failed: expected %d but got %d",
   3754                         expected[count], i);
   3755             break;
   3756         }
   3757         count ++;
   3758     }
   3759     if (count != expectedcount) {
   3760         printStringBreaks(ustr, expected, expectedcount);
   3761         test->errln("break forward test failed: missed %d match",
   3762                     expectedcount - count);
   3763         return;
   3764     }
   3765     // testing boundaries
   3766     for (i = 1; i < expectedcount; i ++) {
   3767         int j = expected[i - 1];
   3768         if (!bi->isBoundary(j)) {
   3769             printStringBreaks(ustr, expected, expectedcount);
   3770             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
   3771             return;
   3772         }
   3773         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
   3774             if (bi->isBoundary(j)) {
   3775                 printStringBreaks(ustr, expected, expectedcount);
   3776                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
   3777                 return;
   3778             }
   3779         }
   3780     }
   3781 
   3782     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
   3783         count --;
   3784         if (forward[count] != i) {
   3785             printStringBreaks(ustr, expected, expectedcount);
   3786             test->errln("happy break test previous() failed: expected %d but got %d",
   3787                         forward[count], i);
   3788             break;
   3789         }
   3790     }
   3791     if (count != 0) {
   3792         printStringBreaks(ustr, expected, expectedcount);
   3793         test->errln("break test previous() failed: missed a match");
   3794         return;
   3795     }
   3796 
   3797     // testing preceding
   3798     for (i = 0; i < expectedcount - 1; i ++) {
   3799         // int j = expected[i] + 1;
   3800         int j = ustr.moveIndex32(expected[i], 1);
   3801         for (; j <= expected[i + 1]; j ++) {
   3802             if (bi->preceding(j) != expected[i]) {
   3803                 printStringBreaks(ustr, expected, expectedcount);
   3804                 test->errln("preceding(): Not expecting boundary at position %d", j);
   3805                 return;
   3806             }
   3807         }
   3808     }
   3809 }
   3810 #endif
   3811 
   3812 void RBBITest::TestWordBreaks(void)
   3813 {
   3814 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3815 
   3816     Locale        locale("en");
   3817     UErrorCode    status = U_ZERO_ERROR;
   3818     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   3819     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
   3820     // Replaced any C+J characters in a row with a random sequence of characters
   3821     // of the same length to make our C+J segmentation not get in the way.
   3822     static const char *strlist[] =
   3823     {
   3824     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
   3825     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
   3826     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
   3827     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
   3828     "\\uac00\\u3588\\u009c\\u0953\\u194b",
   3829     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   3830     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
   3831     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
   3832     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   3833     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   3834     "\\u2027\\U000e0067\\u0a47\\u00b7",
   3835     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   3836     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   3837     "\\u0589\\U000e006e\\u0a42\\U000104a5",
   3838     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
   3839     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   3840     "\\u0027\\u11af\\U000e0057\\u0602",
   3841     "\\U0001d7f2\\U000e007\\u0004\\u0589",
   3842     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
   3843     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   3844     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   3845     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   3846     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
   3847     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   3848     "\\u0233\\U000e0020\\u0a69\\u0d6a",
   3849     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   3850     "\\u18f4\\U000e0049\\u20e7\\u2027",
   3851     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   3852     "\\ua183\\u102d\\u0bec\\u003a",
   3853     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   3854     "\\u003a\\u0e57\\u0fad\\u002e",
   3855     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
   3856     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
   3857     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
   3858     "\\u003a\\u0664\\u00b7\\u1fba",
   3859     "\\u003b\\u0027\\u00b7\\u47a3",
   3860     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
   3861     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
   3862     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
   3863     };
   3864     int loop;
   3865     if (U_FAILURE(status)) {
   3866         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   3867         return;
   3868     }
   3869     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
   3870         // printf("looping %d\n", loop);
   3871         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
   3872         // RBBICharMonkey monkey;
   3873         RBBIWordMonkey monkey;
   3874 
   3875         int expected[50];
   3876         int expectedcount = 0;
   3877 
   3878         monkey.setText(ustr);
   3879         int i;
   3880         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   3881             expected[expectedcount ++] = i;
   3882         }
   3883 
   3884         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   3885     }
   3886     delete bi;
   3887 #endif
   3888 }
   3889 
   3890 void RBBITest::TestWordBoundary(void)
   3891 {
   3892     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
   3893     Locale        locale("en");
   3894     UErrorCode    status = U_ZERO_ERROR;
   3895     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   3896     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
   3897     UChar         str[50];
   3898     static const char *strlist[] =
   3899     {
   3900     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   3901     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   3902     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   3903     "\\u2027\\U000e0067\\u0a47\\u00b7",
   3904     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   3905     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   3906     "\\u0589\\U000e006e\\u0a42\\U000104a5",
   3907     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
   3908     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   3909     "\\u0027\\u11af\\U000e0057\\u0602",
   3910     "\\U0001d7f2\\U000e007\\u0004\\u0589",
   3911     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
   3912     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   3913     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   3914     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   3915     "\\U000e0065\\u302c\\u09ee\\U000e0068",
   3916     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   3917     "\\u0233\\U000e0020\\u0a69\\u0d6a",
   3918     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   3919     "\\u58f4\\U000e0049\\u20e7\\u2027",
   3920     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   3921     "\\ua183\\u102d\\u0bec\\u003a",
   3922     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   3923     "\\u003a\\u0e57\\u0fad\\u002e",
   3924     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
   3925     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
   3926     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
   3927     "\\u003a\\u0664\\u00b7\\u1fba",
   3928     "\\u003b\\u0027\\u00b7\\u47a3",
   3929     };
   3930     int loop;
   3931     if (U_FAILURE(status)) {
   3932         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   3933         return;
   3934     }
   3935     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
   3936         // printf("looping %d\n", loop);
   3937         u_unescape(strlist[loop], str, 20);
   3938         UnicodeString ustr(str);
   3939         int forward[50];
   3940         int count = 0;
   3941 
   3942         bi->setText(ustr);
   3943         int prev = 0;
   3944         int i;
   3945         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
   3946             forward[count ++] = i;
   3947             if (i > prev) {
   3948                 int j;
   3949                 for (j = prev + 1; j < i; j ++) {
   3950                     if (bi->isBoundary(j)) {
   3951                         printStringBreaks(ustr, forward, count);
   3952                         errln("happy boundary test failed: expected %d not a boundary",
   3953                                j);
   3954                         return;
   3955                     }
   3956                 }
   3957             }
   3958             if (!bi->isBoundary(i)) {
   3959                 printStringBreaks(ustr, forward, count);
   3960                 errln("happy boundary test failed: expected %d a boundary",
   3961                        i);
   3962                 return;
   3963             }
   3964             prev = i;
   3965         }
   3966     }
   3967     delete bi;
   3968 }
   3969 
   3970 void RBBITest::TestLineBreaks(void)
   3971 {
   3972 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3973     Locale        locale("en");
   3974     UErrorCode    status = U_ZERO_ERROR;
   3975     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
   3976     const int32_t  STRSIZE = 50;
   3977     UChar         str[STRSIZE];
   3978     static const char *strlist[] =
   3979     {
   3980      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
   3981      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
   3982              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
   3983      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
   3984              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
   3985      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
   3986      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
   3987      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
   3988      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
   3989      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
   3990      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
   3991      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
   3992      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
   3993      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
   3994      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
   3995      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
   3996      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
   3997      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
   3998      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
   3999      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
   4000      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
   4001      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
   4002      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
   4003      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
   4004      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
   4005      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
   4006      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
   4007      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
   4008      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
   4009      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
   4010      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
   4011      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
   4012      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
   4013      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
   4014      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
   4015      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
   4016      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
   4017      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
   4018      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
   4019          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
   4020     };
   4021     int loop;
   4022     TEST_ASSERT_SUCCESS(status);
   4023     if (U_FAILURE(status)) {
   4024         return;
   4025     }
   4026     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
   4027         // printf("looping %d\n", loop);
   4028         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
   4029         if (t >= STRSIZE) {
   4030             TEST_ASSERT(FALSE);
   4031             continue;
   4032         }
   4033 
   4034 
   4035         UnicodeString ustr(str);
   4036         RBBILineMonkey monkey;
   4037         if (U_FAILURE(monkey.deferredStatus)) {
   4038             continue;
   4039         }
   4040 
   4041         const int EXPECTEDSIZE = 50;
   4042         int expected[EXPECTEDSIZE];
   4043         int expectedcount = 0;
   4044 
   4045         monkey.setText(ustr);
   4046         int i;
   4047         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   4048             if (expectedcount >= EXPECTEDSIZE) {
   4049                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
   4050                 return;
   4051             }
   4052             expected[expectedcount ++] = i;
   4053         }
   4054 
   4055         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   4056     }
   4057     delete bi;
   4058 #endif
   4059 }
   4060 
   4061 void RBBITest::TestSentBreaks(void)
   4062 {
   4063 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4064     Locale        locale("en");
   4065     UErrorCode    status = U_ZERO_ERROR;
   4066     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
   4067     UChar         str[200];
   4068     static const char *strlist[] =
   4069     {
   4070      "Now\ris\nthe\r\ntime\n\rfor\r\r",
   4071      "This\n",
   4072      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
   4073      "\"Sentence ending with a quote.\" Bye.",
   4074      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
   4075      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
   4076      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
   4077      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
   4078      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
   4079      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
   4080      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
   4081              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
   4082              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
   4083              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
   4084      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
   4085              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
   4086              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
   4087              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
   4088              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
   4089              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
   4090     };
   4091     int loop;
   4092     if (U_FAILURE(status)) {
   4093         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   4094         return;
   4095     }
   4096     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
   4097         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
   4098         UnicodeString ustr(str);
   4099 
   4100         RBBISentMonkey monkey;
   4101         if (U_FAILURE(monkey.deferredStatus)) {
   4102             continue;
   4103         }
   4104 
   4105         const int EXPECTEDSIZE = 50;
   4106         int expected[EXPECTEDSIZE];
   4107         int expectedcount = 0;
   4108 
   4109         monkey.setText(ustr);
   4110         int i;
   4111         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   4112             if (expectedcount >= EXPECTEDSIZE) {
   4113                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
   4114                 return;
   4115             }
   4116             expected[expectedcount ++] = i;
   4117         }
   4118 
   4119         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   4120     }
   4121     delete bi;
   4122 #endif
   4123 }
   4124 
   4125 void RBBITest::TestMonkey() {
   4126 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4127 
   4128     UErrorCode     status    = U_ZERO_ERROR;
   4129     int32_t        loopCount = 500;
   4130     int32_t        seed      = 1;
   4131     UnicodeString  breakType = "all";
   4132     Locale         locale("en");
   4133     UBool          useUText  = FALSE;
   4134 
   4135     if (quick == FALSE) {
   4136         loopCount = 10000;
   4137     }
   4138 
   4139     if (fTestParams) {
   4140         UnicodeString p(fTestParams);
   4141         loopCount = getIntParam("loop", p, loopCount);
   4142         seed      = getIntParam("seed", p, seed);
   4143 
   4144         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
   4145         if (m.find()) {
   4146             breakType = m.group(1, status);
   4147             m.reset();
   4148             p = m.replaceFirst("", status);
   4149         }
   4150 
   4151         RegexMatcher u(" *utext", p, 0, status);
   4152         if (u.find()) {
   4153             useUText = TRUE;
   4154             u.reset();
   4155             p = u.replaceFirst("", status);
   4156         }
   4157 
   4158 
   4159         // m.reset(p);
   4160         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
   4161             // Each option is stripped out of the option string as it is processed.
   4162             // All options have been checked.  The option string should have been completely emptied..
   4163             char buf[100];
   4164             p.extract(buf, sizeof(buf), NULL, status);
   4165             buf[sizeof(buf)-1] = 0;
   4166             errln("Unrecognized or extra parameter:  %s\n", buf);
   4167             return;
   4168         }
   4169 
   4170     }
   4171 
   4172     if (breakType == "char" || breakType == "all") {
   4173         RBBICharMonkey  m;
   4174         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   4175         if (U_SUCCESS(status)) {
   4176             RunMonkey(bi, m, "char", seed, loopCount, useUText);
   4177             if (breakType == "all" && useUText==FALSE) {
   4178                 // Also run a quick test with UText when "all" is specified
   4179                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
   4180             }
   4181         }
   4182         else {
   4183             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
   4184         }
   4185         delete bi;
   4186     }
   4187 
   4188     if (breakType == "word" || breakType == "all") {
   4189         logln("Word Break Monkey Test");
   4190         RBBIWordMonkey  m;
   4191         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
   4192         if (U_SUCCESS(status)) {
   4193             RunMonkey(bi, m, "word", seed, loopCount, useUText);
   4194         }
   4195         else {
   4196             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
   4197         }
   4198         delete bi;
   4199     }
   4200 
   4201     if (breakType == "line" || breakType == "all") {
   4202         logln("Line Break Monkey Test");
   4203         RBBILineMonkey  m;
   4204         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
   4205         if (loopCount >= 10) {
   4206             loopCount = loopCount / 5;   // Line break runs slower than the others.
   4207         }
   4208         if (U_SUCCESS(status)) {
   4209             RunMonkey(bi, m, "line", seed, loopCount, useUText);
   4210         }
   4211         else {
   4212             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
   4213         }
   4214         delete bi;
   4215     }
   4216 
   4217     if (breakType == "sent" || breakType == "all"  ) {
   4218         logln("Sentence Break Monkey Test");
   4219         RBBISentMonkey  m;
   4220         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
   4221         if (loopCount >= 10) {
   4222             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
   4223         }
   4224         if (U_SUCCESS(status)) {
   4225             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
   4226         }
   4227         else {
   4228             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
   4229         }
   4230         delete bi;
   4231     }
   4232 
   4233 #endif
   4234 }
   4235 
   4236 //
   4237 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
   4238 //    Parameters:
   4239 //       bi      - the break iterator to use
   4240 //       mk      - MonkeyKind, abstraction for obtaining expected results
   4241 //       name    - Name of test (char, word, etc.) for use in error messages
   4242 //       seed    - Seed for starting random number generator (parameter from user)
   4243 //       numIterations
   4244 //
   4245 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
   4246                          int32_t numIterations, UBool useUText) {
   4247 
   4248 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4249 
   4250     const int32_t    TESTSTRINGLEN = 500;
   4251     UnicodeString    testText;
   4252     int32_t          numCharClasses;
   4253     UVector          *chClasses;
   4254     int              expected[TESTSTRINGLEN*2 + 1];
   4255     int              expectedCount = 0;
   4256     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
   4257     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
   4258     char             reverseBreaks[TESTSTRINGLEN*2+1];
   4259     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
   4260     char             followingBreaks[TESTSTRINGLEN*2+1];
   4261     char             precedingBreaks[TESTSTRINGLEN*2+1];
   4262     int              i;
   4263     int              loopCount = 0;
   4264 
   4265     m_seed = seed;
   4266 
   4267     numCharClasses = mk.charClasses()->size();
   4268     chClasses      = mk.charClasses();
   4269 
   4270     // Check for errors that occured during the construction of the MonkeyKind object.
   4271     //  Can't report them where they occured because errln() is a method coming from intlTest,
   4272     //  and is not visible outside of RBBITest :-(
   4273     if (U_FAILURE(mk.deferredStatus)) {
   4274         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
   4275         return;
   4276     }
   4277 
   4278     // Verify that the character classes all have at least one member.
   4279     for (i=0; i<numCharClasses; i++) {
   4280         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
   4281         if (s == NULL || s->size() == 0) {
   4282             errln("Character Class #%d is null or of zero size.", i);
   4283             return;
   4284         }
   4285     }
   4286 
   4287     while (loopCount < numIterations || numIterations == -1) {
   4288         if (numIterations == -1 && loopCount % 10 == 0) {
   4289             // If test is running in an infinite loop, display a periodic tic so
   4290             //   we can tell that it is making progress.
   4291             fprintf(stderr, ".");
   4292         }
   4293         // Save current random number seed, so that we can recreate the random numbers
   4294         //   for this loop iteration in event of an error.
   4295         seed = m_seed;
   4296 
   4297         // Populate a test string with data.
   4298         testText.truncate(0);
   4299         for (i=0; i<TESTSTRINGLEN; i++) {
   4300             int32_t  aClassNum = m_rand() % numCharClasses;
   4301             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
   4302             int32_t   charIdx = m_rand() % classSet->size();
   4303             UChar32   c = classSet->charAt(charIdx);
   4304             if (c < 0) {   // TODO:  deal with sets containing strings.
   4305                 errln("%s:%d c < 0", __FILE__, __LINE__);
   4306                 break;
   4307             }
   4308             // Do not assemble a supplementary character from randomly generated separate surrogates.
   4309             //   (It could be a dictionary character)
   4310             if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
   4311                 continue;
   4312             }
   4313 
   4314             testText.append(c);
   4315         }
   4316 
   4317         // Calculate the expected results for this test string.
   4318         mk.setText(testText);
   4319         memset(expectedBreaks, 0, sizeof(expectedBreaks));
   4320         expectedBreaks[0] = 1;
   4321         int32_t breakPos = 0;
   4322         expectedCount = 0;
   4323         for (;;) {
   4324             breakPos = mk.next(breakPos);
   4325             if (breakPos == -1) {
   4326                 break;
   4327             }
   4328             if (breakPos > testText.length()) {
   4329                 errln("breakPos > testText.length()");
   4330             }
   4331             expectedBreaks[breakPos] = 1;
   4332             U_ASSERT(expectedCount<testText.length());
   4333             expected[expectedCount ++] = breakPos;
   4334             (void)expected;   // Set but not used warning.
   4335                               // TODO (andy): check it out.
   4336         }
   4337 
   4338         // Find the break positions using forward iteration
   4339         memset(forwardBreaks, 0, sizeof(forwardBreaks));
   4340         if (useUText) {
   4341             UErrorCode status = U_ZERO_ERROR;
   4342             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
   4343             // testUText = utext_openUnicodeString(testUText, &testText, &status);
   4344             bi->setText(testUText, status);
   4345             TEST_ASSERT_SUCCESS(status);
   4346             utext_close(testUText);   // The break iterator does a shallow clone of the UText
   4347                                       //  This UText can be closed immediately, so long as the
   4348                                       //  testText string continues to exist.
   4349         } else {
   4350             bi->setText(testText);
   4351         }
   4352 
   4353         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
   4354             if (i < 0 || i > testText.length()) {
   4355                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
   4356                 break;
   4357             }
   4358             forwardBreaks[i] = 1;
   4359         }
   4360 
   4361         // Find the break positions using reverse iteration
   4362         memset(reverseBreaks, 0, sizeof(reverseBreaks));
   4363         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
   4364             if (i < 0 || i > testText.length()) {
   4365                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
   4366                 break;
   4367             }
   4368             reverseBreaks[i] = 1;
   4369         }
   4370 
   4371         // Find the break positions using isBoundary() tests.
   4372         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
   4373         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
   4374         for (i=0; i<=testText.length(); i++) {
   4375             isBoundaryBreaks[i] = bi->isBoundary(i);
   4376         }
   4377 
   4378 
   4379         // Find the break positions using the following() function.
   4380         // printf(".");
   4381         memset(followingBreaks, 0, sizeof(followingBreaks));
   4382         int32_t   lastBreakPos = 0;
   4383         followingBreaks[0] = 1;
   4384         for (i=0; i<testText.length(); i++) {
   4385             breakPos = bi->following(i);
   4386             if (breakPos <= i ||
   4387                 breakPos < lastBreakPos ||
   4388                 breakPos > testText.length() ||
   4389                 (breakPos > lastBreakPos && lastBreakPos > i)) {
   4390                 errln("%s break monkey test: "
   4391                     "Out of range value returned by BreakIterator::following().\n"
   4392                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
   4393                          name, seed, i, breakPos, lastBreakPos);
   4394                 break;
   4395             }
   4396             followingBreaks[breakPos] = 1;
   4397             lastBreakPos = breakPos;
   4398         }
   4399 
   4400         // Find the break positions using the preceding() function.
   4401         memset(precedingBreaks, 0, sizeof(precedingBreaks));
   4402         lastBreakPos = testText.length();
   4403         precedingBreaks[testText.length()] = 1;
   4404         for (i=testText.length(); i>0; i--) {
   4405             breakPos = bi->preceding(i);
   4406             if (breakPos >= i ||
   4407                 breakPos > lastBreakPos ||
   4408                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
   4409                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
   4410                 errln("%s break monkey test: "
   4411                     "Out of range value returned by BreakIterator::preceding().\n"
   4412                     "index=%d;  prev returned %d; lastBreak=%d" ,
   4413                     name,  i, breakPos, lastBreakPos);
   4414                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
   4415                     precedingBreaks[i] = 2;   // Forces an error.
   4416                 }
   4417             } else {
   4418                 if (breakPos >= 0) {
   4419                     precedingBreaks[breakPos] = 1;
   4420                 }
   4421                 lastBreakPos = breakPos;
   4422             }
   4423         }
   4424 
   4425         // Compare the expected and actual results.
   4426         for (i=0; i<=testText.length(); i++) {
   4427             const char *errorType = NULL;
   4428             if  (forwardBreaks[i] != expectedBreaks[i]) {
   4429                 errorType = "next()";
   4430             } else if (reverseBreaks[i] != forwardBreaks[i]) {
   4431                 errorType = "previous()";
   4432             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
   4433                 errorType = "isBoundary()";
   4434             } else if (followingBreaks[i] != expectedBreaks[i]) {
   4435                 errorType = "following()";
   4436             } else if (precedingBreaks[i] != expectedBreaks[i]) {
   4437                 errorType = "preceding()";
   4438             }
   4439 
   4440 
   4441             if (errorType != NULL) {
   4442                 // Format a range of the test text that includes the failure as
   4443                 //  a data item that can be included in the rbbi test data file.
   4444 
   4445                 // Start of the range is the last point where expected and actual results
   4446                 //   both agreed that there was a break position.
   4447                 int startContext = i;
   4448                 int32_t count = 0;
   4449                 for (;;) {
   4450                     if (startContext==0) { break; }
   4451                     startContext --;
   4452                     if (expectedBreaks[startContext] != 0) {
   4453                         if (count == 2) break;
   4454                         count ++;
   4455                     }
   4456                 }
   4457 
   4458                 // End of range is two expected breaks past the start position.
   4459                 int endContext = i + 1;
   4460                 int ci;
   4461                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
   4462                     for (;;) {
   4463                         if (endContext >= testText.length()) {break;}
   4464                         if (expectedBreaks[endContext-1] != 0) {
   4465                             if (count == 0) break;
   4466                             count --;
   4467                         }
   4468                         endContext ++;
   4469                     }
   4470                 }
   4471 
   4472                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
   4473                 UnicodeString errorText = "<data>";
   4474                 /***if (strcmp(errorType, "next()") == 0) {
   4475                     startContext = 0;
   4476                     endContext = testText.length();
   4477 
   4478                     printStringBreaks(testText, expected, expectedCount);
   4479                 }***/
   4480 
   4481                 for (ci=startContext; ci<endContext;) {
   4482                     UnicodeString hexChars("0123456789abcdef");
   4483                     UChar32  c;
   4484                     int      bn;
   4485                     c = testText.char32At(ci);
   4486                     if (ci == i) {
   4487                         // This is the location of the error.
   4488                         errorText.append("<?>");
   4489                     } else if (expectedBreaks[ci] != 0) {
   4490                         // This a non-error expected break position.
   4491                         errorText.append("\\");
   4492                     }
   4493                     if (c < 0x10000) {
   4494                         errorText.append("\\u");
   4495                         for (bn=12; bn>=0; bn-=4) {
   4496                             errorText.append(hexChars.charAt((c>>bn)&0xf));
   4497                         }
   4498                     } else {
   4499                         errorText.append("\\U");
   4500                         for (bn=28; bn>=0; bn-=4) {
   4501                             errorText.append(hexChars.charAt((c>>bn)&0xf));
   4502                         }
   4503                     }
   4504                     ci = testText.moveIndex32(ci, 1);
   4505                 }
   4506                 errorText.append("\\");
   4507                 errorText.append("</data>\n");
   4508 
   4509                 // Output the error
   4510                 char  charErrorTxt[500];
   4511                 UErrorCode status = U_ZERO_ERROR;
   4512                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
   4513                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
   4514                 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
   4515 
   4516                 errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
   4517                     name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
   4518                     errorType, seed, i, charErrorTxt);
   4519                 break;
   4520             }
   4521         }
   4522 
   4523         loopCount++;
   4524     }
   4525 #endif
   4526 }
   4527 
   4528 
   4529 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
   4530 //             This test checks the initial patch,
   4531 //             which is to just keep it from crashing.  Correct word boundaries
   4532 //             await a proper fix to the dictionary code.
   4533 //
   4534 void RBBITest::TestBug5532(void)  {
   4535    // Text includes a mixture of Thai and Latin.
   4536    const unsigned char utf8Data[] = {
   4537            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
   4538            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
   4539            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
   4540            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
   4541            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
   4542            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
   4543            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
   4544            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
   4545            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
   4546            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
   4547            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
   4548 
   4549     UErrorCode status = U_ZERO_ERROR;
   4550     UText utext=UTEXT_INITIALIZER;
   4551     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
   4552     TEST_ASSERT_SUCCESS(status);
   4553 
   4554     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
   4555     TEST_ASSERT_SUCCESS(status);
   4556     if (U_SUCCESS(status)) {
   4557         bi->setText(&utext, status);
   4558         TEST_ASSERT_SUCCESS(status);
   4559 
   4560         int32_t breakCount = 0;
   4561         int32_t previousBreak = -1;
   4562         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
   4563             // For now, just make sure that the break iterator doesn't hang.
   4564             TEST_ASSERT(previousBreak < bi->current());
   4565             previousBreak = bi->current();
   4566         }
   4567         TEST_ASSERT(breakCount > 0);
   4568     }
   4569     delete bi;
   4570     utext_close(&utext);
   4571 }
   4572 
   4573 
   4574 void RBBITest::TestBug9983(void)  {
   4575     UnicodeString text = UnicodeString("\\u002A"  // * Other
   4576                                        "\\uFF65"  //   Other
   4577                                        "\\u309C"  //   Katakana
   4578                                        "\\uFF9F"  //   Extend
   4579                                        "\\uFF65"  //   Other
   4580                                        "\\u0020"  //   Other
   4581                                        "\\u0000").unescape();
   4582 
   4583     UErrorCode status = U_ZERO_ERROR;
   4584     LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
   4585         BreakIterator::createWordInstance(Locale::getRoot(), status)));
   4586     TEST_ASSERT_SUCCESS(status);
   4587     LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
   4588         BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
   4589     TEST_ASSERT_SUCCESS(status);
   4590     if (U_FAILURE(status)) {
   4591         return;
   4592     }
   4593     int32_t offset, rstatus, iterationCount;
   4594 
   4595     brkiter->setText(text);
   4596     brkiter->last();
   4597     iterationCount = 0;
   4598     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
   4599         iterationCount++;
   4600         rstatus = brkiter->getRuleStatus();
   4601         (void)rstatus;     // Suppress set but not used warning.
   4602         if (iterationCount >= 10) {
   4603            break;
   4604         }
   4605     }
   4606     TEST_ASSERT(iterationCount == 6);
   4607 
   4608     brkiterPOSIX->setText(text);
   4609     brkiterPOSIX->last();
   4610     iterationCount = 0;
   4611     while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
   4612         iterationCount++;
   4613         rstatus = brkiterPOSIX->getRuleStatus();
   4614         (void)rstatus;     // Suppress set but not used warning.
   4615         if (iterationCount >= 10) {
   4616            break;
   4617         }
   4618     }
   4619     TEST_ASSERT(iterationCount == 6);
   4620 }
   4621 
   4622 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
   4623 //
   4624 void RBBITest::TestBug7547() {
   4625     UnicodeString rules;
   4626     UErrorCode status = U_ZERO_ERROR;
   4627     UParseError parseError;
   4628     RuleBasedBreakIterator breakIterator(rules, parseError, status);
   4629     if (status != U_BRK_RULE_SYNTAX) {
   4630         errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
   4631     }
   4632     if (parseError.line != 1 || parseError.offset != 0) {
   4633         errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
   4634     }
   4635 }
   4636 
   4637 
   4638 void RBBITest::TestBug12797() {
   4639     UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
   4640     UErrorCode status = U_ZERO_ERROR;
   4641     UParseError parseError;
   4642     RuleBasedBreakIterator bi(rules, parseError, status);
   4643     if (U_FAILURE(status)) {
   4644         errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
   4645         return;
   4646     }
   4647     UnicodeString text = "abc";
   4648     bi.setText(text);
   4649     bi.first();
   4650     int32_t boundary = bi.next();
   4651     if (boundary != 3) {
   4652         errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
   4653     }
   4654 }
   4655 
   4656 void RBBITest::TestBug12918() {
   4657     // This test triggers an assertion failure in dictbe.cpp
   4658     const UChar crasherString[] = { 0x3325, 0x4a16, 0 };
   4659     UErrorCode status = U_ZERO_ERROR;
   4660     UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
   4661     if (U_FAILURE(status)) {
   4662         errln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
   4663         return;
   4664     }
   4665     ubrk_first(iter);
   4666     int32_t pos = 0;
   4667     int32_t lastPos = -1;
   4668     while((pos = ubrk_next(iter)) != UBRK_DONE) {
   4669         if (pos <= lastPos) {
   4670             errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
   4671             break;
   4672         }
   4673     }
   4674     ubrk_close(iter);
   4675 }
   4676 
   4677 //
   4678 //  TestDebug    -  A place-holder test for debugging purposes.
   4679 //                  For putting in fragments of other tests that can be invoked
   4680 //                  for tracing  without a lot of unwanted extra stuff happening.
   4681 //
   4682 void RBBITest::TestDebug(void) {
   4683 
   4684 }
   4685 
   4686 void RBBITest::TestProperties() {
   4687     UErrorCode errorCode = U_ZERO_ERROR;
   4688     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
   4689     if (!prependSet.isEmpty()) {
   4690         errln(
   4691             "[:GCB=Prepend:] is not empty any more. "
   4692             "Uncomment relevant lines in source/data/brkitr/char.txt and "
   4693             "change this test to the opposite condition.");
   4694     }
   4695 }
   4696 
   4697 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   4698