Home | History | Annotate | Download | only in intltest
      1 /********************************************************************
      2  * COPYRIGHT:
      3  * Copyright (c) 1999-2013, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  ********************************************************************/
      6 /************************************************************************
      7 *   Date        Name        Description
      8 *   12/15/99    Madhu        Creation.
      9 *   01/12/2000  Madhu        Updated for changed API and added new tests
     10 ************************************************************************/
     11 
     12 #include "utypeinfo.h"  // for 'typeid' to work
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_BREAK_ITERATION
     17 
     18 #include "unicode/utypes.h"
     19 #include "unicode/brkiter.h"
     20 #include "unicode/rbbi.h"
     21 #include "unicode/uchar.h"
     22 #include "unicode/utf16.h"
     23 #include "unicode/ucnv.h"
     24 #include "unicode/schriter.h"
     25 #include "unicode/uniset.h"
     26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     27 #include "unicode/regex.h"
     28 #endif
     29 #include "unicode/ustring.h"
     30 #include "unicode/utext.h"
     31 #include "intltest.h"
     32 #include "rbbitst.h"
     33 #include <string.h>
     34 #include "uvector.h"
     35 #include "uvectr32.h"
     36 #include <string.h>
     37 #include <stdio.h>
     38 #include <stdlib.h>
     39 #include "unicode/numfmt.h"
     40 #include "unicode/uscript.h"
     41 
     42 #define TEST_ASSERT(x) {if (!(x)) { \
     43     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
     44 
     45 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
     46     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
     47 
     48 
     49 //---------------------------------------------
     50 // runIndexedTest
     51 //---------------------------------------------
     52 
     53 
     54 //  Note:  Before adding new tests to this file, check whether the desired test data can
     55 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
     56 //         it's much less work than writing a new test, diagnostic output in the event of failures
     57 //         is good, and the test data file will is shared with ICU4J, so eventually the test
     58 //         will run there as well, without additional effort.
     59 
     60 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
     61 {
     62     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
     63 
     64     switch (index) {
     65 #if !UCONFIG_NO_FILE_IO
     66         case 0: name = "TestBug4153072";
     67             if(exec) TestBug4153072();                         break;
     68 #else
     69         case 0: name = "skip";
     70             break;
     71 #endif
     72 
     73         case 1: name = "skip";
     74             break;
     75         case 2: name = "TestStatusReturn";
     76             if(exec) TestStatusReturn();                       break;
     77 
     78 #if !UCONFIG_NO_FILE_IO
     79         case 3: name = "TestUnicodeFiles";
     80             if(exec) TestUnicodeFiles();                       break;
     81         case 4: name = "TestEmptyString";
     82             if(exec) TestEmptyString();                        break;
     83 #else
     84         case 3: case 4: name = "skip";
     85             break;
     86 #endif
     87 
     88         case 5: name = "TestGetAvailableLocales";
     89             if(exec) TestGetAvailableLocales();                break;
     90 
     91         case 6: name = "TestGetDisplayName";
     92             if(exec) TestGetDisplayName();                     break;
     93 
     94 #if !UCONFIG_NO_FILE_IO
     95         case 7: name = "TestEndBehaviour";
     96             if(exec) TestEndBehaviour();                       break;
     97         case 8: case 9: case 10: name = "skip";
     98              break;
     99         case 11: name = "TestWordBreaks";
    100              if(exec) TestWordBreaks();                        break;
    101         case 12: name = "TestWordBoundary";
    102              if(exec) TestWordBoundary();                      break;
    103         case 13: name = "TestLineBreaks";
    104              if(exec) TestLineBreaks();                        break;
    105         case 14: name = "TestSentBreaks";
    106              if(exec) TestSentBreaks();                        break;
    107         case 15: name = "TestExtended";
    108              if(exec) TestExtended();                          break;
    109 #else
    110         case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
    111              break;
    112 #endif
    113 
    114 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
    115         case 16:
    116             name = "TestMonkey"; if(exec)  TestMonkey(params); break;
    117 #else
    118         case 16:
    119              name = "skip";                                    break;
    120 #endif
    121 
    122 #if !UCONFIG_NO_FILE_IO
    123         case 17: name = "TestBug3818";
    124             if(exec) TestBug3818();                            break;
    125 #else
    126         case 17: name = "skip";
    127             break;
    128 #endif
    129 
    130         case 18: name = "skip";
    131             break;
    132         case 19: name = "TestDebug";
    133             if(exec) TestDebug();                              break;
    134         case 20: name = "skip";
    135             break;
    136 
    137 #if !UCONFIG_NO_FILE_IO
    138         case 21: name = "TestBug5775";
    139             if (exec) TestBug5775();                           break;
    140 #else
    141         case 21: name = "skip";
    142             break;
    143 #endif
    144 
    145         case 22: name = "TestBug9983";
    146             if (exec) TestBug9983();                           break;
    147         case 23: name = "TestDictRules";
    148             if (exec) TestDictRules();                         break;
    149         case 24: name = "TestBug5532";
    150             if (exec) TestBug5532();                           break;
    151         default: name = ""; break; //needed to end loop
    152     }
    153 }
    154 
    155 
    156 //---------------------------------------------------------------------------
    157 //
    158 //   class BITestData   Holds a set of Break iterator test data and results
    159 //                      Includes
    160 //                         - the string data to be broken
    161 //                         - a vector of the expected break positions.
    162 //                         - a vector of source line numbers for the data,
    163 //                               (to help see where errors occured.)
    164 //                         - The expected break tag values.
    165 //                         - Vectors of actual break positions and tag values.
    166 //                         - Functions for comparing actual with expected and
    167 //                            reporting errors.
    168 //
    169 //----------------------------------------------------------------------------
    170 class BITestData {
    171 public:
    172     UnicodeString    fDataToBreak;
    173     UVector          fExpectedBreakPositions;
    174     UVector          fExpectedTags;
    175     UVector          fLineNum;
    176     UVector          fActualBreakPositions;   // Test Results.
    177     UVector          fActualTags;
    178 
    179     BITestData(UErrorCode &status);
    180     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
    181     void             checkResults(const char *heading, RBBITest *test);
    182     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
    183     void             clearResults();
    184 };
    185 
    186 //
    187 // Constructor.
    188 //
    189 BITestData::BITestData(UErrorCode &status)
    190 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
    191   fActualTags(status)
    192 {
    193 }
    194 
    195 //
    196 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
    197 //                 The macro form collects the line number, which is helpful
    198 //                 when tracking down failures.
    199 //
    200 //                 A null data item is inserted at the start of each test's data
    201 //                  to put the starting zero into the data list.  The position saved for
    202 //                  each non-null item is its ending position.
    203 //
    204 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
    205 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
    206     if (U_FAILURE(status)) {return;}
    207     if (data != NULL) {
    208         fDataToBreak.append(CharsToUnicodeString(data));
    209     }
    210     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
    211     fExpectedTags.addElement(tag, status);
    212     fLineNum.addElement(lineNum, status);
    213 }
    214 
    215 
    216 //
    217 //  checkResults.   Compare the actual and expected break positions, report any differences.
    218 //
    219 void BITestData::checkResults(const char *heading, RBBITest *test) {
    220     int32_t   expectedIndex = 0;
    221     int32_t   actualIndex = 0;
    222 
    223     for (;;) {
    224         // If we've run through both the expected and actual results vectors, we're done.
    225         //   break out of the loop.
    226         if (expectedIndex >= fExpectedBreakPositions.size() &&
    227             actualIndex   >= fActualBreakPositions.size()) {
    228             break;
    229         }
    230 
    231 
    232         if (expectedIndex >= fExpectedBreakPositions.size()) {
    233             err(heading, test, expectedIndex-1, actualIndex);
    234             actualIndex++;
    235             continue;
    236         }
    237 
    238         if (actualIndex >= fActualBreakPositions.size()) {
    239             err(heading, test, expectedIndex, actualIndex-1);
    240             expectedIndex++;
    241             continue;
    242         }
    243 
    244         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
    245             err(heading, test, expectedIndex, actualIndex);
    246             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
    247             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
    248                 actualIndex++;
    249             } else {
    250                 expectedIndex++;
    251             }
    252             continue;
    253         }
    254 
    255         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
    256             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
    257                 heading, fLineNum.elementAt(expectedIndex),
    258                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
    259         }
    260 
    261         actualIndex++;
    262         expectedIndex++;
    263     }
    264 }
    265 
    266 //
    267 //  err   -  An error was found.  Report it, along with information about where the
    268 //                                incorrectly broken test data appeared in the source file.
    269 //
    270 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
    271 {
    272     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
    273     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
    274     int32_t   o        = 0;
    275     int32_t   line     = fLineNum.elementAti(expectedIdx);
    276     if (expectedIdx > 0) {
    277         // The line numbers are off by one because a premature break occurs somewhere
    278         //    within the previous item, rather than at the start of the current (expected) item.
    279         //    We want to report the offset of the unexpected break from the start of
    280         //      this previous item.
    281         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
    282     }
    283     if (actual < expected) {
    284         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
    285     } else {
    286         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
    287     }
    288 }
    289 
    290 
    291 void BITestData::clearResults() {
    292     fActualBreakPositions.removeAllElements();
    293     fActualTags.removeAllElements();
    294 }
    295 
    296 
    297 //--------------------------------------------------------------------------------------
    298 //
    299 //    RBBITest    constructor and destructor
    300 //
    301 //--------------------------------------------------------------------------------------
    302 
    303 RBBITest::RBBITest() {
    304 }
    305 
    306 
    307 RBBITest::~RBBITest() {
    308 }
    309 
    310 //-----------------------------------------------------------------------------------
    311 //
    312 //   Test for status {tag} return value from break rules.
    313 //        TODO:  a more thorough test.
    314 //
    315 //-----------------------------------------------------------------------------------
    316 void RBBITest::TestStatusReturn() {
    317      UnicodeString rulesString1("$Letters = [:L:];\n"
    318                                   "$Numbers = [:N:];\n"
    319                                   "$Letters+{1};\n"
    320                                   "$Numbers+{2};\n"
    321                                   "Help\\ {4}/me\\!;\n"
    322                                   "[^$Letters $Numbers];\n"
    323                                   "!.*;\n", -1, US_INV);
    324      UnicodeString testString1  = "abc123..abc Help me Help me!";
    325                                 // 01234567890123456789012345678
    326      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
    327      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
    328 
    329      UErrorCode status=U_ZERO_ERROR;
    330      UParseError    parseError;
    331 
    332      RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
    333      if(U_FAILURE(status)) {
    334          dataerrln("FAIL : in construction - %s", u_errorName(status));
    335      } else {
    336          int32_t  pos;
    337          int32_t  i = 0;
    338          bi->setText(testString1);
    339          for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
    340              if (pos != bounds1[i]) {
    341                  errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
    342                  break;
    343              }
    344 
    345              int tag = bi->getRuleStatus();
    346              if (tag != brkStatus[i]) {
    347                  errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
    348                  break;
    349              }
    350              i++;
    351          }
    352      }
    353      delete bi;
    354 }
    355 
    356 
    357 static void printStringBreaks(UnicodeString ustr, int expected[],
    358                               int expectedcount)
    359 {
    360     UErrorCode status = U_ZERO_ERROR;
    361     char name[100];
    362     printf("code    alpha extend alphanum type word sent line name\n");
    363     int j;
    364     for (j = 0; j < ustr.length(); j ++) {
    365         if (expectedcount > 0) {
    366             int k;
    367             for (k = 0; k < expectedcount; k ++) {
    368                 if (j == expected[k]) {
    369                     printf("------------------------------------------------ %d\n",
    370                            j);
    371                 }
    372             }
    373         }
    374         UChar32 c = ustr.char32At(j);
    375         if (c > 0xffff) {
    376             j ++;
    377         }
    378         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
    379         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
    380                            u_isUAlphabetic(c),
    381                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
    382                            u_isalnum(c),
    383                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
    384                                                   u_charType(c),
    385                                                   U_SHORT_PROPERTY_NAME),
    386                            u_getPropertyValueName(UCHAR_WORD_BREAK,
    387                                                   u_getIntPropertyValue(c,
    388                                                           UCHAR_WORD_BREAK),
    389                                                   U_SHORT_PROPERTY_NAME),
    390                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
    391                                    u_getIntPropertyValue(c,
    392                                            UCHAR_SENTENCE_BREAK),
    393                                    U_SHORT_PROPERTY_NAME),
    394                            u_getPropertyValueName(UCHAR_LINE_BREAK,
    395                                    u_getIntPropertyValue(c,
    396                                            UCHAR_LINE_BREAK),
    397                                    U_SHORT_PROPERTY_NAME),
    398                            name);
    399     }
    400 }
    401 
    402 
    403 void RBBITest::TestBug3818() {
    404     UErrorCode  status = U_ZERO_ERROR;
    405 
    406     // Four Thai words...
    407     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
    408                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
    409     UnicodeString  thaiStr(thaiWordData);
    410 
    411     RuleBasedBreakIterator* bi =
    412         (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
    413     if (U_FAILURE(status) || bi == NULL) {
    414         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
    415         return;
    416     }
    417     bi->setText(thaiStr);
    418 
    419     int32_t  startOfSecondWord = bi->following(1);
    420     if (startOfSecondWord != 4) {
    421         errln("Fail at file %s, line %d expected start of word at 4, got %d",
    422             __FILE__, __LINE__, startOfSecondWord);
    423     }
    424     startOfSecondWord = bi->following(0);
    425     if (startOfSecondWord != 4) {
    426         errln("Fail at file %s, line %d expected start of word at 4, got %d",
    427             __FILE__, __LINE__, startOfSecondWord);
    428     }
    429     delete bi;
    430 }
    431 
    432 //----------------------------------------------------------------------------
    433 //
    434 // generalIteratorTest      Given a break iterator and a set of test data,
    435 //                          Run the tests and report the results.
    436 //
    437 //----------------------------------------------------------------------------
    438 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
    439 {
    440 
    441     bi.setText(td.fDataToBreak);
    442 
    443     testFirstAndNext(bi, td);
    444 
    445     testLastAndPrevious(bi, td);
    446 
    447     testFollowing(bi, td);
    448     testPreceding(bi, td);
    449     testIsBoundary(bi, td);
    450     doMultipleSelectionTest(bi, td);
    451 }
    452 
    453 
    454 //
    455 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
    456 //                       kind of loop.
    457 //
    458 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
    459 {
    460     UErrorCode  status = U_ZERO_ERROR;
    461     int32_t     p;
    462     int32_t     lastP = -1;
    463     int32_t     tag;
    464 
    465     logln("Test first and next");
    466     bi.setText(td.fDataToBreak);
    467     td.clearResults();
    468 
    469     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
    470         td.fActualBreakPositions.addElement(p, status);  // Save result.
    471         tag = bi.getRuleStatus();
    472         td.fActualTags.addElement(tag, status);
    473         if (p <= lastP) {
    474             // If the iterator is not making forward progress, stop.
    475             //  No need to raise an error here, it'll be detected in the normal check of results.
    476             break;
    477         }
    478         lastP = p;
    479     }
    480     td.checkResults("testFirstAndNext", this);
    481 }
    482 
    483 
    484 //
    485 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
    486 //
    487 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
    488 {
    489     UErrorCode  status = U_ZERO_ERROR;
    490     int32_t     p;
    491     int32_t     lastP  = 0x7ffffffe;
    492     int32_t     tag;
    493 
    494     logln("Test last and previous");
    495     bi.setText(td.fDataToBreak);
    496     td.clearResults();
    497 
    498     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
    499         // Save break position.  Insert it at start of vector of results, shoving
    500         //    already-saved results further towards the end.
    501         td.fActualBreakPositions.insertElementAt(p, 0, status);
    502         // bi.previous();   // TODO:  Why does this fix things up????
    503         // bi.next();
    504         tag = bi.getRuleStatus();
    505         td.fActualTags.insertElementAt(tag, 0, status);
    506         if (p >= lastP) {
    507             // If the iterator is not making progress, stop.
    508             //  No need to raise an error here, it'll be detected in the normal check of results.
    509             break;
    510         }
    511         lastP = p;
    512     }
    513     td.checkResults("testLastAndPrevious", this);
    514 }
    515 
    516 
    517 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
    518 {
    519     UErrorCode  status = U_ZERO_ERROR;
    520     int32_t     p;
    521     int32_t     tag;
    522     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
    523                                  //   cannot be -1; that is returned for DONE.
    524     int         i;
    525 
    526     logln("testFollowing():");
    527     bi.setText(td.fDataToBreak);
    528     td.clearResults();
    529 
    530     // Save the starting point, since we won't get that out of following.
    531     p = bi.first();
    532     td.fActualBreakPositions.addElement(p, status);  // Save result.
    533     tag = bi.getRuleStatus();
    534     td.fActualTags.addElement(tag, status);
    535 
    536     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
    537         p = bi.following(i);
    538         if (p != lastP) {
    539             if (p == RuleBasedBreakIterator::DONE) {
    540                 break;
    541             }
    542             // We've reached a new break position.  Save it.
    543             td.fActualBreakPositions.addElement(p, status);  // Save result.
    544             tag = bi.getRuleStatus();
    545             td.fActualTags.addElement(tag, status);
    546             lastP = p;
    547         }
    548     }
    549     // The loop normally exits by means of the break in the middle.
    550     // Make sure that the index was at the correct position for the break iterator to have
    551     //   returned DONE.
    552     if (i != td.fDataToBreak.length()) {
    553         errln("testFollowing():  iterator returned DONE prematurely.");
    554     }
    555 
    556     // Full check of all results.
    557     td.checkResults("testFollowing", this);
    558 }
    559 
    560 
    561 
    562 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
    563     UErrorCode  status = U_ZERO_ERROR;
    564     int32_t     p;
    565     int32_t     tag;
    566     int32_t     lastP  = 0x7ffffffe;
    567     int         i;
    568 
    569     logln("testPreceding():");
    570     bi.setText(td.fDataToBreak);
    571     td.clearResults();
    572 
    573     p = bi.last();
    574     td.fActualBreakPositions.addElement(p, status);
    575     tag = bi.getRuleStatus();
    576     td.fActualTags.addElement(tag, status);
    577 
    578     for (i = td.fDataToBreak.length(); i>=-1; i--) {
    579         p = bi.preceding(i);
    580         if (p != lastP) {
    581             if (p == RuleBasedBreakIterator::DONE) {
    582                 break;
    583             }
    584             // We've reached a new break position.  Save it.
    585             td.fActualBreakPositions.insertElementAt(p, 0, status);
    586             lastP = p;
    587             tag = bi.getRuleStatus();
    588             td.fActualTags.insertElementAt(tag, 0, status);
    589         }
    590     }
    591     // The loop normally exits by means of the break in the middle.
    592     // Make sure that the index was at the correct position for the break iterator to have
    593     //   returned DONE.
    594     if (i != 0) {
    595         errln("testPreceding():  iterator returned DONE prematurely.");
    596     }
    597 
    598     // Full check of all results.
    599     td.checkResults("testPreceding", this);
    600 }
    601 
    602 
    603 
    604 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
    605     UErrorCode  status = U_ZERO_ERROR;
    606     int         i;
    607     int32_t     tag;
    608 
    609     logln("testIsBoundary():");
    610     bi.setText(td.fDataToBreak);
    611     td.clearResults();
    612 
    613     for (i = 0; i <= td.fDataToBreak.length(); i++) {
    614         if (bi.isBoundary(i)) {
    615             td.fActualBreakPositions.addElement(i, status);  // Save result.
    616             tag = bi.getRuleStatus();
    617             td.fActualTags.addElement(tag, status);
    618         }
    619     }
    620     td.checkResults("testIsBoundary: ", this);
    621 }
    622 
    623 
    624 
    625 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
    626 {
    627     iterator.setText(td.fDataToBreak);
    628 
    629     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
    630     int32_t offset = iterator.first();
    631     int32_t testOffset;
    632     int32_t count = 0;
    633 
    634     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
    635 
    636     if (*testIterator != iterator)
    637         errln("clone() or operator!= failed: two clones compared unequal");
    638 
    639     do {
    640         testOffset = testIterator->first();
    641         testOffset = testIterator->next(count);
    642         if (offset != testOffset)
    643             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
    644 
    645         if (offset != RuleBasedBreakIterator::DONE) {
    646             count++;
    647             offset = iterator.next();
    648 
    649             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
    650                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
    651                 if (count > 10000 || offset == -1) {
    652                     errln("operator== failed too many times. Stopping test.");
    653                     if (offset == -1) {
    654                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
    655                     }
    656                     return;
    657                 }
    658             }
    659         }
    660     } while (offset != RuleBasedBreakIterator::DONE);
    661 
    662     // now do it backwards...
    663     offset = iterator.last();
    664     count = 0;
    665 
    666     do {
    667         testOffset = testIterator->last();
    668         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
    669         if (offset != testOffset)
    670             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
    671 
    672         if (offset != RuleBasedBreakIterator::DONE) {
    673             count--;
    674             offset = iterator.previous();
    675         }
    676     } while (offset != RuleBasedBreakIterator::DONE);
    677 
    678     delete testIterator;
    679 }
    680 
    681 
    682 //---------------------------------------------
    683 //
    684 //     other tests
    685 //
    686 //---------------------------------------------
    687 void RBBITest::TestEmptyString()
    688 {
    689     UnicodeString text = "";
    690     UErrorCode status = U_ZERO_ERROR;
    691 
    692     BITestData x(status);
    693     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
    694     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
    695     if (U_FAILURE(status))
    696     {
    697         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
    698         return;
    699     }
    700     generalIteratorTest(*bi, x);
    701     delete bi;
    702 }
    703 
    704 void RBBITest::TestGetAvailableLocales()
    705 {
    706     int32_t locCount = 0;
    707     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
    708 
    709     if (locCount == 0)
    710         dataerrln("getAvailableLocales() returned an empty list!");
    711     // Just make sure that it's returning good memory.
    712     int32_t i;
    713     for (i = 0; i < locCount; ++i) {
    714         logln(locList[i].getName());
    715     }
    716 }
    717 
    718 //Testing the BreakIterator::getDisplayName() function
    719 void RBBITest::TestGetDisplayName()
    720 {
    721     UnicodeString   result;
    722 
    723     BreakIterator::getDisplayName(Locale::getUS(), result);
    724     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
    725         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
    726                 + result);
    727 
    728     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
    729     if (result != "French (France)")
    730         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
    731                 + result);
    732 }
    733 /**
    734  * Test End Behaviour
    735  * @bug 4068137
    736  */
    737 void RBBITest::TestEndBehaviour()
    738 {
    739     UErrorCode status = U_ZERO_ERROR;
    740     UnicodeString testString("boo.");
    741     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
    742     if (U_FAILURE(status))
    743     {
    744         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
    745         return;
    746     }
    747     wb->setText(testString);
    748 
    749     if (wb->first() != 0)
    750         errln("Didn't get break at beginning of string.");
    751     if (wb->next() != 3)
    752         errln("Didn't get break before period in \"boo.\"");
    753     if (wb->current() != 4 && wb->next() != 4)
    754         errln("Didn't get break at end of string.");
    755     delete wb;
    756 }
    757 /*
    758  * @bug 4153072
    759  */
    760 void RBBITest::TestBug4153072() {
    761     UErrorCode status = U_ZERO_ERROR;
    762     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
    763     if (U_FAILURE(status))
    764     {
    765         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
    766         return;
    767     }
    768     UnicodeString str("...Hello, World!...");
    769     int32_t begin = 3;
    770     int32_t end = str.length() - 3;
    771     UBool onBoundary;
    772 
    773     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
    774     iter->adoptText(textIterator);
    775     int index;
    776     // Note: with the switch to UText, there is no way to restrict the
    777     //       iteration range to begin at an index other than zero.
    778     //       String character iterators created with a non-zero bound are
    779     //         treated by RBBI as being empty.
    780     for (index = -1; index < begin + 1; ++index) {
    781         onBoundary = iter->isBoundary(index);
    782         if (index == 0?  !onBoundary : onBoundary) {
    783             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
    784                             " and begin index = " + begin);
    785         }
    786     }
    787     delete iter;
    788 }
    789 
    790 
    791 //
    792 // Test for problem reported by Ashok Matoria on 9 July 2007
    793 //    One.<kSoftHyphen><kSpace>Two.
    794 //
    795 //    Sentence break at start (0) and then on calling next() it breaks at
    796 //   'T' of "Two". Now, at this point if I do next() and
    797 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
    798 //
    799 void RBBITest::TestBug5775() {
    800     UErrorCode status = U_ZERO_ERROR;
    801     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
    802     TEST_ASSERT_SUCCESS(status);
    803     if (U_FAILURE(status)) {
    804         return;
    805     }
    806 // Check for status first for better handling of no data errors.
    807     TEST_ASSERT(bi != NULL);
    808     if (bi == NULL) {
    809         return;
    810     }
    811 
    812     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
    813     //               01234      56789
    814     s = s.unescape();
    815     bi->setText(s);
    816     int pos = bi->next();
    817     TEST_ASSERT(pos == 6);
    818     pos = bi->next();
    819     TEST_ASSERT(pos == 10);
    820     pos = bi->previous();
    821     TEST_ASSERT(pos == 6);
    822     delete bi;
    823 }
    824 
    825 
    826 
    827 //------------------------------------------------------------------------------
    828 //
    829 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
    830 //
    831 //------------------------------------------------------------------------------
    832 
    833 struct TestParams {
    834     BreakIterator   *bi;
    835     UnicodeString    dataToBreak;
    836     UVector32       *expectedBreaks;
    837     UVector32       *srcLine;
    838     UVector32       *srcCol;
    839 };
    840 
    841 void RBBITest::executeTest(TestParams *t) {
    842     int32_t    bp;
    843     int32_t    prevBP;
    844     int32_t    i;
    845 
    846     if (t->bi == NULL) {
    847         return;
    848     }
    849 
    850     t->bi->setText(t->dataToBreak);
    851     //
    852     //  Run the iterator forward
    853     //
    854     prevBP = -1;
    855     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
    856         if (prevBP ==  bp) {
    857             // Fail for lack of forward progress.
    858             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
    859                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
    860             break;
    861         }
    862 
    863         // Check that there were we didn't miss an expected break between the last one
    864         //  and this one.
    865         for (i=prevBP+1; i<bp; i++) {
    866             if (t->expectedBreaks->elementAti(i) != 0) {
    867                 int expected[] = {0, i};
    868                 printStringBreaks(t->dataToBreak, expected, 2);
    869                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
    870                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
    871             }
    872         }
    873 
    874         // Check that the break we did find was expected
    875         if (t->expectedBreaks->elementAti(bp) == 0) {
    876             int expected[] = {0, bp};
    877             printStringBreaks(t->dataToBreak, expected, 2);
    878             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
    879                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
    880         } else {
    881             // The break was expected.
    882             //   Check that the {nnn} tag value is correct.
    883             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
    884             if (expectedTagVal == -1) {
    885                 expectedTagVal = 0;
    886             }
    887             int32_t line = t->srcLine->elementAti(bp);
    888             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
    889             if (rs != expectedTagVal) {
    890                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
    891                       "          Actual, Expected status = %4d, %4d",
    892                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
    893             }
    894         }
    895 
    896 
    897         prevBP = bp;
    898     }
    899 
    900     // Verify that there were no missed expected breaks after the last one found
    901     for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
    902         if (t->expectedBreaks->elementAti(i) != 0) {
    903             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
    904                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
    905         }
    906     }
    907 
    908     //
    909     //  Run the iterator backwards, verify that the same breaks are found.
    910     //
    911     prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
    912     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
    913         if (prevBP ==  bp) {
    914             // Fail for lack of progress.
    915             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
    916                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
    917             break;
    918         }
    919 
    920         // Check that there were we didn't miss an expected break between the last one
    921         //  and this one.  (UVector returns zeros for index out of bounds.)
    922         for (i=prevBP-1; i>bp; i--) {
    923             if (t->expectedBreaks->elementAti(i) != 0) {
    924                 errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
    925                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
    926             }
    927         }
    928 
    929         // Check that the break we did find was expected
    930         if (t->expectedBreaks->elementAti(bp) == 0) {
    931             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
    932                    bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
    933         } else {
    934             // The break was expected.
    935             //   Check that the {nnn} tag value is correct.
    936             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
    937             if (expectedTagVal == -1) {
    938                 expectedTagVal = 0;
    939             }
    940             int line = t->srcLine->elementAti(bp);
    941             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
    942             if (rs != expectedTagVal) {
    943                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
    944                       "          Actual, Expected status = %4d, %4d",
    945                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
    946             }
    947         }
    948 
    949         prevBP = bp;
    950     }
    951 
    952     // Verify that there were no missed breaks prior to the last one found
    953     for (i=prevBP-1; i>=0; i--) {
    954         if (t->expectedBreaks->elementAti(i) != 0) {
    955             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
    956                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
    957         }
    958     }
    959 
    960     // Check isBoundary()
    961     for (i=0; i<t->expectedBreaks->size(); i++) {
    962         UBool boundaryExpected = (t->expectedBreaks->elementAti(i) != 0);
    963         UBool boundaryFound    = t->bi->isBoundary(i);
    964         if (boundaryExpected != boundaryFound) {
    965             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
    966                   "        Expected, Actual= %s, %s",
    967                   i, t->srcLine->elementAti(i), t->srcCol->elementAti(i),
    968                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
    969         }
    970     }
    971 
    972     // Check following()
    973     for (i=0; i<t->expectedBreaks->size(); i++) {
    974         int32_t actualBreak = t->bi->following(i);
    975         int32_t expectedBreak = BreakIterator::DONE;
    976         for (int32_t j=i+1; j < t->expectedBreaks->size(); j++) {
    977             if (t->expectedBreaks->elementAti(j) != 0) {
    978                 expectedBreak = j;
    979                 break;
    980             }
    981         }
    982         if (expectedBreak != actualBreak) {
    983             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
    984                   "        Expected, Actual= %d, %d",
    985                   i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
    986         }
    987     }
    988 
    989     // Check preceding()
    990     for (i=t->expectedBreaks->size(); i>=0; i--) {
    991         int32_t actualBreak = t->bi->preceding(i);
    992         int32_t expectedBreak = BreakIterator::DONE;
    993 
    994         for (int32_t j=i-1; j >= 0; j--) {
    995             if (t->expectedBreaks->elementAti(j) != 0) {
    996                 expectedBreak = j;
    997                 break;
    998             }
    999         }
   1000         if (expectedBreak != actualBreak) {
   1001             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
   1002                   "        Expected, Actual= %d, %d",
   1003                   i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
   1004         }
   1005     }
   1006 }
   1007 
   1008 
   1009 void RBBITest::TestExtended() {
   1010 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   1011     UErrorCode      status  = U_ZERO_ERROR;
   1012     Locale          locale("");
   1013 
   1014     UnicodeString       rules;
   1015     TestParams          tp;
   1016     tp.bi             = NULL;
   1017     tp.expectedBreaks = new UVector32(status);
   1018     tp.srcLine        = new UVector32(status);
   1019     tp.srcCol         = new UVector32(status);
   1020 
   1021     RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
   1022     if (U_FAILURE(status)) {
   1023         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
   1024     }
   1025 
   1026 
   1027     //
   1028     //  Open and read the test data file.
   1029     //
   1030     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   1031     char testFileName[1000];
   1032     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
   1033         errln("Can't open test data.  Path too long.");
   1034         return;
   1035     }
   1036     strcpy(testFileName, testDataDirectory);
   1037     strcat(testFileName, "rbbitst.txt");
   1038 
   1039     int    len;
   1040     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
   1041     if (U_FAILURE(status)) {
   1042         return; /* something went wrong, error already output */
   1043     }
   1044 
   1045 
   1046 
   1047 
   1048     //
   1049     //  Put the test data into a UnicodeString
   1050     //
   1051     UnicodeString testString(FALSE, testFile, len);
   1052 
   1053     enum EParseState{
   1054         PARSE_COMMENT,
   1055         PARSE_TAG,
   1056         PARSE_DATA,
   1057         PARSE_NUM
   1058     }
   1059     parseState = PARSE_TAG;
   1060 
   1061     EParseState savedState = PARSE_TAG;
   1062 
   1063     static const UChar CH_LF        = 0x0a;
   1064     static const UChar CH_CR        = 0x0d;
   1065     static const UChar CH_HASH      = 0x23;
   1066     /*static const UChar CH_PERIOD    = 0x2e;*/
   1067     static const UChar CH_LT        = 0x3c;
   1068     static const UChar CH_GT        = 0x3e;
   1069     static const UChar CH_BACKSLASH = 0x5c;
   1070     static const UChar CH_BULLET    = 0x2022;
   1071 
   1072     int32_t    lineNum  = 1;
   1073     int32_t    colStart = 0;
   1074     int32_t    column   = 0;
   1075     int32_t    charIdx  = 0;
   1076 
   1077     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
   1078 
   1079     for (charIdx = 0; charIdx < len; ) {
   1080         status = U_ZERO_ERROR;
   1081         UChar  c = testString.charAt(charIdx);
   1082         charIdx++;
   1083         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
   1084             // treat CRLF as a unit
   1085             c = CH_LF;
   1086             charIdx++;
   1087         }
   1088         if (c == CH_LF || c == CH_CR) {
   1089             lineNum++;
   1090             colStart = charIdx;
   1091         }
   1092         column = charIdx - colStart + 1;
   1093 
   1094         switch (parseState) {
   1095         case PARSE_COMMENT:
   1096             if (c == 0x0a || c == 0x0d) {
   1097                 parseState = savedState;
   1098             }
   1099             break;
   1100 
   1101         case PARSE_TAG:
   1102             {
   1103             if (c == CH_HASH) {
   1104                 parseState = PARSE_COMMENT;
   1105                 savedState = PARSE_TAG;
   1106                 break;
   1107             }
   1108             if (u_isUWhiteSpace(c)) {
   1109                 break;
   1110             }
   1111             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
   1112                 delete tp.bi;
   1113                 tp.bi = BreakIterator::createWordInstance(locale,  status);
   1114                 charIdx += 5;
   1115                 break;
   1116             }
   1117             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
   1118                 delete tp.bi;
   1119                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
   1120                 charIdx += 5;
   1121                 break;
   1122             }
   1123             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
   1124                 delete tp.bi;
   1125                 tp.bi = BreakIterator::createLineInstance(locale,  status);
   1126                 charIdx += 5;
   1127                 break;
   1128             }
   1129             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
   1130                 delete tp.bi;
   1131                 tp.bi = NULL;
   1132                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
   1133                 charIdx += 5;
   1134                 break;
   1135             }
   1136             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
   1137                 delete tp.bi;
   1138                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
   1139                 charIdx += 6;
   1140                 break;
   1141             }
   1142 
   1143             // <locale  loc_name>
   1144             localeMatcher.reset(testString);
   1145             if (localeMatcher.lookingAt(charIdx-1, status)) {
   1146                 UnicodeString localeName = localeMatcher.group(1, status);
   1147                 char localeName8[100];
   1148                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
   1149                 locale = Locale::createFromName(localeName8);
   1150                 charIdx += localeMatcher.group(0, status).length() - 1;
   1151                 TEST_ASSERT_SUCCESS(status);
   1152                 break;
   1153             }
   1154             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
   1155                 parseState = PARSE_DATA;
   1156                 charIdx += 5;
   1157                 tp.dataToBreak = "";
   1158                 tp.expectedBreaks->removeAllElements();
   1159                 tp.srcCol ->removeAllElements();
   1160                 tp.srcLine->removeAllElements();
   1161                 break;
   1162             }
   1163 
   1164             errln("line %d: Tag expected in test file.", lineNum);
   1165             parseState = PARSE_COMMENT;
   1166             savedState = PARSE_DATA;
   1167             goto end_test; // Stop the test.
   1168             }
   1169             break;
   1170 
   1171         case PARSE_DATA:
   1172             if (c == CH_BULLET) {
   1173                 int32_t  breakIdx = tp.dataToBreak.length();
   1174                 tp.expectedBreaks->setSize(breakIdx+1);
   1175                 tp.expectedBreaks->setElementAt(-1, breakIdx);
   1176                 tp.srcLine->setSize(breakIdx+1);
   1177                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1178                 tp.srcCol ->setSize(breakIdx+1);
   1179                 tp.srcCol ->setElementAt(column, breakIdx);
   1180                 break;
   1181             }
   1182 
   1183             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
   1184                 // Add final entry to mappings from break location to source file position.
   1185                 //  Need one extra because last break position returned is after the
   1186                 //    last char in the data, not at the last char.
   1187                 tp.srcLine->addElement(lineNum, status);
   1188                 tp.srcCol ->addElement(column, status);
   1189 
   1190                 parseState = PARSE_TAG;
   1191                 charIdx += 6;
   1192 
   1193                 // RUN THE TEST!
   1194                 executeTest(&tp);
   1195                 break;
   1196             }
   1197 
   1198             if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
   1199                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
   1200                 // Get the code point from the name and insert it into the test data.
   1201                 //   (Damn, no API takes names in Unicode  !!!
   1202                 //    we've got to take it back to char *)
   1203                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
   1204                 int32_t nameLength = nameEndIdx - (charIdx+2);
   1205                 char charNameBuf[200];
   1206                 UChar32 theChar = -1;
   1207                 if (nameEndIdx != -1) {
   1208                     UErrorCode status = U_ZERO_ERROR;
   1209                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
   1210                     charNameBuf[sizeof(charNameBuf)-1] = 0;
   1211                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
   1212                     if (U_FAILURE(status)) {
   1213                         theChar = -1;
   1214                     }
   1215                 }
   1216                 if (theChar == -1) {
   1217                     errln("Error in named character in test file at line %d, col %d",
   1218                         lineNum, column);
   1219                 } else {
   1220                     // Named code point was recognized.  Insert it
   1221                     //   into the test data.
   1222                     tp.dataToBreak.append(theChar);
   1223                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
   1224                         tp.srcLine->addElement(lineNum, status);
   1225                         tp.srcCol ->addElement(column, status);
   1226                     }
   1227                 }
   1228                 if (nameEndIdx > charIdx) {
   1229                     charIdx = nameEndIdx+1;
   1230 
   1231                 }
   1232                 break;
   1233             }
   1234 
   1235 
   1236 
   1237 
   1238             if (testString.compare(charIdx-1, 2, "<>") == 0) {
   1239                 charIdx++;
   1240                 int32_t  breakIdx = tp.dataToBreak.length();
   1241                 tp.expectedBreaks->setSize(breakIdx+1);
   1242                 tp.expectedBreaks->setElementAt(-1, breakIdx);
   1243                 tp.srcLine->setSize(breakIdx+1);
   1244                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1245                 tp.srcCol ->setSize(breakIdx+1);
   1246                 tp.srcCol ->setElementAt(column, breakIdx);
   1247                 break;
   1248             }
   1249 
   1250             if (c == CH_LT) {
   1251                 tagValue   = 0;
   1252                 parseState = PARSE_NUM;
   1253                 break;
   1254             }
   1255 
   1256             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
   1257                 parseState = PARSE_COMMENT;
   1258                 savedState = PARSE_DATA;
   1259                 break;
   1260             }
   1261 
   1262             if (c == CH_BACKSLASH) {
   1263                 // Check for \ at end of line, a line continuation.
   1264                 //     Advance over (discard) the newline
   1265                 UChar32 cp = testString.char32At(charIdx);
   1266                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
   1267                     // We have a CR LF
   1268                     //  Need an extra increment of the input ptr to move over both of them
   1269                     charIdx++;
   1270                 }
   1271                 if (cp == CH_LF || cp == CH_CR) {
   1272                     lineNum++;
   1273                     colStart = charIdx;
   1274                     charIdx++;
   1275                     break;
   1276                 }
   1277 
   1278                 // Let unescape handle the back slash.
   1279                 cp = testString.unescapeAt(charIdx);
   1280                 if (cp != -1) {
   1281                     // Escape sequence was recognized.  Insert the char
   1282                     //   into the test data.
   1283                     tp.dataToBreak.append(cp);
   1284                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
   1285                         tp.srcLine->addElement(lineNum, status);
   1286                         tp.srcCol ->addElement(column, status);
   1287                     }
   1288                     break;
   1289                 }
   1290 
   1291 
   1292                 // Not a recognized backslash escape sequence.
   1293                 // Take the next char as a literal.
   1294                 //  TODO:  Should this be an error?
   1295                 c = testString.charAt(charIdx);
   1296                 charIdx = testString.moveIndex32(charIdx, 1);
   1297             }
   1298 
   1299             // Normal, non-escaped data char.
   1300             tp.dataToBreak.append(c);
   1301 
   1302             // Save the mapping from offset in the data to line/column numbers in
   1303             //   the original input file.  Will be used for better error messages only.
   1304             //   If there's an expected break before this char, the slot in the mapping
   1305             //     vector will already be set for this char; don't overwrite it.
   1306             if (tp.dataToBreak.length() > tp.srcLine->size()) {
   1307                 tp.srcLine->addElement(lineNum, status);
   1308                 tp.srcCol ->addElement(column, status);
   1309             }
   1310             break;
   1311 
   1312 
   1313         case PARSE_NUM:
   1314             // We are parsing an expected numeric tag value, like <1234>,
   1315             //   within a chunk of data.
   1316             if (u_isUWhiteSpace(c)) {
   1317                 break;
   1318             }
   1319 
   1320             if (c == CH_GT) {
   1321                 // Finished the number.  Add the info to the expected break data,
   1322                 //   and switch parse state back to doing plain data.
   1323                 parseState = PARSE_DATA;
   1324                 if (tagValue == 0) {
   1325                     tagValue = -1;
   1326                 }
   1327                 int32_t  breakIdx = tp.dataToBreak.length();
   1328                 tp.expectedBreaks->setSize(breakIdx+1);
   1329                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
   1330                 tp.srcLine->setSize(breakIdx+1);
   1331                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1332                 tp.srcCol ->setSize(breakIdx+1);
   1333                 tp.srcCol ->setElementAt(column, breakIdx);
   1334                 break;
   1335             }
   1336 
   1337             if (u_isdigit(c)) {
   1338                 tagValue = tagValue*10 + u_charDigitValue(c);
   1339                 break;
   1340             }
   1341 
   1342             errln("Syntax Error in test file at line %d, col %d",
   1343                 lineNum, column);
   1344             parseState = PARSE_COMMENT;
   1345             goto end_test; // Stop the test
   1346             break;
   1347         }
   1348 
   1349 
   1350         if (U_FAILURE(status)) {
   1351             dataerrln("ICU Error %s while parsing test file at line %d.",
   1352                 u_errorName(status), lineNum);
   1353             status = U_ZERO_ERROR;
   1354             goto end_test; // Stop the test
   1355         }
   1356 
   1357     }
   1358 
   1359 end_test:
   1360     delete tp.bi;
   1361     delete tp.expectedBreaks;
   1362     delete tp.srcLine;
   1363     delete tp.srcCol;
   1364     delete [] testFile;
   1365 #endif
   1366 }
   1367 
   1368 
   1369 //-------------------------------------------------------------------------------
   1370 //
   1371 //  TestDictRules   create a break iterator from source rules that includes a
   1372 //                  dictionary range.   Regression for bug #7130.  Source rules
   1373 //                  do not declare a break iterator type (word, line, sentence, etc.
   1374 //                  but the dictionary code, without a type, would loop.
   1375 //
   1376 //-------------------------------------------------------------------------------
   1377 void RBBITest::TestDictRules() {
   1378     const char *rules =  "$dictionary = [a-z]; \n"
   1379                          "!!forward; \n"
   1380                          "$dictionary $dictionary; \n"
   1381                          "!!reverse; \n"
   1382                          "$dictionary $dictionary; \n";
   1383     const char *text = "aa";
   1384     UErrorCode status = U_ZERO_ERROR;
   1385     UParseError parseError;
   1386 
   1387     RuleBasedBreakIterator bi(rules, parseError, status);
   1388     if (U_SUCCESS(status)) {
   1389         UnicodeString utext = text;
   1390         bi.setText(utext);
   1391         int32_t position;
   1392         int32_t loops;
   1393         for (loops = 0; loops<10; loops++) {
   1394             position = bi.next();
   1395             if (position == RuleBasedBreakIterator::DONE) {
   1396                 break;
   1397             }
   1398         }
   1399         TEST_ASSERT(loops == 1);
   1400     } else {
   1401         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
   1402     }
   1403 }
   1404 
   1405 
   1406 
   1407 //-------------------------------------------------------------------------------
   1408 //
   1409 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
   1410 //    return the datain one big UChar * buffer, which the caller must delete.
   1411 //
   1412 //    parameters:
   1413 //          fileName:   the name of the file, with no directory part.  The test data directory
   1414 //                      is assumed.
   1415 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
   1416 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
   1417 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
   1418 //                      Pass NULL for the system default encoding.
   1419 //          status
   1420 //    returns:
   1421 //                      The file data, converted to UChar.
   1422 //                      The caller must delete this when done with
   1423 //                           delete [] theBuffer;
   1424 //
   1425 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
   1426 //           Move this function to some common place.
   1427 //
   1428 //--------------------------------------------------------------------------------
   1429 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
   1430     UChar       *retPtr  = NULL;
   1431     char        *fileBuf = NULL;
   1432     UConverter* conv     = NULL;
   1433     FILE        *f       = NULL;
   1434 
   1435     ulen = 0;
   1436     if (U_FAILURE(status)) {
   1437         return retPtr;
   1438     }
   1439 
   1440     //
   1441     //  Open the file.
   1442     //
   1443     f = fopen(fileName, "rb");
   1444     if (f == 0) {
   1445         dataerrln("Error opening test data file %s\n", fileName);
   1446         status = U_FILE_ACCESS_ERROR;
   1447         return NULL;
   1448     }
   1449     //
   1450     //  Read it in
   1451     //
   1452     int   fileSize;
   1453     int   amt_read;
   1454 
   1455     fseek( f, 0, SEEK_END);
   1456     fileSize = ftell(f);
   1457     fileBuf = new char[fileSize];
   1458     fseek(f, 0, SEEK_SET);
   1459     amt_read = fread(fileBuf, 1, fileSize, f);
   1460     if (amt_read != fileSize || fileSize <= 0) {
   1461         errln("Error reading test data file.");
   1462         goto cleanUpAndReturn;
   1463     }
   1464 
   1465     //
   1466     // Look for a Unicode Signature (BOM) on the data just read
   1467     //
   1468     int32_t        signatureLength;
   1469     const char *   fileBufC;
   1470     const char*    bomEncoding;
   1471 
   1472     fileBufC = fileBuf;
   1473     bomEncoding = ucnv_detectUnicodeSignature(
   1474         fileBuf, fileSize, &signatureLength, &status);
   1475     if(bomEncoding!=NULL ){
   1476         fileBufC  += signatureLength;
   1477         fileSize  -= signatureLength;
   1478         encoding = bomEncoding;
   1479     }
   1480 
   1481     //
   1482     // Open a converter to take the rule file to UTF-16
   1483     //
   1484     conv = ucnv_open(encoding, &status);
   1485     if (U_FAILURE(status)) {
   1486         goto cleanUpAndReturn;
   1487     }
   1488 
   1489     //
   1490     // Convert the rules to UChar.
   1491     //  Preflight first to determine required buffer size.
   1492     //
   1493     ulen = ucnv_toUChars(conv,
   1494         NULL,           //  dest,
   1495         0,              //  destCapacity,
   1496         fileBufC,
   1497         fileSize,
   1498         &status);
   1499     if (status == U_BUFFER_OVERFLOW_ERROR) {
   1500         // Buffer Overflow is expected from the preflight operation.
   1501         status = U_ZERO_ERROR;
   1502 
   1503         retPtr = new UChar[ulen+1];
   1504         ucnv_toUChars(conv,
   1505             retPtr,       //  dest,
   1506             ulen+1,
   1507             fileBufC,
   1508             fileSize,
   1509             &status);
   1510     }
   1511 
   1512 cleanUpAndReturn:
   1513     fclose(f);
   1514     delete []fileBuf;
   1515     ucnv_close(conv);
   1516     if (U_FAILURE(status)) {
   1517         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   1518         delete []retPtr;
   1519         retPtr = 0;
   1520         ulen   = 0;
   1521     };
   1522     return retPtr;
   1523 }
   1524 
   1525 
   1526 
   1527 //--------------------------------------------------------------------------------------------
   1528 //
   1529 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
   1530 //
   1531 //-------------------------------------------------------------------------------------------
   1532 void RBBITest::TestUnicodeFiles() {
   1533     RuleBasedBreakIterator  *bi;
   1534     UErrorCode               status = U_ZERO_ERROR;
   1535 
   1536     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
   1537     TEST_ASSERT_SUCCESS(status);
   1538     if (U_SUCCESS(status)) {
   1539         runUnicodeTestData("GraphemeBreakTest.txt", bi);
   1540     }
   1541     delete bi;
   1542 
   1543     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
   1544     TEST_ASSERT_SUCCESS(status);
   1545     if (U_SUCCESS(status)) {
   1546         runUnicodeTestData("WordBreakTest.txt", bi);
   1547     }
   1548     delete bi;
   1549 
   1550     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
   1551     TEST_ASSERT_SUCCESS(status);
   1552     if (U_SUCCESS(status)) {
   1553         runUnicodeTestData("SentenceBreakTest.txt", bi);
   1554     }
   1555     delete bi;
   1556 
   1557     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
   1558     TEST_ASSERT_SUCCESS(status);
   1559     if (U_SUCCESS(status)) {
   1560         runUnicodeTestData("LineBreakTest.txt", bi);
   1561     }
   1562     delete bi;
   1563 }
   1564 
   1565 
   1566 //--------------------------------------------------------------------------------------------
   1567 //
   1568 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
   1569 //
   1570 //-------------------------------------------------------------------------------------------
   1571 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
   1572 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   1573     // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb. Ticket #7270
   1574     UBool isTicket7270Fixed = isICUVersionAtLeast(52, 1);
   1575     UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt");
   1576     UErrorCode  status = U_ZERO_ERROR;
   1577 
   1578     //
   1579     //  Open and read the test data file, put it into a UnicodeString.
   1580     //
   1581     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   1582     char testFileName[1000];
   1583     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
   1584         dataerrln("Can't open test data.  Path too long.");
   1585         return;
   1586     }
   1587     strcpy(testFileName, testDataDirectory);
   1588     strcat(testFileName, fileName);
   1589 
   1590     logln("Opening data file %s\n", fileName);
   1591 
   1592     int    len;
   1593     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
   1594     if (status != U_FILE_ACCESS_ERROR) {
   1595         TEST_ASSERT_SUCCESS(status);
   1596         TEST_ASSERT(testFile != NULL);
   1597     }
   1598     if (U_FAILURE(status) || testFile == NULL) {
   1599         return; /* something went wrong, error already output */
   1600     }
   1601     UnicodeString testFileAsString(TRUE, testFile, len);
   1602 
   1603     //
   1604     //  Parse the test data file using a regular expression.
   1605     //  Each kind of token is recognized in its own capture group; what type of item was scanned
   1606     //     is identified by which group had a match.
   1607     //
   1608     //    Caputure Group #                  1          2            3            4           5
   1609     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
   1610     //
   1611     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
   1612     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
   1613     UnicodeString   testString;
   1614     UVector32       breakPositions(status);
   1615     int             lineNumber = 1;
   1616     TEST_ASSERT_SUCCESS(status);
   1617     if (U_FAILURE(status)) {
   1618         return;
   1619     }
   1620 
   1621     //
   1622     //  Scan through each test case, building up the string to be broken in testString,
   1623     //   and the positions that should be boundaries in the breakPositions vector.
   1624     //
   1625     int spin = 0;
   1626     while (tokenMatcher.find()) {
   1627       	if(tokenMatcher.hitEnd()) {
   1628           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
   1629              This occurred when the text file was corrupt (wasn't marked as UTF-8)
   1630              and caused an infinite loop here on EBCDIC systems!
   1631           */
   1632           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
   1633           //	   return;
   1634       	}
   1635         if (tokenMatcher.start(1, status) >= 0) {
   1636             // Scanned a divide sign, indicating a break position in the test data.
   1637             if (testString.length()>0) {
   1638                 breakPositions.addElement(testString.length(), status);
   1639             }
   1640         }
   1641         else if (tokenMatcher.start(2, status) >= 0) {
   1642             // Scanned an 'x', meaning no break at this position in the test data
   1643             //   Nothing to be done here.
   1644             }
   1645         else if (tokenMatcher.start(3, status) >= 0) {
   1646             // Scanned Hex digits.  Convert them to binary, append to the character data string.
   1647             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
   1648             int length = hexNumber.length();
   1649             if (length<=8) {
   1650                 char buf[10];
   1651                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
   1652                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
   1653                 if (c<=0x10ffff) {
   1654                     testString.append(c);
   1655                 } else {
   1656                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
   1657                        fileName, lineNumber);
   1658                 }
   1659             } else {
   1660                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
   1661                        fileName, lineNumber);
   1662              }
   1663         }
   1664         else if (tokenMatcher.start(4, status) >= 0) {
   1665             // Scanned to end of a line, possibly skipping over a comment in the process.
   1666             //   If the line from the file contained test data, run the test now.
   1667             //
   1668             if (testString.length() > 0) {
   1669 // TODO(andy): Remove this time bomb code. Note: Failing line numbers may change when updating to new Unicode data.
   1670 //             Rule 8
   1671 //                ZW SP* <break>
   1672 //             is not yet implemented.
   1673 if (!(isLineBreak && !isTicket7270Fixed && (5198 == lineNumber ||
   1674                                             5202 == lineNumber ||
   1675                                             5214 == lineNumber ||
   1676                                             5246 == lineNumber ||
   1677                                             5298 == lineNumber ||
   1678                                             5302 == lineNumber ))) {
   1679                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
   1680 }
   1681             }
   1682 
   1683             // Clear out this test case.
   1684             //    The string and breakPositions vector will be refilled as the next
   1685             //       test case is parsed.
   1686             testString.remove();
   1687             breakPositions.removeAllElements();
   1688             lineNumber++;
   1689         } else {
   1690             // Scanner catchall.  Something unrecognized appeared on the line.
   1691             char token[16];
   1692             UnicodeString uToken = tokenMatcher.group(0, status);
   1693             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
   1694             token[sizeof(token)-1] = 0;
   1695             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
   1696 
   1697             // Clean up, in preparation for continuing with the next line.
   1698             testString.remove();
   1699             breakPositions.removeAllElements();
   1700             lineNumber++;
   1701         }
   1702         TEST_ASSERT_SUCCESS(status);
   1703         if (U_FAILURE(status)) {
   1704             break;
   1705         }
   1706     }
   1707 
   1708     delete [] testFile;
   1709  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
   1710 }
   1711 
   1712 //--------------------------------------------------------------------------------------------
   1713 //
   1714 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
   1715 //                            test data files.  Do only a simple, forward-only check -
   1716 //                            this test is mostly to check that ICU and the Unicode
   1717 //                            data agree with each other.
   1718 //
   1719 //--------------------------------------------------------------------------------------------
   1720 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
   1721                          const UnicodeString &testString,   // Text data to be broken
   1722                          UVector32 *breakPositions,         // Positions where breaks should be found.
   1723                          RuleBasedBreakIterator *bi) {
   1724     int32_t pos;                 // Break Position in the test string
   1725     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
   1726     int32_t expectedPos;         // Expected break position (index into test string)
   1727 
   1728     bi->setText(testString);
   1729     pos = bi->first();
   1730     pos = bi->next();
   1731 
   1732     while (pos != BreakIterator::DONE) {
   1733         if (expectedI >= breakPositions->size()) {
   1734             errln("Test file \"%s\", line %d, unexpected break found at position %d",
   1735                 testFileName, lineNumber, pos);
   1736             break;
   1737         }
   1738         expectedPos = breakPositions->elementAti(expectedI);
   1739         if (pos < expectedPos) {
   1740             errln("Test file \"%s\", line %d, unexpected break found at position %d",
   1741                 testFileName, lineNumber, pos);
   1742             break;
   1743         }
   1744         if (pos > expectedPos) {
   1745             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
   1746                 testFileName, lineNumber, expectedPos);
   1747             break;
   1748         }
   1749         pos = bi->next();
   1750         expectedI++;
   1751     }
   1752 
   1753     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
   1754         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
   1755             testFileName, lineNumber, breakPositions->elementAti(expectedI));
   1756     }
   1757 }
   1758 
   1759 
   1760 
   1761 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   1762 //---------------------------------------------------------------------------------------
   1763 //
   1764 //   classs RBBIMonkeyKind
   1765 //
   1766 //      Monkey Test for Break Iteration
   1767 //      Abstract interface class.   Concrete derived classes independently
   1768 //      implement the break rules for different iterator types.
   1769 //
   1770 //      The Monkey Test itself uses doesn't know which type of break iterator it is
   1771 //      testing, but works purely in terms of the interface defined here.
   1772 //
   1773 //---------------------------------------------------------------------------------------
   1774 class RBBIMonkeyKind {
   1775 public:
   1776     // Return a UVector of UnicodeSets, representing the character classes used
   1777     //   for this type of iterator.
   1778     virtual  UVector  *charClasses() = 0;
   1779 
   1780     // Set the test text on which subsequent calls to next() will operate
   1781     virtual  void      setText(const UnicodeString &s) = 0;
   1782 
   1783     // Find the next break postion, starting from the prev break position, or from zero.
   1784     // Return -1 after reaching end of string.
   1785     virtual  int32_t   next(int32_t i) = 0;
   1786 
   1787     virtual ~RBBIMonkeyKind();
   1788     UErrorCode       deferredStatus;
   1789 
   1790 
   1791 protected:
   1792     RBBIMonkeyKind();
   1793 
   1794 private:
   1795 };
   1796 
   1797 RBBIMonkeyKind::RBBIMonkeyKind() {
   1798     deferredStatus = U_ZERO_ERROR;
   1799 }
   1800 
   1801 RBBIMonkeyKind::~RBBIMonkeyKind() {
   1802 }
   1803 
   1804 
   1805 //----------------------------------------------------------------------------------------
   1806 //
   1807 //   Random Numbers.  Similar to standard lib rand() and srand()
   1808 //                    Not using library to
   1809 //                      1.  Get same results on all platforms.
   1810 //                      2.  Get access to current seed, to more easily reproduce failures.
   1811 //
   1812 //---------------------------------------------------------------------------------------
   1813 static uint32_t m_seed = 1;
   1814 
   1815 static uint32_t m_rand()
   1816 {
   1817     m_seed = m_seed * 1103515245 + 12345;
   1818     return (uint32_t)(m_seed/65536) % 32768;
   1819 }
   1820 
   1821 
   1822 //------------------------------------------------------------------------------------------
   1823 //
   1824 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
   1825 //                             of RBBIMonkeyKind.
   1826 //
   1827 //------------------------------------------------------------------------------------------
   1828 class RBBICharMonkey: public RBBIMonkeyKind {
   1829 public:
   1830     RBBICharMonkey();
   1831     virtual          ~RBBICharMonkey();
   1832     virtual  UVector *charClasses();
   1833     virtual  void     setText(const UnicodeString &s);
   1834     virtual  int32_t  next(int32_t i);
   1835 private:
   1836     UVector   *fSets;
   1837 
   1838     UnicodeSet  *fCRLFSet;
   1839     UnicodeSet  *fControlSet;
   1840     UnicodeSet  *fExtendSet;
   1841     UnicodeSet  *fRegionalIndicatorSet;
   1842     UnicodeSet  *fPrependSet;
   1843     UnicodeSet  *fSpacingSet;
   1844     UnicodeSet  *fLSet;
   1845     UnicodeSet  *fVSet;
   1846     UnicodeSet  *fTSet;
   1847     UnicodeSet  *fLVSet;
   1848     UnicodeSet  *fLVTSet;
   1849     UnicodeSet  *fHangulSet;
   1850     UnicodeSet  *fAnySet;
   1851 
   1852     const UnicodeString *fText;
   1853 };
   1854 
   1855 
   1856 RBBICharMonkey::RBBICharMonkey() {
   1857     UErrorCode  status = U_ZERO_ERROR;
   1858 
   1859     fText = NULL;
   1860 
   1861     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
   1862     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
   1863     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
   1864     fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
   1865     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
   1866     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
   1867     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
   1868     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
   1869     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
   1870     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
   1871     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
   1872     fHangulSet  = new UnicodeSet();
   1873     fHangulSet->addAll(*fLSet);
   1874     fHangulSet->addAll(*fVSet);
   1875     fHangulSet->addAll(*fTSet);
   1876     fHangulSet->addAll(*fLVSet);
   1877     fHangulSet->addAll(*fLVTSet);
   1878     fAnySet     = new UnicodeSet(0, 0x10ffff);
   1879 
   1880     fSets       = new UVector(status);
   1881     fSets->addElement(fCRLFSet,    status);
   1882     fSets->addElement(fControlSet, status);
   1883     fSets->addElement(fExtendSet,  status);
   1884     fSets->addElement(fRegionalIndicatorSet, status);
   1885     if (!fPrependSet->isEmpty()) {
   1886         fSets->addElement(fPrependSet, status);
   1887     }
   1888     fSets->addElement(fSpacingSet, status);
   1889     fSets->addElement(fHangulSet,  status);
   1890     fSets->addElement(fAnySet,     status);
   1891     if (U_FAILURE(status)) {
   1892         deferredStatus = status;
   1893     }
   1894 }
   1895 
   1896 
   1897 void RBBICharMonkey::setText(const UnicodeString &s) {
   1898     fText = &s;
   1899 }
   1900 
   1901 
   1902 
   1903 int32_t RBBICharMonkey::next(int32_t prevPos) {
   1904     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   1905                               //   break position being tested.  The candidate break
   1906                               //   location is before p2.
   1907 
   1908     int     breakPos = -1;
   1909 
   1910     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   1911 
   1912     if (U_FAILURE(deferredStatus)) {
   1913         return -1;
   1914     }
   1915 
   1916     // Previous break at end of string.  return DONE.
   1917     if (prevPos >= fText->length()) {
   1918         return -1;
   1919     }
   1920     p0 = p1 = p2 = p3 = prevPos;
   1921     c3 =  fText->char32At(prevPos);
   1922     c0 = c1 = c2 = 0;
   1923 
   1924     // Loop runs once per "significant" character position in the input text.
   1925     for (;;) {
   1926         // Move all of the positions forward in the input string.
   1927         p0 = p1;  c0 = c1;
   1928         p1 = p2;  c1 = c2;
   1929         p2 = p3;  c2 = c3;
   1930 
   1931         // Advancd p3 by one codepoint
   1932         p3 = fText->moveIndex32(p3, 1);
   1933         c3 = fText->char32At(p3);
   1934 
   1935         if (p1 == p2) {
   1936             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   1937             continue;
   1938         }
   1939         if (p2 == fText->length()) {
   1940             // Reached end of string.  Always a break position.
   1941             break;
   1942         }
   1943 
   1944         // Rule  GB3   CR x LF
   1945         //     No Extend or Format characters may appear between the CR and LF,
   1946         //     which requires the additional check for p2 immediately following p1.
   1947         //
   1948         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
   1949             continue;
   1950         }
   1951 
   1952         // Rule (GB4).   ( Control | CR | LF ) <break>
   1953         if (fControlSet->contains(c1) ||
   1954             c1 == 0x0D ||
   1955             c1 == 0x0A)  {
   1956             break;
   1957         }
   1958 
   1959         // Rule (GB5)    <break>  ( Control | CR | LF )
   1960         //
   1961         if (fControlSet->contains(c2) ||
   1962             c2 == 0x0D ||
   1963             c2 == 0x0A)  {
   1964             break;
   1965         }
   1966 
   1967 
   1968         // Rule (GB6)  L x ( L | V | LV | LVT )
   1969         if (fLSet->contains(c1) &&
   1970                (fLSet->contains(c2)  ||
   1971                 fVSet->contains(c2)  ||
   1972                 fLVSet->contains(c2) ||
   1973                 fLVTSet->contains(c2))) {
   1974             continue;
   1975         }
   1976 
   1977         // Rule (GB7)    ( LV | V )  x  ( V | T )
   1978         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
   1979             (fVSet->contains(c2) || fTSet->contains(c2)))  {
   1980             continue;
   1981         }
   1982 
   1983         // Rule (GB8)    ( LVT | T)  x T
   1984         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
   1985             fTSet->contains(c2))  {
   1986             continue;
   1987         }
   1988 
   1989         // Rule (GB8a)    Regional_Indicator x Regional_Indicator
   1990         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
   1991             continue;
   1992         }
   1993 
   1994         // Rule (GB9)    Numeric x ALetter
   1995         if (fExtendSet->contains(c2))  {
   1996             continue;
   1997         }
   1998 
   1999         // Rule (GB9a)   x  SpacingMark
   2000         if (fSpacingSet->contains(c2)) {
   2001             continue;
   2002         }
   2003 
   2004         // Rule (GB9b)   Prepend x
   2005         if (fPrependSet->contains(c1)) {
   2006             continue;
   2007         }
   2008 
   2009         // Rule (GB10)  Any  <break>  Any
   2010         break;
   2011     }
   2012 
   2013     breakPos = p2;
   2014     return breakPos;
   2015 }
   2016 
   2017 
   2018 
   2019 UVector  *RBBICharMonkey::charClasses() {
   2020     return fSets;
   2021 }
   2022 
   2023 
   2024 RBBICharMonkey::~RBBICharMonkey() {
   2025     delete fSets;
   2026     delete fCRLFSet;
   2027     delete fControlSet;
   2028     delete fExtendSet;
   2029     delete fRegionalIndicatorSet;
   2030     delete fPrependSet;
   2031     delete fSpacingSet;
   2032     delete fLSet;
   2033     delete fVSet;
   2034     delete fTSet;
   2035     delete fLVSet;
   2036     delete fLVTSet;
   2037     delete fHangulSet;
   2038     delete fAnySet;
   2039 }
   2040 
   2041 //------------------------------------------------------------------------------------------
   2042 //
   2043 //   class RBBIWordMonkey      Word Break specific implementation
   2044 //                             of RBBIMonkeyKind.
   2045 //
   2046 //------------------------------------------------------------------------------------------
   2047 class RBBIWordMonkey: public RBBIMonkeyKind {
   2048 public:
   2049     RBBIWordMonkey();
   2050     virtual          ~RBBIWordMonkey();
   2051     virtual  UVector *charClasses();
   2052     virtual  void     setText(const UnicodeString &s);
   2053     virtual int32_t   next(int32_t i);
   2054 private:
   2055     UVector      *fSets;
   2056 
   2057     UnicodeSet  *fCRSet;
   2058     UnicodeSet  *fLFSet;
   2059     UnicodeSet  *fNewlineSet;
   2060     UnicodeSet  *fKatakanaSet;
   2061     UnicodeSet  *fALetterSet;
   2062     // TODO(jungshik): Do we still need this change?
   2063     // UnicodeSet  *fALetterSet; // matches ALetterPlus in word.txt
   2064     UnicodeSet  *fMidNumLetSet;
   2065     UnicodeSet  *fMidLetterSet;
   2066     UnicodeSet  *fMidNumSet;
   2067     UnicodeSet  *fNumericSet;
   2068     UnicodeSet  *fFormatSet;
   2069     UnicodeSet  *fOtherSet;
   2070     UnicodeSet  *fExtendSet;
   2071     UnicodeSet  *fExtendNumLetSet;
   2072     UnicodeSet  *fRegionalIndicatorSet;
   2073     UnicodeSet  *fDictionaryCjkSet;
   2074 
   2075     RegexMatcher  *fMatcher;
   2076 
   2077     const UnicodeString  *fText;
   2078 };
   2079 
   2080 
   2081 RBBIWordMonkey::RBBIWordMonkey()
   2082 {
   2083     UErrorCode  status = U_ZERO_ERROR;
   2084 
   2085     fSets            = new UVector(status);
   2086 
   2087     fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
   2088     fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
   2089     fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
   2090     fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
   2091     // Exclude Hangul syllables from ALetterSet during testing.
   2092     // Leave CJK dictionary characters out from the monkey tests!
   2093 #if 0
   2094     fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
   2095                                       "[\\p{Line_Break = Complex_Context}"
   2096                                       "-\\p{Grapheme_Cluster_Break = Extend}"
   2097                                       "-\\p{Grapheme_Cluster_Break = Control}"
   2098                                       "]]",
   2099                                       status);
   2100 #endif
   2101     fALetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
   2102     fALetterSet->removeAll(*fDictionaryCjkSet);
   2103     fKatakanaSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
   2104     fMidNumLetSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
   2105     fMidLetterSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
   2106     fMidNumSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
   2107     // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
   2108     // we should figure out why
   2109     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
   2110     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
   2111     fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
   2112     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
   2113     fRegionalIndicatorSet =  new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
   2114 
   2115     fOtherSet        = new UnicodeSet();
   2116     if(U_FAILURE(status)) {
   2117       deferredStatus = status;
   2118       return;
   2119     }
   2120 
   2121     fOtherSet->complement();
   2122     fOtherSet->removeAll(*fCRSet);
   2123     fOtherSet->removeAll(*fLFSet);
   2124     fOtherSet->removeAll(*fNewlineSet);
   2125     fOtherSet->removeAll(*fKatakanaSet);
   2126     fOtherSet->removeAll(*fALetterSet);
   2127     fOtherSet->removeAll(*fMidLetterSet);
   2128     fOtherSet->removeAll(*fMidNumSet);
   2129     fOtherSet->removeAll(*fNumericSet);
   2130     fOtherSet->removeAll(*fExtendNumLetSet);
   2131     fOtherSet->removeAll(*fFormatSet);
   2132     fOtherSet->removeAll(*fExtendSet);
   2133     fOtherSet->removeAll(*fRegionalIndicatorSet);
   2134     // Inhibit dictionary characters from being tested at all.
   2135     fOtherSet->removeAll(*fDictionaryCjkSet);
   2136     fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
   2137 
   2138     fSets->addElement(fCRSet,        status);
   2139     fSets->addElement(fLFSet,        status);
   2140     fSets->addElement(fNewlineSet,   status);
   2141     fSets->addElement(fALetterSet,   status);
   2142     //fSets->addElement(fKatakanaSet,  status); //TODO: work out how to test katakana
   2143     fSets->addElement(fMidLetterSet, status);
   2144     fSets->addElement(fMidNumLetSet, status);
   2145     fSets->addElement(fMidNumSet,    status);
   2146     fSets->addElement(fNumericSet,   status);
   2147     fSets->addElement(fFormatSet,    status);
   2148     fSets->addElement(fExtendSet,    status);
   2149     fSets->addElement(fOtherSet,     status);
   2150     fSets->addElement(fExtendNumLetSet, status);
   2151     fSets->addElement(fRegionalIndicatorSet, status);
   2152 
   2153     if (U_FAILURE(status)) {
   2154         deferredStatus = status;
   2155     }
   2156 }
   2157 
   2158 void RBBIWordMonkey::setText(const UnicodeString &s) {
   2159     fText       = &s;
   2160 }
   2161 
   2162 
   2163 int32_t RBBIWordMonkey::next(int32_t prevPos) {
   2164     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2165                               //   break position being tested.  The candidate break
   2166                               //   location is before p2.
   2167 
   2168     int     breakPos = -1;
   2169 
   2170     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2171 
   2172     if (U_FAILURE(deferredStatus)) {
   2173         return -1;
   2174     }
   2175 
   2176     // Prev break at end of string.  return DONE.
   2177     if (prevPos >= fText->length()) {
   2178         return -1;
   2179     }
   2180     p0 = p1 = p2 = p3 = prevPos;
   2181     c3 =  fText->char32At(prevPos);
   2182     c0 = c1 = c2 = 0;
   2183 
   2184     // Loop runs once per "significant" character position in the input text.
   2185     for (;;) {
   2186         // Move all of the positions forward in the input string.
   2187         p0 = p1;  c0 = c1;
   2188         p1 = p2;  c1 = c2;
   2189         p2 = p3;  c2 = c3;
   2190 
   2191         // Advancd p3 by    X(Extend | Format)*   Rule 4
   2192         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
   2193         do {
   2194             p3 = fText->moveIndex32(p3, 1);
   2195             c3 = fText->char32At(p3);
   2196             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
   2197                break;
   2198             };
   2199         }
   2200         while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
   2201 
   2202 
   2203         if (p1 == p2) {
   2204             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2205             continue;
   2206         }
   2207         if (p2 == fText->length()) {
   2208             // Reached end of string.  Always a break position.
   2209             break;
   2210         }
   2211 
   2212         // Rule  (3)   CR x LF
   2213         //     No Extend or Format characters may appear between the CR and LF,
   2214         //     which requires the additional check for p2 immediately following p1.
   2215         //
   2216         if (c1==0x0D && c2==0x0A) {
   2217             continue;
   2218         }
   2219 
   2220         // Rule (3a)  Break before and after newlines (including CR and LF)
   2221         //
   2222         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
   2223             break;
   2224         };
   2225         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
   2226             break;
   2227         };
   2228 
   2229         // Rule (5).   ALetter x ALetter
   2230         if (fALetterSet->contains(c1) &&
   2231             fALetterSet->contains(c2))  {
   2232             continue;
   2233         }
   2234 
   2235         // Rule (6)  ALetter  x  (MidLetter | MidNumLet) ALetter
   2236         //
   2237         if ( fALetterSet->contains(c1)   &&
   2238              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
   2239              fALetterSet->contains(c3)) {
   2240             continue;
   2241         }
   2242 
   2243 
   2244         // Rule (7)  ALetter (MidLetter | MidNumLet)  x  ALetter
   2245         if (fALetterSet->contains(c0) &&
   2246             (fMidLetterSet->contains(c1) ||  fMidNumLetSet->contains(c1)) &&
   2247             fALetterSet->contains(c2)) {
   2248             continue;
   2249         }
   2250 
   2251         // Rule (8)    Numeric x Numeric
   2252         if (fNumericSet->contains(c1) &&
   2253             fNumericSet->contains(c2))  {
   2254             continue;
   2255         }
   2256 
   2257         // Rule (9)    ALetter x Numeric
   2258         if (fALetterSet->contains(c1) &&
   2259             fNumericSet->contains(c2))  {
   2260             continue;
   2261         }
   2262 
   2263         // Rule (10)    Numeric x ALetter
   2264         if (fNumericSet->contains(c1) &&
   2265             fALetterSet->contains(c2))  {
   2266             continue;
   2267         }
   2268 
   2269         // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric
   2270         if (fNumericSet->contains(c0) &&
   2271             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1))  &&
   2272             fNumericSet->contains(c2)) {
   2273             continue;
   2274         }
   2275 
   2276         // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric
   2277         if (fNumericSet->contains(c1) &&
   2278             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2))  &&
   2279             fNumericSet->contains(c3)) {
   2280             continue;
   2281         }
   2282 
   2283         // Rule (13)  Katakana x Katakana
   2284         if (fKatakanaSet->contains(c1) &&
   2285             fKatakanaSet->contains(c2))  {
   2286             continue;
   2287         }
   2288 
   2289         // Rule 13a
   2290         if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
   2291              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
   2292              fExtendNumLetSet->contains(c2)) {
   2293                 continue;
   2294         }
   2295 
   2296         // Rule 13b
   2297         if (fExtendNumLetSet->contains(c1) &&
   2298                 (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
   2299                 fKatakanaSet->contains(c2)))  {
   2300                 continue;
   2301         }
   2302 
   2303         // Rule 13c
   2304         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
   2305             continue;
   2306         }
   2307 
   2308         // Rule 14.  Break found here.
   2309         break;
   2310     }
   2311 
   2312     breakPos = p2;
   2313     return breakPos;
   2314 }
   2315 
   2316 
   2317 UVector  *RBBIWordMonkey::charClasses() {
   2318     return fSets;
   2319 }
   2320 
   2321 
   2322 RBBIWordMonkey::~RBBIWordMonkey() {
   2323     delete fSets;
   2324     delete fCRSet;
   2325     delete fLFSet;
   2326     delete fNewlineSet;
   2327     delete fKatakanaSet;
   2328     delete fALetterSet;
   2329     delete fMidNumLetSet;
   2330     delete fMidLetterSet;
   2331     delete fMidNumSet;
   2332     delete fNumericSet;
   2333     delete fFormatSet;
   2334     delete fExtendSet;
   2335     delete fExtendNumLetSet;
   2336     delete fRegionalIndicatorSet;
   2337     delete fDictionaryCjkSet;
   2338     delete fOtherSet;
   2339 }
   2340 
   2341 
   2342 
   2343 
   2344 //------------------------------------------------------------------------------------------
   2345 //
   2346 //   class RBBISentMonkey      Sentence Break specific implementation
   2347 //                             of RBBIMonkeyKind.
   2348 //
   2349 //------------------------------------------------------------------------------------------
   2350 class RBBISentMonkey: public RBBIMonkeyKind {
   2351 public:
   2352     RBBISentMonkey();
   2353     virtual          ~RBBISentMonkey();
   2354     virtual  UVector *charClasses();
   2355     virtual  void     setText(const UnicodeString &s);
   2356     virtual int32_t   next(int32_t i);
   2357 private:
   2358     int               moveBack(int posFrom);
   2359     int               moveForward(int posFrom);
   2360     UChar32           cAt(int pos);
   2361 
   2362     UVector      *fSets;
   2363 
   2364     UnicodeSet  *fSepSet;
   2365     UnicodeSet  *fFormatSet;
   2366     UnicodeSet  *fSpSet;
   2367     UnicodeSet  *fLowerSet;
   2368     UnicodeSet  *fUpperSet;
   2369     UnicodeSet  *fOLetterSet;
   2370     UnicodeSet  *fNumericSet;
   2371     UnicodeSet  *fATermSet;
   2372     UnicodeSet  *fSContinueSet;
   2373     UnicodeSet  *fSTermSet;
   2374     UnicodeSet  *fCloseSet;
   2375     UnicodeSet  *fOtherSet;
   2376     UnicodeSet  *fExtendSet;
   2377 
   2378     const UnicodeString  *fText;
   2379 
   2380 };
   2381 
   2382 RBBISentMonkey::RBBISentMonkey()
   2383 {
   2384     UErrorCode  status = U_ZERO_ERROR;
   2385 
   2386     fSets            = new UVector(status);
   2387 
   2388     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
   2389     //                       set and made into character classes of their own.  For the monkey impl,
   2390     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
   2391     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
   2392     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
   2393     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
   2394     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
   2395     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
   2396     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
   2397     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
   2398     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
   2399     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
   2400     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
   2401     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
   2402     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
   2403     fOtherSet        = new UnicodeSet();
   2404 
   2405     if(U_FAILURE(status)) {
   2406       deferredStatus = status;
   2407       return;
   2408     }
   2409 
   2410     fOtherSet->complement();
   2411     fOtherSet->removeAll(*fSepSet);
   2412     fOtherSet->removeAll(*fFormatSet);
   2413     fOtherSet->removeAll(*fSpSet);
   2414     fOtherSet->removeAll(*fLowerSet);
   2415     fOtherSet->removeAll(*fUpperSet);
   2416     fOtherSet->removeAll(*fOLetterSet);
   2417     fOtherSet->removeAll(*fNumericSet);
   2418     fOtherSet->removeAll(*fATermSet);
   2419     fOtherSet->removeAll(*fSContinueSet);
   2420     fOtherSet->removeAll(*fSTermSet);
   2421     fOtherSet->removeAll(*fCloseSet);
   2422     fOtherSet->removeAll(*fExtendSet);
   2423 
   2424     fSets->addElement(fSepSet,       status);
   2425     fSets->addElement(fFormatSet,    status);
   2426     fSets->addElement(fSpSet,        status);
   2427     fSets->addElement(fLowerSet,     status);
   2428     fSets->addElement(fUpperSet,     status);
   2429     fSets->addElement(fOLetterSet,   status);
   2430     fSets->addElement(fNumericSet,   status);
   2431     fSets->addElement(fATermSet,     status);
   2432     fSets->addElement(fSContinueSet, status);
   2433     fSets->addElement(fSTermSet,     status);
   2434     fSets->addElement(fCloseSet,     status);
   2435     fSets->addElement(fOtherSet,     status);
   2436     fSets->addElement(fExtendSet,    status);
   2437 
   2438     if (U_FAILURE(status)) {
   2439         deferredStatus = status;
   2440     }
   2441 }
   2442 
   2443 
   2444 
   2445 void RBBISentMonkey::setText(const UnicodeString &s) {
   2446     fText       = &s;
   2447 }
   2448 
   2449 UVector  *RBBISentMonkey::charClasses() {
   2450     return fSets;
   2451 }
   2452 
   2453 
   2454 //  moveBack()   Find the "significant" code point preceding the index i.
   2455 //               Skips over ($Extend | $Format)* .
   2456 //
   2457 int RBBISentMonkey::moveBack(int i) {
   2458     if (i <= 0) {
   2459         return -1;
   2460     }
   2461     UChar32   c;
   2462     int32_t   j = i;
   2463     do {
   2464         j = fText->moveIndex32(j, -1);
   2465         c = fText->char32At(j);
   2466     }
   2467     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
   2468     return j;
   2469 
   2470  }
   2471 
   2472 
   2473 int RBBISentMonkey::moveForward(int i) {
   2474     if (i>=fText->length()) {
   2475         return fText->length();
   2476     }
   2477     UChar32   c;
   2478     int32_t   j = i;
   2479     do {
   2480         j = fText->moveIndex32(j, 1);
   2481         c = cAt(j);
   2482     }
   2483     while (fFormatSet->contains(c) || fExtendSet->contains(c));
   2484     return j;
   2485 }
   2486 
   2487 UChar32 RBBISentMonkey::cAt(int pos) {
   2488     if (pos<0 || pos>=fText->length()) {
   2489         return -1;
   2490     } else {
   2491         return fText->char32At(pos);
   2492     }
   2493 }
   2494 
   2495 int32_t RBBISentMonkey::next(int32_t prevPos) {
   2496     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2497                               //   break position being tested.  The candidate break
   2498                               //   location is before p2.
   2499 
   2500     int     breakPos = -1;
   2501 
   2502     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2503     UChar32 c;
   2504 
   2505     if (U_FAILURE(deferredStatus)) {
   2506         return -1;
   2507     }
   2508 
   2509     // Prev break at end of string.  return DONE.
   2510     if (prevPos >= fText->length()) {
   2511         return -1;
   2512     }
   2513     p0 = p1 = p2 = p3 = prevPos;
   2514     c3 =  fText->char32At(prevPos);
   2515     c0 = c1 = c2 = 0;
   2516 
   2517     // Loop runs once per "significant" character position in the input text.
   2518     for (;;) {
   2519         // Move all of the positions forward in the input string.
   2520         p0 = p1;  c0 = c1;
   2521         p1 = p2;  c1 = c2;
   2522         p2 = p3;  c2 = c3;
   2523 
   2524         // Advancd p3 by    X(Extend | Format)*   Rule 4
   2525         p3 = moveForward(p3);
   2526         c3 = cAt(p3);
   2527 
   2528         // Rule (3)  CR x LF
   2529         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
   2530             continue;
   2531         }
   2532 
   2533         // Rule (4).   Sep  <break>
   2534         if (fSepSet->contains(c1)) {
   2535             p2 = p1+1;   // Separators don't combine with Extend or Format.
   2536             break;
   2537         }
   2538 
   2539         if (p2 >= fText->length()) {
   2540             // Reached end of string.  Always a break position.
   2541             break;
   2542         }
   2543 
   2544         if (p2 == prevPos) {
   2545             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2546             continue;
   2547         }
   2548 
   2549         // Rule (6).   ATerm x Numeric
   2550         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
   2551             continue;
   2552         }
   2553 
   2554         // Rule (7).  Upper ATerm  x  Uppper
   2555         if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
   2556             continue;
   2557         }
   2558 
   2559         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
   2560         //           Note:  STerm | ATerm are added to the negated part of the expression by a
   2561         //                  note to the Unicode 5.0 documents.
   2562         int p8 = p1;
   2563         while (fSpSet->contains(cAt(p8))) {
   2564             p8 = moveBack(p8);
   2565         }
   2566         while (fCloseSet->contains(cAt(p8))) {
   2567             p8 = moveBack(p8);
   2568         }
   2569         if (fATermSet->contains(cAt(p8))) {
   2570             p8=p2;
   2571             for (;;) {
   2572                 c = cAt(p8);
   2573                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
   2574                     fLowerSet->contains(c) || fSepSet->contains(c) ||
   2575                     fATermSet->contains(c) || fSTermSet->contains(c))  {
   2576                     break;
   2577                 }
   2578                 p8 = moveForward(p8);
   2579             }
   2580             if (fLowerSet->contains(cAt(p8))) {
   2581                 continue;
   2582             }
   2583         }
   2584 
   2585         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
   2586         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
   2587             p8 = p1;
   2588             while (fSpSet->contains(cAt(p8))) {
   2589                 p8 = moveBack(p8);
   2590             }
   2591             while (fCloseSet->contains(cAt(p8))) {
   2592                 p8 = moveBack(p8);
   2593             }
   2594             c = cAt(p8);
   2595             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
   2596                 continue;
   2597             }
   2598         }
   2599 
   2600         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
   2601         int p9 = p1;
   2602         while (fCloseSet->contains(cAt(p9))) {
   2603             p9 = moveBack(p9);
   2604         }
   2605         c = cAt(p9);
   2606         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
   2607             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
   2608                 continue;
   2609             }
   2610         }
   2611 
   2612         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
   2613         int p10 = p1;
   2614         while (fSpSet->contains(cAt(p10))) {
   2615             p10 = moveBack(p10);
   2616         }
   2617         while (fCloseSet->contains(cAt(p10))) {
   2618             p10 = moveBack(p10);
   2619         }
   2620         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
   2621             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
   2622                 continue;
   2623             }
   2624         }
   2625 
   2626         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
   2627         int p11 = p1;
   2628         if (fSepSet->contains(cAt(p11))) {
   2629             p11 = moveBack(p11);
   2630         }
   2631         while (fSpSet->contains(cAt(p11))) {
   2632             p11 = moveBack(p11);
   2633         }
   2634         while (fCloseSet->contains(cAt(p11))) {
   2635             p11 = moveBack(p11);
   2636         }
   2637         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
   2638             break;
   2639         }
   2640 
   2641         //  Rule (12)  Any x Any
   2642         continue;
   2643     }
   2644     breakPos = p2;
   2645     return breakPos;
   2646 }
   2647 
   2648 RBBISentMonkey::~RBBISentMonkey() {
   2649     delete fSets;
   2650     delete fSepSet;
   2651     delete fFormatSet;
   2652     delete fSpSet;
   2653     delete fLowerSet;
   2654     delete fUpperSet;
   2655     delete fOLetterSet;
   2656     delete fNumericSet;
   2657     delete fATermSet;
   2658     delete fSContinueSet;
   2659     delete fSTermSet;
   2660     delete fCloseSet;
   2661     delete fOtherSet;
   2662     delete fExtendSet;
   2663 }
   2664 
   2665 
   2666 
   2667 //-------------------------------------------------------------------------------------------
   2668 //
   2669 //  RBBILineMonkey
   2670 //
   2671 //-------------------------------------------------------------------------------------------
   2672 
   2673 class RBBILineMonkey: public RBBIMonkeyKind {
   2674 public:
   2675     RBBILineMonkey();
   2676     virtual          ~RBBILineMonkey();
   2677     virtual  UVector *charClasses();
   2678     virtual  void     setText(const UnicodeString &s);
   2679     virtual  int32_t  next(int32_t i);
   2680     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
   2681 private:
   2682     UVector      *fSets;
   2683 
   2684     UnicodeSet  *fBK;
   2685     UnicodeSet  *fCR;
   2686     UnicodeSet  *fLF;
   2687     UnicodeSet  *fCM;
   2688     UnicodeSet  *fNL;
   2689     UnicodeSet  *fSG;
   2690     UnicodeSet  *fWJ;
   2691     UnicodeSet  *fZW;
   2692     UnicodeSet  *fGL;
   2693     UnicodeSet  *fCB;
   2694     UnicodeSet  *fSP;
   2695     UnicodeSet  *fB2;
   2696     UnicodeSet  *fBA;
   2697     UnicodeSet  *fBB;
   2698     UnicodeSet  *fHY;
   2699     UnicodeSet  *fH2;
   2700     UnicodeSet  *fH3;
   2701     UnicodeSet  *fCL;
   2702     UnicodeSet  *fCP;
   2703     UnicodeSet  *fEX;
   2704     UnicodeSet  *fIN;
   2705     UnicodeSet  *fJL;
   2706     UnicodeSet  *fJV;
   2707     UnicodeSet  *fJT;
   2708     UnicodeSet  *fNS;
   2709     UnicodeSet  *fOP;
   2710     UnicodeSet  *fQU;
   2711     UnicodeSet  *fIS;
   2712     UnicodeSet  *fNU;
   2713     UnicodeSet  *fPO;
   2714     UnicodeSet  *fPR;
   2715     UnicodeSet  *fSY;
   2716     UnicodeSet  *fAI;
   2717     UnicodeSet  *fAL;
   2718     UnicodeSet  *fCJ;
   2719     UnicodeSet  *fHL;
   2720     UnicodeSet  *fID;
   2721     UnicodeSet  *fRI;
   2722     UnicodeSet  *fSA;
   2723     UnicodeSet  *fXX;
   2724 
   2725     BreakIterator  *fCharBI;
   2726 
   2727     const UnicodeString  *fText;
   2728     int32_t              *fOrigPositions;
   2729 
   2730     RegexMatcher         *fNumberMatcher;
   2731     RegexMatcher         *fLB11Matcher;
   2732 };
   2733 
   2734 
   2735 RBBILineMonkey::RBBILineMonkey()
   2736 {
   2737     UErrorCode  status = U_ZERO_ERROR;
   2738 
   2739     fSets  = new UVector(status);
   2740 
   2741     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
   2742     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
   2743     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
   2744     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
   2745     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
   2746     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
   2747     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
   2748     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
   2749     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
   2750     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
   2751     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
   2752     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
   2753     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
   2754     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
   2755     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
   2756     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
   2757     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
   2758     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
   2759     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
   2760     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
   2761     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
   2762     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
   2763     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
   2764     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
   2765     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
   2766     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
   2767     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
   2768     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
   2769     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
   2770     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
   2771     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
   2772     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
   2773     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
   2774     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
   2775     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
   2776     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
   2777     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
   2778     fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
   2779     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
   2780     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
   2781 
   2782     if (U_FAILURE(status)) {
   2783         deferredStatus = status;
   2784         fCharBI = NULL;
   2785         fNumberMatcher = NULL;
   2786         return;
   2787     }
   2788 
   2789     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
   2790     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
   2791     fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
   2792     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
   2793 
   2794     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
   2795 
   2796     fSets->addElement(fBK, status);
   2797     fSets->addElement(fCR, status);
   2798     fSets->addElement(fLF, status);
   2799     fSets->addElement(fCM, status);
   2800     fSets->addElement(fNL, status);
   2801     fSets->addElement(fWJ, status);
   2802     fSets->addElement(fZW, status);
   2803     fSets->addElement(fGL, status);
   2804     fSets->addElement(fCB, status);
   2805     fSets->addElement(fSP, status);
   2806     fSets->addElement(fB2, status);
   2807     fSets->addElement(fBA, status);
   2808     fSets->addElement(fBB, status);
   2809     fSets->addElement(fHY, status);
   2810     fSets->addElement(fH2, status);
   2811     fSets->addElement(fH3, status);
   2812     fSets->addElement(fCL, status);
   2813     fSets->addElement(fCP, status);
   2814     fSets->addElement(fEX, status);
   2815     fSets->addElement(fIN, status);
   2816     fSets->addElement(fJL, status);
   2817     fSets->addElement(fJT, status);
   2818     fSets->addElement(fJV, status);
   2819     fSets->addElement(fNS, status);
   2820     fSets->addElement(fOP, status);
   2821     fSets->addElement(fQU, status);
   2822     fSets->addElement(fIS, status);
   2823     fSets->addElement(fNU, status);
   2824     fSets->addElement(fPO, status);
   2825     fSets->addElement(fPR, status);
   2826     fSets->addElement(fSY, status);
   2827     fSets->addElement(fAI, status);
   2828     fSets->addElement(fAL, status);
   2829     fSets->addElement(fHL, status);
   2830     fSets->addElement(fID, status);
   2831     fSets->addElement(fWJ, status);
   2832     fSets->addElement(fRI, status);
   2833     fSets->addElement(fSA, status);
   2834     fSets->addElement(fSG, status);
   2835 
   2836     const char *rules =
   2837             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
   2838             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
   2839             "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
   2840             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
   2841             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
   2842             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
   2843 
   2844     fNumberMatcher = new RegexMatcher(
   2845         UnicodeString(rules, -1, US_INV), 0, status);
   2846 
   2847     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
   2848 
   2849     if (U_FAILURE(status)) {
   2850         deferredStatus = status;
   2851     }
   2852 }
   2853 
   2854 
   2855 void RBBILineMonkey::setText(const UnicodeString &s) {
   2856     fText       = &s;
   2857     fCharBI->setText(s);
   2858     fNumberMatcher->reset(s);
   2859 }
   2860 
   2861 //
   2862 //  rule9Adjust
   2863 //     Line Break TR rules 9 and 10 implementation.
   2864 //     This deals with combining marks and other sequences that
   2865 //     that must be treated as if they were something other than what they actually are.
   2866 //
   2867 //     This is factored out into a separate function because it must be applied twice for
   2868 //     each potential break, once to the chars before the position being checked, then
   2869 //     again to the text following the possible break.
   2870 //
   2871 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
   2872     if (pos == -1) {
   2873         // Invalid initial position.  Happens during the warmup iteration of the
   2874         //   main loop in next().
   2875         return;
   2876     }
   2877 
   2878     int32_t  nPos = *nextPos;
   2879 
   2880     // LB 9  Keep combining sequences together.
   2881     //  advance over any CM class chars.  Note that Line Break CM is different
   2882     //  from the normal Grapheme Extend property.
   2883     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
   2884           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
   2885         for (;;) {
   2886             *nextChar = fText->char32At(nPos);
   2887             if (!fCM->contains(*nextChar)) {
   2888                 break;
   2889             }
   2890             nPos = fText->moveIndex32(nPos, 1);
   2891         }
   2892     }
   2893 
   2894 
   2895     // LB 9 Treat X CM* as if it were x.
   2896     //       No explicit action required.
   2897 
   2898     // LB 10  Treat any remaining combining mark as AL
   2899     if (fCM->contains(*posChar)) {
   2900         *posChar = 0x41;   // thisChar = 'A';
   2901     }
   2902 
   2903     // Push the updated nextPos and nextChar back to our caller.
   2904     // This only makes a difference if posChar got bigger by consuming a
   2905     // combining sequence.
   2906     *nextPos  = nPos;
   2907     *nextChar = fText->char32At(nPos);
   2908 }
   2909 
   2910 
   2911 
   2912 int32_t RBBILineMonkey::next(int32_t startPos) {
   2913     UErrorCode status = U_ZERO_ERROR;
   2914     int32_t    pos;       //  Index of the char following a potential break position
   2915     UChar32    thisChar;  //  Character at above position "pos"
   2916 
   2917     int32_t    prevPos;   //  Index of the char preceding a potential break position
   2918     UChar32    prevChar;  //  Character at above position.  Note that prevChar
   2919                           //   and thisChar may not be adjacent because combining
   2920                           //   characters between them will be ignored.
   2921 
   2922     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
   2923     UChar32    prevCharX2;
   2924 
   2925     int32_t    nextPos;   //  Index of the next character following pos.
   2926                           //     Usually skips over combining marks.
   2927     int32_t    nextCPPos; //  Index of the code point following "pos."
   2928                           //     May point to a combining mark.
   2929     int32_t    tPos;      //  temp value.
   2930     UChar32    c;
   2931 
   2932     if (U_FAILURE(deferredStatus)) {
   2933         return -1;
   2934     }
   2935 
   2936     if (startPos >= fText->length()) {
   2937         return -1;
   2938     }
   2939 
   2940 
   2941     // Initial values for loop.  Loop will run the first time without finding breaks,
   2942     //                           while the invalid values shift out and the "this" and
   2943     //                           "prev" positions are filled in with good values.
   2944     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
   2945     thisChar = prevChar  = prevCharX2 = 0;
   2946     nextPos  = nextCPPos = startPos;
   2947 
   2948 
   2949     // Loop runs once per position in the test text, until a break position
   2950     //  is found.
   2951     for (;;) {
   2952         prevPosX2 = prevPos;
   2953         prevCharX2 = prevChar;
   2954 
   2955         prevPos   = pos;
   2956         prevChar  = thisChar;
   2957 
   2958         pos       = nextPos;
   2959         thisChar  = fText->char32At(pos);
   2960 
   2961         nextCPPos = fText->moveIndex32(pos, 1);
   2962         nextPos   = nextCPPos;
   2963 
   2964         // Rule LB2 - Break at end of text.
   2965         if (pos >= fText->length()) {
   2966             break;
   2967         }
   2968 
   2969         // Rule LB 9 - adjust for combining sequences.
   2970         //             We do this one out-of-order because the adjustment does not change anything
   2971         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
   2972         //             be applied.
   2973         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
   2974         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
   2975         c = fText->char32At(nextPos);
   2976         rule9Adjust(pos,     &thisChar, &nextPos, &c);
   2977 
   2978         // If the loop is still warming up - if we haven't shifted the initial
   2979         //   -1 positions out of prevPos yet - loop back to advance the
   2980         //    position in the input without any further looking for breaks.
   2981         if (prevPos == -1) {
   2982             continue;
   2983         }
   2984 
   2985         // LB 4  Always break after hard line breaks,
   2986         if (fBK->contains(prevChar)) {
   2987             break;
   2988         }
   2989 
   2990         // LB 5  Break after CR, LF, NL, but not inside CR LF
   2991         if (prevChar == 0x0d && thisChar == 0x0a) {
   2992             continue;
   2993         }
   2994         if (prevChar == 0x0d ||
   2995             prevChar == 0x0a ||
   2996             prevChar == 0x85)  {
   2997             break;
   2998         }
   2999 
   3000         // LB 6  Don't break before hard line breaks
   3001         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
   3002             fBK->contains(thisChar)) {
   3003                 continue;
   3004         }
   3005 
   3006 
   3007         // LB 7  Don't break before spaces or zero-width space.
   3008         if (fSP->contains(thisChar)) {
   3009             continue;
   3010         }
   3011 
   3012         if (fZW->contains(thisChar)) {
   3013             continue;
   3014         }
   3015 
   3016         // LB 8  Break after zero width space
   3017         if (fZW->contains(prevChar)) {
   3018             break;
   3019         }
   3020 
   3021         // LB 9, 10  Already done, at top of loop.
   3022         //
   3023 
   3024 
   3025         // LB 11  Do not break before or after WORD JOINER and related characters.
   3026         //    x  WJ
   3027         //    WJ  x
   3028         //
   3029         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
   3030             continue;
   3031         }
   3032 
   3033         // LB 12
   3034         //    GL  x
   3035         if (fGL->contains(prevChar)) {
   3036             continue;
   3037         }
   3038 
   3039         // LB 12a
   3040         //    [^SP BA HY] x GL
   3041         if (!(fSP->contains(prevChar) ||
   3042               fBA->contains(prevChar) ||
   3043               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
   3044             continue;
   3045         }
   3046 
   3047 
   3048 
   3049         // LB 13  Don't break before closings.
   3050         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
   3051         //        fall into LB 17 and the more general number regular expression.
   3052         //
   3053         if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
   3054             (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
   3055                                          fEX->contains(thisChar)  ||
   3056             (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
   3057             (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
   3058             continue;
   3059         }
   3060 
   3061         // LB 14 Don't break after OP SP*
   3062         //       Scan backwards, checking for this sequence.
   3063         //       The OP char could include combining marks, so we actually check for
   3064         //           OP CM* SP*
   3065         //       Another Twist: The Rule 67 fixes may have changed a SP CM
   3066         //       sequence into a ID char, so before scanning back through spaces,
   3067         //       verify that prevChar is indeed a space.  The prevChar variable
   3068         //       may differ from fText[prevPos]
   3069         tPos = prevPos;
   3070         if (fSP->contains(prevChar)) {
   3071             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
   3072                 tPos=fText->moveIndex32(tPos, -1);
   3073             }
   3074         }
   3075         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
   3076             tPos=fText->moveIndex32(tPos, -1);
   3077         }
   3078         if (fOP->contains(fText->char32At(tPos))) {
   3079             continue;
   3080         }
   3081 
   3082 
   3083         // LB 15    QU SP* x OP
   3084         if (fOP->contains(thisChar)) {
   3085             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
   3086             int tPos = prevPos;
   3087             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
   3088                 tPos = fText->moveIndex32(tPos, -1);
   3089             }
   3090             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
   3091                 tPos = fText->moveIndex32(tPos, -1);
   3092             }
   3093             if (fQU->contains(fText->char32At(tPos))) {
   3094                 continue;
   3095             }
   3096         }
   3097 
   3098 
   3099 
   3100         // LB 16   (CL | CP) SP* x NS
   3101         //    Scan backwards for SP* CM* (CL | CP)
   3102         if (fNS->contains(thisChar)) {
   3103             int tPos = prevPos;
   3104             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
   3105                 tPos = fText->moveIndex32(tPos, -1);
   3106             }
   3107             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
   3108                 tPos = fText->moveIndex32(tPos, -1);
   3109             }
   3110             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
   3111                 continue;
   3112             }
   3113         }
   3114 
   3115 
   3116         // LB 17        B2 SP* x B2
   3117         if (fB2->contains(thisChar)) {
   3118             //  Scan backwards, checking for the B2 CM* SP* sequence.
   3119             tPos = prevPos;
   3120             if (fSP->contains(prevChar)) {
   3121                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
   3122                     tPos=fText->moveIndex32(tPos, -1);
   3123                 }
   3124             }
   3125             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
   3126                 tPos=fText->moveIndex32(tPos, -1);
   3127             }
   3128             if (fB2->contains(fText->char32At(tPos))) {
   3129                 continue;
   3130             }
   3131         }
   3132 
   3133 
   3134         // LB 18    break after space
   3135         if (fSP->contains(prevChar)) {
   3136             break;
   3137         }
   3138 
   3139         // LB 19
   3140         //    x   QU
   3141         //    QU  x
   3142         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
   3143             continue;
   3144         }
   3145 
   3146         // LB 20  Break around a CB
   3147         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
   3148             break;
   3149         }
   3150 
   3151         // LB 21
   3152         if (fBA->contains(thisChar) ||
   3153             fHY->contains(thisChar) ||
   3154             fNS->contains(thisChar) ||
   3155             fBB->contains(prevChar) )   {
   3156             continue;
   3157         }
   3158 
   3159         // LB 21a
   3160         //   HL (HY | BA) x
   3161         if (fHL->contains(prevCharX2) &&
   3162                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
   3163             continue;
   3164         }
   3165 
   3166         // LB 22
   3167         if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
   3168             (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
   3169             (fID->contains(prevChar) && fIN->contains(thisChar)) ||
   3170             (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
   3171             (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
   3172             continue;
   3173         }
   3174 
   3175 
   3176         // LB 23    ID x PO
   3177         //          AL x NU
   3178         //          HL x NU
   3179         //          NU x AL
   3180         if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
   3181             (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
   3182             (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
   3183             (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
   3184             (fNU->contains(prevChar) && fHL->contains(thisChar)) )   {
   3185             continue;
   3186         }
   3187 
   3188         // LB 24  Do not break between prefix and letters or ideographs.
   3189         //        PR x ID
   3190         //        PR x (AL | HL)
   3191         //        PO x (AL | HL)
   3192         if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
   3193             (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
   3194             (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))))  {
   3195             continue;
   3196         }
   3197 
   3198 
   3199 
   3200         // LB 25    Numbers
   3201         if (fNumberMatcher->lookingAt(prevPos, status)) {
   3202             if (U_FAILURE(status)) {
   3203                 break;
   3204             }
   3205             // Matched a number.  But could have been just a single digit, which would
   3206             //    not represent a "no break here" between prevChar and thisChar
   3207             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
   3208             if (numEndIdx > pos) {
   3209                 // Number match includes at least our two chars being checked
   3210                 if (numEndIdx > nextPos) {
   3211                     // Number match includes additional chars.  Update pos and nextPos
   3212                     //   so that next loop iteration will continue at the end of the number,
   3213                     //   checking for breaks between last char in number & whatever follows.
   3214                     pos = nextPos = numEndIdx;
   3215                     do {
   3216                         pos = fText->moveIndex32(pos, -1);
   3217                         thisChar = fText->char32At(pos);
   3218                     } while (fCM->contains(thisChar));
   3219                 }
   3220                 continue;
   3221             }
   3222         }
   3223 
   3224 
   3225         // LB 26 Do not break a Korean syllable.
   3226         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
   3227                                         fJV->contains(thisChar) ||
   3228                                         fH2->contains(thisChar) ||
   3229                                         fH3->contains(thisChar))) {
   3230                                             continue;
   3231                                         }
   3232 
   3233         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
   3234             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
   3235                 continue;
   3236         }
   3237 
   3238         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
   3239             fJT->contains(thisChar)) {
   3240                 continue;
   3241         }
   3242 
   3243         // LB 27 Treat a Korean Syllable Block the same as ID.
   3244         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
   3245             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
   3246             fIN->contains(thisChar)) {
   3247                 continue;
   3248             }
   3249         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
   3250             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
   3251             fPO->contains(thisChar)) {
   3252                 continue;
   3253             }
   3254         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
   3255             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
   3256                 continue;
   3257             }
   3258 
   3259 
   3260 
   3261         // LB 28  Do not break between alphabetics ("at").
   3262         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
   3263             continue;
   3264         }
   3265 
   3266         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
   3267         if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
   3268             continue;
   3269         }
   3270 
   3271         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
   3272         //          (AL | NU) x OP
   3273         //          CP x (AL | NU)
   3274         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
   3275             continue;
   3276         }
   3277         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
   3278             continue;
   3279         }
   3280 
   3281         // LB30a  Do not break between regional indicators.
   3282         //        RI x RI
   3283         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
   3284             continue;
   3285         }
   3286 
   3287         // LB 31    Break everywhere else
   3288         break;
   3289 
   3290     }
   3291 
   3292     return pos;
   3293 }
   3294 
   3295 
   3296 UVector  *RBBILineMonkey::charClasses() {
   3297     return fSets;
   3298 }
   3299 
   3300 
   3301 RBBILineMonkey::~RBBILineMonkey() {
   3302     delete fSets;
   3303 
   3304     delete fBK;
   3305     delete fCR;
   3306     delete fLF;
   3307     delete fCM;
   3308     delete fNL;
   3309     delete fWJ;
   3310     delete fZW;
   3311     delete fGL;
   3312     delete fCB;
   3313     delete fSP;
   3314     delete fB2;
   3315     delete fBA;
   3316     delete fBB;
   3317     delete fHY;
   3318     delete fH2;
   3319     delete fH3;
   3320     delete fCL;
   3321     delete fCP;
   3322     delete fEX;
   3323     delete fIN;
   3324     delete fJL;
   3325     delete fJV;
   3326     delete fJT;
   3327     delete fNS;
   3328     delete fOP;
   3329     delete fQU;
   3330     delete fIS;
   3331     delete fNU;
   3332     delete fPO;
   3333     delete fPR;
   3334     delete fSY;
   3335     delete fAI;
   3336     delete fAL;
   3337     delete fCJ;
   3338     delete fHL;
   3339     delete fID;
   3340     delete fRI;
   3341     delete fSA;
   3342     delete fSG;
   3343     delete fXX;
   3344 
   3345     delete fCharBI;
   3346     delete fNumberMatcher;
   3347 }
   3348 
   3349 
   3350 //-------------------------------------------------------------------------------------------
   3351 //
   3352 //   TestMonkey
   3353 //
   3354 //     params
   3355 //       seed=nnnnn        Random number starting seed.
   3356 //                         Setting the seed allows errors to be reproduced.
   3357 //       loop=nnn          Looping count.  Controls running time.
   3358 //                         -1:  run forever.
   3359 //                          0 or greater:  run length.
   3360 //
   3361 //       type = char | word | line | sent | title
   3362 //
   3363 //-------------------------------------------------------------------------------------------
   3364 
   3365 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
   3366     int32_t val = defaultVal;
   3367     name.append(" *= *(-?\\d+)");
   3368     UErrorCode status = U_ZERO_ERROR;
   3369     RegexMatcher m(name, params, 0, status);
   3370     if (m.find()) {
   3371         // The param exists.  Convert the string to an int.
   3372         char valString[100];
   3373         int32_t paramLength = m.end(1, status) - m.start(1, status);
   3374         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
   3375             paramLength = (int32_t)(sizeof(valString)-2);
   3376         }
   3377         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
   3378         val = strtol(valString,  NULL, 10);
   3379 
   3380         // Delete this parameter from the params string.
   3381         m.reset();
   3382         params = m.replaceFirst("", status);
   3383     }
   3384     U_ASSERT(U_SUCCESS(status));
   3385     return val;
   3386 }
   3387 #endif
   3388 
   3389 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3390 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
   3391                                     BreakIterator *bi,
   3392                                     int expected[],
   3393                                     int expectedcount)
   3394 {
   3395     int count = 0;
   3396     int i = 0;
   3397     int forward[50];
   3398     bi->setText(ustr);
   3399     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
   3400         forward[count] = i;
   3401         if (count < expectedcount && expected[count] != i) {
   3402             test->errln("break forward test failed: expected %d but got %d",
   3403                         expected[count], i);
   3404             break;
   3405         }
   3406         count ++;
   3407     }
   3408     if (count != expectedcount) {
   3409         printStringBreaks(ustr, expected, expectedcount);
   3410         test->errln("break forward test failed: missed %d match",
   3411                     expectedcount - count);
   3412         return;
   3413     }
   3414     // testing boundaries
   3415     for (i = 1; i < expectedcount; i ++) {
   3416         int j = expected[i - 1];
   3417         if (!bi->isBoundary(j)) {
   3418             printStringBreaks(ustr, expected, expectedcount);
   3419             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
   3420             return;
   3421         }
   3422         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
   3423             if (bi->isBoundary(j)) {
   3424                 printStringBreaks(ustr, expected, expectedcount);
   3425                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
   3426                 return;
   3427             }
   3428         }
   3429     }
   3430 
   3431     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
   3432         count --;
   3433         if (forward[count] != i) {
   3434             printStringBreaks(ustr, expected, expectedcount);
   3435             test->errln("happy break test previous() failed: expected %d but got %d",
   3436                         forward[count], i);
   3437             break;
   3438         }
   3439     }
   3440     if (count != 0) {
   3441         printStringBreaks(ustr, expected, expectedcount);
   3442         test->errln("break test previous() failed: missed a match");
   3443         return;
   3444     }
   3445 
   3446     // testing preceding
   3447     for (i = 0; i < expectedcount - 1; i ++) {
   3448         // int j = expected[i] + 1;
   3449         int j = ustr.moveIndex32(expected[i], 1);
   3450         for (; j <= expected[i + 1]; j ++) {
   3451             if (bi->preceding(j) != expected[i]) {
   3452                 printStringBreaks(ustr, expected, expectedcount);
   3453                 test->errln("preceding(): Not expecting boundary at position %d", j);
   3454                 return;
   3455             }
   3456         }
   3457     }
   3458 }
   3459 #endif
   3460 
   3461 void RBBITest::TestWordBreaks(void)
   3462 {
   3463 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3464 
   3465     Locale        locale("en");
   3466     UErrorCode    status = U_ZERO_ERROR;
   3467     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   3468     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
   3469     // Replaced any C+J characters in a row with a random sequence of characters
   3470     // of the same length to make our C+J segmentation not get in the way.
   3471     static const char *strlist[] =
   3472     {
   3473     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
   3474     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
   3475     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
   3476     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
   3477     "\\uac00\\u3588\\u009c\\u0953\\u194b",
   3478     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   3479     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
   3480     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
   3481     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   3482     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   3483     "\\u2027\\U000e0067\\u0a47\\u00b7",
   3484     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   3485     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   3486     "\\u0589\\U000e006e\\u0a42\\U000104a5",
   3487     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
   3488     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   3489     "\\u0027\\u11af\\U000e0057\\u0602",
   3490     "\\U0001d7f2\\U000e007\\u0004\\u0589",
   3491     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
   3492     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   3493     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   3494     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   3495     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
   3496     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   3497     "\\u0233\\U000e0020\\u0a69\\u0d6a",
   3498     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   3499     "\\u18f4\\U000e0049\\u20e7\\u2027",
   3500     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   3501     "\\ua183\\u102d\\u0bec\\u003a",
   3502     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   3503     "\\u003a\\u0e57\\u0fad\\u002e",
   3504     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
   3505     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
   3506     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
   3507     "\\u003a\\u0664\\u00b7\\u1fba",
   3508     "\\u003b\\u0027\\u00b7\\u47a3",
   3509     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
   3510     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
   3511     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
   3512     };
   3513     int loop;
   3514     if (U_FAILURE(status)) {
   3515         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   3516         return;
   3517     }
   3518     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   3519         // printf("looping %d\n", loop);
   3520         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
   3521         // RBBICharMonkey monkey;
   3522         RBBIWordMonkey monkey;
   3523 
   3524         int expected[50];
   3525         int expectedcount = 0;
   3526 
   3527         monkey.setText(ustr);
   3528         int i;
   3529         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   3530             expected[expectedcount ++] = i;
   3531         }
   3532 
   3533         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   3534     }
   3535     delete bi;
   3536 #endif
   3537 }
   3538 
   3539 void RBBITest::TestWordBoundary(void)
   3540 {
   3541     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
   3542     Locale        locale("en");
   3543     UErrorCode    status = U_ZERO_ERROR;
   3544     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   3545     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
   3546     UChar         str[50];
   3547     static const char *strlist[] =
   3548     {
   3549     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   3550     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   3551     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   3552     "\\u2027\\U000e0067\\u0a47\\u00b7",
   3553     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   3554     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   3555     "\\u0589\\U000e006e\\u0a42\\U000104a5",
   3556     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
   3557     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   3558     "\\u0027\\u11af\\U000e0057\\u0602",
   3559     "\\U0001d7f2\\U000e007\\u0004\\u0589",
   3560     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
   3561     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   3562     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   3563     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   3564     "\\U000e0065\\u302c\\u09ee\\U000e0068",
   3565     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   3566     "\\u0233\\U000e0020\\u0a69\\u0d6a",
   3567     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   3568     "\\u58f4\\U000e0049\\u20e7\\u2027",
   3569     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   3570     "\\ua183\\u102d\\u0bec\\u003a",
   3571     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   3572     "\\u003a\\u0e57\\u0fad\\u002e",
   3573     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
   3574     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
   3575     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
   3576     "\\u003a\\u0664\\u00b7\\u1fba",
   3577     "\\u003b\\u0027\\u00b7\\u47a3",
   3578     };
   3579     int loop;
   3580     if (U_FAILURE(status)) {
   3581         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   3582         return;
   3583     }
   3584     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   3585         // printf("looping %d\n", loop);
   3586         u_unescape(strlist[loop], str, 20);
   3587         UnicodeString ustr(str);
   3588         int forward[50];
   3589         int count = 0;
   3590 
   3591         bi->setText(ustr);
   3592         int prev = 0;
   3593         int i;
   3594         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
   3595             forward[count ++] = i;
   3596             if (i > prev) {
   3597                 int j;
   3598                 for (j = prev + 1; j < i; j ++) {
   3599                     if (bi->isBoundary(j)) {
   3600                         printStringBreaks(ustr, forward, count);
   3601                         errln("happy boundary test failed: expected %d not a boundary",
   3602                                j);
   3603                         return;
   3604                     }
   3605                 }
   3606             }
   3607             if (!bi->isBoundary(i)) {
   3608                 printStringBreaks(ustr, forward, count);
   3609                 errln("happy boundary test failed: expected %d a boundary",
   3610                        i);
   3611                 return;
   3612             }
   3613             prev = i;
   3614         }
   3615     }
   3616     delete bi;
   3617 }
   3618 
   3619 void RBBITest::TestLineBreaks(void)
   3620 {
   3621 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3622     Locale        locale("en");
   3623     UErrorCode    status = U_ZERO_ERROR;
   3624     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
   3625     const int32_t  STRSIZE = 50;
   3626     UChar         str[STRSIZE];
   3627     static const char *strlist[] =
   3628     {
   3629      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
   3630      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
   3631              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
   3632      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
   3633              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
   3634      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
   3635      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
   3636      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
   3637      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
   3638      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
   3639      "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
   3640      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
   3641      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
   3642      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
   3643      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
   3644      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
   3645      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
   3646      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
   3647      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
   3648      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
   3649      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
   3650      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
   3651      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
   3652      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
   3653      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
   3654      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
   3655      "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
   3656      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
   3657      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
   3658      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
   3659      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
   3660      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
   3661      "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
   3662      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
   3663      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
   3664      "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
   3665      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
   3666      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
   3667      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
   3668      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
   3669      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
   3670      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
   3671          "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
   3672          "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
   3673          "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
   3674      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
   3675          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
   3676     };
   3677     int loop;
   3678     TEST_ASSERT_SUCCESS(status);
   3679     if (U_FAILURE(status)) {
   3680         return;
   3681     }
   3682     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   3683         // printf("looping %d\n", loop);
   3684         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
   3685         if (t >= STRSIZE) {
   3686             TEST_ASSERT(FALSE);
   3687             continue;
   3688         }
   3689 
   3690 
   3691         UnicodeString ustr(str);
   3692         RBBILineMonkey monkey;
   3693         if (U_FAILURE(monkey.deferredStatus)) {
   3694             continue;
   3695         }
   3696 
   3697         const int EXPECTEDSIZE = 50;
   3698         int expected[EXPECTEDSIZE];
   3699         int expectedcount = 0;
   3700 
   3701         monkey.setText(ustr);
   3702         int i;
   3703         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   3704             if (expectedcount >= EXPECTEDSIZE) {
   3705                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
   3706                 return;
   3707             }
   3708             expected[expectedcount ++] = i;
   3709         }
   3710 
   3711         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   3712     }
   3713     delete bi;
   3714 #endif
   3715 }
   3716 
   3717 void RBBITest::TestSentBreaks(void)
   3718 {
   3719 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3720     Locale        locale("en");
   3721     UErrorCode    status = U_ZERO_ERROR;
   3722     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
   3723     UChar         str[200];
   3724     static const char *strlist[] =
   3725     {
   3726      "Now\ris\nthe\r\ntime\n\rfor\r\r",
   3727      "This\n",
   3728      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
   3729      "\"Sentence ending with a quote.\" Bye.",
   3730      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
   3731      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
   3732      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
   3733      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
   3734      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
   3735      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
   3736      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
   3737              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
   3738              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
   3739              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
   3740      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
   3741              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
   3742              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
   3743              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
   3744              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
   3745              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
   3746     };
   3747     int loop;
   3748     if (U_FAILURE(status)) {
   3749         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   3750         return;
   3751     }
   3752     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   3753         u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
   3754         UnicodeString ustr(str);
   3755 
   3756         RBBISentMonkey monkey;
   3757         if (U_FAILURE(monkey.deferredStatus)) {
   3758             continue;
   3759         }
   3760 
   3761         const int EXPECTEDSIZE = 50;
   3762         int expected[EXPECTEDSIZE];
   3763         int expectedcount = 0;
   3764 
   3765         monkey.setText(ustr);
   3766         int i;
   3767         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   3768             if (expectedcount >= EXPECTEDSIZE) {
   3769                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
   3770                 return;
   3771             }
   3772             expected[expectedcount ++] = i;
   3773         }
   3774 
   3775         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   3776     }
   3777     delete bi;
   3778 #endif
   3779 }
   3780 
   3781 void RBBITest::TestMonkey(char *params) {
   3782 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3783 
   3784     UErrorCode     status    = U_ZERO_ERROR;
   3785     int32_t        loopCount = 500;
   3786     int32_t        seed      = 1;
   3787     UnicodeString  breakType = "all";
   3788     Locale         locale("en");
   3789     UBool          useUText  = FALSE;
   3790 
   3791     if (quick == FALSE) {
   3792         loopCount = 10000;
   3793     }
   3794 
   3795     if (params) {
   3796         UnicodeString p(params);
   3797         loopCount = getIntParam("loop", p, loopCount);
   3798         seed      = getIntParam("seed", p, seed);
   3799 
   3800         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
   3801         if (m.find()) {
   3802             breakType = m.group(1, status);
   3803             m.reset();
   3804             p = m.replaceFirst("", status);
   3805         }
   3806 
   3807         RegexMatcher u(" *utext", p, 0, status);
   3808         if (u.find()) {
   3809             useUText = TRUE;
   3810             u.reset();
   3811             p = u.replaceFirst("", status);
   3812         }
   3813 
   3814 
   3815         // m.reset(p);
   3816         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
   3817             // Each option is stripped out of the option string as it is processed.
   3818             // All options have been checked.  The option string should have been completely emptied..
   3819             char buf[100];
   3820             p.extract(buf, sizeof(buf), NULL, status);
   3821             buf[sizeof(buf)-1] = 0;
   3822             errln("Unrecognized or extra parameter:  %s\n", buf);
   3823             return;
   3824         }
   3825 
   3826     }
   3827 
   3828     if (breakType == "char" || breakType == "all") {
   3829         RBBICharMonkey  m;
   3830         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   3831         if (U_SUCCESS(status)) {
   3832             RunMonkey(bi, m, "char", seed, loopCount, useUText);
   3833             if (breakType == "all" && useUText==FALSE) {
   3834                 // Also run a quick test with UText when "all" is specified
   3835                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
   3836             }
   3837         }
   3838         else {
   3839             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
   3840         }
   3841         delete bi;
   3842     }
   3843 
   3844     if (breakType == "word" || breakType == "all") {
   3845         logln("Word Break Monkey Test");
   3846         RBBIWordMonkey  m;
   3847         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
   3848         if (U_SUCCESS(status)) {
   3849             RunMonkey(bi, m, "word", seed, loopCount, useUText);
   3850         }
   3851         else {
   3852             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
   3853         }
   3854         delete bi;
   3855     }
   3856 
   3857     if (breakType == "line" || breakType == "all") {
   3858         logln("Line Break Monkey Test");
   3859         RBBILineMonkey  m;
   3860         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
   3861         if (loopCount >= 10) {
   3862             loopCount = loopCount / 5;   // Line break runs slower than the others.
   3863         }
   3864         if (U_SUCCESS(status)) {
   3865             RunMonkey(bi, m, "line", seed, loopCount, useUText);
   3866         }
   3867         else {
   3868             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
   3869         }
   3870         delete bi;
   3871     }
   3872 
   3873     if (breakType == "sent" || breakType == "all"  ) {
   3874         logln("Sentence Break Monkey Test");
   3875         RBBISentMonkey  m;
   3876         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
   3877         if (loopCount >= 10) {
   3878             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
   3879         }
   3880         if (U_SUCCESS(status)) {
   3881             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
   3882         }
   3883         else {
   3884             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
   3885         }
   3886         delete bi;
   3887     }
   3888 
   3889 #endif
   3890 }
   3891 
   3892 //
   3893 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
   3894 //    Parameters:
   3895 //       bi      - the break iterator to use
   3896 //       mk      - MonkeyKind, abstraction for obtaining expected results
   3897 //       name    - Name of test (char, word, etc.) for use in error messages
   3898 //       seed    - Seed for starting random number generator (parameter from user)
   3899 //       numIterations
   3900 //
   3901 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
   3902                          int32_t numIterations, UBool useUText) {
   3903 
   3904 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3905 
   3906     const int32_t    TESTSTRINGLEN = 500;
   3907     UnicodeString    testText;
   3908     int32_t          numCharClasses;
   3909     UVector          *chClasses;
   3910     int              expected[TESTSTRINGLEN*2 + 1];
   3911     int              expectedCount = 0;
   3912     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
   3913     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
   3914     char             reverseBreaks[TESTSTRINGLEN*2+1];
   3915     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
   3916     char             followingBreaks[TESTSTRINGLEN*2+1];
   3917     char             precedingBreaks[TESTSTRINGLEN*2+1];
   3918     int              i;
   3919     int              loopCount = 0;
   3920 
   3921     m_seed = seed;
   3922 
   3923     numCharClasses = mk.charClasses()->size();
   3924     chClasses      = mk.charClasses();
   3925 
   3926     // Check for errors that occured during the construction of the MonkeyKind object.
   3927     //  Can't report them where they occured because errln() is a method coming from intlTest,
   3928     //  and is not visible outside of RBBITest :-(
   3929     if (U_FAILURE(mk.deferredStatus)) {
   3930         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
   3931         return;
   3932     }
   3933 
   3934     // Verify that the character classes all have at least one member.
   3935     for (i=0; i<numCharClasses; i++) {
   3936         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
   3937         if (s == NULL || s->size() == 0) {
   3938             errln("Character Class #%d is null or of zero size.", i);
   3939             return;
   3940         }
   3941     }
   3942 
   3943     while (loopCount < numIterations || numIterations == -1) {
   3944         if (numIterations == -1 && loopCount % 10 == 0) {
   3945             // If test is running in an infinite loop, display a periodic tic so
   3946             //   we can tell that it is making progress.
   3947             fprintf(stderr, ".");
   3948         }
   3949         // Save current random number seed, so that we can recreate the random numbers
   3950         //   for this loop iteration in event of an error.
   3951         seed = m_seed;
   3952 
   3953         // Populate a test string with data.
   3954         testText.truncate(0);
   3955         for (i=0; i<TESTSTRINGLEN; i++) {
   3956             int32_t  aClassNum = m_rand() % numCharClasses;
   3957             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
   3958             int32_t   charIdx = m_rand() % classSet->size();
   3959             UChar32   c = classSet->charAt(charIdx);
   3960             if (c < 0) {   // TODO:  deal with sets containing strings.
   3961                 errln("c < 0");
   3962                 break;
   3963             }
   3964             testText.append(c);
   3965         }
   3966 
   3967         // Calculate the expected results for this test string.
   3968         mk.setText(testText);
   3969         memset(expectedBreaks, 0, sizeof(expectedBreaks));
   3970         expectedBreaks[0] = 1;
   3971         int32_t breakPos = 0;
   3972         expectedCount = 0;
   3973         for (;;) {
   3974             breakPos = mk.next(breakPos);
   3975             if (breakPos == -1) {
   3976                 break;
   3977             }
   3978             if (breakPos > testText.length()) {
   3979                 errln("breakPos > testText.length()");
   3980             }
   3981             expectedBreaks[breakPos] = 1;
   3982             U_ASSERT(expectedCount<testText.length());
   3983             expected[expectedCount ++] = breakPos;
   3984         }
   3985 
   3986         // Find the break positions using forward iteration
   3987         memset(forwardBreaks, 0, sizeof(forwardBreaks));
   3988         if (useUText) {
   3989             UErrorCode status = U_ZERO_ERROR;
   3990             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
   3991             // testUText = utext_openUnicodeString(testUText, &testText, &status);
   3992             bi->setText(testUText, status);
   3993             TEST_ASSERT_SUCCESS(status);
   3994             utext_close(testUText);   // The break iterator does a shallow clone of the UText
   3995                                       //  This UText can be closed immediately, so long as the
   3996                                       //  testText string continues to exist.
   3997         } else {
   3998             bi->setText(testText);
   3999         }
   4000 
   4001         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
   4002             if (i < 0 || i > testText.length()) {
   4003                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
   4004                 break;
   4005             }
   4006             forwardBreaks[i] = 1;
   4007         }
   4008 
   4009         // Find the break positions using reverse iteration
   4010         memset(reverseBreaks, 0, sizeof(reverseBreaks));
   4011         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
   4012             if (i < 0 || i > testText.length()) {
   4013                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
   4014                 break;
   4015             }
   4016             reverseBreaks[i] = 1;
   4017         }
   4018 
   4019         // Find the break positions using isBoundary() tests.
   4020         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
   4021         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
   4022         for (i=0; i<=testText.length(); i++) {
   4023             isBoundaryBreaks[i] = bi->isBoundary(i);
   4024         }
   4025 
   4026 
   4027         // Find the break positions using the following() function.
   4028         // printf(".");
   4029         memset(followingBreaks, 0, sizeof(followingBreaks));
   4030         int32_t   lastBreakPos = 0;
   4031         followingBreaks[0] = 1;
   4032         for (i=0; i<testText.length(); i++) {
   4033             breakPos = bi->following(i);
   4034             if (breakPos <= i ||
   4035                 breakPos < lastBreakPos ||
   4036                 breakPos > testText.length() ||
   4037                 (breakPos > lastBreakPos && lastBreakPos > i)) {
   4038                 errln("%s break monkey test: "
   4039                     "Out of range value returned by BreakIterator::following().\n"
   4040                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
   4041                          name, seed, i, breakPos, lastBreakPos);
   4042                 break;
   4043             }
   4044             followingBreaks[breakPos] = 1;
   4045             lastBreakPos = breakPos;
   4046         }
   4047 
   4048         // Find the break positions using the preceding() function.
   4049         memset(precedingBreaks, 0, sizeof(precedingBreaks));
   4050         lastBreakPos = testText.length();
   4051         precedingBreaks[testText.length()] = 1;
   4052         for (i=testText.length(); i>0; i--) {
   4053             breakPos = bi->preceding(i);
   4054             if (breakPos >= i ||
   4055                 breakPos > lastBreakPos ||
   4056                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
   4057                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
   4058                 errln("%s break monkey test: "
   4059                     "Out of range value returned by BreakIterator::preceding().\n"
   4060                     "index=%d;  prev returned %d; lastBreak=%d" ,
   4061                     name,  i, breakPos, lastBreakPos);
   4062                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
   4063                     precedingBreaks[i] = 2;   // Forces an error.
   4064                 }
   4065             } else {
   4066                 if (breakPos >= 0) {
   4067                     precedingBreaks[breakPos] = 1;
   4068                 }
   4069                 lastBreakPos = breakPos;
   4070             }
   4071         }
   4072 
   4073         // Compare the expected and actual results.
   4074         for (i=0; i<=testText.length(); i++) {
   4075             const char *errorType = NULL;
   4076             if  (forwardBreaks[i] != expectedBreaks[i]) {
   4077                 errorType = "next()";
   4078             } else if (reverseBreaks[i] != forwardBreaks[i]) {
   4079                 errorType = "previous()";
   4080             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
   4081                 errorType = "isBoundary()";
   4082             } else if (followingBreaks[i] != expectedBreaks[i]) {
   4083                 errorType = "following()";
   4084             } else if (precedingBreaks[i] != expectedBreaks[i]) {
   4085                 errorType = "preceding()";
   4086             }
   4087 
   4088 
   4089             if (errorType != NULL) {
   4090                 // Format a range of the test text that includes the failure as
   4091                 //  a data item that can be included in the rbbi test data file.
   4092 
   4093                 // Start of the range is the last point where expected and actual results
   4094                 //   both agreed that there was a break position.
   4095                 int startContext = i;
   4096                 int32_t count = 0;
   4097                 for (;;) {
   4098                     if (startContext==0) { break; }
   4099                     startContext --;
   4100                     if (expectedBreaks[startContext] != 0) {
   4101                         if (count == 2) break;
   4102                         count ++;
   4103                     }
   4104                 }
   4105 
   4106                 // End of range is two expected breaks past the start position.
   4107                 int endContext = i + 1;
   4108                 int ci;
   4109                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
   4110                     for (;;) {
   4111                         if (endContext >= testText.length()) {break;}
   4112                         if (expectedBreaks[endContext-1] != 0) {
   4113                             if (count == 0) break;
   4114                             count --;
   4115                         }
   4116                         endContext ++;
   4117                     }
   4118                 }
   4119 
   4120                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
   4121                 UnicodeString errorText = "<data>";
   4122                 /***if (strcmp(errorType, "next()") == 0) {
   4123                     startContext = 0;
   4124                     endContext = testText.length();
   4125 
   4126                     printStringBreaks(testText, expected, expectedCount);
   4127                 }***/
   4128 
   4129                 for (ci=startContext; ci<endContext;) {
   4130                     UnicodeString hexChars("0123456789abcdef");
   4131                     UChar32  c;
   4132                     int      bn;
   4133                     c = testText.char32At(ci);
   4134                     if (ci == i) {
   4135                         // This is the location of the error.
   4136                         errorText.append("<?>");
   4137                     } else if (expectedBreaks[ci] != 0) {
   4138                         // This a non-error expected break position.
   4139                         errorText.append("\\");
   4140                     }
   4141                     if (c < 0x10000) {
   4142                         errorText.append("\\u");
   4143                         for (bn=12; bn>=0; bn-=4) {
   4144                             errorText.append(hexChars.charAt((c>>bn)&0xf));
   4145                         }
   4146                     } else {
   4147                         errorText.append("\\U");
   4148                         for (bn=28; bn>=0; bn-=4) {
   4149                             errorText.append(hexChars.charAt((c>>bn)&0xf));
   4150                         }
   4151                     }
   4152                     ci = testText.moveIndex32(ci, 1);
   4153                 }
   4154                 errorText.append("\\");
   4155                 errorText.append("</data>\n");
   4156 
   4157                 // Output the error
   4158                 char  charErrorTxt[500];
   4159                 UErrorCode status = U_ZERO_ERROR;
   4160                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
   4161                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
   4162                 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
   4163 
   4164                 errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
   4165                     name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
   4166                     errorType, seed, i, charErrorTxt);
   4167                 break;
   4168             }
   4169         }
   4170 
   4171         loopCount++;
   4172     }
   4173 #endif
   4174 }
   4175 
   4176 
   4177 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
   4178 //             This test checks the initial patch,
   4179 //             which is to just keep it from crashing.  Correct word boundaries
   4180 //             await a proper fix to the dictionary code.
   4181 //
   4182 void RBBITest::TestBug5532(void)  {
   4183    // Text includes a mixture of Thai and Latin.
   4184    const unsigned char utf8Data[] = {
   4185            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
   4186            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
   4187            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
   4188            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
   4189            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
   4190            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
   4191            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
   4192            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
   4193            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
   4194            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
   4195            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
   4196 
   4197     UErrorCode status = U_ZERO_ERROR;
   4198     UText utext=UTEXT_INITIALIZER;
   4199     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
   4200     TEST_ASSERT_SUCCESS(status);
   4201 
   4202     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
   4203     TEST_ASSERT_SUCCESS(status);
   4204     if (U_SUCCESS(status)) {
   4205         bi->setText(&utext, status);
   4206         TEST_ASSERT_SUCCESS(status);
   4207 
   4208         int32_t breakCount = 0;
   4209         int32_t previousBreak = -1;
   4210         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
   4211             // For now, just make sure that the break iterator doesn't hang.
   4212             TEST_ASSERT(previousBreak < bi->current());
   4213             previousBreak = bi->current();
   4214         }
   4215         TEST_ASSERT(breakCount > 0);
   4216     }
   4217     delete bi;
   4218     utext_close(&utext);
   4219 }
   4220 
   4221 
   4222 void RBBITest::TestBug9983(void)  {
   4223     UnicodeString text = UnicodeString("\\u002A"  // * Other
   4224                                        "\\uFF65"  //   Other
   4225                                        "\\u309C"  //   Katakana
   4226                                        "\\uFF9F"  //   Extend
   4227                                        "\\uFF65"  //   Other
   4228                                        "\\u0020"  //   Other
   4229                                        "\\u0000").unescape();
   4230 
   4231     UErrorCode status = U_ZERO_ERROR;
   4232     LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
   4233         BreakIterator::createWordInstance(Locale::getRoot(), status)));
   4234     TEST_ASSERT_SUCCESS(status);
   4235     if (U_FAILURE(status)) {
   4236         return;
   4237     }
   4238     brkiter->setText(text);
   4239     int32_t offset, rstatus;
   4240     brkiter->last();
   4241     int32_t iterationCount = 0;
   4242     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
   4243         iterationCount++;
   4244         rstatus = brkiter->getRuleStatus();
   4245         // printf(" %d(%d)", offset, rstatus);
   4246         if (iterationCount >= 10) {
   4247            break;
   4248         }
   4249     }
   4250     TEST_ASSERT(iterationCount == 6);
   4251 }
   4252 
   4253 
   4254 //
   4255 //  TestDebug    -  A place-holder test for debugging purposes.
   4256 //                  For putting in fragments of other tests that can be invoked
   4257 //                  for tracing  without a lot of unwanted extra stuff happening.
   4258 //
   4259 void RBBITest::TestDebug(void) {
   4260 #if 0
   4261     UErrorCode   status = U_ZERO_ERROR;
   4262     int pos = 0;
   4263     int ruleStatus = 0;
   4264 
   4265     RuleBasedBreakIterator* bi =
   4266        // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
   4267        // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
   4268        (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
   4269     UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
   4270     // UnicodeString s("Aaa.  Bcd");
   4271     s = s.unescape();
   4272     bi->setText(s);
   4273     UBool r = bi->isBoundary(8);
   4274     printf("%s", r?"true":"false");
   4275     return;
   4276     pos = bi->last();
   4277     do {
   4278         // ruleStatus = bi->getRuleStatus();
   4279         printf("%d\t%d\n", pos, ruleStatus);
   4280         pos = bi->previous();
   4281     } while (pos != BreakIterator::DONE);
   4282 #endif
   4283 }
   4284 
   4285 void RBBITest::TestProperties() {
   4286     UErrorCode errorCode = U_ZERO_ERROR;
   4287     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
   4288     if (!prependSet.isEmpty()) {
   4289         errln(
   4290             "[:GCB=Prepend:] is not empty any more. "
   4291             "Uncomment relevant lines in source/data/brkitr/char.txt and "
   4292             "change this test to the opposite condition.");
   4293     }
   4294 }
   4295 
   4296 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   4297