Home | History | Annotate | Download | only in intltest
      1 /********************************************************************
      2  * COPYRIGHT:
      3  * Copyright (c) 1999-2012, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  ********************************************************************/
      6 /************************************************************************
      7 *   Date        Name        Description
      8 *   12/15/99    Madhu        Creation.
      9 *   01/12/2000  Madhu        Updated for changed API and added new tests
     10 ************************************************************************/
     11 
     12 #include "utypeinfo.h"  // for 'typeid' to work
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_BREAK_ITERATION
     17 
     18 #include "unicode/utypes.h"
     19 #include "unicode/brkiter.h"
     20 #include "unicode/rbbi.h"
     21 #include "unicode/uchar.h"
     22 #include "unicode/utf16.h"
     23 #include "unicode/ucnv.h"
     24 #include "unicode/schriter.h"
     25 #include "unicode/uniset.h"
     26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     27 #include "unicode/regex.h"
     28 #endif
     29 #include "unicode/ustring.h"
     30 #include "unicode/utext.h"
     31 #include "intltest.h"
     32 #include "rbbitst.h"
     33 #include <string.h>
     34 #include "uvector.h"
     35 #include "uvectr32.h"
     36 #include <string.h>
     37 #include <stdio.h>
     38 #include <stdlib.h>
     39 #include "unicode/numfmt.h"
     40 #include "unicode/uscript.h"
     41 
     42 #define TEST_ASSERT(x) {if (!(x)) { \
     43     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
     44 
     45 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
     46     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
     47 
     48 
     49 //---------------------------------------------
     50 // runIndexedTest
     51 //---------------------------------------------
     52 
     53 
     54 //  Note:  Before adding new tests to this file, check whether the desired test data can
     55 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
     56 //         it's much less work than writing a new test, diagnostic output in the event of failures
     57 //         is good, and the test data file will is shared with ICU4J, so eventually the test
     58 //         will run there as well, without additional effort.
     59 
     60 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
     61 {
     62     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
     63 
     64     switch (index) {
     65 #if !UCONFIG_NO_FILE_IO
     66         case 0: name = "TestBug4153072";
     67             if(exec) TestBug4153072();                         break;
     68 #else
     69         case 0: name = "skip";
     70             break;
     71 #endif
     72 
     73         case 1: name = "skip";
     74             break;
     75         case 2: name = "TestStatusReturn";
     76             if(exec) TestStatusReturn();                       break;
     77 
     78 #if !UCONFIG_NO_FILE_IO
     79         case 3: name = "TestUnicodeFiles";
     80             if(exec) TestUnicodeFiles();                       break;
     81         case 4: name = "TestEmptyString";
     82             if(exec) TestEmptyString();                        break;
     83 #else
     84         case 3: case 4: name = "skip";
     85             break;
     86 #endif
     87 
     88         case 5: name = "TestGetAvailableLocales";
     89             if(exec) TestGetAvailableLocales();                break;
     90 
     91         case 6: name = "TestGetDisplayName";
     92             if(exec) TestGetDisplayName();                     break;
     93 
     94 #if !UCONFIG_NO_FILE_IO
     95         case 7: name = "TestEndBehaviour";
     96             if(exec) TestEndBehaviour();                       break;
     97         case 8: case 9: case 10: name = "skip";
     98              break;
     99         case 11: name = "TestWordBreaks";
    100              if(exec) TestWordBreaks();                        break;
    101         case 12: name = "TestWordBoundary";
    102              if(exec) TestWordBoundary();                      break;
    103         case 13: name = "TestLineBreaks";
    104              if(exec) TestLineBreaks();                        break;
    105         case 14: name = "TestSentBreaks";
    106              if(exec) TestSentBreaks();                        break;
    107         case 15: name = "TestExtended";
    108              if(exec) TestExtended();                          break;
    109 #else
    110         case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
    111              break;
    112 #endif
    113 
    114 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
    115         case 16:
    116             name = "TestMonkey"; if(exec)  TestMonkey(params); break;
    117 #else
    118         case 16:
    119              name = "skip";                                    break;
    120 #endif
    121 
    122 #if !UCONFIG_NO_FILE_IO
    123         case 17: name = "TestBug3818";
    124             if(exec) TestBug3818();                            break;
    125 #else
    126         case 17: name = "skip";
    127             break;
    128 #endif
    129 
    130         case 18: name = "skip";
    131             break;
    132         case 19: name = "TestDebug";
    133             if(exec) TestDebug();                              break;
    134         case 20: name = "skip";
    135             break;
    136 
    137 #if !UCONFIG_NO_FILE_IO
    138         case 21: name = "TestBug5775";
    139             if (exec) TestBug5775();                           break;
    140 #else
    141         case 21: name = "skip";
    142             break;
    143 #endif
    144 
    145         case 22: name = "skip";
    146             break;
    147         case 23: name = "TestDictRules";
    148             if (exec) TestDictRules();                         break;
    149         case 24: name = "TestBug5532";
    150             if (exec) TestBug5532();                           break;
    151         default: name = ""; break; //needed to end loop
    152     }
    153 }
    154 
    155 
    156 //---------------------------------------------------------------------------
    157 //
    158 //   class BITestData   Holds a set of Break iterator test data and results
    159 //                      Includes
    160 //                         - the string data to be broken
    161 //                         - a vector of the expected break positions.
    162 //                         - a vector of source line numbers for the data,
    163 //                               (to help see where errors occured.)
    164 //                         - The expected break tag values.
    165 //                         - Vectors of actual break positions and tag values.
    166 //                         - Functions for comparing actual with expected and
    167 //                            reporting errors.
    168 //
    169 //----------------------------------------------------------------------------
    170 class BITestData {
    171 public:
    172     UnicodeString    fDataToBreak;
    173     UVector          fExpectedBreakPositions;
    174     UVector          fExpectedTags;
    175     UVector          fLineNum;
    176     UVector          fActualBreakPositions;   // Test Results.
    177     UVector          fActualTags;
    178 
    179     BITestData(UErrorCode &status);
    180     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
    181     void             checkResults(const char *heading, RBBITest *test);
    182     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
    183     void             clearResults();
    184 };
    185 
    186 //
    187 // Constructor.
    188 //
    189 BITestData::BITestData(UErrorCode &status)
    190 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
    191   fActualTags(status)
    192 {
    193 }
    194 
    195 //
    196 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
    197 //                 The macro form collects the line number, which is helpful
    198 //                 when tracking down failures.
    199 //
    200 //                 A null data item is inserted at the start of each test's data
    201 //                  to put the starting zero into the data list.  The position saved for
    202 //                  each non-null item is its ending position.
    203 //
    204 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
    205 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
    206     if (U_FAILURE(status)) {return;}
    207     if (data != NULL) {
    208         fDataToBreak.append(CharsToUnicodeString(data));
    209     }
    210     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
    211     fExpectedTags.addElement(tag, status);
    212     fLineNum.addElement(lineNum, status);
    213 }
    214 
    215 
    216 //
    217 //  checkResults.   Compare the actual and expected break positions, report any differences.
    218 //
    219 void BITestData::checkResults(const char *heading, RBBITest *test) {
    220     int32_t   expectedIndex = 0;
    221     int32_t   actualIndex = 0;
    222 
    223     for (;;) {
    224         // If we've run through both the expected and actual results vectors, we're done.
    225         //   break out of the loop.
    226         if (expectedIndex >= fExpectedBreakPositions.size() &&
    227             actualIndex   >= fActualBreakPositions.size()) {
    228             break;
    229         }
    230 
    231 
    232         if (expectedIndex >= fExpectedBreakPositions.size()) {
    233             err(heading, test, expectedIndex-1, actualIndex);
    234             actualIndex++;
    235             continue;
    236         }
    237 
    238         if (actualIndex >= fActualBreakPositions.size()) {
    239             err(heading, test, expectedIndex, actualIndex-1);
    240             expectedIndex++;
    241             continue;
    242         }
    243 
    244         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
    245             err(heading, test, expectedIndex, actualIndex);
    246             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
    247             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
    248                 actualIndex++;
    249             } else {
    250                 expectedIndex++;
    251             }
    252             continue;
    253         }
    254 
    255         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
    256             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
    257                 heading, fLineNum.elementAt(expectedIndex),
    258                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
    259         }
    260 
    261         actualIndex++;
    262         expectedIndex++;
    263     }
    264 }
    265 
    266 //
    267 //  err   -  An error was found.  Report it, along with information about where the
    268 //                                incorrectly broken test data appeared in the source file.
    269 //
    270 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
    271 {
    272     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
    273     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
    274     int32_t   o        = 0;
    275     int32_t   line     = fLineNum.elementAti(expectedIdx);
    276     if (expectedIdx > 0) {
    277         // The line numbers are off by one because a premature break occurs somewhere
    278         //    within the previous item, rather than at the start of the current (expected) item.
    279         //    We want to report the offset of the unexpected break from the start of
    280         //      this previous item.
    281         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
    282     }
    283     if (actual < expected) {
    284         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
    285     } else {
    286         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
    287     }
    288 }
    289 
    290 
    291 void BITestData::clearResults() {
    292     fActualBreakPositions.removeAllElements();
    293     fActualTags.removeAllElements();
    294 }
    295 
    296 
    297 //--------------------------------------------------------------------------------------
    298 //
    299 //    RBBITest    constructor and destructor
    300 //
    301 //--------------------------------------------------------------------------------------
    302 
    303 RBBITest::RBBITest() {
    304 }
    305 
    306 
    307 RBBITest::~RBBITest() {
    308 }
    309 
    310 //-----------------------------------------------------------------------------------
    311 //
    312 //   Test for status {tag} return value from break rules.
    313 //        TODO:  a more thorough test.
    314 //
    315 //-----------------------------------------------------------------------------------
    316 void RBBITest::TestStatusReturn() {
    317      UnicodeString rulesString1("$Letters = [:L:];\n"
    318                                   "$Numbers = [:N:];\n"
    319                                   "$Letters+{1};\n"
    320                                   "$Numbers+{2};\n"
    321                                   "Help\\ {4}/me\\!;\n"
    322                                   "[^$Letters $Numbers];\n"
    323                                   "!.*;\n", -1, US_INV);
    324      UnicodeString testString1  = "abc123..abc Help me Help me!";
    325                                 // 01234567890123456789012345678
    326      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
    327      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
    328 
    329      UErrorCode status=U_ZERO_ERROR;
    330      UParseError    parseError;
    331 
    332      RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
    333      if(U_FAILURE(status)) {
    334          dataerrln("FAIL : in construction - %s", u_errorName(status));
    335      } else {
    336          int32_t  pos;
    337          int32_t  i = 0;
    338          bi->setText(testString1);
    339          for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
    340              if (pos != bounds1[i]) {
    341                  errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
    342                  break;
    343              }
    344 
    345              int tag = bi->getRuleStatus();
    346              if (tag != brkStatus[i]) {
    347                  errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
    348                  break;
    349              }
    350              i++;
    351          }
    352      }
    353      delete bi;
    354 }
    355 
    356 
    357 static void printStringBreaks(UnicodeString ustr, int expected[],
    358                               int expectedcount)
    359 {
    360     UErrorCode status = U_ZERO_ERROR;
    361     char name[100];
    362     printf("code    alpha extend alphanum type word sent line name\n");
    363     int j;
    364     for (j = 0; j < ustr.length(); j ++) {
    365         if (expectedcount > 0) {
    366             int k;
    367             for (k = 0; k < expectedcount; k ++) {
    368                 if (j == expected[k]) {
    369                     printf("------------------------------------------------ %d\n",
    370                            j);
    371                 }
    372             }
    373         }
    374         UChar32 c = ustr.char32At(j);
    375         if (c > 0xffff) {
    376             j ++;
    377         }
    378         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
    379         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
    380                            u_isUAlphabetic(c),
    381                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
    382                            u_isalnum(c),
    383                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
    384                                                   u_charType(c),
    385                                                   U_SHORT_PROPERTY_NAME),
    386                            u_getPropertyValueName(UCHAR_WORD_BREAK,
    387                                                   u_getIntPropertyValue(c,
    388                                                           UCHAR_WORD_BREAK),
    389                                                   U_SHORT_PROPERTY_NAME),
    390                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
    391                                    u_getIntPropertyValue(c,
    392                                            UCHAR_SENTENCE_BREAK),
    393                                    U_SHORT_PROPERTY_NAME),
    394                            u_getPropertyValueName(UCHAR_LINE_BREAK,
    395                                    u_getIntPropertyValue(c,
    396                                            UCHAR_LINE_BREAK),
    397                                    U_SHORT_PROPERTY_NAME),
    398                            name);
    399     }
    400 }
    401 
    402 
    403 void RBBITest::TestBug3818() {
    404     UErrorCode  status = U_ZERO_ERROR;
    405 
    406     // Four Thai words...
    407     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
    408                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
    409     UnicodeString  thaiStr(thaiWordData);
    410 
    411     RuleBasedBreakIterator* bi =
    412         (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
    413     if (U_FAILURE(status) || bi == NULL) {
    414         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
    415         return;
    416     }
    417     bi->setText(thaiStr);
    418 
    419     int32_t  startOfSecondWord = bi->following(1);
    420     if (startOfSecondWord != 4) {
    421         errln("Fail at file %s, line %d expected start of word at 4, got %d",
    422             __FILE__, __LINE__, startOfSecondWord);
    423     }
    424     startOfSecondWord = bi->following(0);
    425     if (startOfSecondWord != 4) {
    426         errln("Fail at file %s, line %d expected start of word at 4, got %d",
    427             __FILE__, __LINE__, startOfSecondWord);
    428     }
    429     delete bi;
    430 }
    431 
    432 //----------------------------------------------------------------------------
    433 //
    434 // generalIteratorTest      Given a break iterator and a set of test data,
    435 //                          Run the tests and report the results.
    436 //
    437 //----------------------------------------------------------------------------
    438 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
    439 {
    440 
    441     bi.setText(td.fDataToBreak);
    442 
    443     testFirstAndNext(bi, td);
    444 
    445     testLastAndPrevious(bi, td);
    446 
    447     testFollowing(bi, td);
    448     testPreceding(bi, td);
    449     testIsBoundary(bi, td);
    450     doMultipleSelectionTest(bi, td);
    451 }
    452 
    453 
    454 //
    455 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
    456 //                       kind of loop.
    457 //
    458 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
    459 {
    460     UErrorCode  status = U_ZERO_ERROR;
    461     int32_t     p;
    462     int32_t     lastP = -1;
    463     int32_t     tag;
    464 
    465     logln("Test first and next");
    466     bi.setText(td.fDataToBreak);
    467     td.clearResults();
    468 
    469     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
    470         td.fActualBreakPositions.addElement(p, status);  // Save result.
    471         tag = bi.getRuleStatus();
    472         td.fActualTags.addElement(tag, status);
    473         if (p <= lastP) {
    474             // If the iterator is not making forward progress, stop.
    475             //  No need to raise an error here, it'll be detected in the normal check of results.
    476             break;
    477         }
    478         lastP = p;
    479     }
    480     td.checkResults("testFirstAndNext", this);
    481 }
    482 
    483 
    484 //
    485 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
    486 //
    487 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
    488 {
    489     UErrorCode  status = U_ZERO_ERROR;
    490     int32_t     p;
    491     int32_t     lastP  = 0x7ffffffe;
    492     int32_t     tag;
    493 
    494     logln("Test last and previous");
    495     bi.setText(td.fDataToBreak);
    496     td.clearResults();
    497 
    498     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
    499         // Save break position.  Insert it at start of vector of results, shoving
    500         //    already-saved results further towards the end.
    501         td.fActualBreakPositions.insertElementAt(p, 0, status);
    502         // bi.previous();   // TODO:  Why does this fix things up????
    503         // bi.next();
    504         tag = bi.getRuleStatus();
    505         td.fActualTags.insertElementAt(tag, 0, status);
    506         if (p >= lastP) {
    507             // If the iterator is not making progress, stop.
    508             //  No need to raise an error here, it'll be detected in the normal check of results.
    509             break;
    510         }
    511         lastP = p;
    512     }
    513     td.checkResults("testLastAndPrevious", this);
    514 }
    515 
    516 
    517 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
    518 {
    519     UErrorCode  status = U_ZERO_ERROR;
    520     int32_t     p;
    521     int32_t     tag;
    522     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
    523                                  //   cannot be -1; that is returned for DONE.
    524     int         i;
    525 
    526     logln("testFollowing():");
    527     bi.setText(td.fDataToBreak);
    528     td.clearResults();
    529 
    530     // Save the starting point, since we won't get that out of following.
    531     p = bi.first();
    532     td.fActualBreakPositions.addElement(p, status);  // Save result.
    533     tag = bi.getRuleStatus();
    534     td.fActualTags.addElement(tag, status);
    535 
    536     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
    537         p = bi.following(i);
    538         if (p != lastP) {
    539             if (p == RuleBasedBreakIterator::DONE) {
    540                 break;
    541             }
    542             // We've reached a new break position.  Save it.
    543             td.fActualBreakPositions.addElement(p, status);  // Save result.
    544             tag = bi.getRuleStatus();
    545             td.fActualTags.addElement(tag, status);
    546             lastP = p;
    547         }
    548     }
    549     // The loop normally exits by means of the break in the middle.
    550     // Make sure that the index was at the correct position for the break iterator to have
    551     //   returned DONE.
    552     if (i != td.fDataToBreak.length()) {
    553         errln("testFollowing():  iterator returned DONE prematurely.");
    554     }
    555 
    556     // Full check of all results.
    557     td.checkResults("testFollowing", this);
    558 }
    559 
    560 
    561 
    562 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
    563     UErrorCode  status = U_ZERO_ERROR;
    564     int32_t     p;
    565     int32_t     tag;
    566     int32_t     lastP  = 0x7ffffffe;
    567     int         i;
    568 
    569     logln("testPreceding():");
    570     bi.setText(td.fDataToBreak);
    571     td.clearResults();
    572 
    573     p = bi.last();
    574     td.fActualBreakPositions.addElement(p, status);
    575     tag = bi.getRuleStatus();
    576     td.fActualTags.addElement(tag, status);
    577 
    578     for (i = td.fDataToBreak.length(); i>=-1; i--) {
    579         p = bi.preceding(i);
    580         if (p != lastP) {
    581             if (p == RuleBasedBreakIterator::DONE) {
    582                 break;
    583             }
    584             // We've reached a new break position.  Save it.
    585             td.fActualBreakPositions.insertElementAt(p, 0, status);
    586             lastP = p;
    587             tag = bi.getRuleStatus();
    588             td.fActualTags.insertElementAt(tag, 0, status);
    589         }
    590     }
    591     // The loop normally exits by means of the break in the middle.
    592     // Make sure that the index was at the correct position for the break iterator to have
    593     //   returned DONE.
    594     if (i != 0) {
    595         errln("testPreceding():  iterator returned DONE prematurely.");
    596     }
    597 
    598     // Full check of all results.
    599     td.checkResults("testPreceding", this);
    600 }
    601 
    602 
    603 
    604 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
    605     UErrorCode  status = U_ZERO_ERROR;
    606     int         i;
    607     int32_t     tag;
    608 
    609     logln("testIsBoundary():");
    610     bi.setText(td.fDataToBreak);
    611     td.clearResults();
    612 
    613     for (i = 0; i <= td.fDataToBreak.length(); i++) {
    614         if (bi.isBoundary(i)) {
    615             td.fActualBreakPositions.addElement(i, status);  // Save result.
    616             tag = bi.getRuleStatus();
    617             td.fActualTags.addElement(tag, status);
    618         }
    619     }
    620     td.checkResults("testIsBoundary: ", this);
    621 }
    622 
    623 
    624 
    625 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
    626 {
    627     iterator.setText(td.fDataToBreak);
    628 
    629     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
    630     int32_t offset = iterator.first();
    631     int32_t testOffset;
    632     int32_t count = 0;
    633 
    634     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
    635 
    636     if (*testIterator != iterator)
    637         errln("clone() or operator!= failed: two clones compared unequal");
    638 
    639     do {
    640         testOffset = testIterator->first();
    641         testOffset = testIterator->next(count);
    642         if (offset != testOffset)
    643             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
    644 
    645         if (offset != RuleBasedBreakIterator::DONE) {
    646             count++;
    647             offset = iterator.next();
    648 
    649             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
    650                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
    651                 if (count > 10000 || offset == -1) {
    652                     errln("operator== failed too many times. Stopping test.");
    653                     if (offset == -1) {
    654                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
    655                     }
    656                     return;
    657                 }
    658             }
    659         }
    660     } while (offset != RuleBasedBreakIterator::DONE);
    661 
    662     // now do it backwards...
    663     offset = iterator.last();
    664     count = 0;
    665 
    666     do {
    667         testOffset = testIterator->last();
    668         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
    669         if (offset != testOffset)
    670             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
    671 
    672         if (offset != RuleBasedBreakIterator::DONE) {
    673             count--;
    674             offset = iterator.previous();
    675         }
    676     } while (offset != RuleBasedBreakIterator::DONE);
    677 
    678     delete testIterator;
    679 }
    680 
    681 
    682 //---------------------------------------------
    683 //
    684 //     other tests
    685 //
    686 //---------------------------------------------
    687 void RBBITest::TestEmptyString()
    688 {
    689     UnicodeString text = "";
    690     UErrorCode status = U_ZERO_ERROR;
    691 
    692     BITestData x(status);
    693     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
    694     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
    695     if (U_FAILURE(status))
    696     {
    697         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
    698         return;
    699     }
    700     generalIteratorTest(*bi, x);
    701     delete bi;
    702 }
    703 
    704 void RBBITest::TestGetAvailableLocales()
    705 {
    706     int32_t locCount = 0;
    707     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
    708 
    709     if (locCount == 0)
    710         dataerrln("getAvailableLocales() returned an empty list!");
    711     // Just make sure that it's returning good memory.
    712     int32_t i;
    713     for (i = 0; i < locCount; ++i) {
    714         logln(locList[i].getName());
    715     }
    716 }
    717 
    718 //Testing the BreakIterator::getDisplayName() function
    719 void RBBITest::TestGetDisplayName()
    720 {
    721     UnicodeString   result;
    722 
    723     BreakIterator::getDisplayName(Locale::getUS(), result);
    724     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
    725         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
    726                 + result);
    727 
    728     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
    729     if (result != "French (France)")
    730         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
    731                 + result);
    732 }
    733 /**
    734  * Test End Behaviour
    735  * @bug 4068137
    736  */
    737 void RBBITest::TestEndBehaviour()
    738 {
    739     UErrorCode status = U_ZERO_ERROR;
    740     UnicodeString testString("boo.");
    741     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
    742     if (U_FAILURE(status))
    743     {
    744         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
    745         return;
    746     }
    747     wb->setText(testString);
    748 
    749     if (wb->first() != 0)
    750         errln("Didn't get break at beginning of string.");
    751     if (wb->next() != 3)
    752         errln("Didn't get break before period in \"boo.\"");
    753     if (wb->current() != 4 && wb->next() != 4)
    754         errln("Didn't get break at end of string.");
    755     delete wb;
    756 }
    757 /*
    758  * @bug 4153072
    759  */
    760 void RBBITest::TestBug4153072() {
    761     UErrorCode status = U_ZERO_ERROR;
    762     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
    763     if (U_FAILURE(status))
    764     {
    765         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
    766         return;
    767     }
    768     UnicodeString str("...Hello, World!...");
    769     int32_t begin = 3;
    770     int32_t end = str.length() - 3;
    771     UBool onBoundary;
    772 
    773     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
    774     iter->adoptText(textIterator);
    775     int index;
    776     // Note: with the switch to UText, there is no way to restrict the
    777     //       iteration range to begin at an index other than zero.
    778     //       String character iterators created with a non-zero bound are
    779     //         treated by RBBI as being empty.
    780     for (index = -1; index < begin + 1; ++index) {
    781         onBoundary = iter->isBoundary(index);
    782         if (index == 0?  !onBoundary : onBoundary) {
    783             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
    784                             " and begin index = " + begin);
    785         }
    786     }
    787     delete iter;
    788 }
    789 
    790 
    791 //
    792 // Test for problem reported by Ashok Matoria on 9 July 2007
    793 //    One.<kSoftHyphen><kSpace>Two.
    794 //
    795 //    Sentence break at start (0) and then on calling next() it breaks at
    796 //   'T' of "Two". Now, at this point if I do next() and
    797 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
    798 //
    799 void RBBITest::TestBug5775() {
    800     UErrorCode status = U_ZERO_ERROR;
    801     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
    802     TEST_ASSERT_SUCCESS(status);
    803     if (U_FAILURE(status)) {
    804         return;
    805     }
    806 // Check for status first for better handling of no data errors.
    807     TEST_ASSERT(bi != NULL);
    808     if (bi == NULL) {
    809         return;
    810     }
    811 
    812     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
    813     //               01234      56789
    814     s = s.unescape();
    815     bi->setText(s);
    816     int pos = bi->next();
    817     TEST_ASSERT(pos == 6);
    818     pos = bi->next();
    819     TEST_ASSERT(pos == 10);
    820     pos = bi->previous();
    821     TEST_ASSERT(pos == 6);
    822     delete bi;
    823 }
    824 
    825 
    826 
    827 //------------------------------------------------------------------------------
    828 //
    829 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
    830 //
    831 //------------------------------------------------------------------------------
    832 
    833 struct TestParams {
    834     BreakIterator   *bi;
    835     UnicodeString    dataToBreak;
    836     UVector32       *expectedBreaks;
    837     UVector32       *srcLine;
    838     UVector32       *srcCol;
    839 };
    840 
    841 void RBBITest::executeTest(TestParams *t) {
    842     int32_t    bp;
    843     int32_t    prevBP;
    844     int32_t    i;
    845 
    846     if (t->bi == NULL) {
    847         return;
    848     }
    849 
    850     t->bi->setText(t->dataToBreak);
    851     //
    852     //  Run the iterator forward
    853     //
    854     prevBP = -1;
    855     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
    856         if (prevBP ==  bp) {
    857             // Fail for lack of forward progress.
    858             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
    859                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
    860             break;
    861         }
    862 
    863         // Check that there were we didn't miss an expected break between the last one
    864         //  and this one.
    865         for (i=prevBP+1; i<bp; i++) {
    866             if (t->expectedBreaks->elementAti(i) != 0) {
    867                 int expected[] = {0, i};
    868                 printStringBreaks(t->dataToBreak, expected, 2);
    869                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
    870                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
    871             }
    872         }
    873 
    874         // Check that the break we did find was expected
    875         if (t->expectedBreaks->elementAti(bp) == 0) {
    876             int expected[] = {0, bp};
    877             printStringBreaks(t->dataToBreak, expected, 2);
    878             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
    879                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
    880         } else {
    881             // The break was expected.
    882             //   Check that the {nnn} tag value is correct.
    883             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
    884             if (expectedTagVal == -1) {
    885                 expectedTagVal = 0;
    886             }
    887             int32_t line = t->srcLine->elementAti(bp);
    888             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
    889             if (rs != expectedTagVal) {
    890                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
    891                       "          Actual, Expected status = %4d, %4d",
    892                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
    893             }
    894         }
    895 
    896 
    897         prevBP = bp;
    898     }
    899 
    900     // Verify that there were no missed expected breaks after the last one found
    901     for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
    902         if (t->expectedBreaks->elementAti(i) != 0) {
    903             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
    904                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
    905         }
    906     }
    907 
    908     //
    909     //  Run the iterator backwards, verify that the same breaks are found.
    910     //
    911     prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
    912     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
    913         if (prevBP ==  bp) {
    914             // Fail for lack of progress.
    915             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
    916                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
    917             break;
    918         }
    919 
    920         // Check that there were we didn't miss an expected break between the last one
    921         //  and this one.  (UVector returns zeros for index out of bounds.)
    922         for (i=prevBP-1; i>bp; i--) {
    923             if (t->expectedBreaks->elementAti(i) != 0) {
    924                 errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
    925                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
    926             }
    927         }
    928 
    929         // Check that the break we did find was expected
    930         if (t->expectedBreaks->elementAti(bp) == 0) {
    931             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
    932                    bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
    933         } else {
    934             // The break was expected.
    935             //   Check that the {nnn} tag value is correct.
    936             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
    937             if (expectedTagVal == -1) {
    938                 expectedTagVal = 0;
    939             }
    940             int line = t->srcLine->elementAti(bp);
    941             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
    942             if (rs != expectedTagVal) {
    943                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
    944                       "          Actual, Expected status = %4d, %4d",
    945                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
    946             }
    947         }
    948 
    949         prevBP = bp;
    950     }
    951 
    952     // Verify that there were no missed breaks prior to the last one found
    953     for (i=prevBP-1; i>=0; i--) {
    954         if (t->expectedBreaks->elementAti(i) != 0) {
    955             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
    956                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
    957         }
    958     }
    959 }
    960 
    961 
    962 void RBBITest::TestExtended() {
    963 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
    964     UErrorCode      status  = U_ZERO_ERROR;
    965     Locale          locale("");
    966 
    967     UnicodeString       rules;
    968     TestParams          tp;
    969     tp.bi             = NULL;
    970     tp.expectedBreaks = new UVector32(status);
    971     tp.srcLine        = new UVector32(status);
    972     tp.srcCol         = new UVector32(status);
    973 
    974     RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
    975     if (U_FAILURE(status)) {
    976         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
    977     }
    978 
    979 
    980     //
    981     //  Open and read the test data file.
    982     //
    983     const char *testDataDirectory = IntlTest::getSourceTestData(status);
    984     char testFileName[1000];
    985     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
    986         errln("Can't open test data.  Path too long.");
    987         return;
    988     }
    989     strcpy(testFileName, testDataDirectory);
    990     strcat(testFileName, "rbbitst.txt");
    991 
    992     int    len;
    993     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
    994     if (U_FAILURE(status)) {
    995         return; /* something went wrong, error already output */
    996     }
    997 
    998 
    999 
   1000 
   1001     //
   1002     //  Put the test data into a UnicodeString
   1003     //
   1004     UnicodeString testString(FALSE, testFile, len);
   1005 
   1006     enum EParseState{
   1007         PARSE_COMMENT,
   1008         PARSE_TAG,
   1009         PARSE_DATA,
   1010         PARSE_NUM
   1011     }
   1012     parseState = PARSE_TAG;
   1013 
   1014     EParseState savedState = PARSE_TAG;
   1015 
   1016     static const UChar CH_LF        = 0x0a;
   1017     static const UChar CH_CR        = 0x0d;
   1018     static const UChar CH_HASH      = 0x23;
   1019     /*static const UChar CH_PERIOD    = 0x2e;*/
   1020     static const UChar CH_LT        = 0x3c;
   1021     static const UChar CH_GT        = 0x3e;
   1022     static const UChar CH_BACKSLASH = 0x5c;
   1023     static const UChar CH_BULLET    = 0x2022;
   1024 
   1025     int32_t    lineNum  = 1;
   1026     int32_t    colStart = 0;
   1027     int32_t    column   = 0;
   1028     int32_t    charIdx  = 0;
   1029 
   1030     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
   1031 
   1032     for (charIdx = 0; charIdx < len; ) {
   1033         status = U_ZERO_ERROR;
   1034         UChar  c = testString.charAt(charIdx);
   1035         charIdx++;
   1036         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
   1037             // treat CRLF as a unit
   1038             c = CH_LF;
   1039             charIdx++;
   1040         }
   1041         if (c == CH_LF || c == CH_CR) {
   1042             lineNum++;
   1043             colStart = charIdx;
   1044         }
   1045         column = charIdx - colStart + 1;
   1046 
   1047         switch (parseState) {
   1048         case PARSE_COMMENT:
   1049             if (c == 0x0a || c == 0x0d) {
   1050                 parseState = savedState;
   1051             }
   1052             break;
   1053 
   1054         case PARSE_TAG:
   1055             {
   1056             if (c == CH_HASH) {
   1057                 parseState = PARSE_COMMENT;
   1058                 savedState = PARSE_TAG;
   1059                 break;
   1060             }
   1061             if (u_isUWhiteSpace(c)) {
   1062                 break;
   1063             }
   1064             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
   1065                 delete tp.bi;
   1066                 tp.bi = BreakIterator::createWordInstance(locale,  status);
   1067                 charIdx += 5;
   1068                 break;
   1069             }
   1070             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
   1071                 delete tp.bi;
   1072                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
   1073                 charIdx += 5;
   1074                 break;
   1075             }
   1076             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
   1077                 delete tp.bi;
   1078                 tp.bi = BreakIterator::createLineInstance(locale,  status);
   1079                 charIdx += 5;
   1080                 break;
   1081             }
   1082             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
   1083                 delete tp.bi;
   1084                 tp.bi = NULL;
   1085                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
   1086                 charIdx += 5;
   1087                 break;
   1088             }
   1089             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
   1090                 delete tp.bi;
   1091                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
   1092                 charIdx += 6;
   1093                 break;
   1094             }
   1095 
   1096             // <locale  loc_name>
   1097             localeMatcher.reset(testString);
   1098             if (localeMatcher.lookingAt(charIdx-1, status)) {
   1099                 UnicodeString localeName = localeMatcher.group(1, status);
   1100                 char localeName8[100];
   1101                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
   1102                 locale = Locale::createFromName(localeName8);
   1103                 charIdx += localeMatcher.group(0, status).length();
   1104                 TEST_ASSERT_SUCCESS(status);
   1105                 break;
   1106             }
   1107             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
   1108                 parseState = PARSE_DATA;
   1109                 charIdx += 5;
   1110                 tp.dataToBreak = "";
   1111                 tp.expectedBreaks->removeAllElements();
   1112                 tp.srcCol ->removeAllElements();
   1113                 tp.srcLine->removeAllElements();
   1114                 break;
   1115             }
   1116 
   1117             errln("line %d: Tag expected in test file.", lineNum);
   1118             parseState = PARSE_COMMENT;
   1119             savedState = PARSE_DATA;
   1120             goto end_test; // Stop the test.
   1121             }
   1122             break;
   1123 
   1124         case PARSE_DATA:
   1125             if (c == CH_BULLET) {
   1126                 int32_t  breakIdx = tp.dataToBreak.length();
   1127                 tp.expectedBreaks->setSize(breakIdx+1);
   1128                 tp.expectedBreaks->setElementAt(-1, breakIdx);
   1129                 tp.srcLine->setSize(breakIdx+1);
   1130                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1131                 tp.srcCol ->setSize(breakIdx+1);
   1132                 tp.srcCol ->setElementAt(column, breakIdx);
   1133                 break;
   1134             }
   1135 
   1136             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
   1137                 // Add final entry to mappings from break location to source file position.
   1138                 //  Need one extra because last break position returned is after the
   1139                 //    last char in the data, not at the last char.
   1140                 tp.srcLine->addElement(lineNum, status);
   1141                 tp.srcCol ->addElement(column, status);
   1142 
   1143                 parseState = PARSE_TAG;
   1144                 charIdx += 6;
   1145 
   1146                 // RUN THE TEST!
   1147                 executeTest(&tp);
   1148                 break;
   1149             }
   1150 
   1151             if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
   1152                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
   1153                 // Get the code point from the name and insert it into the test data.
   1154                 //   (Damn, no API takes names in Unicode  !!!
   1155                 //    we've got to take it back to char *)
   1156                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
   1157                 int32_t nameLength = nameEndIdx - (charIdx+2);
   1158                 char charNameBuf[200];
   1159                 UChar32 theChar = -1;
   1160                 if (nameEndIdx != -1) {
   1161                     UErrorCode status = U_ZERO_ERROR;
   1162                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
   1163                     charNameBuf[sizeof(charNameBuf)-1] = 0;
   1164                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
   1165                     if (U_FAILURE(status)) {
   1166                         theChar = -1;
   1167                     }
   1168                 }
   1169                 if (theChar == -1) {
   1170                     errln("Error in named character in test file at line %d, col %d",
   1171                         lineNum, column);
   1172                 } else {
   1173                     // Named code point was recognized.  Insert it
   1174                     //   into the test data.
   1175                     tp.dataToBreak.append(theChar);
   1176                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
   1177                         tp.srcLine->addElement(lineNum, status);
   1178                         tp.srcCol ->addElement(column, status);
   1179                     }
   1180                 }
   1181                 if (nameEndIdx > charIdx) {
   1182                     charIdx = nameEndIdx+1;
   1183 
   1184                 }
   1185                 break;
   1186             }
   1187 
   1188 
   1189 
   1190 
   1191             if (testString.compare(charIdx-1, 2, "<>") == 0) {
   1192                 charIdx++;
   1193                 int32_t  breakIdx = tp.dataToBreak.length();
   1194                 tp.expectedBreaks->setSize(breakIdx+1);
   1195                 tp.expectedBreaks->setElementAt(-1, breakIdx);
   1196                 tp.srcLine->setSize(breakIdx+1);
   1197                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1198                 tp.srcCol ->setSize(breakIdx+1);
   1199                 tp.srcCol ->setElementAt(column, breakIdx);
   1200                 break;
   1201             }
   1202 
   1203             if (c == CH_LT) {
   1204                 tagValue   = 0;
   1205                 parseState = PARSE_NUM;
   1206                 break;
   1207             }
   1208 
   1209             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
   1210                 parseState = PARSE_COMMENT;
   1211                 savedState = PARSE_DATA;
   1212                 break;
   1213             }
   1214 
   1215             if (c == CH_BACKSLASH) {
   1216                 // Check for \ at end of line, a line continuation.
   1217                 //     Advance over (discard) the newline
   1218                 UChar32 cp = testString.char32At(charIdx);
   1219                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
   1220                     // We have a CR LF
   1221                     //  Need an extra increment of the input ptr to move over both of them
   1222                     charIdx++;
   1223                 }
   1224                 if (cp == CH_LF || cp == CH_CR) {
   1225                     lineNum++;
   1226                     colStart = charIdx;
   1227                     charIdx++;
   1228                     break;
   1229                 }
   1230 
   1231                 // Let unescape handle the back slash.
   1232                 cp = testString.unescapeAt(charIdx);
   1233                 if (cp != -1) {
   1234                     // Escape sequence was recognized.  Insert the char
   1235                     //   into the test data.
   1236                     tp.dataToBreak.append(cp);
   1237                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
   1238                         tp.srcLine->addElement(lineNum, status);
   1239                         tp.srcCol ->addElement(column, status);
   1240                     }
   1241                     break;
   1242                 }
   1243 
   1244 
   1245                 // Not a recognized backslash escape sequence.
   1246                 // Take the next char as a literal.
   1247                 //  TODO:  Should this be an error?
   1248                 c = testString.charAt(charIdx);
   1249                 charIdx = testString.moveIndex32(charIdx, 1);
   1250             }
   1251 
   1252             // Normal, non-escaped data char.
   1253             tp.dataToBreak.append(c);
   1254 
   1255             // Save the mapping from offset in the data to line/column numbers in
   1256             //   the original input file.  Will be used for better error messages only.
   1257             //   If there's an expected break before this char, the slot in the mapping
   1258             //     vector will already be set for this char; don't overwrite it.
   1259             if (tp.dataToBreak.length() > tp.srcLine->size()) {
   1260                 tp.srcLine->addElement(lineNum, status);
   1261                 tp.srcCol ->addElement(column, status);
   1262             }
   1263             break;
   1264 
   1265 
   1266         case PARSE_NUM:
   1267             // We are parsing an expected numeric tag value, like <1234>,
   1268             //   within a chunk of data.
   1269             if (u_isUWhiteSpace(c)) {
   1270                 break;
   1271             }
   1272 
   1273             if (c == CH_GT) {
   1274                 // Finished the number.  Add the info to the expected break data,
   1275                 //   and switch parse state back to doing plain data.
   1276                 parseState = PARSE_DATA;
   1277                 if (tagValue == 0) {
   1278                     tagValue = -1;
   1279                 }
   1280                 int32_t  breakIdx = tp.dataToBreak.length();
   1281                 tp.expectedBreaks->setSize(breakIdx+1);
   1282                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
   1283                 tp.srcLine->setSize(breakIdx+1);
   1284                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1285                 tp.srcCol ->setSize(breakIdx+1);
   1286                 tp.srcCol ->setElementAt(column, breakIdx);
   1287                 break;
   1288             }
   1289 
   1290             if (u_isdigit(c)) {
   1291                 tagValue = tagValue*10 + u_charDigitValue(c);
   1292                 break;
   1293             }
   1294 
   1295             errln("Syntax Error in test file at line %d, col %d",
   1296                 lineNum, column);
   1297             parseState = PARSE_COMMENT;
   1298             goto end_test; // Stop the test
   1299             break;
   1300         }
   1301 
   1302 
   1303         if (U_FAILURE(status)) {
   1304             dataerrln("ICU Error %s while parsing test file at line %d.",
   1305                 u_errorName(status), lineNum);
   1306             status = U_ZERO_ERROR;
   1307             goto end_test; // Stop the test
   1308         }
   1309 
   1310     }
   1311 
   1312 end_test:
   1313     delete tp.bi;
   1314     delete tp.expectedBreaks;
   1315     delete tp.srcLine;
   1316     delete tp.srcCol;
   1317     delete [] testFile;
   1318 #endif
   1319 }
   1320 
   1321 
   1322 //-------------------------------------------------------------------------------
   1323 //
   1324 //  TestDictRules   create a break iterator from source rules that includes a
   1325 //                  dictionary range.   Regression for bug #7130.  Source rules
   1326 //                  do not declare a break iterator type (word, line, sentence, etc.
   1327 //                  but the dictionary code, without a type, would loop.
   1328 //
   1329 //-------------------------------------------------------------------------------
   1330 void RBBITest::TestDictRules() {
   1331     const char *rules =  "$dictionary = [a-z]; \n"
   1332                          "!!forward; \n"
   1333                          "$dictionary $dictionary; \n"
   1334                          "!!reverse; \n"
   1335                          "$dictionary $dictionary; \n";
   1336     const char *text = "aa";
   1337     UErrorCode status = U_ZERO_ERROR;
   1338     UParseError parseError;
   1339 
   1340     RuleBasedBreakIterator bi(rules, parseError, status);
   1341     if (U_SUCCESS(status)) {
   1342         UnicodeString utext = text;
   1343         bi.setText(utext);
   1344         int32_t position;
   1345         int32_t loops;
   1346         for (loops = 0; loops<10; loops++) {
   1347             position = bi.next();
   1348             if (position == RuleBasedBreakIterator::DONE) {
   1349                 break;
   1350             }
   1351         }
   1352         TEST_ASSERT(loops == 1);
   1353     } else {
   1354         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
   1355     }
   1356 }
   1357 
   1358 
   1359 
   1360 //-------------------------------------------------------------------------------
   1361 //
   1362 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
   1363 //    return the datain one big UChar * buffer, which the caller must delete.
   1364 //
   1365 //    parameters:
   1366 //          fileName:   the name of the file, with no directory part.  The test data directory
   1367 //                      is assumed.
   1368 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
   1369 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
   1370 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
   1371 //                      Pass NULL for the system default encoding.
   1372 //          status
   1373 //    returns:
   1374 //                      The file data, converted to UChar.
   1375 //                      The caller must delete this when done with
   1376 //                           delete [] theBuffer;
   1377 //
   1378 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
   1379 //           Move this function to some common place.
   1380 //
   1381 //--------------------------------------------------------------------------------
   1382 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
   1383     UChar       *retPtr  = NULL;
   1384     char        *fileBuf = NULL;
   1385     UConverter* conv     = NULL;
   1386     FILE        *f       = NULL;
   1387 
   1388     ulen = 0;
   1389     if (U_FAILURE(status)) {
   1390         return retPtr;
   1391     }
   1392 
   1393     //
   1394     //  Open the file.
   1395     //
   1396     f = fopen(fileName, "rb");
   1397     if (f == 0) {
   1398         dataerrln("Error opening test data file %s\n", fileName);
   1399         status = U_FILE_ACCESS_ERROR;
   1400         return NULL;
   1401     }
   1402     //
   1403     //  Read it in
   1404     //
   1405     int   fileSize;
   1406     int   amt_read;
   1407 
   1408     fseek( f, 0, SEEK_END);
   1409     fileSize = ftell(f);
   1410     fileBuf = new char[fileSize];
   1411     fseek(f, 0, SEEK_SET);
   1412     amt_read = fread(fileBuf, 1, fileSize, f);
   1413     if (amt_read != fileSize || fileSize <= 0) {
   1414         errln("Error reading test data file.");
   1415         goto cleanUpAndReturn;
   1416     }
   1417 
   1418     //
   1419     // Look for a Unicode Signature (BOM) on the data just read
   1420     //
   1421     int32_t        signatureLength;
   1422     const char *   fileBufC;
   1423     const char*    bomEncoding;
   1424 
   1425     fileBufC = fileBuf;
   1426     bomEncoding = ucnv_detectUnicodeSignature(
   1427         fileBuf, fileSize, &signatureLength, &status);
   1428     if(bomEncoding!=NULL ){
   1429         fileBufC  += signatureLength;
   1430         fileSize  -= signatureLength;
   1431         encoding = bomEncoding;
   1432     }
   1433 
   1434     //
   1435     // Open a converter to take the rule file to UTF-16
   1436     //
   1437     conv = ucnv_open(encoding, &status);
   1438     if (U_FAILURE(status)) {
   1439         goto cleanUpAndReturn;
   1440     }
   1441 
   1442     //
   1443     // Convert the rules to UChar.
   1444     //  Preflight first to determine required buffer size.
   1445     //
   1446     ulen = ucnv_toUChars(conv,
   1447         NULL,           //  dest,
   1448         0,              //  destCapacity,
   1449         fileBufC,
   1450         fileSize,
   1451         &status);
   1452     if (status == U_BUFFER_OVERFLOW_ERROR) {
   1453         // Buffer Overflow is expected from the preflight operation.
   1454         status = U_ZERO_ERROR;
   1455 
   1456         retPtr = new UChar[ulen+1];
   1457         ucnv_toUChars(conv,
   1458             retPtr,       //  dest,
   1459             ulen+1,
   1460             fileBufC,
   1461             fileSize,
   1462             &status);
   1463     }
   1464 
   1465 cleanUpAndReturn:
   1466     fclose(f);
   1467     delete []fileBuf;
   1468     ucnv_close(conv);
   1469     if (U_FAILURE(status)) {
   1470         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   1471         delete []retPtr;
   1472         retPtr = 0;
   1473         ulen   = 0;
   1474     };
   1475     return retPtr;
   1476 }
   1477 
   1478 
   1479 
   1480 //--------------------------------------------------------------------------------------------
   1481 //
   1482 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
   1483 //
   1484 //-------------------------------------------------------------------------------------------
   1485 void RBBITest::TestUnicodeFiles() {
   1486     RuleBasedBreakIterator  *bi;
   1487     UErrorCode               status = U_ZERO_ERROR;
   1488 
   1489     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
   1490     TEST_ASSERT_SUCCESS(status);
   1491     if (U_SUCCESS(status)) {
   1492         runUnicodeTestData("GraphemeBreakTest.txt", bi);
   1493     }
   1494     delete bi;
   1495 
   1496     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
   1497     TEST_ASSERT_SUCCESS(status);
   1498     if (U_SUCCESS(status)) {
   1499         runUnicodeTestData("WordBreakTest.txt", bi);
   1500     }
   1501     delete bi;
   1502 
   1503     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
   1504     TEST_ASSERT_SUCCESS(status);
   1505     if (U_SUCCESS(status)) {
   1506         runUnicodeTestData("SentenceBreakTest.txt", bi);
   1507     }
   1508     delete bi;
   1509 
   1510     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
   1511     TEST_ASSERT_SUCCESS(status);
   1512     if (U_SUCCESS(status)) {
   1513         runUnicodeTestData("LineBreakTest.txt", bi);
   1514     }
   1515     delete bi;
   1516 }
   1517 
   1518 
   1519 //--------------------------------------------------------------------------------------------
   1520 //
   1521 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
   1522 //
   1523 //-------------------------------------------------------------------------------------------
   1524 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
   1525 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   1526     // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb. Ticket #7270
   1527     UBool isTicket7270Fixed = isICUVersionAtLeast(52, 1);
   1528     UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt");
   1529     UErrorCode  status = U_ZERO_ERROR;
   1530 
   1531     //
   1532     //  Open and read the test data file, put it into a UnicodeString.
   1533     //
   1534     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   1535     char testFileName[1000];
   1536     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
   1537         dataerrln("Can't open test data.  Path too long.");
   1538         return;
   1539     }
   1540     strcpy(testFileName, testDataDirectory);
   1541     strcat(testFileName, fileName);
   1542 
   1543     logln("Opening data file %s\n", fileName);
   1544 
   1545     int    len;
   1546     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
   1547     if (status != U_FILE_ACCESS_ERROR) {
   1548         TEST_ASSERT_SUCCESS(status);
   1549         TEST_ASSERT(testFile != NULL);
   1550     }
   1551     if (U_FAILURE(status) || testFile == NULL) {
   1552         return; /* something went wrong, error already output */
   1553     }
   1554     UnicodeString testFileAsString(TRUE, testFile, len);
   1555 
   1556     //
   1557     //  Parse the test data file using a regular expression.
   1558     //  Each kind of token is recognized in its own capture group; what type of item was scanned
   1559     //     is identified by which group had a match.
   1560     //
   1561     //    Caputure Group #                  1          2            3            4           5
   1562     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
   1563     //
   1564     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
   1565     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
   1566     UnicodeString   testString;
   1567     UVector32       breakPositions(status);
   1568     int             lineNumber = 1;
   1569     TEST_ASSERT_SUCCESS(status);
   1570     if (U_FAILURE(status)) {
   1571         return;
   1572     }
   1573 
   1574     //
   1575     //  Scan through each test case, building up the string to be broken in testString,
   1576     //   and the positions that should be boundaries in the breakPositions vector.
   1577     //
   1578     int spin = 0;
   1579     while (tokenMatcher.find()) {
   1580       	if(tokenMatcher.hitEnd()) {
   1581           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
   1582              This occurred when the text file was corrupt (wasn't marked as UTF-8)
   1583              and caused an infinite loop here on EBCDIC systems!
   1584           */
   1585           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
   1586           //	   return;
   1587       	}
   1588         if (tokenMatcher.start(1, status) >= 0) {
   1589             // Scanned a divide sign, indicating a break position in the test data.
   1590             if (testString.length()>0) {
   1591                 breakPositions.addElement(testString.length(), status);
   1592             }
   1593         }
   1594         else if (tokenMatcher.start(2, status) >= 0) {
   1595             // Scanned an 'x', meaning no break at this position in the test data
   1596             //   Nothing to be done here.
   1597             }
   1598         else if (tokenMatcher.start(3, status) >= 0) {
   1599             // Scanned Hex digits.  Convert them to binary, append to the character data string.
   1600             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
   1601             int length = hexNumber.length();
   1602             if (length<=8) {
   1603                 char buf[10];
   1604                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
   1605                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
   1606                 if (c<=0x10ffff) {
   1607                     testString.append(c);
   1608                 } else {
   1609                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
   1610                        fileName, lineNumber);
   1611                 }
   1612             } else {
   1613                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
   1614                        fileName, lineNumber);
   1615              }
   1616         }
   1617         else if (tokenMatcher.start(4, status) >= 0) {
   1618             // Scanned to end of a line, possibly skipping over a comment in the process.
   1619             //   If the line from the file contained test data, run the test now.
   1620             //
   1621             if (testString.length() > 0) {
   1622 // TODO(andy): Remove this time bomb code. Note: Failing line numbers may change when updating to new Unicode data.
   1623 //             Rule 8
   1624 //                ZW SP* <break>
   1625 //             is not yet implemented.
   1626 if (!(isLineBreak && !isTicket7270Fixed && (5198 == lineNumber ||
   1627                                             5202 == lineNumber ||
   1628                                             5214 == lineNumber ||
   1629                                             5246 == lineNumber ||
   1630                                             5298 == lineNumber ||
   1631                                             5302 == lineNumber ))) {
   1632                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
   1633 }
   1634             }
   1635 
   1636             // Clear out this test case.
   1637             //    The string and breakPositions vector will be refilled as the next
   1638             //       test case is parsed.
   1639             testString.remove();
   1640             breakPositions.removeAllElements();
   1641             lineNumber++;
   1642         } else {
   1643             // Scanner catchall.  Something unrecognized appeared on the line.
   1644             char token[16];
   1645             UnicodeString uToken = tokenMatcher.group(0, status);
   1646             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
   1647             token[sizeof(token)-1] = 0;
   1648             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
   1649 
   1650             // Clean up, in preparation for continuing with the next line.
   1651             testString.remove();
   1652             breakPositions.removeAllElements();
   1653             lineNumber++;
   1654         }
   1655         TEST_ASSERT_SUCCESS(status);
   1656         if (U_FAILURE(status)) {
   1657             break;
   1658         }
   1659     }
   1660 
   1661     delete [] testFile;
   1662  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
   1663 }
   1664 
   1665 //--------------------------------------------------------------------------------------------
   1666 //
   1667 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
   1668 //                            test data files.  Do only a simple, forward-only check -
   1669 //                            this test is mostly to check that ICU and the Unicode
   1670 //                            data agree with each other.
   1671 //
   1672 //--------------------------------------------------------------------------------------------
   1673 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
   1674                          const UnicodeString &testString,   // Text data to be broken
   1675                          UVector32 *breakPositions,         // Positions where breaks should be found.
   1676                          RuleBasedBreakIterator *bi) {
   1677     int32_t pos;                 // Break Position in the test string
   1678     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
   1679     int32_t expectedPos;         // Expected break position (index into test string)
   1680 
   1681     bi->setText(testString);
   1682     pos = bi->first();
   1683     pos = bi->next();
   1684 
   1685     while (pos != BreakIterator::DONE) {
   1686         if (expectedI >= breakPositions->size()) {
   1687             errln("Test file \"%s\", line %d, unexpected break found at position %d",
   1688                 testFileName, lineNumber, pos);
   1689             break;
   1690         }
   1691         expectedPos = breakPositions->elementAti(expectedI);
   1692         if (pos < expectedPos) {
   1693             errln("Test file \"%s\", line %d, unexpected break found at position %d",
   1694                 testFileName, lineNumber, pos);
   1695             break;
   1696         }
   1697         if (pos > expectedPos) {
   1698             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
   1699                 testFileName, lineNumber, expectedPos);
   1700             break;
   1701         }
   1702         pos = bi->next();
   1703         expectedI++;
   1704     }
   1705 
   1706     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
   1707         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
   1708             testFileName, lineNumber, breakPositions->elementAti(expectedI));
   1709     }
   1710 }
   1711 
   1712 
   1713 
   1714 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   1715 //---------------------------------------------------------------------------------------
   1716 //
   1717 //   classs RBBIMonkeyKind
   1718 //
   1719 //      Monkey Test for Break Iteration
   1720 //      Abstract interface class.   Concrete derived classes independently
   1721 //      implement the break rules for different iterator types.
   1722 //
   1723 //      The Monkey Test itself uses doesn't know which type of break iterator it is
   1724 //      testing, but works purely in terms of the interface defined here.
   1725 //
   1726 //---------------------------------------------------------------------------------------
   1727 class RBBIMonkeyKind {
   1728 public:
   1729     // Return a UVector of UnicodeSets, representing the character classes used
   1730     //   for this type of iterator.
   1731     virtual  UVector  *charClasses() = 0;
   1732 
   1733     // Set the test text on which subsequent calls to next() will operate
   1734     virtual  void      setText(const UnicodeString &s) = 0;
   1735 
   1736     // Find the next break postion, starting from the prev break position, or from zero.
   1737     // Return -1 after reaching end of string.
   1738     virtual  int32_t   next(int32_t i) = 0;
   1739 
   1740     virtual ~RBBIMonkeyKind();
   1741     UErrorCode       deferredStatus;
   1742 
   1743 
   1744 protected:
   1745     RBBIMonkeyKind();
   1746 
   1747 private:
   1748 };
   1749 
   1750 RBBIMonkeyKind::RBBIMonkeyKind() {
   1751     deferredStatus = U_ZERO_ERROR;
   1752 }
   1753 
   1754 RBBIMonkeyKind::~RBBIMonkeyKind() {
   1755 }
   1756 
   1757 
   1758 //----------------------------------------------------------------------------------------
   1759 //
   1760 //   Random Numbers.  Similar to standard lib rand() and srand()
   1761 //                    Not using library to
   1762 //                      1.  Get same results on all platforms.
   1763 //                      2.  Get access to current seed, to more easily reproduce failures.
   1764 //
   1765 //---------------------------------------------------------------------------------------
   1766 static uint32_t m_seed = 1;
   1767 
   1768 static uint32_t m_rand()
   1769 {
   1770     m_seed = m_seed * 1103515245 + 12345;
   1771     return (uint32_t)(m_seed/65536) % 32768;
   1772 }
   1773 
   1774 
   1775 //------------------------------------------------------------------------------------------
   1776 //
   1777 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
   1778 //                             of RBBIMonkeyKind.
   1779 //
   1780 //------------------------------------------------------------------------------------------
   1781 class RBBICharMonkey: public RBBIMonkeyKind {
   1782 public:
   1783     RBBICharMonkey();
   1784     virtual          ~RBBICharMonkey();
   1785     virtual  UVector *charClasses();
   1786     virtual  void     setText(const UnicodeString &s);
   1787     virtual  int32_t  next(int32_t i);
   1788 private:
   1789     UVector   *fSets;
   1790 
   1791     UnicodeSet  *fCRLFSet;
   1792     UnicodeSet  *fControlSet;
   1793     UnicodeSet  *fExtendSet;
   1794     UnicodeSet  *fRegionalIndicatorSet;
   1795     UnicodeSet  *fPrependSet;
   1796     UnicodeSet  *fSpacingSet;
   1797     UnicodeSet  *fLSet;
   1798     UnicodeSet  *fVSet;
   1799     UnicodeSet  *fTSet;
   1800     UnicodeSet  *fLVSet;
   1801     UnicodeSet  *fLVTSet;
   1802     UnicodeSet  *fHangulSet;
   1803     UnicodeSet  *fAnySet;
   1804 
   1805     const UnicodeString *fText;
   1806 };
   1807 
   1808 
   1809 RBBICharMonkey::RBBICharMonkey() {
   1810     UErrorCode  status = U_ZERO_ERROR;
   1811 
   1812     fText = NULL;
   1813 
   1814     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
   1815     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
   1816     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
   1817     fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
   1818     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
   1819     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
   1820     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
   1821     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
   1822     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
   1823     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
   1824     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
   1825     fHangulSet  = new UnicodeSet();
   1826     fHangulSet->addAll(*fLSet);
   1827     fHangulSet->addAll(*fVSet);
   1828     fHangulSet->addAll(*fTSet);
   1829     fHangulSet->addAll(*fLVSet);
   1830     fHangulSet->addAll(*fLVTSet);
   1831     fAnySet     = new UnicodeSet(0, 0x10ffff);
   1832 
   1833     fSets       = new UVector(status);
   1834     fSets->addElement(fCRLFSet,    status);
   1835     fSets->addElement(fControlSet, status);
   1836     fSets->addElement(fExtendSet,  status);
   1837     fSets->addElement(fRegionalIndicatorSet, status);
   1838     if (!fPrependSet->isEmpty()) {
   1839         fSets->addElement(fPrependSet, status);
   1840     }
   1841     fSets->addElement(fSpacingSet, status);
   1842     fSets->addElement(fHangulSet,  status);
   1843     fSets->addElement(fAnySet,     status);
   1844     if (U_FAILURE(status)) {
   1845         deferredStatus = status;
   1846     }
   1847 }
   1848 
   1849 
   1850 void RBBICharMonkey::setText(const UnicodeString &s) {
   1851     fText = &s;
   1852 }
   1853 
   1854 
   1855 
   1856 int32_t RBBICharMonkey::next(int32_t prevPos) {
   1857     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   1858                               //   break position being tested.  The candidate break
   1859                               //   location is before p2.
   1860 
   1861     int     breakPos = -1;
   1862 
   1863     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   1864 
   1865     if (U_FAILURE(deferredStatus)) {
   1866         return -1;
   1867     }
   1868 
   1869     // Previous break at end of string.  return DONE.
   1870     if (prevPos >= fText->length()) {
   1871         return -1;
   1872     }
   1873     p0 = p1 = p2 = p3 = prevPos;
   1874     c3 =  fText->char32At(prevPos);
   1875     c0 = c1 = c2 = 0;
   1876 
   1877     // Loop runs once per "significant" character position in the input text.
   1878     for (;;) {
   1879         // Move all of the positions forward in the input string.
   1880         p0 = p1;  c0 = c1;
   1881         p1 = p2;  c1 = c2;
   1882         p2 = p3;  c2 = c3;
   1883 
   1884         // Advancd p3 by one codepoint
   1885         p3 = fText->moveIndex32(p3, 1);
   1886         c3 = fText->char32At(p3);
   1887 
   1888         if (p1 == p2) {
   1889             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   1890             continue;
   1891         }
   1892         if (p2 == fText->length()) {
   1893             // Reached end of string.  Always a break position.
   1894             break;
   1895         }
   1896 
   1897         // Rule  GB3   CR x LF
   1898         //     No Extend or Format characters may appear between the CR and LF,
   1899         //     which requires the additional check for p2 immediately following p1.
   1900         //
   1901         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
   1902             continue;
   1903         }
   1904 
   1905         // Rule (GB4).   ( Control | CR | LF ) <break>
   1906         if (fControlSet->contains(c1) ||
   1907             c1 == 0x0D ||
   1908             c1 == 0x0A)  {
   1909             break;
   1910         }
   1911 
   1912         // Rule (GB5)    <break>  ( Control | CR | LF )
   1913         //
   1914         if (fControlSet->contains(c2) ||
   1915             c2 == 0x0D ||
   1916             c2 == 0x0A)  {
   1917             break;
   1918         }
   1919 
   1920 
   1921         // Rule (GB6)  L x ( L | V | LV | LVT )
   1922         if (fLSet->contains(c1) &&
   1923                (fLSet->contains(c2)  ||
   1924                 fVSet->contains(c2)  ||
   1925                 fLVSet->contains(c2) ||
   1926                 fLVTSet->contains(c2))) {
   1927             continue;
   1928         }
   1929 
   1930         // Rule (GB7)    ( LV | V )  x  ( V | T )
   1931         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
   1932             (fVSet->contains(c2) || fTSet->contains(c2)))  {
   1933             continue;
   1934         }
   1935 
   1936         // Rule (GB8)    ( LVT | T)  x T
   1937         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
   1938             fTSet->contains(c2))  {
   1939             continue;
   1940         }
   1941 
   1942         // Rule (GB8a)    Regional_Indicator x Regional_Indicator
   1943         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
   1944             continue;
   1945         }
   1946 
   1947         // Rule (GB9)    Numeric x ALetter
   1948         if (fExtendSet->contains(c2))  {
   1949             continue;
   1950         }
   1951 
   1952         // Rule (GB9a)   x  SpacingMark
   1953         if (fSpacingSet->contains(c2)) {
   1954             continue;
   1955         }
   1956 
   1957         // Rule (GB9b)   Prepend x
   1958         if (fPrependSet->contains(c1)) {
   1959             continue;
   1960         }
   1961 
   1962         // Rule (GB10)  Any  <break>  Any
   1963         break;
   1964     }
   1965 
   1966     breakPos = p2;
   1967     return breakPos;
   1968 }
   1969 
   1970 
   1971 
   1972 UVector  *RBBICharMonkey::charClasses() {
   1973     return fSets;
   1974 }
   1975 
   1976 
   1977 RBBICharMonkey::~RBBICharMonkey() {
   1978     delete fSets;
   1979     delete fCRLFSet;
   1980     delete fControlSet;
   1981     delete fExtendSet;
   1982     delete fRegionalIndicatorSet;
   1983     delete fPrependSet;
   1984     delete fSpacingSet;
   1985     delete fLSet;
   1986     delete fVSet;
   1987     delete fTSet;
   1988     delete fLVSet;
   1989     delete fLVTSet;
   1990     delete fHangulSet;
   1991     delete fAnySet;
   1992 }
   1993 
   1994 //------------------------------------------------------------------------------------------
   1995 //
   1996 //   class RBBIWordMonkey      Word Break specific implementation
   1997 //                             of RBBIMonkeyKind.
   1998 //
   1999 //------------------------------------------------------------------------------------------
   2000 class RBBIWordMonkey: public RBBIMonkeyKind {
   2001 public:
   2002     RBBIWordMonkey();
   2003     virtual          ~RBBIWordMonkey();
   2004     virtual  UVector *charClasses();
   2005     virtual  void     setText(const UnicodeString &s);
   2006     virtual int32_t   next(int32_t i);
   2007 private:
   2008     UVector      *fSets;
   2009 
   2010     UnicodeSet  *fCRSet;
   2011     UnicodeSet  *fLFSet;
   2012     UnicodeSet  *fNewlineSet;
   2013     UnicodeSet  *fKatakanaSet;
   2014     UnicodeSet  *fALetterSet;
   2015     // TODO(jungshik): Do we still need this change?
   2016     // UnicodeSet  *fALetterSet; // matches ALetterPlus in word.txt
   2017     UnicodeSet  *fMidNumLetSet;
   2018     UnicodeSet  *fMidLetterSet;
   2019     UnicodeSet  *fMidNumSet;
   2020     UnicodeSet  *fNumericSet;
   2021     UnicodeSet  *fFormatSet;
   2022     UnicodeSet  *fOtherSet;
   2023     UnicodeSet  *fExtendSet;
   2024     UnicodeSet  *fExtendNumLetSet;
   2025     UnicodeSet  *fRegionalIndicatorSet;
   2026     UnicodeSet  *fDictionaryCjkSet;
   2027 
   2028     RegexMatcher  *fMatcher;
   2029 
   2030     const UnicodeString  *fText;
   2031 };
   2032 
   2033 
   2034 RBBIWordMonkey::RBBIWordMonkey()
   2035 {
   2036     UErrorCode  status = U_ZERO_ERROR;
   2037 
   2038     fSets            = new UVector(status);
   2039 
   2040     fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
   2041     fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
   2042     fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
   2043     fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
   2044     // Exclude Hangul syllables from ALetterSet during testing.
   2045     // Leave CJK dictionary characters out from the monkey tests!
   2046 #if 0
   2047     fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
   2048                                       "[\\p{Line_Break = Complex_Context}"
   2049                                       "-\\p{Grapheme_Cluster_Break = Extend}"
   2050                                       "-\\p{Grapheme_Cluster_Break = Control}"
   2051                                       "]]",
   2052                                       status);
   2053 #endif
   2054     fALetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
   2055     fALetterSet->removeAll(*fDictionaryCjkSet);
   2056     fKatakanaSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
   2057     fMidNumLetSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
   2058     fMidLetterSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
   2059     fMidNumSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
   2060     // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
   2061     // we should figure out why
   2062     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
   2063     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
   2064     fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
   2065     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
   2066     fRegionalIndicatorSet =  new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
   2067 
   2068     fOtherSet        = new UnicodeSet();
   2069     if(U_FAILURE(status)) {
   2070       deferredStatus = status;
   2071       return;
   2072     }
   2073 
   2074     fOtherSet->complement();
   2075     fOtherSet->removeAll(*fCRSet);
   2076     fOtherSet->removeAll(*fLFSet);
   2077     fOtherSet->removeAll(*fNewlineSet);
   2078     fOtherSet->removeAll(*fKatakanaSet);
   2079     fOtherSet->removeAll(*fALetterSet);
   2080     fOtherSet->removeAll(*fMidLetterSet);
   2081     fOtherSet->removeAll(*fMidNumSet);
   2082     fOtherSet->removeAll(*fNumericSet);
   2083     fOtherSet->removeAll(*fExtendNumLetSet);
   2084     fOtherSet->removeAll(*fFormatSet);
   2085     fOtherSet->removeAll(*fExtendSet);
   2086     fOtherSet->removeAll(*fRegionalIndicatorSet);
   2087     // Inhibit dictionary characters from being tested at all.
   2088     fOtherSet->removeAll(*fDictionaryCjkSet);
   2089     fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
   2090 
   2091     fSets->addElement(fCRSet,        status);
   2092     fSets->addElement(fLFSet,        status);
   2093     fSets->addElement(fNewlineSet,   status);
   2094     fSets->addElement(fALetterSet,   status);
   2095     //fSets->addElement(fKatakanaSet,  status); //TODO: work out how to test katakana
   2096     fSets->addElement(fMidLetterSet, status);
   2097     fSets->addElement(fMidNumLetSet, status);
   2098     fSets->addElement(fMidNumSet,    status);
   2099     fSets->addElement(fNumericSet,   status);
   2100     fSets->addElement(fFormatSet,    status);
   2101     fSets->addElement(fExtendSet,    status);
   2102     fSets->addElement(fOtherSet,     status);
   2103     fSets->addElement(fExtendNumLetSet, status);
   2104     fSets->addElement(fRegionalIndicatorSet, status);
   2105 
   2106     if (U_FAILURE(status)) {
   2107         deferredStatus = status;
   2108     }
   2109 }
   2110 
   2111 void RBBIWordMonkey::setText(const UnicodeString &s) {
   2112     fText       = &s;
   2113 }
   2114 
   2115 
   2116 int32_t RBBIWordMonkey::next(int32_t prevPos) {
   2117     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2118                               //   break position being tested.  The candidate break
   2119                               //   location is before p2.
   2120 
   2121     int     breakPos = -1;
   2122 
   2123     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2124 
   2125     if (U_FAILURE(deferredStatus)) {
   2126         return -1;
   2127     }
   2128 
   2129     // Prev break at end of string.  return DONE.
   2130     if (prevPos >= fText->length()) {
   2131         return -1;
   2132     }
   2133     p0 = p1 = p2 = p3 = prevPos;
   2134     c3 =  fText->char32At(prevPos);
   2135     c0 = c1 = c2 = 0;
   2136 
   2137     // Loop runs once per "significant" character position in the input text.
   2138     for (;;) {
   2139         // Move all of the positions forward in the input string.
   2140         p0 = p1;  c0 = c1;
   2141         p1 = p2;  c1 = c2;
   2142         p2 = p3;  c2 = c3;
   2143 
   2144         // Advancd p3 by    X(Extend | Format)*   Rule 4
   2145         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
   2146         do {
   2147             p3 = fText->moveIndex32(p3, 1);
   2148             c3 = fText->char32At(p3);
   2149             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
   2150                break;
   2151             };
   2152         }
   2153         while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
   2154 
   2155 
   2156         if (p1 == p2) {
   2157             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2158             continue;
   2159         }
   2160         if (p2 == fText->length()) {
   2161             // Reached end of string.  Always a break position.
   2162             break;
   2163         }
   2164 
   2165         // Rule  (3)   CR x LF
   2166         //     No Extend or Format characters may appear between the CR and LF,
   2167         //     which requires the additional check for p2 immediately following p1.
   2168         //
   2169         if (c1==0x0D && c2==0x0A) {
   2170             continue;
   2171         }
   2172 
   2173         // Rule (3a)  Break before and after newlines (including CR and LF)
   2174         //
   2175         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
   2176             break;
   2177         };
   2178         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
   2179             break;
   2180         };
   2181 
   2182         // Rule (5).   ALetter x ALetter
   2183         if (fALetterSet->contains(c1) &&
   2184             fALetterSet->contains(c2))  {
   2185             continue;
   2186         }
   2187 
   2188         // Rule (6)  ALetter  x  (MidLetter | MidNumLet) ALetter
   2189         //
   2190         if ( fALetterSet->contains(c1)   &&
   2191              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
   2192              fALetterSet->contains(c3)) {
   2193             continue;
   2194         }
   2195 
   2196 
   2197         // Rule (7)  ALetter (MidLetter | MidNumLet)  x  ALetter
   2198         if (fALetterSet->contains(c0) &&
   2199             (fMidLetterSet->contains(c1) ||  fMidNumLetSet->contains(c1)) &&
   2200             fALetterSet->contains(c2)) {
   2201             continue;
   2202         }
   2203 
   2204         // Rule (8)    Numeric x Numeric
   2205         if (fNumericSet->contains(c1) &&
   2206             fNumericSet->contains(c2))  {
   2207             continue;
   2208         }
   2209 
   2210         // Rule (9)    ALetter x Numeric
   2211         if (fALetterSet->contains(c1) &&
   2212             fNumericSet->contains(c2))  {
   2213             continue;
   2214         }
   2215 
   2216         // Rule (10)    Numeric x ALetter
   2217         if (fNumericSet->contains(c1) &&
   2218             fALetterSet->contains(c2))  {
   2219             continue;
   2220         }
   2221 
   2222         // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric
   2223         if (fNumericSet->contains(c0) &&
   2224             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1))  &&
   2225             fNumericSet->contains(c2)) {
   2226             continue;
   2227         }
   2228 
   2229         // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric
   2230         if (fNumericSet->contains(c1) &&
   2231             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2))  &&
   2232             fNumericSet->contains(c3)) {
   2233             continue;
   2234         }
   2235 
   2236         // Rule (13)  Katakana x Katakana
   2237         if (fKatakanaSet->contains(c1) &&
   2238             fKatakanaSet->contains(c2))  {
   2239             continue;
   2240         }
   2241 
   2242         // Rule 13a
   2243         if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
   2244              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
   2245              fExtendNumLetSet->contains(c2)) {
   2246                 continue;
   2247         }
   2248 
   2249         // Rule 13b
   2250         if (fExtendNumLetSet->contains(c1) &&
   2251                 (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
   2252                 fKatakanaSet->contains(c2)))  {
   2253                 continue;
   2254         }
   2255 
   2256         // Rule 13c
   2257         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
   2258             continue;
   2259         }
   2260 
   2261         // Rule 14.  Break found here.
   2262         break;
   2263     }
   2264 
   2265     breakPos = p2;
   2266     return breakPos;
   2267 }
   2268 
   2269 
   2270 UVector  *RBBIWordMonkey::charClasses() {
   2271     return fSets;
   2272 }
   2273 
   2274 
   2275 RBBIWordMonkey::~RBBIWordMonkey() {
   2276     delete fSets;
   2277     delete fCRSet;
   2278     delete fLFSet;
   2279     delete fNewlineSet;
   2280     delete fKatakanaSet;
   2281     delete fALetterSet;
   2282     delete fMidNumLetSet;
   2283     delete fMidLetterSet;
   2284     delete fMidNumSet;
   2285     delete fNumericSet;
   2286     delete fFormatSet;
   2287     delete fExtendSet;
   2288     delete fExtendNumLetSet;
   2289     delete fRegionalIndicatorSet;
   2290     delete fDictionaryCjkSet;
   2291     delete fOtherSet;
   2292 }
   2293 
   2294 
   2295 
   2296 
   2297 //------------------------------------------------------------------------------------------
   2298 //
   2299 //   class RBBISentMonkey      Sentence Break specific implementation
   2300 //                             of RBBIMonkeyKind.
   2301 //
   2302 //------------------------------------------------------------------------------------------
   2303 class RBBISentMonkey: public RBBIMonkeyKind {
   2304 public:
   2305     RBBISentMonkey();
   2306     virtual          ~RBBISentMonkey();
   2307     virtual  UVector *charClasses();
   2308     virtual  void     setText(const UnicodeString &s);
   2309     virtual int32_t   next(int32_t i);
   2310 private:
   2311     int               moveBack(int posFrom);
   2312     int               moveForward(int posFrom);
   2313     UChar32           cAt(int pos);
   2314 
   2315     UVector      *fSets;
   2316 
   2317     UnicodeSet  *fSepSet;
   2318     UnicodeSet  *fFormatSet;
   2319     UnicodeSet  *fSpSet;
   2320     UnicodeSet  *fLowerSet;
   2321     UnicodeSet  *fUpperSet;
   2322     UnicodeSet  *fOLetterSet;
   2323     UnicodeSet  *fNumericSet;
   2324     UnicodeSet  *fATermSet;
   2325     UnicodeSet  *fSContinueSet;
   2326     UnicodeSet  *fSTermSet;
   2327     UnicodeSet  *fCloseSet;
   2328     UnicodeSet  *fOtherSet;
   2329     UnicodeSet  *fExtendSet;
   2330 
   2331     const UnicodeString  *fText;
   2332 
   2333 };
   2334 
   2335 RBBISentMonkey::RBBISentMonkey()
   2336 {
   2337     UErrorCode  status = U_ZERO_ERROR;
   2338 
   2339     fSets            = new UVector(status);
   2340 
   2341     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
   2342     //                       set and made into character classes of their own.  For the monkey impl,
   2343     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
   2344     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
   2345     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
   2346     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
   2347     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
   2348     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
   2349     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
   2350     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
   2351     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
   2352     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
   2353     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
   2354     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
   2355     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
   2356     fOtherSet        = new UnicodeSet();
   2357 
   2358     if(U_FAILURE(status)) {
   2359       deferredStatus = status;
   2360       return;
   2361     }
   2362 
   2363     fOtherSet->complement();
   2364     fOtherSet->removeAll(*fSepSet);
   2365     fOtherSet->removeAll(*fFormatSet);
   2366     fOtherSet->removeAll(*fSpSet);
   2367     fOtherSet->removeAll(*fLowerSet);
   2368     fOtherSet->removeAll(*fUpperSet);
   2369     fOtherSet->removeAll(*fOLetterSet);
   2370     fOtherSet->removeAll(*fNumericSet);
   2371     fOtherSet->removeAll(*fATermSet);
   2372     fOtherSet->removeAll(*fSContinueSet);
   2373     fOtherSet->removeAll(*fSTermSet);
   2374     fOtherSet->removeAll(*fCloseSet);
   2375     fOtherSet->removeAll(*fExtendSet);
   2376 
   2377     fSets->addElement(fSepSet,       status);
   2378     fSets->addElement(fFormatSet,    status);
   2379     fSets->addElement(fSpSet,        status);
   2380     fSets->addElement(fLowerSet,     status);
   2381     fSets->addElement(fUpperSet,     status);
   2382     fSets->addElement(fOLetterSet,   status);
   2383     fSets->addElement(fNumericSet,   status);
   2384     fSets->addElement(fATermSet,     status);
   2385     fSets->addElement(fSContinueSet, status);
   2386     fSets->addElement(fSTermSet,     status);
   2387     fSets->addElement(fCloseSet,     status);
   2388     fSets->addElement(fOtherSet,     status);
   2389     fSets->addElement(fExtendSet,    status);
   2390 
   2391     if (U_FAILURE(status)) {
   2392         deferredStatus = status;
   2393     }
   2394 }
   2395 
   2396 
   2397 
   2398 void RBBISentMonkey::setText(const UnicodeString &s) {
   2399     fText       = &s;
   2400 }
   2401 
   2402 UVector  *RBBISentMonkey::charClasses() {
   2403     return fSets;
   2404 }
   2405 
   2406 
   2407 //  moveBack()   Find the "significant" code point preceding the index i.
   2408 //               Skips over ($Extend | $Format)* .
   2409 //
   2410 int RBBISentMonkey::moveBack(int i) {
   2411     if (i <= 0) {
   2412         return -1;
   2413     }
   2414     UChar32   c;
   2415     int32_t   j = i;
   2416     do {
   2417         j = fText->moveIndex32(j, -1);
   2418         c = fText->char32At(j);
   2419     }
   2420     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
   2421     return j;
   2422 
   2423  }
   2424 
   2425 
   2426 int RBBISentMonkey::moveForward(int i) {
   2427     if (i>=fText->length()) {
   2428         return fText->length();
   2429     }
   2430     UChar32   c;
   2431     int32_t   j = i;
   2432     do {
   2433         j = fText->moveIndex32(j, 1);
   2434         c = cAt(j);
   2435     }
   2436     while (fFormatSet->contains(c) || fExtendSet->contains(c));
   2437     return j;
   2438 }
   2439 
   2440 UChar32 RBBISentMonkey::cAt(int pos) {
   2441     if (pos<0 || pos>=fText->length()) {
   2442         return -1;
   2443     } else {
   2444         return fText->char32At(pos);
   2445     }
   2446 }
   2447 
   2448 int32_t RBBISentMonkey::next(int32_t prevPos) {
   2449     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2450                               //   break position being tested.  The candidate break
   2451                               //   location is before p2.
   2452 
   2453     int     breakPos = -1;
   2454 
   2455     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2456     UChar32 c;
   2457 
   2458     if (U_FAILURE(deferredStatus)) {
   2459         return -1;
   2460     }
   2461 
   2462     // Prev break at end of string.  return DONE.
   2463     if (prevPos >= fText->length()) {
   2464         return -1;
   2465     }
   2466     p0 = p1 = p2 = p3 = prevPos;
   2467     c3 =  fText->char32At(prevPos);
   2468     c0 = c1 = c2 = 0;
   2469 
   2470     // Loop runs once per "significant" character position in the input text.
   2471     for (;;) {
   2472         // Move all of the positions forward in the input string.
   2473         p0 = p1;  c0 = c1;
   2474         p1 = p2;  c1 = c2;
   2475         p2 = p3;  c2 = c3;
   2476 
   2477         // Advancd p3 by    X(Extend | Format)*   Rule 4
   2478         p3 = moveForward(p3);
   2479         c3 = cAt(p3);
   2480 
   2481         // Rule (3)  CR x LF
   2482         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
   2483             continue;
   2484         }
   2485 
   2486         // Rule (4).   Sep  <break>
   2487         if (fSepSet->contains(c1)) {
   2488             p2 = p1+1;   // Separators don't combine with Extend or Format.
   2489             break;
   2490         }
   2491 
   2492         if (p2 >= fText->length()) {
   2493             // Reached end of string.  Always a break position.
   2494             break;
   2495         }
   2496 
   2497         if (p2 == prevPos) {
   2498             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2499             continue;
   2500         }
   2501 
   2502         // Rule (6).   ATerm x Numeric
   2503         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
   2504             continue;
   2505         }
   2506 
   2507         // Rule (7).  Upper ATerm  x  Uppper
   2508         if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
   2509             continue;
   2510         }
   2511 
   2512         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
   2513         //           Note:  STerm | ATerm are added to the negated part of the expression by a
   2514         //                  note to the Unicode 5.0 documents.
   2515         int p8 = p1;
   2516         while (fSpSet->contains(cAt(p8))) {
   2517             p8 = moveBack(p8);
   2518         }
   2519         while (fCloseSet->contains(cAt(p8))) {
   2520             p8 = moveBack(p8);
   2521         }
   2522         if (fATermSet->contains(cAt(p8))) {
   2523             p8=p2;
   2524             for (;;) {
   2525                 c = cAt(p8);
   2526                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
   2527                     fLowerSet->contains(c) || fSepSet->contains(c) ||
   2528                     fATermSet->contains(c) || fSTermSet->contains(c))  {
   2529                     break;
   2530                 }
   2531                 p8 = moveForward(p8);
   2532             }
   2533             if (fLowerSet->contains(cAt(p8))) {
   2534                 continue;
   2535             }
   2536         }
   2537 
   2538         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
   2539         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
   2540             p8 = p1;
   2541             while (fSpSet->contains(cAt(p8))) {
   2542                 p8 = moveBack(p8);
   2543             }
   2544             while (fCloseSet->contains(cAt(p8))) {
   2545                 p8 = moveBack(p8);
   2546             }
   2547             c = cAt(p8);
   2548             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
   2549                 continue;
   2550             }
   2551         }
   2552 
   2553         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
   2554         int p9 = p1;
   2555         while (fCloseSet->contains(cAt(p9))) {
   2556             p9 = moveBack(p9);
   2557         }
   2558         c = cAt(p9);
   2559         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
   2560             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
   2561                 continue;
   2562             }
   2563         }
   2564 
   2565         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
   2566         int p10 = p1;
   2567         while (fSpSet->contains(cAt(p10))) {
   2568             p10 = moveBack(p10);
   2569         }
   2570         while (fCloseSet->contains(cAt(p10))) {
   2571             p10 = moveBack(p10);
   2572         }
   2573         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
   2574             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
   2575                 continue;
   2576             }
   2577         }
   2578 
   2579         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
   2580         int p11 = p1;
   2581         if (fSepSet->contains(cAt(p11))) {
   2582             p11 = moveBack(p11);
   2583         }
   2584         while (fSpSet->contains(cAt(p11))) {
   2585             p11 = moveBack(p11);
   2586         }
   2587         while (fCloseSet->contains(cAt(p11))) {
   2588             p11 = moveBack(p11);
   2589         }
   2590         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
   2591             break;
   2592         }
   2593 
   2594         //  Rule (12)  Any x Any
   2595         continue;
   2596     }
   2597     breakPos = p2;
   2598     return breakPos;
   2599 }
   2600 
   2601 RBBISentMonkey::~RBBISentMonkey() {
   2602     delete fSets;
   2603     delete fSepSet;
   2604     delete fFormatSet;
   2605     delete fSpSet;
   2606     delete fLowerSet;
   2607     delete fUpperSet;
   2608     delete fOLetterSet;
   2609     delete fNumericSet;
   2610     delete fATermSet;
   2611     delete fSContinueSet;
   2612     delete fSTermSet;
   2613     delete fCloseSet;
   2614     delete fOtherSet;
   2615     delete fExtendSet;
   2616 }
   2617 
   2618 
   2619 
   2620 //-------------------------------------------------------------------------------------------
   2621 //
   2622 //  RBBILineMonkey
   2623 //
   2624 //-------------------------------------------------------------------------------------------
   2625 
   2626 class RBBILineMonkey: public RBBIMonkeyKind {
   2627 public:
   2628     RBBILineMonkey();
   2629     virtual          ~RBBILineMonkey();
   2630     virtual  UVector *charClasses();
   2631     virtual  void     setText(const UnicodeString &s);
   2632     virtual  int32_t  next(int32_t i);
   2633     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
   2634 private:
   2635     UVector      *fSets;
   2636 
   2637     UnicodeSet  *fBK;
   2638     UnicodeSet  *fCR;
   2639     UnicodeSet  *fLF;
   2640     UnicodeSet  *fCM;
   2641     UnicodeSet  *fNL;
   2642     UnicodeSet  *fSG;
   2643     UnicodeSet  *fWJ;
   2644     UnicodeSet  *fZW;
   2645     UnicodeSet  *fGL;
   2646     UnicodeSet  *fCB;
   2647     UnicodeSet  *fSP;
   2648     UnicodeSet  *fB2;
   2649     UnicodeSet  *fBA;
   2650     UnicodeSet  *fBB;
   2651     UnicodeSet  *fHY;
   2652     UnicodeSet  *fH2;
   2653     UnicodeSet  *fH3;
   2654     UnicodeSet  *fCL;
   2655     UnicodeSet  *fCP;
   2656     UnicodeSet  *fEX;
   2657     UnicodeSet  *fIN;
   2658     UnicodeSet  *fJL;
   2659     UnicodeSet  *fJV;
   2660     UnicodeSet  *fJT;
   2661     UnicodeSet  *fNS;
   2662     UnicodeSet  *fOP;
   2663     UnicodeSet  *fQU;
   2664     UnicodeSet  *fIS;
   2665     UnicodeSet  *fNU;
   2666     UnicodeSet  *fPO;
   2667     UnicodeSet  *fPR;
   2668     UnicodeSet  *fSY;
   2669     UnicodeSet  *fAI;
   2670     UnicodeSet  *fAL;
   2671     UnicodeSet  *fCJ;
   2672     UnicodeSet  *fHL;
   2673     UnicodeSet  *fID;
   2674     UnicodeSet  *fRI;
   2675     UnicodeSet  *fSA;
   2676     UnicodeSet  *fXX;
   2677 
   2678     BreakIterator  *fCharBI;
   2679 
   2680     const UnicodeString  *fText;
   2681     int32_t              *fOrigPositions;
   2682 
   2683     RegexMatcher         *fNumberMatcher;
   2684     RegexMatcher         *fLB11Matcher;
   2685 };
   2686 
   2687 
   2688 RBBILineMonkey::RBBILineMonkey()
   2689 {
   2690     UErrorCode  status = U_ZERO_ERROR;
   2691 
   2692     fSets  = new UVector(status);
   2693 
   2694     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
   2695     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
   2696     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
   2697     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
   2698     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
   2699     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
   2700     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
   2701     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
   2702     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
   2703     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
   2704     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
   2705     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
   2706     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
   2707     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
   2708     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
   2709     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
   2710     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
   2711     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
   2712     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
   2713     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
   2714     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
   2715     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
   2716     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
   2717     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
   2718     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
   2719     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
   2720     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
   2721     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
   2722     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
   2723     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
   2724     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
   2725     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
   2726     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
   2727     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
   2728     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
   2729     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
   2730     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
   2731     fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
   2732     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
   2733     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
   2734 
   2735     if (U_FAILURE(status)) {
   2736         deferredStatus = status;
   2737         fCharBI = NULL;
   2738         fNumberMatcher = NULL;
   2739         return;
   2740     }
   2741 
   2742     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
   2743     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
   2744     fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
   2745     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
   2746 
   2747     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
   2748 
   2749     fSets->addElement(fBK, status);
   2750     fSets->addElement(fCR, status);
   2751     fSets->addElement(fLF, status);
   2752     fSets->addElement(fCM, status);
   2753     fSets->addElement(fNL, status);
   2754     fSets->addElement(fWJ, status);
   2755     fSets->addElement(fZW, status);
   2756     fSets->addElement(fGL, status);
   2757     fSets->addElement(fCB, status);
   2758     fSets->addElement(fSP, status);
   2759     fSets->addElement(fB2, status);
   2760     fSets->addElement(fBA, status);
   2761     fSets->addElement(fBB, status);
   2762     fSets->addElement(fHY, status);
   2763     fSets->addElement(fH2, status);
   2764     fSets->addElement(fH3, status);
   2765     fSets->addElement(fCL, status);
   2766     fSets->addElement(fCP, status);
   2767     fSets->addElement(fEX, status);
   2768     fSets->addElement(fIN, status);
   2769     fSets->addElement(fJL, status);
   2770     fSets->addElement(fJT, status);
   2771     fSets->addElement(fJV, status);
   2772     fSets->addElement(fNS, status);
   2773     fSets->addElement(fOP, status);
   2774     fSets->addElement(fQU, status);
   2775     fSets->addElement(fIS, status);
   2776     fSets->addElement(fNU, status);
   2777     fSets->addElement(fPO, status);
   2778     fSets->addElement(fPR, status);
   2779     fSets->addElement(fSY, status);
   2780     fSets->addElement(fAI, status);
   2781     fSets->addElement(fAL, status);
   2782     fSets->addElement(fHL, status);
   2783     fSets->addElement(fID, status);
   2784     fSets->addElement(fWJ, status);
   2785     fSets->addElement(fRI, status);
   2786     fSets->addElement(fSA, status);
   2787     fSets->addElement(fSG, status);
   2788 
   2789     const char *rules =
   2790             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
   2791             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
   2792             "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
   2793             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
   2794             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
   2795             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
   2796 
   2797     fNumberMatcher = new RegexMatcher(
   2798         UnicodeString(rules, -1, US_INV), 0, status);
   2799 
   2800     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
   2801 
   2802     if (U_FAILURE(status)) {
   2803         deferredStatus = status;
   2804     }
   2805 }
   2806 
   2807 
   2808 void RBBILineMonkey::setText(const UnicodeString &s) {
   2809     fText       = &s;
   2810     fCharBI->setText(s);
   2811     fNumberMatcher->reset(s);
   2812 }
   2813 
   2814 //
   2815 //  rule9Adjust
   2816 //     Line Break TR rules 9 and 10 implementation.
   2817 //     This deals with combining marks and other sequences that
   2818 //     that must be treated as if they were something other than what they actually are.
   2819 //
   2820 //     This is factored out into a separate function because it must be applied twice for
   2821 //     each potential break, once to the chars before the position being checked, then
   2822 //     again to the text following the possible break.
   2823 //
   2824 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
   2825     if (pos == -1) {
   2826         // Invalid initial position.  Happens during the warmup iteration of the
   2827         //   main loop in next().
   2828         return;
   2829     }
   2830 
   2831     int32_t  nPos = *nextPos;
   2832 
   2833     // LB 9  Keep combining sequences together.
   2834     //  advance over any CM class chars.  Note that Line Break CM is different
   2835     //  from the normal Grapheme Extend property.
   2836     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
   2837           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
   2838         for (;;) {
   2839             *nextChar = fText->char32At(nPos);
   2840             if (!fCM->contains(*nextChar)) {
   2841                 break;
   2842             }
   2843             nPos = fText->moveIndex32(nPos, 1);
   2844         }
   2845     }
   2846 
   2847 
   2848     // LB 9 Treat X CM* as if it were x.
   2849     //       No explicit action required.
   2850 
   2851     // LB 10  Treat any remaining combining mark as AL
   2852     if (fCM->contains(*posChar)) {
   2853         *posChar = 0x41;   // thisChar = 'A';
   2854     }
   2855 
   2856     // Push the updated nextPos and nextChar back to our caller.
   2857     // This only makes a difference if posChar got bigger by consuming a
   2858     // combining sequence.
   2859     *nextPos  = nPos;
   2860     *nextChar = fText->char32At(nPos);
   2861 }
   2862 
   2863 
   2864 
   2865 int32_t RBBILineMonkey::next(int32_t startPos) {
   2866     UErrorCode status = U_ZERO_ERROR;
   2867     int32_t    pos;       //  Index of the char following a potential break position
   2868     UChar32    thisChar;  //  Character at above position "pos"
   2869 
   2870     int32_t    prevPos;   //  Index of the char preceding a potential break position
   2871     UChar32    prevChar;  //  Character at above position.  Note that prevChar
   2872                           //   and thisChar may not be adjacent because combining
   2873                           //   characters between them will be ignored.
   2874 
   2875     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
   2876     UChar32    prevCharX2;
   2877 
   2878     int32_t    nextPos;   //  Index of the next character following pos.
   2879                           //     Usually skips over combining marks.
   2880     int32_t    nextCPPos; //  Index of the code point following "pos."
   2881                           //     May point to a combining mark.
   2882     int32_t    tPos;      //  temp value.
   2883     UChar32    c;
   2884 
   2885     if (U_FAILURE(deferredStatus)) {
   2886         return -1;
   2887     }
   2888 
   2889     if (startPos >= fText->length()) {
   2890         return -1;
   2891     }
   2892 
   2893 
   2894     // Initial values for loop.  Loop will run the first time without finding breaks,
   2895     //                           while the invalid values shift out and the "this" and
   2896     //                           "prev" positions are filled in with good values.
   2897     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
   2898     thisChar = prevChar  = prevCharX2 = 0;
   2899     nextPos  = nextCPPos = startPos;
   2900 
   2901 
   2902     // Loop runs once per position in the test text, until a break position
   2903     //  is found.
   2904     for (;;) {
   2905         prevPosX2 = prevPos;
   2906         prevCharX2 = prevChar;
   2907 
   2908         prevPos   = pos;
   2909         prevChar  = thisChar;
   2910 
   2911         pos       = nextPos;
   2912         thisChar  = fText->char32At(pos);
   2913 
   2914         nextCPPos = fText->moveIndex32(pos, 1);
   2915         nextPos   = nextCPPos;
   2916 
   2917         // Rule LB2 - Break at end of text.
   2918         if (pos >= fText->length()) {
   2919             break;
   2920         }
   2921 
   2922         // Rule LB 9 - adjust for combining sequences.
   2923         //             We do this one out-of-order because the adjustment does not change anything
   2924         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
   2925         //             be applied.
   2926         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
   2927         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
   2928         c = fText->char32At(nextPos);
   2929         rule9Adjust(pos,     &thisChar, &nextPos, &c);
   2930 
   2931         // If the loop is still warming up - if we haven't shifted the initial
   2932         //   -1 positions out of prevPos yet - loop back to advance the
   2933         //    position in the input without any further looking for breaks.
   2934         if (prevPos == -1) {
   2935             continue;
   2936         }
   2937 
   2938         // LB 4  Always break after hard line breaks,
   2939         if (fBK->contains(prevChar)) {
   2940             break;
   2941         }
   2942 
   2943         // LB 5  Break after CR, LF, NL, but not inside CR LF
   2944         if (prevChar == 0x0d && thisChar == 0x0a) {
   2945             continue;
   2946         }
   2947         if (prevChar == 0x0d ||
   2948             prevChar == 0x0a ||
   2949             prevChar == 0x85)  {
   2950             break;
   2951         }
   2952 
   2953         // LB 6  Don't break before hard line breaks
   2954         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
   2955             fBK->contains(thisChar)) {
   2956                 continue;
   2957         }
   2958 
   2959 
   2960         // LB 7  Don't break before spaces or zero-width space.
   2961         if (fSP->contains(thisChar)) {
   2962             continue;
   2963         }
   2964 
   2965         if (fZW->contains(thisChar)) {
   2966             continue;
   2967         }
   2968 
   2969         // LB 8  Break after zero width space
   2970         if (fZW->contains(prevChar)) {
   2971             break;
   2972         }
   2973 
   2974         // LB 9, 10  Already done, at top of loop.
   2975         //
   2976 
   2977 
   2978         // LB 11  Do not break before or after WORD JOINER and related characters.
   2979         //    x  WJ
   2980         //    WJ  x
   2981         //
   2982         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
   2983             continue;
   2984         }
   2985 
   2986         // LB 12
   2987         //    GL  x
   2988         if (fGL->contains(prevChar)) {
   2989             continue;
   2990         }
   2991 
   2992         // LB 12a
   2993         //    [^SP BA HY] x GL
   2994         if (!(fSP->contains(prevChar) ||
   2995               fBA->contains(prevChar) ||
   2996               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
   2997             continue;
   2998         }
   2999 
   3000 
   3001 
   3002         // LB 13  Don't break before closings.
   3003         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
   3004         //        fall into LB 17 and the more general number regular expression.
   3005         //
   3006         if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
   3007             (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
   3008                                          fEX->contains(thisChar)  ||
   3009             (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
   3010             (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
   3011             continue;
   3012         }
   3013 
   3014         // LB 14 Don't break after OP SP*
   3015         //       Scan backwards, checking for this sequence.
   3016         //       The OP char could include combining marks, so we actually check for
   3017         //           OP CM* SP*
   3018         //       Another Twist: The Rule 67 fixes may have changed a SP CM
   3019         //       sequence into a ID char, so before scanning back through spaces,
   3020         //       verify that prevChar is indeed a space.  The prevChar variable
   3021         //       may differ from fText[prevPos]
   3022         tPos = prevPos;
   3023         if (fSP->contains(prevChar)) {
   3024             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
   3025                 tPos=fText->moveIndex32(tPos, -1);
   3026             }
   3027         }
   3028         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
   3029             tPos=fText->moveIndex32(tPos, -1);
   3030         }
   3031         if (fOP->contains(fText->char32At(tPos))) {
   3032             continue;
   3033         }
   3034 
   3035 
   3036         // LB 15    QU SP* x OP
   3037         if (fOP->contains(thisChar)) {
   3038             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
   3039             int tPos = prevPos;
   3040             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
   3041                 tPos = fText->moveIndex32(tPos, -1);
   3042             }
   3043             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
   3044                 tPos = fText->moveIndex32(tPos, -1);
   3045             }
   3046             if (fQU->contains(fText->char32At(tPos))) {
   3047                 continue;
   3048             }
   3049         }
   3050 
   3051 
   3052 
   3053         // LB 16   (CL | CP) SP* x NS
   3054         //    Scan backwards for SP* CM* (CL | CP)
   3055         if (fNS->contains(thisChar)) {
   3056             int tPos = prevPos;
   3057             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
   3058                 tPos = fText->moveIndex32(tPos, -1);
   3059             }
   3060             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
   3061                 tPos = fText->moveIndex32(tPos, -1);
   3062             }
   3063             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
   3064                 continue;
   3065             }
   3066         }
   3067 
   3068 
   3069         // LB 17        B2 SP* x B2
   3070         if (fB2->contains(thisChar)) {
   3071             //  Scan backwards, checking for the B2 CM* SP* sequence.
   3072             tPos = prevPos;
   3073             if (fSP->contains(prevChar)) {
   3074                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
   3075                     tPos=fText->moveIndex32(tPos, -1);
   3076                 }
   3077             }
   3078             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
   3079                 tPos=fText->moveIndex32(tPos, -1);
   3080             }
   3081             if (fB2->contains(fText->char32At(tPos))) {
   3082                 continue;
   3083             }
   3084         }
   3085 
   3086 
   3087         // LB 18    break after space
   3088         if (fSP->contains(prevChar)) {
   3089             break;
   3090         }
   3091 
   3092         // LB 19
   3093         //    x   QU
   3094         //    QU  x
   3095         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
   3096             continue;
   3097         }
   3098 
   3099         // LB 20  Break around a CB
   3100         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
   3101             break;
   3102         }
   3103 
   3104         // LB 21
   3105         if (fBA->contains(thisChar) ||
   3106             fHY->contains(thisChar) ||
   3107             fNS->contains(thisChar) ||
   3108             fBB->contains(prevChar) )   {
   3109             continue;
   3110         }
   3111 
   3112         // LB 21a
   3113         //   HL (HY | BA) x
   3114         if (fHL->contains(prevCharX2) &&
   3115                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
   3116             continue;
   3117         }
   3118 
   3119         // LB 22
   3120         if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
   3121             (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
   3122             (fID->contains(prevChar) && fIN->contains(thisChar)) ||
   3123             (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
   3124             (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
   3125             continue;
   3126         }
   3127 
   3128 
   3129         // LB 23    ID x PO
   3130         //          AL x NU
   3131         //          HL x NU
   3132         //          NU x AL
   3133         if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
   3134             (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
   3135             (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
   3136             (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
   3137             (fNU->contains(prevChar) && fHL->contains(thisChar)) )   {
   3138             continue;
   3139         }
   3140 
   3141         // LB 24  Do not break between prefix and letters or ideographs.
   3142         //        PR x ID
   3143         //        PR x (AL | HL)
   3144         //        PO x (AL | HL)
   3145         if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
   3146             (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
   3147             (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))))  {
   3148             continue;
   3149         }
   3150 
   3151 
   3152 
   3153         // LB 25    Numbers
   3154         if (fNumberMatcher->lookingAt(prevPos, status)) {
   3155             if (U_FAILURE(status)) {
   3156                 break;
   3157             }
   3158             // Matched a number.  But could have been just a single digit, which would
   3159             //    not represent a "no break here" between prevChar and thisChar
   3160             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
   3161             if (numEndIdx > pos) {
   3162                 // Number match includes at least our two chars being checked
   3163                 if (numEndIdx > nextPos) {
   3164                     // Number match includes additional chars.  Update pos and nextPos
   3165                     //   so that next loop iteration will continue at the end of the number,
   3166                     //   checking for breaks between last char in number & whatever follows.
   3167                     pos = nextPos = numEndIdx;
   3168                     do {
   3169                         pos = fText->moveIndex32(pos, -1);
   3170                         thisChar = fText->char32At(pos);
   3171                     } while (fCM->contains(thisChar));
   3172                 }
   3173                 continue;
   3174             }
   3175         }
   3176 
   3177 
   3178         // LB 26 Do not break a Korean syllable.
   3179         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
   3180                                         fJV->contains(thisChar) ||
   3181                                         fH2->contains(thisChar) ||
   3182                                         fH3->contains(thisChar))) {
   3183                                             continue;
   3184                                         }
   3185 
   3186         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
   3187             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
   3188                 continue;
   3189         }
   3190 
   3191         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
   3192             fJT->contains(thisChar)) {
   3193                 continue;
   3194         }
   3195 
   3196         // LB 27 Treat a Korean Syllable Block the same as ID.
   3197         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
   3198             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
   3199             fIN->contains(thisChar)) {
   3200                 continue;
   3201             }
   3202         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
   3203             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
   3204             fPO->contains(thisChar)) {
   3205                 continue;
   3206             }
   3207         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
   3208             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
   3209                 continue;
   3210             }
   3211 
   3212 
   3213 
   3214         // LB 28  Do not break between alphabetics ("at").
   3215         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
   3216             continue;
   3217         }
   3218 
   3219         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
   3220         if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
   3221             continue;
   3222         }
   3223 
   3224         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
   3225         //          (AL | NU) x OP
   3226         //          CP x (AL | NU)
   3227         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
   3228             continue;
   3229         }
   3230         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
   3231             continue;
   3232         }
   3233 
   3234         // LB30a  Do not break between regional indicators.
   3235         //        RI x RI
   3236         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
   3237             continue;
   3238         }
   3239 
   3240         // LB 31    Break everywhere else
   3241         break;
   3242 
   3243     }
   3244 
   3245     return pos;
   3246 }
   3247 
   3248 
   3249 UVector  *RBBILineMonkey::charClasses() {
   3250     return fSets;
   3251 }
   3252 
   3253 
   3254 RBBILineMonkey::~RBBILineMonkey() {
   3255     delete fSets;
   3256 
   3257     delete fBK;
   3258     delete fCR;
   3259     delete fLF;
   3260     delete fCM;
   3261     delete fNL;
   3262     delete fWJ;
   3263     delete fZW;
   3264     delete fGL;
   3265     delete fCB;
   3266     delete fSP;
   3267     delete fB2;
   3268     delete fBA;
   3269     delete fBB;
   3270     delete fHY;
   3271     delete fH2;
   3272     delete fH3;
   3273     delete fCL;
   3274     delete fCP;
   3275     delete fEX;
   3276     delete fIN;
   3277     delete fJL;
   3278     delete fJV;
   3279     delete fJT;
   3280     delete fNS;
   3281     delete fOP;
   3282     delete fQU;
   3283     delete fIS;
   3284     delete fNU;
   3285     delete fPO;
   3286     delete fPR;
   3287     delete fSY;
   3288     delete fAI;
   3289     delete fAL;
   3290     delete fCJ;
   3291     delete fHL;
   3292     delete fID;
   3293     delete fRI;
   3294     delete fSA;
   3295     delete fSG;
   3296     delete fXX;
   3297 
   3298     delete fCharBI;
   3299     delete fNumberMatcher;
   3300 }
   3301 
   3302 
   3303 //-------------------------------------------------------------------------------------------
   3304 //
   3305 //   TestMonkey
   3306 //
   3307 //     params
   3308 //       seed=nnnnn        Random number starting seed.
   3309 //                         Setting the seed allows errors to be reproduced.
   3310 //       loop=nnn          Looping count.  Controls running time.
   3311 //                         -1:  run forever.
   3312 //                          0 or greater:  run length.
   3313 //
   3314 //       type = char | word | line | sent | title
   3315 //
   3316 //-------------------------------------------------------------------------------------------
   3317 
   3318 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
   3319     int32_t val = defaultVal;
   3320     name.append(" *= *(-?\\d+)");
   3321     UErrorCode status = U_ZERO_ERROR;
   3322     RegexMatcher m(name, params, 0, status);
   3323     if (m.find()) {
   3324         // The param exists.  Convert the string to an int.
   3325         char valString[100];
   3326         int32_t paramLength = m.end(1, status) - m.start(1, status);
   3327         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
   3328             paramLength = (int32_t)(sizeof(valString)-2);
   3329         }
   3330         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
   3331         val = strtol(valString,  NULL, 10);
   3332 
   3333         // Delete this parameter from the params string.
   3334         m.reset();
   3335         params = m.replaceFirst("", status);
   3336     }
   3337     U_ASSERT(U_SUCCESS(status));
   3338     return val;
   3339 }
   3340 #endif
   3341 
   3342 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3343 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
   3344                                     BreakIterator *bi,
   3345                                     int expected[],
   3346                                     int expectedcount)
   3347 {
   3348     int count = 0;
   3349     int i = 0;
   3350     int forward[50];
   3351     bi->setText(ustr);
   3352     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
   3353         forward[count] = i;
   3354         if (count < expectedcount && expected[count] != i) {
   3355             test->errln("break forward test failed: expected %d but got %d",
   3356                         expected[count], i);
   3357             break;
   3358         }
   3359         count ++;
   3360     }
   3361     if (count != expectedcount) {
   3362         printStringBreaks(ustr, expected, expectedcount);
   3363         test->errln("break forward test failed: missed %d match",
   3364                     expectedcount - count);
   3365         return;
   3366     }
   3367     // testing boundaries
   3368     for (i = 1; i < expectedcount; i ++) {
   3369         int j = expected[i - 1];
   3370         if (!bi->isBoundary(j)) {
   3371             printStringBreaks(ustr, expected, expectedcount);
   3372             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
   3373             return;
   3374         }
   3375         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
   3376             if (bi->isBoundary(j)) {
   3377                 printStringBreaks(ustr, expected, expectedcount);
   3378                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
   3379                 return;
   3380             }
   3381         }
   3382     }
   3383 
   3384     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
   3385         count --;
   3386         if (forward[count] != i) {
   3387             printStringBreaks(ustr, expected, expectedcount);
   3388             test->errln("happy break test previous() failed: expected %d but got %d",
   3389                         forward[count], i);
   3390             break;
   3391         }
   3392     }
   3393     if (count != 0) {
   3394         printStringBreaks(ustr, expected, expectedcount);
   3395         test->errln("break test previous() failed: missed a match");
   3396         return;
   3397     }
   3398 
   3399     // testing preceding
   3400     for (i = 0; i < expectedcount - 1; i ++) {
   3401         // int j = expected[i] + 1;
   3402         int j = ustr.moveIndex32(expected[i], 1);
   3403         for (; j <= expected[i + 1]; j ++) {
   3404             if (bi->preceding(j) != expected[i]) {
   3405                 printStringBreaks(ustr, expected, expectedcount);
   3406                 test->errln("preceding(): Not expecting boundary at position %d", j);
   3407                 return;
   3408             }
   3409         }
   3410     }
   3411 }
   3412 #endif
   3413 
   3414 void RBBITest::TestWordBreaks(void)
   3415 {
   3416 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3417 
   3418     Locale        locale("en");
   3419     UErrorCode    status = U_ZERO_ERROR;
   3420     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   3421     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
   3422     // Replaced any C+J characters in a row with a random sequence of characters
   3423     // of the same length to make our C+J segmentation not get in the way.
   3424     static const char *strlist[] =
   3425     {
   3426     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
   3427     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
   3428     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
   3429     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
   3430     "\\uac00\\u3588\\u009c\\u0953\\u194b",
   3431     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   3432     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
   3433     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
   3434     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   3435     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   3436     "\\u2027\\U000e0067\\u0a47\\u00b7",
   3437     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   3438     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   3439     "\\u0589\\U000e006e\\u0a42\\U000104a5",
   3440     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
   3441     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   3442     "\\u0027\\u11af\\U000e0057\\u0602",
   3443     "\\U0001d7f2\\U000e007\\u0004\\u0589",
   3444     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
   3445     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   3446     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   3447     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   3448     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
   3449     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   3450     "\\u0233\\U000e0020\\u0a69\\u0d6a",
   3451     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   3452     "\\u18f4\\U000e0049\\u20e7\\u2027",
   3453     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   3454     "\\ua183\\u102d\\u0bec\\u003a",
   3455     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   3456     "\\u003a\\u0e57\\u0fad\\u002e",
   3457     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
   3458     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
   3459     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
   3460     "\\u003a\\u0664\\u00b7\\u1fba",
   3461     "\\u003b\\u0027\\u00b7\\u47a3",
   3462     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
   3463     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
   3464     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
   3465     };
   3466     int loop;
   3467     if (U_FAILURE(status)) {
   3468         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   3469         return;
   3470     }
   3471     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   3472         // printf("looping %d\n", loop);
   3473         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
   3474         // RBBICharMonkey monkey;
   3475         RBBIWordMonkey monkey;
   3476 
   3477         int expected[50];
   3478         int expectedcount = 0;
   3479 
   3480         monkey.setText(ustr);
   3481         int i;
   3482         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   3483             expected[expectedcount ++] = i;
   3484         }
   3485 
   3486         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   3487     }
   3488     delete bi;
   3489 #endif
   3490 }
   3491 
   3492 void RBBITest::TestWordBoundary(void)
   3493 {
   3494     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
   3495     Locale        locale("en");
   3496     UErrorCode    status = U_ZERO_ERROR;
   3497     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   3498     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
   3499     UChar         str[50];
   3500     static const char *strlist[] =
   3501     {
   3502     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   3503     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   3504     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   3505     "\\u2027\\U000e0067\\u0a47\\u00b7",
   3506     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   3507     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   3508     "\\u0589\\U000e006e\\u0a42\\U000104a5",
   3509     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
   3510     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   3511     "\\u0027\\u11af\\U000e0057\\u0602",
   3512     "\\U0001d7f2\\U000e007\\u0004\\u0589",
   3513     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
   3514     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   3515     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   3516     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   3517     "\\U000e0065\\u302c\\u09ee\\U000e0068",
   3518     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   3519     "\\u0233\\U000e0020\\u0a69\\u0d6a",
   3520     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   3521     "\\u58f4\\U000e0049\\u20e7\\u2027",
   3522     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   3523     "\\ua183\\u102d\\u0bec\\u003a",
   3524     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   3525     "\\u003a\\u0e57\\u0fad\\u002e",
   3526     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
   3527     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
   3528     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
   3529     "\\u003a\\u0664\\u00b7\\u1fba",
   3530     "\\u003b\\u0027\\u00b7\\u47a3",
   3531     };
   3532     int loop;
   3533     if (U_FAILURE(status)) {
   3534         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   3535         return;
   3536     }
   3537     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   3538         // printf("looping %d\n", loop);
   3539         u_unescape(strlist[loop], str, 20);
   3540         UnicodeString ustr(str);
   3541         int forward[50];
   3542         int count = 0;
   3543 
   3544         bi->setText(ustr);
   3545         int prev = 0;
   3546         int i;
   3547         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
   3548             forward[count ++] = i;
   3549             if (i > prev) {
   3550                 int j;
   3551                 for (j = prev + 1; j < i; j ++) {
   3552                     if (bi->isBoundary(j)) {
   3553                         printStringBreaks(ustr, forward, count);
   3554                         errln("happy boundary test failed: expected %d not a boundary",
   3555                                j);
   3556                         return;
   3557                     }
   3558                 }
   3559             }
   3560             if (!bi->isBoundary(i)) {
   3561                 printStringBreaks(ustr, forward, count);
   3562                 errln("happy boundary test failed: expected %d a boundary",
   3563                        i);
   3564                 return;
   3565             }
   3566             prev = i;
   3567         }
   3568     }
   3569     delete bi;
   3570 }
   3571 
   3572 void RBBITest::TestLineBreaks(void)
   3573 {
   3574 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3575     Locale        locale("en");
   3576     UErrorCode    status = U_ZERO_ERROR;
   3577     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
   3578     const int32_t  STRSIZE = 50;
   3579     UChar         str[STRSIZE];
   3580     static const char *strlist[] =
   3581     {
   3582      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
   3583      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
   3584              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
   3585      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
   3586              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
   3587      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
   3588      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
   3589      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
   3590      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
   3591      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
   3592      "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
   3593      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
   3594      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
   3595      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
   3596      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
   3597      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
   3598      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
   3599      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
   3600      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
   3601      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
   3602      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
   3603      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
   3604      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
   3605      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
   3606      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
   3607      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
   3608      "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
   3609      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
   3610      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
   3611      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
   3612      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
   3613      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
   3614      "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
   3615      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
   3616      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
   3617      "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
   3618      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
   3619      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
   3620      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
   3621      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
   3622      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
   3623      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
   3624          "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
   3625          "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
   3626          "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
   3627      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
   3628          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
   3629     };
   3630     int loop;
   3631     TEST_ASSERT_SUCCESS(status);
   3632     if (U_FAILURE(status)) {
   3633         return;
   3634     }
   3635     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   3636         // printf("looping %d\n", loop);
   3637         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
   3638         if (t >= STRSIZE) {
   3639             TEST_ASSERT(FALSE);
   3640             continue;
   3641         }
   3642 
   3643 
   3644         UnicodeString ustr(str);
   3645         RBBILineMonkey monkey;
   3646         if (U_FAILURE(monkey.deferredStatus)) {
   3647             continue;
   3648         }
   3649 
   3650         const int EXPECTEDSIZE = 50;
   3651         int expected[EXPECTEDSIZE];
   3652         int expectedcount = 0;
   3653 
   3654         monkey.setText(ustr);
   3655         int i;
   3656         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   3657             if (expectedcount >= EXPECTEDSIZE) {
   3658                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
   3659                 return;
   3660             }
   3661             expected[expectedcount ++] = i;
   3662         }
   3663 
   3664         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   3665     }
   3666     delete bi;
   3667 #endif
   3668 }
   3669 
   3670 void RBBITest::TestSentBreaks(void)
   3671 {
   3672 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3673     Locale        locale("en");
   3674     UErrorCode    status = U_ZERO_ERROR;
   3675     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
   3676     UChar         str[200];
   3677     static const char *strlist[] =
   3678     {
   3679      "Now\ris\nthe\r\ntime\n\rfor\r\r",
   3680      "This\n",
   3681      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
   3682      "\"Sentence ending with a quote.\" Bye.",
   3683      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
   3684      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
   3685      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
   3686      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
   3687      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
   3688      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
   3689      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
   3690              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
   3691              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
   3692              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
   3693      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
   3694              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
   3695              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
   3696              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
   3697              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
   3698              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
   3699     };
   3700     int loop;
   3701     if (U_FAILURE(status)) {
   3702         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   3703         return;
   3704     }
   3705     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   3706         u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
   3707         UnicodeString ustr(str);
   3708 
   3709         RBBISentMonkey monkey;
   3710         if (U_FAILURE(monkey.deferredStatus)) {
   3711             continue;
   3712         }
   3713 
   3714         const int EXPECTEDSIZE = 50;
   3715         int expected[EXPECTEDSIZE];
   3716         int expectedcount = 0;
   3717 
   3718         monkey.setText(ustr);
   3719         int i;
   3720         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   3721             if (expectedcount >= EXPECTEDSIZE) {
   3722                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
   3723                 return;
   3724             }
   3725             expected[expectedcount ++] = i;
   3726         }
   3727 
   3728         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   3729     }
   3730     delete bi;
   3731 #endif
   3732 }
   3733 
   3734 void RBBITest::TestMonkey(char *params) {
   3735 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3736 
   3737     UErrorCode     status    = U_ZERO_ERROR;
   3738     int32_t        loopCount = 500;
   3739     int32_t        seed      = 1;
   3740     UnicodeString  breakType = "all";
   3741     Locale         locale("en");
   3742     UBool          useUText  = FALSE;
   3743 
   3744     if (quick == FALSE) {
   3745         loopCount = 10000;
   3746     }
   3747 
   3748     if (params) {
   3749         UnicodeString p(params);
   3750         loopCount = getIntParam("loop", p, loopCount);
   3751         seed      = getIntParam("seed", p, seed);
   3752 
   3753         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
   3754         if (m.find()) {
   3755             breakType = m.group(1, status);
   3756             m.reset();
   3757             p = m.replaceFirst("", status);
   3758         }
   3759 
   3760         RegexMatcher u(" *utext", p, 0, status);
   3761         if (u.find()) {
   3762             useUText = TRUE;
   3763             u.reset();
   3764             p = u.replaceFirst("", status);
   3765         }
   3766 
   3767 
   3768         // m.reset(p);
   3769         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
   3770             // Each option is stripped out of the option string as it is processed.
   3771             // All options have been checked.  The option string should have been completely emptied..
   3772             char buf[100];
   3773             p.extract(buf, sizeof(buf), NULL, status);
   3774             buf[sizeof(buf)-1] = 0;
   3775             errln("Unrecognized or extra parameter:  %s\n", buf);
   3776             return;
   3777         }
   3778 
   3779     }
   3780 
   3781     if (breakType == "char" || breakType == "all") {
   3782         RBBICharMonkey  m;
   3783         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   3784         if (U_SUCCESS(status)) {
   3785             RunMonkey(bi, m, "char", seed, loopCount, useUText);
   3786             if (breakType == "all" && useUText==FALSE) {
   3787                 // Also run a quick test with UText when "all" is specified
   3788                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
   3789             }
   3790         }
   3791         else {
   3792             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
   3793         }
   3794         delete bi;
   3795     }
   3796 
   3797     if (breakType == "word" || breakType == "all") {
   3798         logln("Word Break Monkey Test");
   3799         RBBIWordMonkey  m;
   3800         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
   3801         if (U_SUCCESS(status)) {
   3802             RunMonkey(bi, m, "word", seed, loopCount, useUText);
   3803         }
   3804         else {
   3805             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
   3806         }
   3807         delete bi;
   3808     }
   3809 
   3810     if (breakType == "line" || breakType == "all") {
   3811         logln("Line Break Monkey Test");
   3812         RBBILineMonkey  m;
   3813         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
   3814         if (loopCount >= 10) {
   3815             loopCount = loopCount / 5;   // Line break runs slower than the others.
   3816         }
   3817         if (U_SUCCESS(status)) {
   3818             RunMonkey(bi, m, "line", seed, loopCount, useUText);
   3819         }
   3820         else {
   3821             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
   3822         }
   3823         delete bi;
   3824     }
   3825 
   3826     if (breakType == "sent" || breakType == "all"  ) {
   3827         logln("Sentence Break Monkey Test");
   3828         RBBISentMonkey  m;
   3829         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
   3830         if (loopCount >= 10) {
   3831             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
   3832         }
   3833         if (U_SUCCESS(status)) {
   3834             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
   3835         }
   3836         else {
   3837             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
   3838         }
   3839         delete bi;
   3840     }
   3841 
   3842 #endif
   3843 }
   3844 
   3845 //
   3846 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
   3847 //    Parameters:
   3848 //       bi      - the break iterator to use
   3849 //       mk      - MonkeyKind, abstraction for obtaining expected results
   3850 //       name    - Name of test (char, word, etc.) for use in error messages
   3851 //       seed    - Seed for starting random number generator (parameter from user)
   3852 //       numIterations
   3853 //
   3854 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
   3855                          int32_t numIterations, UBool useUText) {
   3856 
   3857 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3858 
   3859     const int32_t    TESTSTRINGLEN = 500;
   3860     UnicodeString    testText;
   3861     int32_t          numCharClasses;
   3862     UVector          *chClasses;
   3863     int              expected[TESTSTRINGLEN*2 + 1];
   3864     int              expectedCount = 0;
   3865     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
   3866     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
   3867     char             reverseBreaks[TESTSTRINGLEN*2+1];
   3868     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
   3869     char             followingBreaks[TESTSTRINGLEN*2+1];
   3870     char             precedingBreaks[TESTSTRINGLEN*2+1];
   3871     int              i;
   3872     int              loopCount = 0;
   3873 
   3874     m_seed = seed;
   3875 
   3876     numCharClasses = mk.charClasses()->size();
   3877     chClasses      = mk.charClasses();
   3878 
   3879     // Check for errors that occured during the construction of the MonkeyKind object.
   3880     //  Can't report them where they occured because errln() is a method coming from intlTest,
   3881     //  and is not visible outside of RBBITest :-(
   3882     if (U_FAILURE(mk.deferredStatus)) {
   3883         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
   3884         return;
   3885     }
   3886 
   3887     // Verify that the character classes all have at least one member.
   3888     for (i=0; i<numCharClasses; i++) {
   3889         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
   3890         if (s == NULL || s->size() == 0) {
   3891             errln("Character Class #%d is null or of zero size.", i);
   3892             return;
   3893         }
   3894     }
   3895 
   3896     while (loopCount < numIterations || numIterations == -1) {
   3897         if (numIterations == -1 && loopCount % 10 == 0) {
   3898             // If test is running in an infinite loop, display a periodic tic so
   3899             //   we can tell that it is making progress.
   3900             fprintf(stderr, ".");
   3901         }
   3902         // Save current random number seed, so that we can recreate the random numbers
   3903         //   for this loop iteration in event of an error.
   3904         seed = m_seed;
   3905 
   3906         // Populate a test string with data.
   3907         testText.truncate(0);
   3908         for (i=0; i<TESTSTRINGLEN; i++) {
   3909             int32_t  aClassNum = m_rand() % numCharClasses;
   3910             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
   3911             int32_t   charIdx = m_rand() % classSet->size();
   3912             UChar32   c = classSet->charAt(charIdx);
   3913             if (c < 0) {   // TODO:  deal with sets containing strings.
   3914                 errln("c < 0");
   3915                 break;
   3916             }
   3917             testText.append(c);
   3918         }
   3919 
   3920         // Calculate the expected results for this test string.
   3921         mk.setText(testText);
   3922         memset(expectedBreaks, 0, sizeof(expectedBreaks));
   3923         expectedBreaks[0] = 1;
   3924         int32_t breakPos = 0;
   3925         expectedCount = 0;
   3926         for (;;) {
   3927             breakPos = mk.next(breakPos);
   3928             if (breakPos == -1) {
   3929                 break;
   3930             }
   3931             if (breakPos > testText.length()) {
   3932                 errln("breakPos > testText.length()");
   3933             }
   3934             expectedBreaks[breakPos] = 1;
   3935             U_ASSERT(expectedCount<testText.length());
   3936             expected[expectedCount ++] = breakPos;
   3937         }
   3938 
   3939         // Find the break positions using forward iteration
   3940         memset(forwardBreaks, 0, sizeof(forwardBreaks));
   3941         if (useUText) {
   3942             UErrorCode status = U_ZERO_ERROR;
   3943             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
   3944             // testUText = utext_openUnicodeString(testUText, &testText, &status);
   3945             bi->setText(testUText, status);
   3946             TEST_ASSERT_SUCCESS(status);
   3947             utext_close(testUText);   // The break iterator does a shallow clone of the UText
   3948                                       //  This UText can be closed immediately, so long as the
   3949                                       //  testText string continues to exist.
   3950         } else {
   3951             bi->setText(testText);
   3952         }
   3953 
   3954         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
   3955             if (i < 0 || i > testText.length()) {
   3956                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
   3957                 break;
   3958             }
   3959             forwardBreaks[i] = 1;
   3960         }
   3961 
   3962         // Find the break positions using reverse iteration
   3963         memset(reverseBreaks, 0, sizeof(reverseBreaks));
   3964         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
   3965             if (i < 0 || i > testText.length()) {
   3966                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
   3967                 break;
   3968             }
   3969             reverseBreaks[i] = 1;
   3970         }
   3971 
   3972         // Find the break positions using isBoundary() tests.
   3973         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
   3974         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
   3975         for (i=0; i<=testText.length(); i++) {
   3976             isBoundaryBreaks[i] = bi->isBoundary(i);
   3977         }
   3978 
   3979 
   3980         // Find the break positions using the following() function.
   3981         // printf(".");
   3982         memset(followingBreaks, 0, sizeof(followingBreaks));
   3983         int32_t   lastBreakPos = 0;
   3984         followingBreaks[0] = 1;
   3985         for (i=0; i<testText.length(); i++) {
   3986             breakPos = bi->following(i);
   3987             if (breakPos <= i ||
   3988                 breakPos < lastBreakPos ||
   3989                 breakPos > testText.length() ||
   3990                 (breakPos > lastBreakPos && lastBreakPos > i)) {
   3991                 errln("%s break monkey test: "
   3992                     "Out of range value returned by BreakIterator::following().\n"
   3993                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
   3994                          name, seed, i, breakPos, lastBreakPos);
   3995                 break;
   3996             }
   3997             followingBreaks[breakPos] = 1;
   3998             lastBreakPos = breakPos;
   3999         }
   4000 
   4001         // Find the break positions using the preceding() function.
   4002         memset(precedingBreaks, 0, sizeof(precedingBreaks));
   4003         lastBreakPos = testText.length();
   4004         precedingBreaks[testText.length()] = 1;
   4005         for (i=testText.length(); i>0; i--) {
   4006             breakPos = bi->preceding(i);
   4007             if (breakPos >= i ||
   4008                 breakPos > lastBreakPos ||
   4009                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
   4010                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
   4011                 errln("%s break monkey test: "
   4012                     "Out of range value returned by BreakIterator::preceding().\n"
   4013                     "index=%d;  prev returned %d; lastBreak=%d" ,
   4014                     name,  i, breakPos, lastBreakPos);
   4015                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
   4016                     precedingBreaks[i] = 2;   // Forces an error.
   4017                 }
   4018             } else {
   4019                 if (breakPos >= 0) {
   4020                     precedingBreaks[breakPos] = 1;
   4021                 }
   4022                 lastBreakPos = breakPos;
   4023             }
   4024         }
   4025 
   4026         // Compare the expected and actual results.
   4027         for (i=0; i<=testText.length(); i++) {
   4028             const char *errorType = NULL;
   4029             if  (forwardBreaks[i] != expectedBreaks[i]) {
   4030                 errorType = "next()";
   4031             } else if (reverseBreaks[i] != forwardBreaks[i]) {
   4032                 errorType = "previous()";
   4033             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
   4034                 errorType = "isBoundary()";
   4035             } else if (followingBreaks[i] != expectedBreaks[i]) {
   4036                 errorType = "following()";
   4037             } else if (precedingBreaks[i] != expectedBreaks[i]) {
   4038                 errorType = "preceding()";
   4039             }
   4040 
   4041 
   4042             if (errorType != NULL) {
   4043                 // Format a range of the test text that includes the failure as
   4044                 //  a data item that can be included in the rbbi test data file.
   4045 
   4046                 // Start of the range is the last point where expected and actual results
   4047                 //   both agreed that there was a break position.
   4048                 int startContext = i;
   4049                 int32_t count = 0;
   4050                 for (;;) {
   4051                     if (startContext==0) { break; }
   4052                     startContext --;
   4053                     if (expectedBreaks[startContext] != 0) {
   4054                         if (count == 2) break;
   4055                         count ++;
   4056                     }
   4057                 }
   4058 
   4059                 // End of range is two expected breaks past the start position.
   4060                 int endContext = i + 1;
   4061                 int ci;
   4062                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
   4063                     for (;;) {
   4064                         if (endContext >= testText.length()) {break;}
   4065                         if (expectedBreaks[endContext-1] != 0) {
   4066                             if (count == 0) break;
   4067                             count --;
   4068                         }
   4069                         endContext ++;
   4070                     }
   4071                 }
   4072 
   4073                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
   4074                 UnicodeString errorText = "<data>";
   4075                 /***if (strcmp(errorType, "next()") == 0) {
   4076                     startContext = 0;
   4077                     endContext = testText.length();
   4078 
   4079                     printStringBreaks(testText, expected, expectedCount);
   4080                 }***/
   4081 
   4082                 for (ci=startContext; ci<endContext;) {
   4083                     UnicodeString hexChars("0123456789abcdef");
   4084                     UChar32  c;
   4085                     int      bn;
   4086                     c = testText.char32At(ci);
   4087                     if (ci == i) {
   4088                         // This is the location of the error.
   4089                         errorText.append("<?>");
   4090                     } else if (expectedBreaks[ci] != 0) {
   4091                         // This a non-error expected break position.
   4092                         errorText.append("\\");
   4093                     }
   4094                     if (c < 0x10000) {
   4095                         errorText.append("\\u");
   4096                         for (bn=12; bn>=0; bn-=4) {
   4097                             errorText.append(hexChars.charAt((c>>bn)&0xf));
   4098                         }
   4099                     } else {
   4100                         errorText.append("\\U");
   4101                         for (bn=28; bn>=0; bn-=4) {
   4102                             errorText.append(hexChars.charAt((c>>bn)&0xf));
   4103                         }
   4104                     }
   4105                     ci = testText.moveIndex32(ci, 1);
   4106                 }
   4107                 errorText.append("\\");
   4108                 errorText.append("</data>\n");
   4109 
   4110                 // Output the error
   4111                 char  charErrorTxt[500];
   4112                 UErrorCode status = U_ZERO_ERROR;
   4113                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
   4114                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
   4115                 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
   4116 
   4117                 errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
   4118                     name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
   4119                     errorType, seed, i, charErrorTxt);
   4120                 break;
   4121             }
   4122         }
   4123 
   4124         loopCount++;
   4125     }
   4126 #endif
   4127 }
   4128 
   4129 
   4130 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
   4131 //             This test checks the initial patch,
   4132 //             which is to just keep it from crashing.  Correct word boundaries
   4133 //             await a proper fix to the dictionary code.
   4134 //
   4135 void RBBITest::TestBug5532(void)  {
   4136    // Text includes a mixture of Thai and Latin.
   4137    const unsigned char utf8Data[] = {
   4138            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
   4139            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
   4140            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
   4141            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
   4142            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
   4143            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
   4144            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
   4145            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
   4146            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
   4147            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
   4148            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
   4149 
   4150     UErrorCode status = U_ZERO_ERROR;
   4151     UText utext=UTEXT_INITIALIZER;
   4152     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
   4153     TEST_ASSERT_SUCCESS(status);
   4154 
   4155     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
   4156     TEST_ASSERT_SUCCESS(status);
   4157     if (U_SUCCESS(status)) {
   4158         bi->setText(&utext, status);
   4159         TEST_ASSERT_SUCCESS(status);
   4160 
   4161         int32_t breakCount = 0;
   4162         int32_t previousBreak = -1;
   4163         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
   4164             // For now, just make sure that the break iterator doesn't hang.
   4165             TEST_ASSERT(previousBreak < bi->current());
   4166             previousBreak = bi->current();
   4167         }
   4168         TEST_ASSERT(breakCount > 0);
   4169     }
   4170     delete bi;
   4171     utext_close(&utext);
   4172 }
   4173 
   4174 
   4175 //
   4176 //  TestDebug    -  A place-holder test for debugging purposes.
   4177 //                  For putting in fragments of other tests that can be invoked
   4178 //                  for tracing  without a lot of unwanted extra stuff happening.
   4179 //
   4180 void RBBITest::TestDebug(void) {
   4181 #if 0
   4182     UErrorCode   status = U_ZERO_ERROR;
   4183     int pos = 0;
   4184     int ruleStatus = 0;
   4185 
   4186     RuleBasedBreakIterator* bi =
   4187        // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
   4188        // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
   4189        (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
   4190     UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
   4191     // UnicodeString s("Aaa.  Bcd");
   4192     s = s.unescape();
   4193     bi->setText(s);
   4194     UBool r = bi->isBoundary(8);
   4195     printf("%s", r?"true":"false");
   4196     return;
   4197     pos = bi->last();
   4198     do {
   4199         // ruleStatus = bi->getRuleStatus();
   4200         printf("%d\t%d\n", pos, ruleStatus);
   4201         pos = bi->previous();
   4202     } while (pos != BreakIterator::DONE);
   4203 #endif
   4204 }
   4205 
   4206 void RBBITest::TestProperties() {
   4207     UErrorCode errorCode = U_ZERO_ERROR;
   4208     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
   4209     if (!prependSet.isEmpty()) {
   4210         errln(
   4211             "[:GCB=Prepend:] is not empty any more. "
   4212             "Uncomment relevant lines in source/data/brkitr/char.txt and "
   4213             "change this test to the opposite condition.");
   4214     }
   4215 }
   4216 
   4217 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   4218