Home | History | Annotate | Download | only in intltest
      1 /********************************************************************
      2  * COPYRIGHT:
      3  * Copyright (c) 1999-2013, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  ********************************************************************/
      6 /************************************************************************
      7 *   Date        Name        Description
      8 *   12/15/99    Madhu        Creation.
      9 *   01/12/2000  Madhu        Updated for changed API and added new tests
     10 ************************************************************************/
     11 
     12 #include "utypeinfo.h"  // for 'typeid' to work
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_BREAK_ITERATION
     17 
     18 #include "unicode/utypes.h"
     19 #include "unicode/brkiter.h"
     20 #include "unicode/rbbi.h"
     21 #include "unicode/uchar.h"
     22 #include "unicode/utf16.h"
     23 #include "unicode/ucnv.h"
     24 #include "unicode/schriter.h"
     25 #include "unicode/uniset.h"
     26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     27 #include "unicode/regex.h"
     28 #endif
     29 #include "unicode/ustring.h"
     30 #include "unicode/utext.h"
     31 #include "intltest.h"
     32 #include "rbbitst.h"
     33 #include <string.h>
     34 #include "uvector.h"
     35 #include "uvectr32.h"
     36 #include <string.h>
     37 #include <stdio.h>
     38 #include <stdlib.h>
     39 #include "unicode/numfmt.h"
     40 #include "unicode/uscript.h"
     41 
     42 #define TEST_ASSERT(x) {if (!(x)) { \
     43     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
     44 
     45 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
     46     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
     47 
     48 
     49 //---------------------------------------------
     50 // runIndexedTest
     51 //---------------------------------------------
     52 
     53 
     54 //  Note:  Before adding new tests to this file, check whether the desired test data can
     55 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
     56 //         it's much less work than writing a new test, diagnostic output in the event of failures
     57 //         is good, and the test data file will is shared with ICU4J, so eventually the test
     58 //         will run there as well, without additional effort.
     59 
     60 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
     61 {
     62     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
     63 
     64     switch (index) {
     65 #if !UCONFIG_NO_FILE_IO
     66         case 0: name = "TestBug4153072";
     67             if(exec) TestBug4153072();                         break;
     68 #else
     69         case 0: name = "skip";
     70             break;
     71 #endif
     72 
     73         case 1: name = "skip";
     74             break;
     75         case 2: name = "TestStatusReturn";
     76             if(exec) TestStatusReturn();                       break;
     77 
     78 #if !UCONFIG_NO_FILE_IO
     79         case 3: name = "TestUnicodeFiles";
     80             if(exec) TestUnicodeFiles();                       break;
     81         case 4: name = "TestEmptyString";
     82             if(exec) TestEmptyString();                        break;
     83 #else
     84         case 3: case 4: name = "skip";
     85             break;
     86 #endif
     87 
     88         case 5: name = "TestGetAvailableLocales";
     89             if(exec) TestGetAvailableLocales();                break;
     90 
     91         case 6: name = "TestGetDisplayName";
     92             if(exec) TestGetDisplayName();                     break;
     93 
     94 #if !UCONFIG_NO_FILE_IO
     95         case 7: name = "TestEndBehaviour";
     96             if(exec) TestEndBehaviour();                       break;
     97         case 8: case 9: case 10: name = "skip";
     98              break;
     99         case 11: name = "TestWordBreaks";
    100              if(exec) TestWordBreaks();                        break;
    101         case 12: name = "TestWordBoundary";
    102              if(exec) TestWordBoundary();                      break;
    103         case 13: name = "TestLineBreaks";
    104              if(exec) TestLineBreaks();                        break;
    105         case 14: name = "TestSentBreaks";
    106              if(exec) TestSentBreaks();                        break;
    107         case 15: name = "TestExtended";
    108              if(exec) TestExtended();                          break;
    109 #else
    110         case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
    111              break;
    112 #endif
    113 
    114 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
    115         case 16:
    116             name = "TestMonkey"; if(exec)  TestMonkey(params); break;
    117 #else
    118         case 16:
    119              name = "skip";                                    break;
    120 #endif
    121 
    122 #if !UCONFIG_NO_FILE_IO
    123         case 17: name = "TestBug3818";
    124             if(exec) TestBug3818();                            break;
    125 #else
    126         case 17: name = "skip";
    127             break;
    128 #endif
    129 
    130         case 18: name = "skip";
    131             break;
    132         case 19: name = "TestDebug";
    133             if(exec) TestDebug();                              break;
    134         case 20: name = "skip";
    135             break;
    136 
    137 #if !UCONFIG_NO_FILE_IO
    138         case 21: name = "TestBug5775";
    139             if (exec) TestBug5775();                           break;
    140 #else
    141         case 21: name = "skip";
    142             break;
    143 #endif
    144 
    145         case 22: name = "TestBug9983";
    146             if (exec) TestBug9983();                           break;
    147         case 23: name = "TestDictRules";
    148             if (exec) TestDictRules();                         break;
    149         case 24: name = "TestBug5532";
    150             if (exec) TestBug5532();                           break;
    151         default: name = ""; break; //needed to end loop
    152     }
    153 }
    154 
    155 
    156 //---------------------------------------------------------------------------
    157 //
    158 //   class BITestData   Holds a set of Break iterator test data and results
    159 //                      Includes
    160 //                         - the string data to be broken
    161 //                         - a vector of the expected break positions.
    162 //                         - a vector of source line numbers for the data,
    163 //                               (to help see where errors occured.)
    164 //                         - The expected break tag values.
    165 //                         - Vectors of actual break positions and tag values.
    166 //                         - Functions for comparing actual with expected and
    167 //                            reporting errors.
    168 //
    169 //----------------------------------------------------------------------------
    170 class BITestData {
    171 public:
    172     UnicodeString    fDataToBreak;
    173     UVector          fExpectedBreakPositions;
    174     UVector          fExpectedTags;
    175     UVector          fLineNum;
    176     UVector          fActualBreakPositions;   // Test Results.
    177     UVector          fActualTags;
    178 
    179     BITestData(UErrorCode &status);
    180     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
    181     void             checkResults(const char *heading, RBBITest *test);
    182     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
    183     void             clearResults();
    184 };
    185 
    186 //
    187 // Constructor.
    188 //
    189 BITestData::BITestData(UErrorCode &status)
    190 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
    191   fActualTags(status)
    192 {
    193 }
    194 
    195 //
    196 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
    197 //                 The macro form collects the line number, which is helpful
    198 //                 when tracking down failures.
    199 //
    200 //                 A null data item is inserted at the start of each test's data
    201 //                  to put the starting zero into the data list.  The position saved for
    202 //                  each non-null item is its ending position.
    203 //
    204 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
    205 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
    206     if (U_FAILURE(status)) {return;}
    207     if (data != NULL) {
    208         fDataToBreak.append(CharsToUnicodeString(data));
    209     }
    210     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
    211     fExpectedTags.addElement(tag, status);
    212     fLineNum.addElement(lineNum, status);
    213 }
    214 
    215 
    216 //
    217 //  checkResults.   Compare the actual and expected break positions, report any differences.
    218 //
    219 void BITestData::checkResults(const char *heading, RBBITest *test) {
    220     int32_t   expectedIndex = 0;
    221     int32_t   actualIndex = 0;
    222 
    223     for (;;) {
    224         // If we've run through both the expected and actual results vectors, we're done.
    225         //   break out of the loop.
    226         if (expectedIndex >= fExpectedBreakPositions.size() &&
    227             actualIndex   >= fActualBreakPositions.size()) {
    228             break;
    229         }
    230 
    231 
    232         if (expectedIndex >= fExpectedBreakPositions.size()) {
    233             err(heading, test, expectedIndex-1, actualIndex);
    234             actualIndex++;
    235             continue;
    236         }
    237 
    238         if (actualIndex >= fActualBreakPositions.size()) {
    239             err(heading, test, expectedIndex, actualIndex-1);
    240             expectedIndex++;
    241             continue;
    242         }
    243 
    244         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
    245             err(heading, test, expectedIndex, actualIndex);
    246             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
    247             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
    248                 actualIndex++;
    249             } else {
    250                 expectedIndex++;
    251             }
    252             continue;
    253         }
    254 
    255         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
    256             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
    257                 heading, fLineNum.elementAt(expectedIndex),
    258                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
    259         }
    260 
    261         actualIndex++;
    262         expectedIndex++;
    263     }
    264 }
    265 
    266 //
    267 //  err   -  An error was found.  Report it, along with information about where the
    268 //                                incorrectly broken test data appeared in the source file.
    269 //
    270 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
    271 {
    272     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
    273     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
    274     int32_t   o        = 0;
    275     int32_t   line     = fLineNum.elementAti(expectedIdx);
    276     if (expectedIdx > 0) {
    277         // The line numbers are off by one because a premature break occurs somewhere
    278         //    within the previous item, rather than at the start of the current (expected) item.
    279         //    We want to report the offset of the unexpected break from the start of
    280         //      this previous item.
    281         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
    282     }
    283     if (actual < expected) {
    284         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
    285     } else {
    286         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
    287     }
    288 }
    289 
    290 
    291 void BITestData::clearResults() {
    292     fActualBreakPositions.removeAllElements();
    293     fActualTags.removeAllElements();
    294 }
    295 
    296 
    297 //--------------------------------------------------------------------------------------
    298 //
    299 //    RBBITest    constructor and destructor
    300 //
    301 //--------------------------------------------------------------------------------------
    302 
    303 RBBITest::RBBITest() {
    304 }
    305 
    306 
    307 RBBITest::~RBBITest() {
    308 }
    309 
    310 //-----------------------------------------------------------------------------------
    311 //
    312 //   Test for status {tag} return value from break rules.
    313 //        TODO:  a more thorough test.
    314 //
    315 //-----------------------------------------------------------------------------------
    316 void RBBITest::TestStatusReturn() {
    317      UnicodeString rulesString1("$Letters = [:L:];\n"
    318                                   "$Numbers = [:N:];\n"
    319                                   "$Letters+{1};\n"
    320                                   "$Numbers+{2};\n"
    321                                   "Help\\ {4}/me\\!;\n"
    322                                   "[^$Letters $Numbers];\n"
    323                                   "!.*;\n", -1, US_INV);
    324      UnicodeString testString1  = "abc123..abc Help me Help me!";
    325                                 // 01234567890123456789012345678
    326      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
    327      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
    328 
    329      UErrorCode status=U_ZERO_ERROR;
    330      UParseError    parseError;
    331 
    332      BreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
    333      if(U_FAILURE(status)) {
    334          dataerrln("FAIL : in construction - %s", u_errorName(status));
    335      } else {
    336          int32_t  pos;
    337          int32_t  i = 0;
    338          bi->setText(testString1);
    339          for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
    340              if (pos != bounds1[i]) {
    341                  errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
    342                  break;
    343              }
    344 
    345              int tag = bi->getRuleStatus();
    346              if (tag != brkStatus[i]) {
    347                  errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
    348                  break;
    349              }
    350              i++;
    351          }
    352      }
    353      delete bi;
    354 }
    355 
    356 
    357 static void printStringBreaks(UnicodeString ustr, int expected[],
    358                               int expectedcount)
    359 {
    360     UErrorCode status = U_ZERO_ERROR;
    361     char name[100];
    362     printf("code    alpha extend alphanum type word sent line name\n");
    363     int j;
    364     for (j = 0; j < ustr.length(); j ++) {
    365         if (expectedcount > 0) {
    366             int k;
    367             for (k = 0; k < expectedcount; k ++) {
    368                 if (j == expected[k]) {
    369                     printf("------------------------------------------------ %d\n",
    370                            j);
    371                 }
    372             }
    373         }
    374         UChar32 c = ustr.char32At(j);
    375         if (c > 0xffff) {
    376             j ++;
    377         }
    378         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
    379         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
    380                            u_isUAlphabetic(c),
    381                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
    382                            u_isalnum(c),
    383                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
    384                                                   u_charType(c),
    385                                                   U_SHORT_PROPERTY_NAME),
    386                            u_getPropertyValueName(UCHAR_WORD_BREAK,
    387                                                   u_getIntPropertyValue(c,
    388                                                           UCHAR_WORD_BREAK),
    389                                                   U_SHORT_PROPERTY_NAME),
    390                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
    391                                    u_getIntPropertyValue(c,
    392                                            UCHAR_SENTENCE_BREAK),
    393                                    U_SHORT_PROPERTY_NAME),
    394                            u_getPropertyValueName(UCHAR_LINE_BREAK,
    395                                    u_getIntPropertyValue(c,
    396                                            UCHAR_LINE_BREAK),
    397                                    U_SHORT_PROPERTY_NAME),
    398                            name);
    399     }
    400 }
    401 
    402 
    403 void RBBITest::TestBug3818() {
    404     UErrorCode  status = U_ZERO_ERROR;
    405 
    406     // Four Thai words...
    407     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
    408                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
    409     UnicodeString  thaiStr(thaiWordData);
    410 
    411     BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
    412     if (U_FAILURE(status) || bi == NULL) {
    413         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
    414         return;
    415     }
    416     bi->setText(thaiStr);
    417 
    418     int32_t  startOfSecondWord = bi->following(1);
    419     if (startOfSecondWord != 4) {
    420         errln("Fail at file %s, line %d expected start of word at 4, got %d",
    421             __FILE__, __LINE__, startOfSecondWord);
    422     }
    423     startOfSecondWord = bi->following(0);
    424     if (startOfSecondWord != 4) {
    425         errln("Fail at file %s, line %d expected start of word at 4, got %d",
    426             __FILE__, __LINE__, startOfSecondWord);
    427     }
    428     delete bi;
    429 }
    430 
    431 //----------------------------------------------------------------------------
    432 //
    433 // generalIteratorTest      Given a break iterator and a set of test data,
    434 //                          Run the tests and report the results.
    435 //
    436 //----------------------------------------------------------------------------
    437 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
    438 {
    439 
    440     bi.setText(td.fDataToBreak);
    441 
    442     testFirstAndNext(bi, td);
    443 
    444     testLastAndPrevious(bi, td);
    445 
    446     testFollowing(bi, td);
    447     testPreceding(bi, td);
    448     testIsBoundary(bi, td);
    449     doMultipleSelectionTest(bi, td);
    450 }
    451 
    452 
    453 //
    454 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
    455 //                       kind of loop.
    456 //
    457 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
    458 {
    459     UErrorCode  status = U_ZERO_ERROR;
    460     int32_t     p;
    461     int32_t     lastP = -1;
    462     int32_t     tag;
    463 
    464     logln("Test first and next");
    465     bi.setText(td.fDataToBreak);
    466     td.clearResults();
    467 
    468     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
    469         td.fActualBreakPositions.addElement(p, status);  // Save result.
    470         tag = bi.getRuleStatus();
    471         td.fActualTags.addElement(tag, status);
    472         if (p <= lastP) {
    473             // If the iterator is not making forward progress, stop.
    474             //  No need to raise an error here, it'll be detected in the normal check of results.
    475             break;
    476         }
    477         lastP = p;
    478     }
    479     td.checkResults("testFirstAndNext", this);
    480 }
    481 
    482 
    483 //
    484 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
    485 //
    486 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
    487 {
    488     UErrorCode  status = U_ZERO_ERROR;
    489     int32_t     p;
    490     int32_t     lastP  = 0x7ffffffe;
    491     int32_t     tag;
    492 
    493     logln("Test last and previous");
    494     bi.setText(td.fDataToBreak);
    495     td.clearResults();
    496 
    497     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
    498         // Save break position.  Insert it at start of vector of results, shoving
    499         //    already-saved results further towards the end.
    500         td.fActualBreakPositions.insertElementAt(p, 0, status);
    501         // bi.previous();   // TODO:  Why does this fix things up????
    502         // bi.next();
    503         tag = bi.getRuleStatus();
    504         td.fActualTags.insertElementAt(tag, 0, status);
    505         if (p >= lastP) {
    506             // If the iterator is not making progress, stop.
    507             //  No need to raise an error here, it'll be detected in the normal check of results.
    508             break;
    509         }
    510         lastP = p;
    511     }
    512     td.checkResults("testLastAndPrevious", this);
    513 }
    514 
    515 
    516 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
    517 {
    518     UErrorCode  status = U_ZERO_ERROR;
    519     int32_t     p;
    520     int32_t     tag;
    521     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
    522                                  //   cannot be -1; that is returned for DONE.
    523     int         i;
    524 
    525     logln("testFollowing():");
    526     bi.setText(td.fDataToBreak);
    527     td.clearResults();
    528 
    529     // Save the starting point, since we won't get that out of following.
    530     p = bi.first();
    531     td.fActualBreakPositions.addElement(p, status);  // Save result.
    532     tag = bi.getRuleStatus();
    533     td.fActualTags.addElement(tag, status);
    534 
    535     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
    536         p = bi.following(i);
    537         if (p != lastP) {
    538             if (p == RuleBasedBreakIterator::DONE) {
    539                 break;
    540             }
    541             // We've reached a new break position.  Save it.
    542             td.fActualBreakPositions.addElement(p, status);  // Save result.
    543             tag = bi.getRuleStatus();
    544             td.fActualTags.addElement(tag, status);
    545             lastP = p;
    546         }
    547     }
    548     // The loop normally exits by means of the break in the middle.
    549     // Make sure that the index was at the correct position for the break iterator to have
    550     //   returned DONE.
    551     if (i != td.fDataToBreak.length()) {
    552         errln("testFollowing():  iterator returned DONE prematurely.");
    553     }
    554 
    555     // Full check of all results.
    556     td.checkResults("testFollowing", this);
    557 }
    558 
    559 
    560 
    561 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
    562     UErrorCode  status = U_ZERO_ERROR;
    563     int32_t     p;
    564     int32_t     tag;
    565     int32_t     lastP  = 0x7ffffffe;
    566     int         i;
    567 
    568     logln("testPreceding():");
    569     bi.setText(td.fDataToBreak);
    570     td.clearResults();
    571 
    572     p = bi.last();
    573     td.fActualBreakPositions.addElement(p, status);
    574     tag = bi.getRuleStatus();
    575     td.fActualTags.addElement(tag, status);
    576 
    577     for (i = td.fDataToBreak.length(); i>=-1; i--) {
    578         p = bi.preceding(i);
    579         if (p != lastP) {
    580             if (p == RuleBasedBreakIterator::DONE) {
    581                 break;
    582             }
    583             // We've reached a new break position.  Save it.
    584             td.fActualBreakPositions.insertElementAt(p, 0, status);
    585             lastP = p;
    586             tag = bi.getRuleStatus();
    587             td.fActualTags.insertElementAt(tag, 0, status);
    588         }
    589     }
    590     // The loop normally exits by means of the break in the middle.
    591     // Make sure that the index was at the correct position for the break iterator to have
    592     //   returned DONE.
    593     if (i != 0) {
    594         errln("testPreceding():  iterator returned DONE prematurely.");
    595     }
    596 
    597     // Full check of all results.
    598     td.checkResults("testPreceding", this);
    599 }
    600 
    601 
    602 
    603 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
    604     UErrorCode  status = U_ZERO_ERROR;
    605     int         i;
    606     int32_t     tag;
    607 
    608     logln("testIsBoundary():");
    609     bi.setText(td.fDataToBreak);
    610     td.clearResults();
    611 
    612     for (i = 0; i <= td.fDataToBreak.length(); i++) {
    613         if (bi.isBoundary(i)) {
    614             td.fActualBreakPositions.addElement(i, status);  // Save result.
    615             tag = bi.getRuleStatus();
    616             td.fActualTags.addElement(tag, status);
    617         }
    618     }
    619     td.checkResults("testIsBoundary: ", this);
    620 }
    621 
    622 
    623 
    624 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
    625 {
    626     iterator.setText(td.fDataToBreak);
    627 
    628     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
    629     int32_t offset = iterator.first();
    630     int32_t testOffset;
    631     int32_t count = 0;
    632 
    633     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
    634 
    635     if (*testIterator != iterator)
    636         errln("clone() or operator!= failed: two clones compared unequal");
    637 
    638     do {
    639         testOffset = testIterator->first();
    640         testOffset = testIterator->next(count);
    641         if (offset != testOffset)
    642             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
    643 
    644         if (offset != RuleBasedBreakIterator::DONE) {
    645             count++;
    646             offset = iterator.next();
    647 
    648             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
    649                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
    650                 if (count > 10000 || offset == -1) {
    651                     errln("operator== failed too many times. Stopping test.");
    652                     if (offset == -1) {
    653                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
    654                     }
    655                     return;
    656                 }
    657             }
    658         }
    659     } while (offset != RuleBasedBreakIterator::DONE);
    660 
    661     // now do it backwards...
    662     offset = iterator.last();
    663     count = 0;
    664 
    665     do {
    666         testOffset = testIterator->last();
    667         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
    668         if (offset != testOffset)
    669             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
    670 
    671         if (offset != RuleBasedBreakIterator::DONE) {
    672             count--;
    673             offset = iterator.previous();
    674         }
    675     } while (offset != RuleBasedBreakIterator::DONE);
    676 
    677     delete testIterator;
    678 }
    679 
    680 
    681 //---------------------------------------------
    682 //
    683 //     other tests
    684 //
    685 //---------------------------------------------
    686 void RBBITest::TestEmptyString()
    687 {
    688     UnicodeString text = "";
    689     UErrorCode status = U_ZERO_ERROR;
    690 
    691     BITestData x(status);
    692     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
    693     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
    694     if (U_FAILURE(status))
    695     {
    696         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
    697         return;
    698     }
    699     generalIteratorTest(*bi, x);
    700     delete bi;
    701 }
    702 
    703 void RBBITest::TestGetAvailableLocales()
    704 {
    705     int32_t locCount = 0;
    706     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
    707 
    708     if (locCount == 0)
    709         dataerrln("getAvailableLocales() returned an empty list!");
    710     // Just make sure that it's returning good memory.
    711     int32_t i;
    712     for (i = 0; i < locCount; ++i) {
    713         logln(locList[i].getName());
    714     }
    715 }
    716 
    717 //Testing the BreakIterator::getDisplayName() function
    718 void RBBITest::TestGetDisplayName()
    719 {
    720     UnicodeString   result;
    721 
    722     BreakIterator::getDisplayName(Locale::getUS(), result);
    723     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
    724         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
    725                 + result);
    726 
    727     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
    728     if (result != "French (France)")
    729         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
    730                 + result);
    731 }
    732 /**
    733  * Test End Behaviour
    734  * @bug 4068137
    735  */
    736 void RBBITest::TestEndBehaviour()
    737 {
    738     UErrorCode status = U_ZERO_ERROR;
    739     UnicodeString testString("boo.");
    740     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
    741     if (U_FAILURE(status))
    742     {
    743         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
    744         return;
    745     }
    746     wb->setText(testString);
    747 
    748     if (wb->first() != 0)
    749         errln("Didn't get break at beginning of string.");
    750     if (wb->next() != 3)
    751         errln("Didn't get break before period in \"boo.\"");
    752     if (wb->current() != 4 && wb->next() != 4)
    753         errln("Didn't get break at end of string.");
    754     delete wb;
    755 }
    756 /*
    757  * @bug 4153072
    758  */
    759 void RBBITest::TestBug4153072() {
    760     UErrorCode status = U_ZERO_ERROR;
    761     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
    762     if (U_FAILURE(status))
    763     {
    764         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
    765         return;
    766     }
    767     UnicodeString str("...Hello, World!...");
    768     int32_t begin = 3;
    769     int32_t end = str.length() - 3;
    770     UBool onBoundary;
    771 
    772     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
    773     iter->adoptText(textIterator);
    774     int index;
    775     // Note: with the switch to UText, there is no way to restrict the
    776     //       iteration range to begin at an index other than zero.
    777     //       String character iterators created with a non-zero bound are
    778     //         treated by RBBI as being empty.
    779     for (index = -1; index < begin + 1; ++index) {
    780         onBoundary = iter->isBoundary(index);
    781         if (index == 0?  !onBoundary : onBoundary) {
    782             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
    783                             " and begin index = " + begin);
    784         }
    785     }
    786     delete iter;
    787 }
    788 
    789 
    790 //
    791 // Test for problem reported by Ashok Matoria on 9 July 2007
    792 //    One.<kSoftHyphen><kSpace>Two.
    793 //
    794 //    Sentence break at start (0) and then on calling next() it breaks at
    795 //   'T' of "Two". Now, at this point if I do next() and
    796 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
    797 //
    798 void RBBITest::TestBug5775() {
    799     UErrorCode status = U_ZERO_ERROR;
    800     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
    801     TEST_ASSERT_SUCCESS(status);
    802     if (U_FAILURE(status)) {
    803         return;
    804     }
    805 // Check for status first for better handling of no data errors.
    806     TEST_ASSERT(bi != NULL);
    807     if (bi == NULL) {
    808         return;
    809     }
    810 
    811     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
    812     //               01234      56789
    813     s = s.unescape();
    814     bi->setText(s);
    815     int pos = bi->next();
    816     TEST_ASSERT(pos == 6);
    817     pos = bi->next();
    818     TEST_ASSERT(pos == 10);
    819     pos = bi->previous();
    820     TEST_ASSERT(pos == 6);
    821     delete bi;
    822 }
    823 
    824 
    825 
    826 //------------------------------------------------------------------------------
    827 //
    828 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
    829 //
    830 //------------------------------------------------------------------------------
    831 
    832 struct TestParams {
    833     BreakIterator   *bi;
    834     UnicodeString    dataToBreak;
    835     UVector32       *expectedBreaks;
    836     UVector32       *srcLine;
    837     UVector32       *srcCol;
    838 };
    839 
    840 void RBBITest::executeTest(TestParams *t) {
    841     int32_t    bp;
    842     int32_t    prevBP;
    843     int32_t    i;
    844 
    845     if (t->bi == NULL) {
    846         return;
    847     }
    848 
    849     t->bi->setText(t->dataToBreak);
    850     //
    851     //  Run the iterator forward
    852     //
    853     prevBP = -1;
    854     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
    855         if (prevBP ==  bp) {
    856             // Fail for lack of forward progress.
    857             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
    858                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
    859             break;
    860         }
    861 
    862         // Check that there were we didn't miss an expected break between the last one
    863         //  and this one.
    864         for (i=prevBP+1; i<bp; i++) {
    865             if (t->expectedBreaks->elementAti(i) != 0) {
    866                 int expected[] = {0, i};
    867                 printStringBreaks(t->dataToBreak, expected, 2);
    868                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
    869                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
    870             }
    871         }
    872 
    873         // Check that the break we did find was expected
    874         if (t->expectedBreaks->elementAti(bp) == 0) {
    875             int expected[] = {0, bp};
    876             printStringBreaks(t->dataToBreak, expected, 2);
    877             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
    878                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
    879         } else {
    880             // The break was expected.
    881             //   Check that the {nnn} tag value is correct.
    882             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
    883             if (expectedTagVal == -1) {
    884                 expectedTagVal = 0;
    885             }
    886             int32_t line = t->srcLine->elementAti(bp);
    887             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
    888             if (rs != expectedTagVal) {
    889                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
    890                       "          Actual, Expected status = %4d, %4d",
    891                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
    892             }
    893         }
    894 
    895 
    896         prevBP = bp;
    897     }
    898 
    899     // Verify that there were no missed expected breaks after the last one found
    900     for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
    901         if (t->expectedBreaks->elementAti(i) != 0) {
    902             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
    903                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
    904         }
    905     }
    906 
    907     //
    908     //  Run the iterator backwards, verify that the same breaks are found.
    909     //
    910     prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
    911     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
    912         if (prevBP ==  bp) {
    913             // Fail for lack of progress.
    914             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
    915                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
    916             break;
    917         }
    918 
    919         // Check that there were we didn't miss an expected break between the last one
    920         //  and this one.  (UVector returns zeros for index out of bounds.)
    921         for (i=prevBP-1; i>bp; i--) {
    922             if (t->expectedBreaks->elementAti(i) != 0) {
    923                 errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
    924                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
    925             }
    926         }
    927 
    928         // Check that the break we did find was expected
    929         if (t->expectedBreaks->elementAti(bp) == 0) {
    930             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
    931                    bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
    932         } else {
    933             // The break was expected.
    934             //   Check that the {nnn} tag value is correct.
    935             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
    936             if (expectedTagVal == -1) {
    937                 expectedTagVal = 0;
    938             }
    939             int line = t->srcLine->elementAti(bp);
    940             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
    941             if (rs != expectedTagVal) {
    942                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
    943                       "          Actual, Expected status = %4d, %4d",
    944                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
    945             }
    946         }
    947 
    948         prevBP = bp;
    949     }
    950 
    951     // Verify that there were no missed breaks prior to the last one found
    952     for (i=prevBP-1; i>=0; i--) {
    953         if (t->expectedBreaks->elementAti(i) != 0) {
    954             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
    955                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
    956         }
    957     }
    958 
    959     // Check isBoundary()
    960     for (i=0; i<t->expectedBreaks->size(); i++) {
    961         UBool boundaryExpected = (t->expectedBreaks->elementAti(i) != 0);
    962         UBool boundaryFound    = t->bi->isBoundary(i);
    963         if (boundaryExpected != boundaryFound) {
    964             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
    965                   "        Expected, Actual= %s, %s",
    966                   i, t->srcLine->elementAti(i), t->srcCol->elementAti(i),
    967                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
    968         }
    969     }
    970 
    971     // Check following()
    972     for (i=0; i<t->expectedBreaks->size(); i++) {
    973         int32_t actualBreak = t->bi->following(i);
    974         int32_t expectedBreak = BreakIterator::DONE;
    975         for (int32_t j=i+1; j < t->expectedBreaks->size(); j++) {
    976             if (t->expectedBreaks->elementAti(j) != 0) {
    977                 expectedBreak = j;
    978                 break;
    979             }
    980         }
    981         if (expectedBreak != actualBreak) {
    982             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
    983                   "        Expected, Actual= %d, %d",
    984                   i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
    985         }
    986     }
    987 
    988     // Check preceding()
    989     for (i=t->expectedBreaks->size(); i>=0; i--) {
    990         int32_t actualBreak = t->bi->preceding(i);
    991         int32_t expectedBreak = BreakIterator::DONE;
    992 
    993         for (int32_t j=i-1; j >= 0; j--) {
    994             if (t->expectedBreaks->elementAti(j) != 0) {
    995                 expectedBreak = j;
    996                 break;
    997             }
    998         }
    999         if (expectedBreak != actualBreak) {
   1000             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
   1001                   "        Expected, Actual= %d, %d",
   1002                   i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
   1003         }
   1004     }
   1005 }
   1006 
   1007 
   1008 void RBBITest::TestExtended() {
   1009 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   1010     UErrorCode      status  = U_ZERO_ERROR;
   1011     Locale          locale("");
   1012 
   1013     UnicodeString       rules;
   1014     TestParams          tp;
   1015     tp.bi             = NULL;
   1016     tp.expectedBreaks = new UVector32(status);
   1017     tp.srcLine        = new UVector32(status);
   1018     tp.srcCol         = new UVector32(status);
   1019 
   1020     RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
   1021     if (U_FAILURE(status)) {
   1022         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
   1023     }
   1024 
   1025 
   1026     //
   1027     //  Open and read the test data file.
   1028     //
   1029     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   1030     char testFileName[1000];
   1031     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
   1032         errln("Can't open test data.  Path too long.");
   1033         return;
   1034     }
   1035     strcpy(testFileName, testDataDirectory);
   1036     strcat(testFileName, "rbbitst.txt");
   1037 
   1038     int    len;
   1039     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
   1040     if (U_FAILURE(status)) {
   1041         return; /* something went wrong, error already output */
   1042     }
   1043 
   1044 
   1045 
   1046 
   1047     //
   1048     //  Put the test data into a UnicodeString
   1049     //
   1050     UnicodeString testString(FALSE, testFile, len);
   1051 
   1052     enum EParseState{
   1053         PARSE_COMMENT,
   1054         PARSE_TAG,
   1055         PARSE_DATA,
   1056         PARSE_NUM
   1057     }
   1058     parseState = PARSE_TAG;
   1059 
   1060     EParseState savedState = PARSE_TAG;
   1061 
   1062     static const UChar CH_LF        = 0x0a;
   1063     static const UChar CH_CR        = 0x0d;
   1064     static const UChar CH_HASH      = 0x23;
   1065     /*static const UChar CH_PERIOD    = 0x2e;*/
   1066     static const UChar CH_LT        = 0x3c;
   1067     static const UChar CH_GT        = 0x3e;
   1068     static const UChar CH_BACKSLASH = 0x5c;
   1069     static const UChar CH_BULLET    = 0x2022;
   1070 
   1071     int32_t    lineNum  = 1;
   1072     int32_t    colStart = 0;
   1073     int32_t    column   = 0;
   1074     int32_t    charIdx  = 0;
   1075 
   1076     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
   1077 
   1078     for (charIdx = 0; charIdx < len; ) {
   1079         status = U_ZERO_ERROR;
   1080         UChar  c = testString.charAt(charIdx);
   1081         charIdx++;
   1082         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
   1083             // treat CRLF as a unit
   1084             c = CH_LF;
   1085             charIdx++;
   1086         }
   1087         if (c == CH_LF || c == CH_CR) {
   1088             lineNum++;
   1089             colStart = charIdx;
   1090         }
   1091         column = charIdx - colStart + 1;
   1092 
   1093         switch (parseState) {
   1094         case PARSE_COMMENT:
   1095             if (c == 0x0a || c == 0x0d) {
   1096                 parseState = savedState;
   1097             }
   1098             break;
   1099 
   1100         case PARSE_TAG:
   1101             {
   1102             if (c == CH_HASH) {
   1103                 parseState = PARSE_COMMENT;
   1104                 savedState = PARSE_TAG;
   1105                 break;
   1106             }
   1107             if (u_isUWhiteSpace(c)) {
   1108                 break;
   1109             }
   1110             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
   1111                 delete tp.bi;
   1112                 tp.bi = BreakIterator::createWordInstance(locale,  status);
   1113                 charIdx += 5;
   1114                 break;
   1115             }
   1116             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
   1117                 delete tp.bi;
   1118                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
   1119                 charIdx += 5;
   1120                 break;
   1121             }
   1122             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
   1123                 delete tp.bi;
   1124                 tp.bi = BreakIterator::createLineInstance(locale,  status);
   1125                 charIdx += 5;
   1126                 break;
   1127             }
   1128             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
   1129                 delete tp.bi;
   1130                 tp.bi = NULL;
   1131                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
   1132                 charIdx += 5;
   1133                 break;
   1134             }
   1135             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
   1136                 delete tp.bi;
   1137                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
   1138                 charIdx += 6;
   1139                 break;
   1140             }
   1141 
   1142             // <locale  loc_name>
   1143             localeMatcher.reset(testString);
   1144             if (localeMatcher.lookingAt(charIdx-1, status)) {
   1145                 UnicodeString localeName = localeMatcher.group(1, status);
   1146                 char localeName8[100];
   1147                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
   1148                 locale = Locale::createFromName(localeName8);
   1149                 charIdx += localeMatcher.group(0, status).length() - 1;
   1150                 TEST_ASSERT_SUCCESS(status);
   1151                 break;
   1152             }
   1153             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
   1154                 parseState = PARSE_DATA;
   1155                 charIdx += 5;
   1156                 tp.dataToBreak = "";
   1157                 tp.expectedBreaks->removeAllElements();
   1158                 tp.srcCol ->removeAllElements();
   1159                 tp.srcLine->removeAllElements();
   1160                 break;
   1161             }
   1162 
   1163             errln("line %d: Tag expected in test file.", lineNum);
   1164             parseState = PARSE_COMMENT;
   1165             savedState = PARSE_DATA;
   1166             goto end_test; // Stop the test.
   1167             }
   1168             break;
   1169 
   1170         case PARSE_DATA:
   1171             if (c == CH_BULLET) {
   1172                 int32_t  breakIdx = tp.dataToBreak.length();
   1173                 tp.expectedBreaks->setSize(breakIdx+1);
   1174                 tp.expectedBreaks->setElementAt(-1, breakIdx);
   1175                 tp.srcLine->setSize(breakIdx+1);
   1176                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1177                 tp.srcCol ->setSize(breakIdx+1);
   1178                 tp.srcCol ->setElementAt(column, breakIdx);
   1179                 break;
   1180             }
   1181 
   1182             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
   1183                 // Add final entry to mappings from break location to source file position.
   1184                 //  Need one extra because last break position returned is after the
   1185                 //    last char in the data, not at the last char.
   1186                 tp.srcLine->addElement(lineNum, status);
   1187                 tp.srcCol ->addElement(column, status);
   1188 
   1189                 parseState = PARSE_TAG;
   1190                 charIdx += 6;
   1191 
   1192                 // RUN THE TEST!
   1193                 executeTest(&tp);
   1194                 break;
   1195             }
   1196 
   1197             if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
   1198                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
   1199                 // Get the code point from the name and insert it into the test data.
   1200                 //   (Damn, no API takes names in Unicode  !!!
   1201                 //    we've got to take it back to char *)
   1202                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
   1203                 int32_t nameLength = nameEndIdx - (charIdx+2);
   1204                 char charNameBuf[200];
   1205                 UChar32 theChar = -1;
   1206                 if (nameEndIdx != -1) {
   1207                     UErrorCode status = U_ZERO_ERROR;
   1208                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
   1209                     charNameBuf[sizeof(charNameBuf)-1] = 0;
   1210                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
   1211                     if (U_FAILURE(status)) {
   1212                         theChar = -1;
   1213                     }
   1214                 }
   1215                 if (theChar == -1) {
   1216                     errln("Error in named character in test file at line %d, col %d",
   1217                         lineNum, column);
   1218                 } else {
   1219                     // Named code point was recognized.  Insert it
   1220                     //   into the test data.
   1221                     tp.dataToBreak.append(theChar);
   1222                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
   1223                         tp.srcLine->addElement(lineNum, status);
   1224                         tp.srcCol ->addElement(column, status);
   1225                     }
   1226                 }
   1227                 if (nameEndIdx > charIdx) {
   1228                     charIdx = nameEndIdx+1;
   1229 
   1230                 }
   1231                 break;
   1232             }
   1233 
   1234 
   1235 
   1236 
   1237             if (testString.compare(charIdx-1, 2, "<>") == 0) {
   1238                 charIdx++;
   1239                 int32_t  breakIdx = tp.dataToBreak.length();
   1240                 tp.expectedBreaks->setSize(breakIdx+1);
   1241                 tp.expectedBreaks->setElementAt(-1, breakIdx);
   1242                 tp.srcLine->setSize(breakIdx+1);
   1243                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1244                 tp.srcCol ->setSize(breakIdx+1);
   1245                 tp.srcCol ->setElementAt(column, breakIdx);
   1246                 break;
   1247             }
   1248 
   1249             if (c == CH_LT) {
   1250                 tagValue   = 0;
   1251                 parseState = PARSE_NUM;
   1252                 break;
   1253             }
   1254 
   1255             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
   1256                 parseState = PARSE_COMMENT;
   1257                 savedState = PARSE_DATA;
   1258                 break;
   1259             }
   1260 
   1261             if (c == CH_BACKSLASH) {
   1262                 // Check for \ at end of line, a line continuation.
   1263                 //     Advance over (discard) the newline
   1264                 UChar32 cp = testString.char32At(charIdx);
   1265                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
   1266                     // We have a CR LF
   1267                     //  Need an extra increment of the input ptr to move over both of them
   1268                     charIdx++;
   1269                 }
   1270                 if (cp == CH_LF || cp == CH_CR) {
   1271                     lineNum++;
   1272                     colStart = charIdx;
   1273                     charIdx++;
   1274                     break;
   1275                 }
   1276 
   1277                 // Let unescape handle the back slash.
   1278                 cp = testString.unescapeAt(charIdx);
   1279                 if (cp != -1) {
   1280                     // Escape sequence was recognized.  Insert the char
   1281                     //   into the test data.
   1282                     tp.dataToBreak.append(cp);
   1283                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
   1284                         tp.srcLine->addElement(lineNum, status);
   1285                         tp.srcCol ->addElement(column, status);
   1286                     }
   1287                     break;
   1288                 }
   1289 
   1290 
   1291                 // Not a recognized backslash escape sequence.
   1292                 // Take the next char as a literal.
   1293                 //  TODO:  Should this be an error?
   1294                 c = testString.charAt(charIdx);
   1295                 charIdx = testString.moveIndex32(charIdx, 1);
   1296             }
   1297 
   1298             // Normal, non-escaped data char.
   1299             tp.dataToBreak.append(c);
   1300 
   1301             // Save the mapping from offset in the data to line/column numbers in
   1302             //   the original input file.  Will be used for better error messages only.
   1303             //   If there's an expected break before this char, the slot in the mapping
   1304             //     vector will already be set for this char; don't overwrite it.
   1305             if (tp.dataToBreak.length() > tp.srcLine->size()) {
   1306                 tp.srcLine->addElement(lineNum, status);
   1307                 tp.srcCol ->addElement(column, status);
   1308             }
   1309             break;
   1310 
   1311 
   1312         case PARSE_NUM:
   1313             // We are parsing an expected numeric tag value, like <1234>,
   1314             //   within a chunk of data.
   1315             if (u_isUWhiteSpace(c)) {
   1316                 break;
   1317             }
   1318 
   1319             if (c == CH_GT) {
   1320                 // Finished the number.  Add the info to the expected break data,
   1321                 //   and switch parse state back to doing plain data.
   1322                 parseState = PARSE_DATA;
   1323                 if (tagValue == 0) {
   1324                     tagValue = -1;
   1325                 }
   1326                 int32_t  breakIdx = tp.dataToBreak.length();
   1327                 tp.expectedBreaks->setSize(breakIdx+1);
   1328                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
   1329                 tp.srcLine->setSize(breakIdx+1);
   1330                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1331                 tp.srcCol ->setSize(breakIdx+1);
   1332                 tp.srcCol ->setElementAt(column, breakIdx);
   1333                 break;
   1334             }
   1335 
   1336             if (u_isdigit(c)) {
   1337                 tagValue = tagValue*10 + u_charDigitValue(c);
   1338                 break;
   1339             }
   1340 
   1341             errln("Syntax Error in test file at line %d, col %d",
   1342                 lineNum, column);
   1343             parseState = PARSE_COMMENT;
   1344             goto end_test; // Stop the test
   1345             break;
   1346         }
   1347 
   1348 
   1349         if (U_FAILURE(status)) {
   1350             dataerrln("ICU Error %s while parsing test file at line %d.",
   1351                 u_errorName(status), lineNum);
   1352             status = U_ZERO_ERROR;
   1353             goto end_test; // Stop the test
   1354         }
   1355 
   1356     }
   1357 
   1358 end_test:
   1359     delete tp.bi;
   1360     delete tp.expectedBreaks;
   1361     delete tp.srcLine;
   1362     delete tp.srcCol;
   1363     delete [] testFile;
   1364 #endif
   1365 }
   1366 
   1367 
   1368 //-------------------------------------------------------------------------------
   1369 //
   1370 //  TestDictRules   create a break iterator from source rules that includes a
   1371 //                  dictionary range.   Regression for bug #7130.  Source rules
   1372 //                  do not declare a break iterator type (word, line, sentence, etc.
   1373 //                  but the dictionary code, without a type, would loop.
   1374 //
   1375 //-------------------------------------------------------------------------------
   1376 void RBBITest::TestDictRules() {
   1377     const char *rules =  "$dictionary = [a-z]; \n"
   1378                          "!!forward; \n"
   1379                          "$dictionary $dictionary; \n"
   1380                          "!!reverse; \n"
   1381                          "$dictionary $dictionary; \n";
   1382     const char *text = "aa";
   1383     UErrorCode status = U_ZERO_ERROR;
   1384     UParseError parseError;
   1385 
   1386     RuleBasedBreakIterator bi(rules, parseError, status);
   1387     if (U_SUCCESS(status)) {
   1388         UnicodeString utext = text;
   1389         bi.setText(utext);
   1390         int32_t position;
   1391         int32_t loops;
   1392         for (loops = 0; loops<10; loops++) {
   1393             position = bi.next();
   1394             if (position == RuleBasedBreakIterator::DONE) {
   1395                 break;
   1396             }
   1397         }
   1398         TEST_ASSERT(loops == 1);
   1399     } else {
   1400         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
   1401     }
   1402 }
   1403 
   1404 
   1405 
   1406 //-------------------------------------------------------------------------------
   1407 //
   1408 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
   1409 //    return the datain one big UChar * buffer, which the caller must delete.
   1410 //
   1411 //    parameters:
   1412 //          fileName:   the name of the file, with no directory part.  The test data directory
   1413 //                      is assumed.
   1414 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
   1415 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
   1416 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
   1417 //                      Pass NULL for the system default encoding.
   1418 //          status
   1419 //    returns:
   1420 //                      The file data, converted to UChar.
   1421 //                      The caller must delete this when done with
   1422 //                           delete [] theBuffer;
   1423 //
   1424 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
   1425 //           Move this function to some common place.
   1426 //
   1427 //--------------------------------------------------------------------------------
   1428 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
   1429     UChar       *retPtr  = NULL;
   1430     char        *fileBuf = NULL;
   1431     UConverter* conv     = NULL;
   1432     FILE        *f       = NULL;
   1433 
   1434     ulen = 0;
   1435     if (U_FAILURE(status)) {
   1436         return retPtr;
   1437     }
   1438 
   1439     //
   1440     //  Open the file.
   1441     //
   1442     f = fopen(fileName, "rb");
   1443     if (f == 0) {
   1444         dataerrln("Error opening test data file %s\n", fileName);
   1445         status = U_FILE_ACCESS_ERROR;
   1446         return NULL;
   1447     }
   1448     //
   1449     //  Read it in
   1450     //
   1451     int   fileSize;
   1452     int   amt_read;
   1453 
   1454     fseek( f, 0, SEEK_END);
   1455     fileSize = ftell(f);
   1456     fileBuf = new char[fileSize];
   1457     fseek(f, 0, SEEK_SET);
   1458     amt_read = fread(fileBuf, 1, fileSize, f);
   1459     if (amt_read != fileSize || fileSize <= 0) {
   1460         errln("Error reading test data file.");
   1461         goto cleanUpAndReturn;
   1462     }
   1463 
   1464     //
   1465     // Look for a Unicode Signature (BOM) on the data just read
   1466     //
   1467     int32_t        signatureLength;
   1468     const char *   fileBufC;
   1469     const char*    bomEncoding;
   1470 
   1471     fileBufC = fileBuf;
   1472     bomEncoding = ucnv_detectUnicodeSignature(
   1473         fileBuf, fileSize, &signatureLength, &status);
   1474     if(bomEncoding!=NULL ){
   1475         fileBufC  += signatureLength;
   1476         fileSize  -= signatureLength;
   1477         encoding = bomEncoding;
   1478     }
   1479 
   1480     //
   1481     // Open a converter to take the rule file to UTF-16
   1482     //
   1483     conv = ucnv_open(encoding, &status);
   1484     if (U_FAILURE(status)) {
   1485         goto cleanUpAndReturn;
   1486     }
   1487 
   1488     //
   1489     // Convert the rules to UChar.
   1490     //  Preflight first to determine required buffer size.
   1491     //
   1492     ulen = ucnv_toUChars(conv,
   1493         NULL,           //  dest,
   1494         0,              //  destCapacity,
   1495         fileBufC,
   1496         fileSize,
   1497         &status);
   1498     if (status == U_BUFFER_OVERFLOW_ERROR) {
   1499         // Buffer Overflow is expected from the preflight operation.
   1500         status = U_ZERO_ERROR;
   1501 
   1502         retPtr = new UChar[ulen+1];
   1503         ucnv_toUChars(conv,
   1504             retPtr,       //  dest,
   1505             ulen+1,
   1506             fileBufC,
   1507             fileSize,
   1508             &status);
   1509     }
   1510 
   1511 cleanUpAndReturn:
   1512     fclose(f);
   1513     delete []fileBuf;
   1514     ucnv_close(conv);
   1515     if (U_FAILURE(status)) {
   1516         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   1517         delete []retPtr;
   1518         retPtr = 0;
   1519         ulen   = 0;
   1520     };
   1521     return retPtr;
   1522 }
   1523 
   1524 
   1525 
   1526 //--------------------------------------------------------------------------------------------
   1527 //
   1528 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
   1529 //
   1530 //-------------------------------------------------------------------------------------------
   1531 void RBBITest::TestUnicodeFiles() {
   1532     RuleBasedBreakIterator  *bi;
   1533     UErrorCode               status = U_ZERO_ERROR;
   1534 
   1535     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
   1536     TEST_ASSERT_SUCCESS(status);
   1537     if (U_SUCCESS(status)) {
   1538         runUnicodeTestData("GraphemeBreakTest.txt", bi);
   1539     }
   1540     delete bi;
   1541 
   1542     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
   1543     TEST_ASSERT_SUCCESS(status);
   1544     if (U_SUCCESS(status)) {
   1545         runUnicodeTestData("WordBreakTest.txt", bi);
   1546     }
   1547     delete bi;
   1548 
   1549     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
   1550     TEST_ASSERT_SUCCESS(status);
   1551     if (U_SUCCESS(status)) {
   1552         runUnicodeTestData("SentenceBreakTest.txt", bi);
   1553     }
   1554     delete bi;
   1555 
   1556     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
   1557     TEST_ASSERT_SUCCESS(status);
   1558     if (U_SUCCESS(status)) {
   1559         runUnicodeTestData("LineBreakTest.txt", bi);
   1560     }
   1561     delete bi;
   1562 }
   1563 
   1564 
   1565 //--------------------------------------------------------------------------------------------
   1566 //
   1567 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
   1568 //
   1569 //-------------------------------------------------------------------------------------------
   1570 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
   1571 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   1572     // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb. Ticket #7270
   1573     UBool isTicket7270Fixed = !logKnownIssue("7270");
   1574     UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt");
   1575     UErrorCode  status = U_ZERO_ERROR;
   1576 
   1577     //
   1578     //  Open and read the test data file, put it into a UnicodeString.
   1579     //
   1580     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   1581     char testFileName[1000];
   1582     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
   1583         dataerrln("Can't open test data.  Path too long.");
   1584         return;
   1585     }
   1586     strcpy(testFileName, testDataDirectory);
   1587     strcat(testFileName, fileName);
   1588 
   1589     logln("Opening data file %s\n", fileName);
   1590 
   1591     int    len;
   1592     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
   1593     if (status != U_FILE_ACCESS_ERROR) {
   1594         TEST_ASSERT_SUCCESS(status);
   1595         TEST_ASSERT(testFile != NULL);
   1596     }
   1597     if (U_FAILURE(status) || testFile == NULL) {
   1598         return; /* something went wrong, error already output */
   1599     }
   1600     UnicodeString testFileAsString(TRUE, testFile, len);
   1601 
   1602     //
   1603     //  Parse the test data file using a regular expression.
   1604     //  Each kind of token is recognized in its own capture group; what type of item was scanned
   1605     //     is identified by which group had a match.
   1606     //
   1607     //    Caputure Group #                  1          2            3            4           5
   1608     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
   1609     //
   1610     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
   1611     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
   1612     UnicodeString   testString;
   1613     UVector32       breakPositions(status);
   1614     int             lineNumber = 1;
   1615     TEST_ASSERT_SUCCESS(status);
   1616     if (U_FAILURE(status)) {
   1617         return;
   1618     }
   1619 
   1620     //
   1621     //  Scan through each test case, building up the string to be broken in testString,
   1622     //   and the positions that should be boundaries in the breakPositions vector.
   1623     //
   1624     int spin = 0;
   1625     while (tokenMatcher.find()) {
   1626       	if(tokenMatcher.hitEnd()) {
   1627           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
   1628              This occurred when the text file was corrupt (wasn't marked as UTF-8)
   1629              and caused an infinite loop here on EBCDIC systems!
   1630           */
   1631           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
   1632           //	   return;
   1633       	}
   1634         if (tokenMatcher.start(1, status) >= 0) {
   1635             // Scanned a divide sign, indicating a break position in the test data.
   1636             if (testString.length()>0) {
   1637                 breakPositions.addElement(testString.length(), status);
   1638             }
   1639         }
   1640         else if (tokenMatcher.start(2, status) >= 0) {
   1641             // Scanned an 'x', meaning no break at this position in the test data
   1642             //   Nothing to be done here.
   1643             }
   1644         else if (tokenMatcher.start(3, status) >= 0) {
   1645             // Scanned Hex digits.  Convert them to binary, append to the character data string.
   1646             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
   1647             int length = hexNumber.length();
   1648             if (length<=8) {
   1649                 char buf[10];
   1650                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
   1651                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
   1652                 if (c<=0x10ffff) {
   1653                     testString.append(c);
   1654                 } else {
   1655                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
   1656                        fileName, lineNumber);
   1657                 }
   1658             } else {
   1659                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
   1660                        fileName, lineNumber);
   1661              }
   1662         }
   1663         else if (tokenMatcher.start(4, status) >= 0) {
   1664             // Scanned to end of a line, possibly skipping over a comment in the process.
   1665             //   If the line from the file contained test data, run the test now.
   1666             //
   1667             if (testString.length() > 0) {
   1668 // TODO(andy): Remove this time bomb code. Note: Failing line numbers may change when updating to new Unicode data.
   1669 //             Rule 8
   1670 //                ZW SP* <break>
   1671 //             is not yet implemented.
   1672 if (!(isLineBreak && !isTicket7270Fixed && (5198 == lineNumber ||
   1673                                             5202 == lineNumber ||
   1674                                             5214 == lineNumber ||
   1675                                             5246 == lineNumber ||
   1676                                             5298 == lineNumber ||
   1677                                             5302 == lineNumber ))) {
   1678                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
   1679 }
   1680             }
   1681 
   1682             // Clear out this test case.
   1683             //    The string and breakPositions vector will be refilled as the next
   1684             //       test case is parsed.
   1685             testString.remove();
   1686             breakPositions.removeAllElements();
   1687             lineNumber++;
   1688         } else {
   1689             // Scanner catchall.  Something unrecognized appeared on the line.
   1690             char token[16];
   1691             UnicodeString uToken = tokenMatcher.group(0, status);
   1692             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
   1693             token[sizeof(token)-1] = 0;
   1694             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
   1695 
   1696             // Clean up, in preparation for continuing with the next line.
   1697             testString.remove();
   1698             breakPositions.removeAllElements();
   1699             lineNumber++;
   1700         }
   1701         TEST_ASSERT_SUCCESS(status);
   1702         if (U_FAILURE(status)) {
   1703             break;
   1704         }
   1705     }
   1706 
   1707     delete [] testFile;
   1708  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
   1709 }
   1710 
   1711 //--------------------------------------------------------------------------------------------
   1712 //
   1713 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
   1714 //                            test data files.  Do only a simple, forward-only check -
   1715 //                            this test is mostly to check that ICU and the Unicode
   1716 //                            data agree with each other.
   1717 //
   1718 //--------------------------------------------------------------------------------------------
   1719 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
   1720                          const UnicodeString &testString,   // Text data to be broken
   1721                          UVector32 *breakPositions,         // Positions where breaks should be found.
   1722                          RuleBasedBreakIterator *bi) {
   1723     int32_t pos;                 // Break Position in the test string
   1724     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
   1725     int32_t expectedPos;         // Expected break position (index into test string)
   1726 
   1727     bi->setText(testString);
   1728     pos = bi->first();
   1729     pos = bi->next();
   1730 
   1731     while (pos != BreakIterator::DONE) {
   1732         if (expectedI >= breakPositions->size()) {
   1733             errln("Test file \"%s\", line %d, unexpected break found at position %d",
   1734                 testFileName, lineNumber, pos);
   1735             break;
   1736         }
   1737         expectedPos = breakPositions->elementAti(expectedI);
   1738         if (pos < expectedPos) {
   1739             errln("Test file \"%s\", line %d, unexpected break found at position %d",
   1740                 testFileName, lineNumber, pos);
   1741             break;
   1742         }
   1743         if (pos > expectedPos) {
   1744             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
   1745                 testFileName, lineNumber, expectedPos);
   1746             break;
   1747         }
   1748         pos = bi->next();
   1749         expectedI++;
   1750     }
   1751 
   1752     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
   1753         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
   1754             testFileName, lineNumber, breakPositions->elementAti(expectedI));
   1755     }
   1756 }
   1757 
   1758 
   1759 
   1760 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   1761 //---------------------------------------------------------------------------------------
   1762 //
   1763 //   classs RBBIMonkeyKind
   1764 //
   1765 //      Monkey Test for Break Iteration
   1766 //      Abstract interface class.   Concrete derived classes independently
   1767 //      implement the break rules for different iterator types.
   1768 //
   1769 //      The Monkey Test itself uses doesn't know which type of break iterator it is
   1770 //      testing, but works purely in terms of the interface defined here.
   1771 //
   1772 //---------------------------------------------------------------------------------------
   1773 class RBBIMonkeyKind {
   1774 public:
   1775     // Return a UVector of UnicodeSets, representing the character classes used
   1776     //   for this type of iterator.
   1777     virtual  UVector  *charClasses() = 0;
   1778 
   1779     // Set the test text on which subsequent calls to next() will operate
   1780     virtual  void      setText(const UnicodeString &s) = 0;
   1781 
   1782     // Find the next break postion, starting from the prev break position, or from zero.
   1783     // Return -1 after reaching end of string.
   1784     virtual  int32_t   next(int32_t i) = 0;
   1785 
   1786     virtual ~RBBIMonkeyKind();
   1787     UErrorCode       deferredStatus;
   1788 
   1789 
   1790 protected:
   1791     RBBIMonkeyKind();
   1792 
   1793 private:
   1794 };
   1795 
   1796 RBBIMonkeyKind::RBBIMonkeyKind() {
   1797     deferredStatus = U_ZERO_ERROR;
   1798 }
   1799 
   1800 RBBIMonkeyKind::~RBBIMonkeyKind() {
   1801 }
   1802 
   1803 
   1804 //----------------------------------------------------------------------------------------
   1805 //
   1806 //   Random Numbers.  Similar to standard lib rand() and srand()
   1807 //                    Not using library to
   1808 //                      1.  Get same results on all platforms.
   1809 //                      2.  Get access to current seed, to more easily reproduce failures.
   1810 //
   1811 //---------------------------------------------------------------------------------------
   1812 static uint32_t m_seed = 1;
   1813 
   1814 static uint32_t m_rand()
   1815 {
   1816     m_seed = m_seed * 1103515245 + 12345;
   1817     return (uint32_t)(m_seed/65536) % 32768;
   1818 }
   1819 
   1820 
   1821 //------------------------------------------------------------------------------------------
   1822 //
   1823 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
   1824 //                             of RBBIMonkeyKind.
   1825 //
   1826 //------------------------------------------------------------------------------------------
   1827 class RBBICharMonkey: public RBBIMonkeyKind {
   1828 public:
   1829     RBBICharMonkey();
   1830     virtual          ~RBBICharMonkey();
   1831     virtual  UVector *charClasses();
   1832     virtual  void     setText(const UnicodeString &s);
   1833     virtual  int32_t  next(int32_t i);
   1834 private:
   1835     UVector   *fSets;
   1836 
   1837     UnicodeSet  *fCRLFSet;
   1838     UnicodeSet  *fControlSet;
   1839     UnicodeSet  *fExtendSet;
   1840     UnicodeSet  *fRegionalIndicatorSet;
   1841     UnicodeSet  *fPrependSet;
   1842     UnicodeSet  *fSpacingSet;
   1843     UnicodeSet  *fLSet;
   1844     UnicodeSet  *fVSet;
   1845     UnicodeSet  *fTSet;
   1846     UnicodeSet  *fLVSet;
   1847     UnicodeSet  *fLVTSet;
   1848     UnicodeSet  *fHangulSet;
   1849     UnicodeSet  *fAnySet;
   1850 
   1851     const UnicodeString *fText;
   1852 };
   1853 
   1854 
   1855 RBBICharMonkey::RBBICharMonkey() {
   1856     UErrorCode  status = U_ZERO_ERROR;
   1857 
   1858     fText = NULL;
   1859 
   1860     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
   1861     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
   1862     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
   1863     fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
   1864     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
   1865     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
   1866     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
   1867     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
   1868     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
   1869     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
   1870     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
   1871     fHangulSet  = new UnicodeSet();
   1872     fHangulSet->addAll(*fLSet);
   1873     fHangulSet->addAll(*fVSet);
   1874     fHangulSet->addAll(*fTSet);
   1875     fHangulSet->addAll(*fLVSet);
   1876     fHangulSet->addAll(*fLVTSet);
   1877     fAnySet     = new UnicodeSet(0, 0x10ffff);
   1878 
   1879     fSets       = new UVector(status);
   1880     fSets->addElement(fCRLFSet,    status);
   1881     fSets->addElement(fControlSet, status);
   1882     fSets->addElement(fExtendSet,  status);
   1883     fSets->addElement(fRegionalIndicatorSet, status);
   1884     if (!fPrependSet->isEmpty()) {
   1885         fSets->addElement(fPrependSet, status);
   1886     }
   1887     fSets->addElement(fSpacingSet, status);
   1888     fSets->addElement(fHangulSet,  status);
   1889     fSets->addElement(fAnySet,     status);
   1890     if (U_FAILURE(status)) {
   1891         deferredStatus = status;
   1892     }
   1893 }
   1894 
   1895 
   1896 void RBBICharMonkey::setText(const UnicodeString &s) {
   1897     fText = &s;
   1898 }
   1899 
   1900 
   1901 
   1902 int32_t RBBICharMonkey::next(int32_t prevPos) {
   1903     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   1904                               //   break position being tested.  The candidate break
   1905                               //   location is before p2.
   1906 
   1907     int     breakPos = -1;
   1908 
   1909     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   1910 
   1911     if (U_FAILURE(deferredStatus)) {
   1912         return -1;
   1913     }
   1914 
   1915     // Previous break at end of string.  return DONE.
   1916     if (prevPos >= fText->length()) {
   1917         return -1;
   1918     }
   1919     p0 = p1 = p2 = p3 = prevPos;
   1920     c3 =  fText->char32At(prevPos);
   1921     c0 = c1 = c2 = 0;
   1922     (void)p0;   // suppress set but not used warning.
   1923     (void)c0;
   1924 
   1925     // Loop runs once per "significant" character position in the input text.
   1926     for (;;) {
   1927         // Move all of the positions forward in the input string.
   1928         p0 = p1;  c0 = c1;
   1929         p1 = p2;  c1 = c2;
   1930         p2 = p3;  c2 = c3;
   1931 
   1932         // Advancd p3 by one codepoint
   1933         p3 = fText->moveIndex32(p3, 1);
   1934         c3 = fText->char32At(p3);
   1935 
   1936         if (p1 == p2) {
   1937             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   1938             continue;
   1939         }
   1940         if (p2 == fText->length()) {
   1941             // Reached end of string.  Always a break position.
   1942             break;
   1943         }
   1944 
   1945         // Rule  GB3   CR x LF
   1946         //     No Extend or Format characters may appear between the CR and LF,
   1947         //     which requires the additional check for p2 immediately following p1.
   1948         //
   1949         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
   1950             continue;
   1951         }
   1952 
   1953         // Rule (GB4).   ( Control | CR | LF ) <break>
   1954         if (fControlSet->contains(c1) ||
   1955             c1 == 0x0D ||
   1956             c1 == 0x0A)  {
   1957             break;
   1958         }
   1959 
   1960         // Rule (GB5)    <break>  ( Control | CR | LF )
   1961         //
   1962         if (fControlSet->contains(c2) ||
   1963             c2 == 0x0D ||
   1964             c2 == 0x0A)  {
   1965             break;
   1966         }
   1967 
   1968 
   1969         // Rule (GB6)  L x ( L | V | LV | LVT )
   1970         if (fLSet->contains(c1) &&
   1971                (fLSet->contains(c2)  ||
   1972                 fVSet->contains(c2)  ||
   1973                 fLVSet->contains(c2) ||
   1974                 fLVTSet->contains(c2))) {
   1975             continue;
   1976         }
   1977 
   1978         // Rule (GB7)    ( LV | V )  x  ( V | T )
   1979         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
   1980             (fVSet->contains(c2) || fTSet->contains(c2)))  {
   1981             continue;
   1982         }
   1983 
   1984         // Rule (GB8)    ( LVT | T)  x T
   1985         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
   1986             fTSet->contains(c2))  {
   1987             continue;
   1988         }
   1989 
   1990         // Rule (GB8a)    Regional_Indicator x Regional_Indicator
   1991         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
   1992             continue;
   1993         }
   1994 
   1995         // Rule (GB9)    Numeric x ALetter
   1996         if (fExtendSet->contains(c2))  {
   1997             continue;
   1998         }
   1999 
   2000         // Rule (GB9a)   x  SpacingMark
   2001         if (fSpacingSet->contains(c2)) {
   2002             continue;
   2003         }
   2004 
   2005         // Rule (GB9b)   Prepend x
   2006         if (fPrependSet->contains(c1)) {
   2007             continue;
   2008         }
   2009 
   2010         // Rule (GB10)  Any  <break>  Any
   2011         break;
   2012     }
   2013 
   2014     breakPos = p2;
   2015     return breakPos;
   2016 }
   2017 
   2018 
   2019 
   2020 UVector  *RBBICharMonkey::charClasses() {
   2021     return fSets;
   2022 }
   2023 
   2024 
   2025 RBBICharMonkey::~RBBICharMonkey() {
   2026     delete fSets;
   2027     delete fCRLFSet;
   2028     delete fControlSet;
   2029     delete fExtendSet;
   2030     delete fRegionalIndicatorSet;
   2031     delete fPrependSet;
   2032     delete fSpacingSet;
   2033     delete fLSet;
   2034     delete fVSet;
   2035     delete fTSet;
   2036     delete fLVSet;
   2037     delete fLVTSet;
   2038     delete fHangulSet;
   2039     delete fAnySet;
   2040 }
   2041 
   2042 //------------------------------------------------------------------------------------------
   2043 //
   2044 //   class RBBIWordMonkey      Word Break specific implementation
   2045 //                             of RBBIMonkeyKind.
   2046 //
   2047 //------------------------------------------------------------------------------------------
   2048 class RBBIWordMonkey: public RBBIMonkeyKind {
   2049 public:
   2050     RBBIWordMonkey();
   2051     virtual          ~RBBIWordMonkey();
   2052     virtual  UVector *charClasses();
   2053     virtual  void     setText(const UnicodeString &s);
   2054     virtual int32_t   next(int32_t i);
   2055 private:
   2056     UVector      *fSets;
   2057 
   2058     UnicodeSet  *fCRSet;
   2059     UnicodeSet  *fLFSet;
   2060     UnicodeSet  *fNewlineSet;
   2061     UnicodeSet  *fRegionalIndicatorSet;
   2062     UnicodeSet  *fKatakanaSet;
   2063     UnicodeSet  *fHebrew_LetterSet;
   2064     UnicodeSet  *fALetterSet;
   2065     // TODO(jungshik): Do we still need this change?
   2066     // UnicodeSet  *fALetterSet; // matches ALetterPlus in word.txt
   2067     UnicodeSet  *fSingle_QuoteSet;
   2068     UnicodeSet  *fDouble_QuoteSet;
   2069     UnicodeSet  *fMidNumLetSet;
   2070     UnicodeSet  *fMidLetterSet;
   2071     UnicodeSet  *fMidNumSet;
   2072     UnicodeSet  *fNumericSet;
   2073     UnicodeSet  *fFormatSet;
   2074     UnicodeSet  *fOtherSet;
   2075     UnicodeSet  *fExtendSet;
   2076     UnicodeSet  *fExtendNumLetSet;
   2077     UnicodeSet  *fDictionaryCjkSet;
   2078 
   2079     const UnicodeString  *fText;
   2080 };
   2081 
   2082 
   2083 RBBIWordMonkey::RBBIWordMonkey()
   2084 {
   2085     UErrorCode  status = U_ZERO_ERROR;
   2086 
   2087     fSets            = new UVector(status);
   2088 
   2089     fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
   2090     fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
   2091     fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
   2092     fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
   2093     // Exclude Hangul syllables from ALetterSet during testing.
   2094     // Leave CJK dictionary characters out from the monkey tests!
   2095 #if 0
   2096     fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
   2097                                       "[\\p{Line_Break = Complex_Context}"
   2098                                       "-\\p{Grapheme_Cluster_Break = Extend}"
   2099                                       "-\\p{Grapheme_Cluster_Break = Control}"
   2100                                       "]]",
   2101                                       status);
   2102 #endif
   2103     fRegionalIndicatorSet =  new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
   2104     fKatakanaSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
   2105     fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);
   2106     fALetterSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
   2107     fALetterSet->removeAll(*fDictionaryCjkSet);
   2108     fSingle_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"),    status);
   2109     fDouble_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"),    status);
   2110     fMidNumLetSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
   2111     fMidLetterSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
   2112     fMidNumSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
   2113     // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
   2114     // we should figure out why
   2115     fNumericSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
   2116     fFormatSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
   2117     fExtendNumLetSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
   2118     fExtendSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
   2119 
   2120     fOtherSet        = new UnicodeSet();
   2121     if(U_FAILURE(status)) {
   2122       deferredStatus = status;
   2123       return;
   2124     }
   2125 
   2126     fOtherSet->complement();
   2127     fOtherSet->removeAll(*fCRSet);
   2128     fOtherSet->removeAll(*fLFSet);
   2129     fOtherSet->removeAll(*fNewlineSet);
   2130     fOtherSet->removeAll(*fKatakanaSet);
   2131     fOtherSet->removeAll(*fHebrew_LetterSet);
   2132     fOtherSet->removeAll(*fALetterSet);
   2133     fOtherSet->removeAll(*fSingle_QuoteSet);
   2134     fOtherSet->removeAll(*fDouble_QuoteSet);
   2135     fOtherSet->removeAll(*fMidLetterSet);
   2136     fOtherSet->removeAll(*fMidNumSet);
   2137     fOtherSet->removeAll(*fNumericSet);
   2138     fOtherSet->removeAll(*fExtendNumLetSet);
   2139     fOtherSet->removeAll(*fFormatSet);
   2140     fOtherSet->removeAll(*fExtendSet);
   2141     fOtherSet->removeAll(*fRegionalIndicatorSet);
   2142     // Inhibit dictionary characters from being tested at all.
   2143     fOtherSet->removeAll(*fDictionaryCjkSet);
   2144     fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
   2145 
   2146     fSets->addElement(fCRSet,                status);
   2147     fSets->addElement(fLFSet,                status);
   2148     fSets->addElement(fNewlineSet,           status);
   2149     fSets->addElement(fRegionalIndicatorSet, status);
   2150     fSets->addElement(fHebrew_LetterSet,     status);
   2151     fSets->addElement(fALetterSet,           status);
   2152     fSets->addElement(fSingle_QuoteSet,      status);
   2153     fSets->addElement(fDouble_QuoteSet,      status);
   2154     //fSets->addElement(fKatakanaSet,          status); //TODO: work out how to test katakana
   2155     fSets->addElement(fMidLetterSet,         status);
   2156     fSets->addElement(fMidNumLetSet,         status);
   2157     fSets->addElement(fMidNumSet,            status);
   2158     fSets->addElement(fNumericSet,           status);
   2159     fSets->addElement(fFormatSet,            status);
   2160     fSets->addElement(fExtendSet,            status);
   2161     fSets->addElement(fOtherSet,             status);
   2162     fSets->addElement(fExtendNumLetSet,      status);
   2163 
   2164     if (U_FAILURE(status)) {
   2165         deferredStatus = status;
   2166     }
   2167 }
   2168 
   2169 void RBBIWordMonkey::setText(const UnicodeString &s) {
   2170     fText       = &s;
   2171 }
   2172 
   2173 
   2174 int32_t RBBIWordMonkey::next(int32_t prevPos) {
   2175     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2176                               //   break position being tested.  The candidate break
   2177                               //   location is before p2.
   2178 
   2179     int     breakPos = -1;
   2180 
   2181     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2182 
   2183     if (U_FAILURE(deferredStatus)) {
   2184         return -1;
   2185     }
   2186 
   2187     // Prev break at end of string.  return DONE.
   2188     if (prevPos >= fText->length()) {
   2189         return -1;
   2190     }
   2191     p0 = p1 = p2 = p3 = prevPos;
   2192     c3 =  fText->char32At(prevPos);
   2193     c0 = c1 = c2 = 0;
   2194     (void)p0;       // Suppress set but not used warning.
   2195 
   2196     // Loop runs once per "significant" character position in the input text.
   2197     for (;;) {
   2198         // Move all of the positions forward in the input string.
   2199         p0 = p1;  c0 = c1;
   2200         p1 = p2;  c1 = c2;
   2201         p2 = p3;  c2 = c3;
   2202 
   2203         // Advancd p3 by    X(Extend | Format)*   Rule 4
   2204         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
   2205         do {
   2206             p3 = fText->moveIndex32(p3, 1);
   2207             c3 = fText->char32At(p3);
   2208             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
   2209                break;
   2210             };
   2211         }
   2212         while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
   2213 
   2214 
   2215         if (p1 == p2) {
   2216             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2217             continue;
   2218         }
   2219         if (p2 == fText->length()) {
   2220             // Reached end of string.  Always a break position.
   2221             break;
   2222         }
   2223 
   2224         // Rule  (3)   CR x LF
   2225         //     No Extend or Format characters may appear between the CR and LF,
   2226         //     which requires the additional check for p2 immediately following p1.
   2227         //
   2228         if (c1==0x0D && c2==0x0A) {
   2229             continue;
   2230         }
   2231 
   2232         // Rule (3a)  Break before and after newlines (including CR and LF)
   2233         //
   2234         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
   2235             break;
   2236         };
   2237         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
   2238             break;
   2239         };
   2240 
   2241         // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
   2242         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
   2243             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
   2244             continue;
   2245         }
   2246 
   2247         // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
   2248         //
   2249         if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
   2250              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
   2251              (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
   2252             continue;
   2253         }
   2254 
   2255         // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
   2256         if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
   2257             (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
   2258             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
   2259             continue;
   2260         }
   2261 
   2262         // Rule (7a)     Hebrew_Letter x Single_Quote
   2263         if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
   2264             continue;
   2265         }
   2266 
   2267         // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
   2268         if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
   2269             continue;
   2270         }
   2271 
   2272         // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
   2273         if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
   2274             continue;
   2275         }
   2276 
   2277         // Rule (8)    Numeric x Numeric
   2278         if (fNumericSet->contains(c1) &&
   2279             fNumericSet->contains(c2))  {
   2280             continue;
   2281         }
   2282 
   2283         // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
   2284         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
   2285             fNumericSet->contains(c2))  {
   2286             continue;
   2287         }
   2288 
   2289         // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
   2290         if (fNumericSet->contains(c1) &&
   2291             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
   2292             continue;
   2293         }
   2294 
   2295         // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
   2296         if (fNumericSet->contains(c0) &&
   2297             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
   2298             fNumericSet->contains(c2)) {
   2299             continue;
   2300         }
   2301 
   2302         // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
   2303         if (fNumericSet->contains(c1) &&
   2304             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
   2305             fNumericSet->contains(c3)) {
   2306             continue;
   2307         }
   2308 
   2309         // Rule (13)  Katakana x Katakana
   2310         if (fKatakanaSet->contains(c1) &&
   2311             fKatakanaSet->contains(c2))  {
   2312             continue;
   2313         }
   2314 
   2315         // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
   2316         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
   2317              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
   2318              fExtendNumLetSet->contains(c2)) {
   2319                 continue;
   2320         }
   2321 
   2322         // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
   2323         if (fExtendNumLetSet->contains(c1) &&
   2324                 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
   2325                  fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
   2326             continue;
   2327         }
   2328 
   2329         // Rule 13c
   2330         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
   2331             continue;
   2332         }
   2333 
   2334         // Rule 14.  Break found here.
   2335         break;
   2336     }
   2337 
   2338     breakPos = p2;
   2339     return breakPos;
   2340 }
   2341 
   2342 
   2343 UVector  *RBBIWordMonkey::charClasses() {
   2344     return fSets;
   2345 }
   2346 
   2347 
   2348 RBBIWordMonkey::~RBBIWordMonkey() {
   2349     delete fSets;
   2350     delete fCRSet;
   2351     delete fLFSet;
   2352     delete fNewlineSet;
   2353     delete fKatakanaSet;
   2354     delete fHebrew_LetterSet;
   2355     delete fALetterSet;
   2356     delete fSingle_QuoteSet;
   2357     delete fDouble_QuoteSet;
   2358     delete fMidNumLetSet;
   2359     delete fMidLetterSet;
   2360     delete fMidNumSet;
   2361     delete fNumericSet;
   2362     delete fFormatSet;
   2363     delete fExtendSet;
   2364     delete fExtendNumLetSet;
   2365     delete fRegionalIndicatorSet;
   2366     delete fDictionaryCjkSet;
   2367     delete fOtherSet;
   2368 }
   2369 
   2370 
   2371 
   2372 
   2373 //------------------------------------------------------------------------------------------
   2374 //
   2375 //   class RBBISentMonkey      Sentence Break specific implementation
   2376 //                             of RBBIMonkeyKind.
   2377 //
   2378 //------------------------------------------------------------------------------------------
   2379 class RBBISentMonkey: public RBBIMonkeyKind {
   2380 public:
   2381     RBBISentMonkey();
   2382     virtual          ~RBBISentMonkey();
   2383     virtual  UVector *charClasses();
   2384     virtual  void     setText(const UnicodeString &s);
   2385     virtual int32_t   next(int32_t i);
   2386 private:
   2387     int               moveBack(int posFrom);
   2388     int               moveForward(int posFrom);
   2389     UChar32           cAt(int pos);
   2390 
   2391     UVector      *fSets;
   2392 
   2393     UnicodeSet  *fSepSet;
   2394     UnicodeSet  *fFormatSet;
   2395     UnicodeSet  *fSpSet;
   2396     UnicodeSet  *fLowerSet;
   2397     UnicodeSet  *fUpperSet;
   2398     UnicodeSet  *fOLetterSet;
   2399     UnicodeSet  *fNumericSet;
   2400     UnicodeSet  *fATermSet;
   2401     UnicodeSet  *fSContinueSet;
   2402     UnicodeSet  *fSTermSet;
   2403     UnicodeSet  *fCloseSet;
   2404     UnicodeSet  *fOtherSet;
   2405     UnicodeSet  *fExtendSet;
   2406 
   2407     const UnicodeString  *fText;
   2408 
   2409 };
   2410 
   2411 RBBISentMonkey::RBBISentMonkey()
   2412 {
   2413     UErrorCode  status = U_ZERO_ERROR;
   2414 
   2415     fSets            = new UVector(status);
   2416 
   2417     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
   2418     //                       set and made into character classes of their own.  For the monkey impl,
   2419     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
   2420     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
   2421     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
   2422     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
   2423     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
   2424     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
   2425     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
   2426     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
   2427     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
   2428     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
   2429     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
   2430     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
   2431     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
   2432     fOtherSet        = new UnicodeSet();
   2433 
   2434     if(U_FAILURE(status)) {
   2435       deferredStatus = status;
   2436       return;
   2437     }
   2438 
   2439     fOtherSet->complement();
   2440     fOtherSet->removeAll(*fSepSet);
   2441     fOtherSet->removeAll(*fFormatSet);
   2442     fOtherSet->removeAll(*fSpSet);
   2443     fOtherSet->removeAll(*fLowerSet);
   2444     fOtherSet->removeAll(*fUpperSet);
   2445     fOtherSet->removeAll(*fOLetterSet);
   2446     fOtherSet->removeAll(*fNumericSet);
   2447     fOtherSet->removeAll(*fATermSet);
   2448     fOtherSet->removeAll(*fSContinueSet);
   2449     fOtherSet->removeAll(*fSTermSet);
   2450     fOtherSet->removeAll(*fCloseSet);
   2451     fOtherSet->removeAll(*fExtendSet);
   2452 
   2453     fSets->addElement(fSepSet,       status);
   2454     fSets->addElement(fFormatSet,    status);
   2455     fSets->addElement(fSpSet,        status);
   2456     fSets->addElement(fLowerSet,     status);
   2457     fSets->addElement(fUpperSet,     status);
   2458     fSets->addElement(fOLetterSet,   status);
   2459     fSets->addElement(fNumericSet,   status);
   2460     fSets->addElement(fATermSet,     status);
   2461     fSets->addElement(fSContinueSet, status);
   2462     fSets->addElement(fSTermSet,     status);
   2463     fSets->addElement(fCloseSet,     status);
   2464     fSets->addElement(fOtherSet,     status);
   2465     fSets->addElement(fExtendSet,    status);
   2466 
   2467     if (U_FAILURE(status)) {
   2468         deferredStatus = status;
   2469     }
   2470 }
   2471 
   2472 
   2473 
   2474 void RBBISentMonkey::setText(const UnicodeString &s) {
   2475     fText       = &s;
   2476 }
   2477 
   2478 UVector  *RBBISentMonkey::charClasses() {
   2479     return fSets;
   2480 }
   2481 
   2482 
   2483 //  moveBack()   Find the "significant" code point preceding the index i.
   2484 //               Skips over ($Extend | $Format)* .
   2485 //
   2486 int RBBISentMonkey::moveBack(int i) {
   2487     if (i <= 0) {
   2488         return -1;
   2489     }
   2490     UChar32   c;
   2491     int32_t   j = i;
   2492     do {
   2493         j = fText->moveIndex32(j, -1);
   2494         c = fText->char32At(j);
   2495     }
   2496     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
   2497     return j;
   2498 
   2499  }
   2500 
   2501 
   2502 int RBBISentMonkey::moveForward(int i) {
   2503     if (i>=fText->length()) {
   2504         return fText->length();
   2505     }
   2506     UChar32   c;
   2507     int32_t   j = i;
   2508     do {
   2509         j = fText->moveIndex32(j, 1);
   2510         c = cAt(j);
   2511     }
   2512     while (fFormatSet->contains(c) || fExtendSet->contains(c));
   2513     return j;
   2514 }
   2515 
   2516 UChar32 RBBISentMonkey::cAt(int pos) {
   2517     if (pos<0 || pos>=fText->length()) {
   2518         return -1;
   2519     } else {
   2520         return fText->char32At(pos);
   2521     }
   2522 }
   2523 
   2524 int32_t RBBISentMonkey::next(int32_t prevPos) {
   2525     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2526                               //   break position being tested.  The candidate break
   2527                               //   location is before p2.
   2528 
   2529     int     breakPos = -1;
   2530 
   2531     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2532     UChar32 c;
   2533 
   2534     if (U_FAILURE(deferredStatus)) {
   2535         return -1;
   2536     }
   2537 
   2538     // Prev break at end of string.  return DONE.
   2539     if (prevPos >= fText->length()) {
   2540         return -1;
   2541     }
   2542     p0 = p1 = p2 = p3 = prevPos;
   2543     c3 =  fText->char32At(prevPos);
   2544     c0 = c1 = c2 = 0;
   2545     (void)p0;     // Suppress set but not used warning.
   2546 
   2547     // Loop runs once per "significant" character position in the input text.
   2548     for (;;) {
   2549         // Move all of the positions forward in the input string.
   2550         p0 = p1;  c0 = c1;
   2551         p1 = p2;  c1 = c2;
   2552         p2 = p3;  c2 = c3;
   2553 
   2554         // Advancd p3 by    X(Extend | Format)*   Rule 4
   2555         p3 = moveForward(p3);
   2556         c3 = cAt(p3);
   2557 
   2558         // Rule (3)  CR x LF
   2559         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
   2560             continue;
   2561         }
   2562 
   2563         // Rule (4).   Sep  <break>
   2564         if (fSepSet->contains(c1)) {
   2565             p2 = p1+1;   // Separators don't combine with Extend or Format.
   2566             break;
   2567         }
   2568 
   2569         if (p2 >= fText->length()) {
   2570             // Reached end of string.  Always a break position.
   2571             break;
   2572         }
   2573 
   2574         if (p2 == prevPos) {
   2575             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2576             continue;
   2577         }
   2578 
   2579         // Rule (6).   ATerm x Numeric
   2580         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
   2581             continue;
   2582         }
   2583 
   2584         // Rule (7).  Upper ATerm  x  Uppper
   2585         if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
   2586             continue;
   2587         }
   2588 
   2589         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
   2590         //           Note:  STerm | ATerm are added to the negated part of the expression by a
   2591         //                  note to the Unicode 5.0 documents.
   2592         int p8 = p1;
   2593         while (fSpSet->contains(cAt(p8))) {
   2594             p8 = moveBack(p8);
   2595         }
   2596         while (fCloseSet->contains(cAt(p8))) {
   2597             p8 = moveBack(p8);
   2598         }
   2599         if (fATermSet->contains(cAt(p8))) {
   2600             p8=p2;
   2601             for (;;) {
   2602                 c = cAt(p8);
   2603                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
   2604                     fLowerSet->contains(c) || fSepSet->contains(c) ||
   2605                     fATermSet->contains(c) || fSTermSet->contains(c))  {
   2606                     break;
   2607                 }
   2608                 p8 = moveForward(p8);
   2609             }
   2610             if (fLowerSet->contains(cAt(p8))) {
   2611                 continue;
   2612             }
   2613         }
   2614 
   2615         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
   2616         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
   2617             p8 = p1;
   2618             while (fSpSet->contains(cAt(p8))) {
   2619                 p8 = moveBack(p8);
   2620             }
   2621             while (fCloseSet->contains(cAt(p8))) {
   2622                 p8 = moveBack(p8);
   2623             }
   2624             c = cAt(p8);
   2625             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
   2626                 continue;
   2627             }
   2628         }
   2629 
   2630         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
   2631         int p9 = p1;
   2632         while (fCloseSet->contains(cAt(p9))) {
   2633             p9 = moveBack(p9);
   2634         }
   2635         c = cAt(p9);
   2636         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
   2637             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
   2638                 continue;
   2639             }
   2640         }
   2641 
   2642         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
   2643         int p10 = p1;
   2644         while (fSpSet->contains(cAt(p10))) {
   2645             p10 = moveBack(p10);
   2646         }
   2647         while (fCloseSet->contains(cAt(p10))) {
   2648             p10 = moveBack(p10);
   2649         }
   2650         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
   2651             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
   2652                 continue;
   2653             }
   2654         }
   2655 
   2656         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
   2657         int p11 = p1;
   2658         if (fSepSet->contains(cAt(p11))) {
   2659             p11 = moveBack(p11);
   2660         }
   2661         while (fSpSet->contains(cAt(p11))) {
   2662             p11 = moveBack(p11);
   2663         }
   2664         while (fCloseSet->contains(cAt(p11))) {
   2665             p11 = moveBack(p11);
   2666         }
   2667         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
   2668             break;
   2669         }
   2670 
   2671         //  Rule (12)  Any x Any
   2672         continue;
   2673     }
   2674     breakPos = p2;
   2675     return breakPos;
   2676 }
   2677 
   2678 RBBISentMonkey::~RBBISentMonkey() {
   2679     delete fSets;
   2680     delete fSepSet;
   2681     delete fFormatSet;
   2682     delete fSpSet;
   2683     delete fLowerSet;
   2684     delete fUpperSet;
   2685     delete fOLetterSet;
   2686     delete fNumericSet;
   2687     delete fATermSet;
   2688     delete fSContinueSet;
   2689     delete fSTermSet;
   2690     delete fCloseSet;
   2691     delete fOtherSet;
   2692     delete fExtendSet;
   2693 }
   2694 
   2695 
   2696 
   2697 //-------------------------------------------------------------------------------------------
   2698 //
   2699 //  RBBILineMonkey
   2700 //
   2701 //-------------------------------------------------------------------------------------------
   2702 
   2703 class RBBILineMonkey: public RBBIMonkeyKind {
   2704 public:
   2705     RBBILineMonkey();
   2706     virtual          ~RBBILineMonkey();
   2707     virtual  UVector *charClasses();
   2708     virtual  void     setText(const UnicodeString &s);
   2709     virtual  int32_t  next(int32_t i);
   2710     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
   2711 private:
   2712     UVector      *fSets;
   2713 
   2714     UnicodeSet  *fBK;
   2715     UnicodeSet  *fCR;
   2716     UnicodeSet  *fLF;
   2717     UnicodeSet  *fCM;
   2718     UnicodeSet  *fNL;
   2719     UnicodeSet  *fSG;
   2720     UnicodeSet  *fWJ;
   2721     UnicodeSet  *fZW;
   2722     UnicodeSet  *fGL;
   2723     UnicodeSet  *fCB;
   2724     UnicodeSet  *fSP;
   2725     UnicodeSet  *fB2;
   2726     UnicodeSet  *fBA;
   2727     UnicodeSet  *fBB;
   2728     UnicodeSet  *fHY;
   2729     UnicodeSet  *fH2;
   2730     UnicodeSet  *fH3;
   2731     UnicodeSet  *fCL;
   2732     UnicodeSet  *fCP;
   2733     UnicodeSet  *fEX;
   2734     UnicodeSet  *fIN;
   2735     UnicodeSet  *fJL;
   2736     UnicodeSet  *fJV;
   2737     UnicodeSet  *fJT;
   2738     UnicodeSet  *fNS;
   2739     UnicodeSet  *fOP;
   2740     UnicodeSet  *fQU;
   2741     UnicodeSet  *fIS;
   2742     UnicodeSet  *fNU;
   2743     UnicodeSet  *fPO;
   2744     UnicodeSet  *fPR;
   2745     UnicodeSet  *fSY;
   2746     UnicodeSet  *fAI;
   2747     UnicodeSet  *fAL;
   2748     UnicodeSet  *fCJ;
   2749     UnicodeSet  *fHL;
   2750     UnicodeSet  *fID;
   2751     UnicodeSet  *fRI;
   2752     UnicodeSet  *fSA;
   2753     UnicodeSet  *fXX;
   2754 
   2755     BreakIterator        *fCharBI;
   2756     const UnicodeString  *fText;
   2757     RegexMatcher         *fNumberMatcher;
   2758 };
   2759 
   2760 
   2761 RBBILineMonkey::RBBILineMonkey()
   2762 {
   2763     UErrorCode  status = U_ZERO_ERROR;
   2764 
   2765     fSets  = new UVector(status);
   2766 
   2767     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
   2768     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
   2769     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
   2770     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
   2771     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
   2772     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
   2773     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
   2774     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
   2775     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
   2776     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
   2777     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
   2778     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
   2779     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
   2780     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
   2781     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
   2782     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
   2783     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
   2784     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
   2785     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
   2786     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
   2787     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
   2788     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
   2789     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
   2790     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
   2791     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
   2792     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
   2793     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
   2794     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
   2795     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
   2796     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
   2797     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
   2798     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
   2799     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
   2800     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
   2801     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
   2802     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
   2803     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
   2804     fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
   2805     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
   2806     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
   2807 
   2808     if (U_FAILURE(status)) {
   2809         deferredStatus = status;
   2810         fCharBI = NULL;
   2811         fNumberMatcher = NULL;
   2812         return;
   2813     }
   2814 
   2815     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
   2816     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
   2817     fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
   2818     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
   2819 
   2820     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
   2821 
   2822     fSets->addElement(fBK, status);
   2823     fSets->addElement(fCR, status);
   2824     fSets->addElement(fLF, status);
   2825     fSets->addElement(fCM, status);
   2826     fSets->addElement(fNL, status);
   2827     fSets->addElement(fWJ, status);
   2828     fSets->addElement(fZW, status);
   2829     fSets->addElement(fGL, status);
   2830     fSets->addElement(fCB, status);
   2831     fSets->addElement(fSP, status);
   2832     fSets->addElement(fB2, status);
   2833     fSets->addElement(fBA, status);
   2834     fSets->addElement(fBB, status);
   2835     fSets->addElement(fHY, status);
   2836     fSets->addElement(fH2, status);
   2837     fSets->addElement(fH3, status);
   2838     fSets->addElement(fCL, status);
   2839     fSets->addElement(fCP, status);
   2840     fSets->addElement(fEX, status);
   2841     fSets->addElement(fIN, status);
   2842     fSets->addElement(fJL, status);
   2843     fSets->addElement(fJT, status);
   2844     fSets->addElement(fJV, status);
   2845     fSets->addElement(fNS, status);
   2846     fSets->addElement(fOP, status);
   2847     fSets->addElement(fQU, status);
   2848     fSets->addElement(fIS, status);
   2849     fSets->addElement(fNU, status);
   2850     fSets->addElement(fPO, status);
   2851     fSets->addElement(fPR, status);
   2852     fSets->addElement(fSY, status);
   2853     fSets->addElement(fAI, status);
   2854     fSets->addElement(fAL, status);
   2855     fSets->addElement(fHL, status);
   2856     fSets->addElement(fID, status);
   2857     fSets->addElement(fWJ, status);
   2858     fSets->addElement(fRI, status);
   2859     fSets->addElement(fSA, status);
   2860     fSets->addElement(fSG, status);
   2861 
   2862     const char *rules =
   2863             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
   2864             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
   2865             "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
   2866             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
   2867             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
   2868             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
   2869 
   2870     fNumberMatcher = new RegexMatcher(
   2871         UnicodeString(rules, -1, US_INV), 0, status);
   2872 
   2873     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
   2874 
   2875     if (U_FAILURE(status)) {
   2876         deferredStatus = status;
   2877     }
   2878 }
   2879 
   2880 
   2881 void RBBILineMonkey::setText(const UnicodeString &s) {
   2882     fText       = &s;
   2883     fCharBI->setText(s);
   2884     fNumberMatcher->reset(s);
   2885 }
   2886 
   2887 //
   2888 //  rule9Adjust
   2889 //     Line Break TR rules 9 and 10 implementation.
   2890 //     This deals with combining marks and other sequences that
   2891 //     that must be treated as if they were something other than what they actually are.
   2892 //
   2893 //     This is factored out into a separate function because it must be applied twice for
   2894 //     each potential break, once to the chars before the position being checked, then
   2895 //     again to the text following the possible break.
   2896 //
   2897 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
   2898     if (pos == -1) {
   2899         // Invalid initial position.  Happens during the warmup iteration of the
   2900         //   main loop in next().
   2901         return;
   2902     }
   2903 
   2904     int32_t  nPos = *nextPos;
   2905 
   2906     // LB 9  Keep combining sequences together.
   2907     //  advance over any CM class chars.  Note that Line Break CM is different
   2908     //  from the normal Grapheme Extend property.
   2909     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
   2910           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
   2911         for (;;) {
   2912             *nextChar = fText->char32At(nPos);
   2913             if (!fCM->contains(*nextChar)) {
   2914                 break;
   2915             }
   2916             nPos = fText->moveIndex32(nPos, 1);
   2917         }
   2918     }
   2919 
   2920 
   2921     // LB 9 Treat X CM* as if it were x.
   2922     //       No explicit action required.
   2923 
   2924     // LB 10  Treat any remaining combining mark as AL
   2925     if (fCM->contains(*posChar)) {
   2926         *posChar = 0x41;   // thisChar = 'A';
   2927     }
   2928 
   2929     // Push the updated nextPos and nextChar back to our caller.
   2930     // This only makes a difference if posChar got bigger by consuming a
   2931     // combining sequence.
   2932     *nextPos  = nPos;
   2933     *nextChar = fText->char32At(nPos);
   2934 }
   2935 
   2936 
   2937 
   2938 int32_t RBBILineMonkey::next(int32_t startPos) {
   2939     UErrorCode status = U_ZERO_ERROR;
   2940     int32_t    pos;       //  Index of the char following a potential break position
   2941     UChar32    thisChar;  //  Character at above position "pos"
   2942 
   2943     int32_t    prevPos;   //  Index of the char preceding a potential break position
   2944     UChar32    prevChar;  //  Character at above position.  Note that prevChar
   2945                           //   and thisChar may not be adjacent because combining
   2946                           //   characters between them will be ignored.
   2947 
   2948     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
   2949     UChar32    prevCharX2;
   2950 
   2951     int32_t    nextPos;   //  Index of the next character following pos.
   2952                           //     Usually skips over combining marks.
   2953     int32_t    nextCPPos; //  Index of the code point following "pos."
   2954                           //     May point to a combining mark.
   2955     int32_t    tPos;      //  temp value.
   2956     UChar32    c;
   2957 
   2958     if (U_FAILURE(deferredStatus)) {
   2959         return -1;
   2960     }
   2961 
   2962     if (startPos >= fText->length()) {
   2963         return -1;
   2964     }
   2965 
   2966 
   2967     // Initial values for loop.  Loop will run the first time without finding breaks,
   2968     //                           while the invalid values shift out and the "this" and
   2969     //                           "prev" positions are filled in with good values.
   2970     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
   2971     thisChar = prevChar  = prevCharX2 = 0;
   2972     nextPos  = nextCPPos = startPos;
   2973 
   2974 
   2975     // Loop runs once per position in the test text, until a break position
   2976     //  is found.
   2977     for (;;) {
   2978         prevPosX2 = prevPos;
   2979         prevCharX2 = prevChar;
   2980 
   2981         prevPos   = pos;
   2982         prevChar  = thisChar;
   2983 
   2984         pos       = nextPos;
   2985         thisChar  = fText->char32At(pos);
   2986 
   2987         nextCPPos = fText->moveIndex32(pos, 1);
   2988         nextPos   = nextCPPos;
   2989 
   2990         // Rule LB2 - Break at end of text.
   2991         if (pos >= fText->length()) {
   2992             break;
   2993         }
   2994 
   2995         // Rule LB 9 - adjust for combining sequences.
   2996         //             We do this one out-of-order because the adjustment does not change anything
   2997         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
   2998         //             be applied.
   2999         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
   3000         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
   3001         c = fText->char32At(nextPos);
   3002         rule9Adjust(pos,     &thisChar, &nextPos, &c);
   3003 
   3004         // If the loop is still warming up - if we haven't shifted the initial
   3005         //   -1 positions out of prevPos yet - loop back to advance the
   3006         //    position in the input without any further looking for breaks.
   3007         if (prevPos == -1) {
   3008             continue;
   3009         }
   3010 
   3011         // LB 4  Always break after hard line breaks,
   3012         if (fBK->contains(prevChar)) {
   3013             break;
   3014         }
   3015 
   3016         // LB 5  Break after CR, LF, NL, but not inside CR LF
   3017         if (prevChar == 0x0d && thisChar == 0x0a) {
   3018             continue;
   3019         }
   3020         if (prevChar == 0x0d ||
   3021             prevChar == 0x0a ||
   3022             prevChar == 0x85)  {
   3023             break;
   3024         }
   3025 
   3026         // LB 6  Don't break before hard line breaks
   3027         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
   3028             fBK->contains(thisChar)) {
   3029                 continue;
   3030         }
   3031 
   3032 
   3033         // LB 7  Don't break before spaces or zero-width space.
   3034         if (fSP->contains(thisChar)) {
   3035             continue;
   3036         }
   3037 
   3038         if (fZW->contains(thisChar)) {
   3039             continue;
   3040         }
   3041 
   3042         // LB 8  Break after zero width space
   3043         if (fZW->contains(prevChar)) {
   3044             break;
   3045         }
   3046 
   3047         // LB 9, 10  Already done, at top of loop.
   3048         //
   3049 
   3050 
   3051         // LB 11  Do not break before or after WORD JOINER and related characters.
   3052         //    x  WJ
   3053         //    WJ  x
   3054         //
   3055         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
   3056             continue;
   3057         }
   3058 
   3059         // LB 12
   3060         //    GL  x
   3061         if (fGL->contains(prevChar)) {
   3062             continue;
   3063         }
   3064 
   3065         // LB 12a
   3066         //    [^SP BA HY] x GL
   3067         if (!(fSP->contains(prevChar) ||
   3068               fBA->contains(prevChar) ||
   3069               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
   3070             continue;
   3071         }
   3072 
   3073 
   3074 
   3075         // LB 13  Don't break before closings.
   3076         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
   3077         //        fall into LB 17 and the more general number regular expression.
   3078         //
   3079         if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
   3080             (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
   3081                                          fEX->contains(thisChar)  ||
   3082             (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
   3083             (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
   3084             continue;
   3085         }
   3086 
   3087         // LB 14 Don't break after OP SP*
   3088         //       Scan backwards, checking for this sequence.
   3089         //       The OP char could include combining marks, so we actually check for
   3090         //           OP CM* SP*
   3091         //       Another Twist: The Rule 67 fixes may have changed a SP CM
   3092         //       sequence into a ID char, so before scanning back through spaces,
   3093         //       verify that prevChar is indeed a space.  The prevChar variable
   3094         //       may differ from fText[prevPos]
   3095         tPos = prevPos;
   3096         if (fSP->contains(prevChar)) {
   3097             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
   3098                 tPos=fText->moveIndex32(tPos, -1);
   3099             }
   3100         }
   3101         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
   3102             tPos=fText->moveIndex32(tPos, -1);
   3103         }
   3104         if (fOP->contains(fText->char32At(tPos))) {
   3105             continue;
   3106         }
   3107 
   3108 
   3109         // LB 15    QU SP* x OP
   3110         if (fOP->contains(thisChar)) {
   3111             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
   3112             int tPos = prevPos;
   3113             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
   3114                 tPos = fText->moveIndex32(tPos, -1);
   3115             }
   3116             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
   3117                 tPos = fText->moveIndex32(tPos, -1);
   3118             }
   3119             if (fQU->contains(fText->char32At(tPos))) {
   3120                 continue;
   3121             }
   3122         }
   3123 
   3124 
   3125 
   3126         // LB 16   (CL | CP) SP* x NS
   3127         //    Scan backwards for SP* CM* (CL | CP)
   3128         if (fNS->contains(thisChar)) {
   3129             int tPos = prevPos;
   3130             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
   3131                 tPos = fText->moveIndex32(tPos, -1);
   3132             }
   3133             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
   3134                 tPos = fText->moveIndex32(tPos, -1);
   3135             }
   3136             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
   3137                 continue;
   3138             }
   3139         }
   3140 
   3141 
   3142         // LB 17        B2 SP* x B2
   3143         if (fB2->contains(thisChar)) {
   3144             //  Scan backwards, checking for the B2 CM* SP* sequence.
   3145             tPos = prevPos;
   3146             if (fSP->contains(prevChar)) {
   3147                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
   3148                     tPos=fText->moveIndex32(tPos, -1);
   3149                 }
   3150             }
   3151             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
   3152                 tPos=fText->moveIndex32(tPos, -1);
   3153             }
   3154             if (fB2->contains(fText->char32At(tPos))) {
   3155                 continue;
   3156             }
   3157         }
   3158 
   3159 
   3160         // LB 18    break after space
   3161         if (fSP->contains(prevChar)) {
   3162             break;
   3163         }
   3164 
   3165         // LB 19
   3166         //    x   QU
   3167         //    QU  x
   3168         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
   3169             continue;
   3170         }
   3171 
   3172         // LB 20  Break around a CB
   3173         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
   3174             break;
   3175         }
   3176 
   3177         // LB 21
   3178         if (fBA->contains(thisChar) ||
   3179             fHY->contains(thisChar) ||
   3180             fNS->contains(thisChar) ||
   3181             fBB->contains(prevChar) )   {
   3182             continue;
   3183         }
   3184 
   3185         // LB 21a
   3186         //   HL (HY | BA) x
   3187         if (fHL->contains(prevCharX2) &&
   3188                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
   3189             continue;
   3190         }
   3191 
   3192         // LB 21b
   3193         //   SY x HL
   3194         if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
   3195             continue;
   3196         }
   3197 
   3198         // LB 22
   3199         if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
   3200             (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
   3201             (fID->contains(prevChar) && fIN->contains(thisChar)) ||
   3202             (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
   3203             (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
   3204             continue;
   3205         }
   3206 
   3207 
   3208         // LB 23    ID x PO
   3209         //          AL x NU
   3210         //          HL x NU
   3211         //          NU x AL
   3212         if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
   3213             (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
   3214             (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
   3215             (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
   3216             (fNU->contains(prevChar) && fHL->contains(thisChar)) )   {
   3217             continue;
   3218         }
   3219 
   3220         // LB 24  Do not break between prefix and letters or ideographs.
   3221         //        PR x ID
   3222         //        PR x (AL | HL)
   3223         //        PO x (AL | HL)
   3224         if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
   3225             (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
   3226             (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))))  {
   3227             continue;
   3228         }
   3229 
   3230 
   3231 
   3232         // LB 25    Numbers
   3233         if (fNumberMatcher->lookingAt(prevPos, status)) {
   3234             if (U_FAILURE(status)) {
   3235                 break;
   3236             }
   3237             // Matched a number.  But could have been just a single digit, which would
   3238             //    not represent a "no break here" between prevChar and thisChar
   3239             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
   3240             if (numEndIdx > pos) {
   3241                 // Number match includes at least our two chars being checked
   3242                 if (numEndIdx > nextPos) {
   3243                     // Number match includes additional chars.  Update pos and nextPos
   3244                     //   so that next loop iteration will continue at the end of the number,
   3245                     //   checking for breaks between last char in number & whatever follows.
   3246                     pos = nextPos = numEndIdx;
   3247                     do {
   3248                         pos = fText->moveIndex32(pos, -1);
   3249                         thisChar = fText->char32At(pos);
   3250                     } while (fCM->contains(thisChar));
   3251                 }
   3252                 continue;
   3253             }
   3254         }
   3255 
   3256 
   3257         // LB 26 Do not break a Korean syllable.
   3258         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
   3259                                         fJV->contains(thisChar) ||
   3260                                         fH2->contains(thisChar) ||
   3261                                         fH3->contains(thisChar))) {
   3262                                             continue;
   3263                                         }
   3264 
   3265         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
   3266             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
   3267                 continue;
   3268         }
   3269 
   3270         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
   3271             fJT->contains(thisChar)) {
   3272                 continue;
   3273         }
   3274 
   3275         // LB 27 Treat a Korean Syllable Block the same as ID.
   3276         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
   3277             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
   3278             fIN->contains(thisChar)) {
   3279                 continue;
   3280             }
   3281         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
   3282             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
   3283             fPO->contains(thisChar)) {
   3284                 continue;
   3285             }
   3286         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
   3287             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
   3288                 continue;
   3289             }
   3290 
   3291 
   3292 
   3293         // LB 28  Do not break between alphabetics ("at").
   3294         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
   3295             continue;
   3296         }
   3297 
   3298         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
   3299         if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
   3300             continue;
   3301         }
   3302 
   3303         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
   3304         //          (AL | NU) x OP
   3305         //          CP x (AL | NU)
   3306         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
   3307             continue;
   3308         }
   3309         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
   3310             continue;
   3311         }
   3312 
   3313         // LB30a  Do not break between regional indicators.
   3314         //        RI x RI
   3315         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
   3316             continue;
   3317         }
   3318 
   3319         // LB 31    Break everywhere else
   3320         break;
   3321 
   3322     }
   3323 
   3324     return pos;
   3325 }
   3326 
   3327 
   3328 UVector  *RBBILineMonkey::charClasses() {
   3329     return fSets;
   3330 }
   3331 
   3332 
   3333 RBBILineMonkey::~RBBILineMonkey() {
   3334     delete fSets;
   3335 
   3336     delete fBK;
   3337     delete fCR;
   3338     delete fLF;
   3339     delete fCM;
   3340     delete fNL;
   3341     delete fWJ;
   3342     delete fZW;
   3343     delete fGL;
   3344     delete fCB;
   3345     delete fSP;
   3346     delete fB2;
   3347     delete fBA;
   3348     delete fBB;
   3349     delete fHY;
   3350     delete fH2;
   3351     delete fH3;
   3352     delete fCL;
   3353     delete fCP;
   3354     delete fEX;
   3355     delete fIN;
   3356     delete fJL;
   3357     delete fJV;
   3358     delete fJT;
   3359     delete fNS;
   3360     delete fOP;
   3361     delete fQU;
   3362     delete fIS;
   3363     delete fNU;
   3364     delete fPO;
   3365     delete fPR;
   3366     delete fSY;
   3367     delete fAI;
   3368     delete fAL;
   3369     delete fCJ;
   3370     delete fHL;
   3371     delete fID;
   3372     delete fRI;
   3373     delete fSA;
   3374     delete fSG;
   3375     delete fXX;
   3376 
   3377     delete fCharBI;
   3378     delete fNumberMatcher;
   3379 }
   3380 
   3381 
   3382 //-------------------------------------------------------------------------------------------
   3383 //
   3384 //   TestMonkey
   3385 //
   3386 //     params
   3387 //       seed=nnnnn        Random number starting seed.
   3388 //                         Setting the seed allows errors to be reproduced.
   3389 //       loop=nnn          Looping count.  Controls running time.
   3390 //                         -1:  run forever.
   3391 //                          0 or greater:  run length.
   3392 //
   3393 //       type = char | word | line | sent | title
   3394 //
   3395 //-------------------------------------------------------------------------------------------
   3396 
   3397 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
   3398     int32_t val = defaultVal;
   3399     name.append(" *= *(-?\\d+)");
   3400     UErrorCode status = U_ZERO_ERROR;
   3401     RegexMatcher m(name, params, 0, status);
   3402     if (m.find()) {
   3403         // The param exists.  Convert the string to an int.
   3404         char valString[100];
   3405         int32_t paramLength = m.end(1, status) - m.start(1, status);
   3406         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
   3407             paramLength = (int32_t)(sizeof(valString)-2);
   3408         }
   3409         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
   3410         val = strtol(valString,  NULL, 10);
   3411 
   3412         // Delete this parameter from the params string.
   3413         m.reset();
   3414         params = m.replaceFirst("", status);
   3415     }
   3416     U_ASSERT(U_SUCCESS(status));
   3417     return val;
   3418 }
   3419 #endif
   3420 
   3421 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3422 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
   3423                                     BreakIterator *bi,
   3424                                     int expected[],
   3425                                     int expectedcount)
   3426 {
   3427     int count = 0;
   3428     int i = 0;
   3429     int forward[50];
   3430     bi->setText(ustr);
   3431     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
   3432         forward[count] = i;
   3433         if (count < expectedcount && expected[count] != i) {
   3434             test->errln("break forward test failed: expected %d but got %d",
   3435                         expected[count], i);
   3436             break;
   3437         }
   3438         count ++;
   3439     }
   3440     if (count != expectedcount) {
   3441         printStringBreaks(ustr, expected, expectedcount);
   3442         test->errln("break forward test failed: missed %d match",
   3443                     expectedcount - count);
   3444         return;
   3445     }
   3446     // testing boundaries
   3447     for (i = 1; i < expectedcount; i ++) {
   3448         int j = expected[i - 1];
   3449         if (!bi->isBoundary(j)) {
   3450             printStringBreaks(ustr, expected, expectedcount);
   3451             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
   3452             return;
   3453         }
   3454         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
   3455             if (bi->isBoundary(j)) {
   3456                 printStringBreaks(ustr, expected, expectedcount);
   3457                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
   3458                 return;
   3459             }
   3460         }
   3461     }
   3462 
   3463     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
   3464         count --;
   3465         if (forward[count] != i) {
   3466             printStringBreaks(ustr, expected, expectedcount);
   3467             test->errln("happy break test previous() failed: expected %d but got %d",
   3468                         forward[count], i);
   3469             break;
   3470         }
   3471     }
   3472     if (count != 0) {
   3473         printStringBreaks(ustr, expected, expectedcount);
   3474         test->errln("break test previous() failed: missed a match");
   3475         return;
   3476     }
   3477 
   3478     // testing preceding
   3479     for (i = 0; i < expectedcount - 1; i ++) {
   3480         // int j = expected[i] + 1;
   3481         int j = ustr.moveIndex32(expected[i], 1);
   3482         for (; j <= expected[i + 1]; j ++) {
   3483             if (bi->preceding(j) != expected[i]) {
   3484                 printStringBreaks(ustr, expected, expectedcount);
   3485                 test->errln("preceding(): Not expecting boundary at position %d", j);
   3486                 return;
   3487             }
   3488         }
   3489     }
   3490 }
   3491 #endif
   3492 
   3493 void RBBITest::TestWordBreaks(void)
   3494 {
   3495 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3496 
   3497     Locale        locale("en");
   3498     UErrorCode    status = U_ZERO_ERROR;
   3499     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   3500     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
   3501     // Replaced any C+J characters in a row with a random sequence of characters
   3502     // of the same length to make our C+J segmentation not get in the way.
   3503     static const char *strlist[] =
   3504     {
   3505     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
   3506     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
   3507     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
   3508     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
   3509     "\\uac00\\u3588\\u009c\\u0953\\u194b",
   3510     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   3511     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
   3512     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
   3513     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   3514     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   3515     "\\u2027\\U000e0067\\u0a47\\u00b7",
   3516     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   3517     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   3518     "\\u0589\\U000e006e\\u0a42\\U000104a5",
   3519     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
   3520     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   3521     "\\u0027\\u11af\\U000e0057\\u0602",
   3522     "\\U0001d7f2\\U000e007\\u0004\\u0589",
   3523     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
   3524     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   3525     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   3526     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   3527     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
   3528     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   3529     "\\u0233\\U000e0020\\u0a69\\u0d6a",
   3530     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   3531     "\\u18f4\\U000e0049\\u20e7\\u2027",
   3532     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   3533     "\\ua183\\u102d\\u0bec\\u003a",
   3534     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   3535     "\\u003a\\u0e57\\u0fad\\u002e",
   3536     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
   3537     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
   3538     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
   3539     "\\u003a\\u0664\\u00b7\\u1fba",
   3540     "\\u003b\\u0027\\u00b7\\u47a3",
   3541     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
   3542     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
   3543     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
   3544     };
   3545     int loop;
   3546     if (U_FAILURE(status)) {
   3547         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   3548         return;
   3549     }
   3550     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   3551         // printf("looping %d\n", loop);
   3552         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
   3553         // RBBICharMonkey monkey;
   3554         RBBIWordMonkey monkey;
   3555 
   3556         int expected[50];
   3557         int expectedcount = 0;
   3558 
   3559         monkey.setText(ustr);
   3560         int i;
   3561         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   3562             expected[expectedcount ++] = i;
   3563         }
   3564 
   3565         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   3566     }
   3567     delete bi;
   3568 #endif
   3569 }
   3570 
   3571 void RBBITest::TestWordBoundary(void)
   3572 {
   3573     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
   3574     Locale        locale("en");
   3575     UErrorCode    status = U_ZERO_ERROR;
   3576     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   3577     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
   3578     UChar         str[50];
   3579     static const char *strlist[] =
   3580     {
   3581     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   3582     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   3583     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   3584     "\\u2027\\U000e0067\\u0a47\\u00b7",
   3585     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   3586     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   3587     "\\u0589\\U000e006e\\u0a42\\U000104a5",
   3588     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
   3589     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   3590     "\\u0027\\u11af\\U000e0057\\u0602",
   3591     "\\U0001d7f2\\U000e007\\u0004\\u0589",
   3592     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
   3593     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   3594     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   3595     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   3596     "\\U000e0065\\u302c\\u09ee\\U000e0068",
   3597     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   3598     "\\u0233\\U000e0020\\u0a69\\u0d6a",
   3599     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   3600     "\\u58f4\\U000e0049\\u20e7\\u2027",
   3601     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   3602     "\\ua183\\u102d\\u0bec\\u003a",
   3603     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   3604     "\\u003a\\u0e57\\u0fad\\u002e",
   3605     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
   3606     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
   3607     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
   3608     "\\u003a\\u0664\\u00b7\\u1fba",
   3609     "\\u003b\\u0027\\u00b7\\u47a3",
   3610     };
   3611     int loop;
   3612     if (U_FAILURE(status)) {
   3613         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   3614         return;
   3615     }
   3616     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   3617         // printf("looping %d\n", loop);
   3618         u_unescape(strlist[loop], str, 20);
   3619         UnicodeString ustr(str);
   3620         int forward[50];
   3621         int count = 0;
   3622 
   3623         bi->setText(ustr);
   3624         int prev = 0;
   3625         int i;
   3626         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
   3627             forward[count ++] = i;
   3628             if (i > prev) {
   3629                 int j;
   3630                 for (j = prev + 1; j < i; j ++) {
   3631                     if (bi->isBoundary(j)) {
   3632                         printStringBreaks(ustr, forward, count);
   3633                         errln("happy boundary test failed: expected %d not a boundary",
   3634                                j);
   3635                         return;
   3636                     }
   3637                 }
   3638             }
   3639             if (!bi->isBoundary(i)) {
   3640                 printStringBreaks(ustr, forward, count);
   3641                 errln("happy boundary test failed: expected %d a boundary",
   3642                        i);
   3643                 return;
   3644             }
   3645             prev = i;
   3646         }
   3647     }
   3648     delete bi;
   3649 }
   3650 
   3651 void RBBITest::TestLineBreaks(void)
   3652 {
   3653 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3654     Locale        locale("en");
   3655     UErrorCode    status = U_ZERO_ERROR;
   3656     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
   3657     const int32_t  STRSIZE = 50;
   3658     UChar         str[STRSIZE];
   3659     static const char *strlist[] =
   3660     {
   3661      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
   3662      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
   3663              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
   3664      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
   3665              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
   3666      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
   3667      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
   3668      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
   3669      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
   3670      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
   3671      "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
   3672      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
   3673      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
   3674      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
   3675      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
   3676      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
   3677      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
   3678      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
   3679      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
   3680      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
   3681      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
   3682      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
   3683      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
   3684      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
   3685      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
   3686      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
   3687      "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
   3688      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
   3689      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
   3690      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
   3691      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
   3692      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
   3693      "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
   3694      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
   3695      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
   3696      "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
   3697      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
   3698      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
   3699      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
   3700      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
   3701      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
   3702      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
   3703          "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
   3704          "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
   3705          "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
   3706      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
   3707          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
   3708     };
   3709     int loop;
   3710     TEST_ASSERT_SUCCESS(status);
   3711     if (U_FAILURE(status)) {
   3712         return;
   3713     }
   3714     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   3715         // printf("looping %d\n", loop);
   3716         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
   3717         if (t >= STRSIZE) {
   3718             TEST_ASSERT(FALSE);
   3719             continue;
   3720         }
   3721 
   3722 
   3723         UnicodeString ustr(str);
   3724         RBBILineMonkey monkey;
   3725         if (U_FAILURE(monkey.deferredStatus)) {
   3726             continue;
   3727         }
   3728 
   3729         const int EXPECTEDSIZE = 50;
   3730         int expected[EXPECTEDSIZE];
   3731         int expectedcount = 0;
   3732 
   3733         monkey.setText(ustr);
   3734         int i;
   3735         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   3736             if (expectedcount >= EXPECTEDSIZE) {
   3737                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
   3738                 return;
   3739             }
   3740             expected[expectedcount ++] = i;
   3741         }
   3742 
   3743         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   3744     }
   3745     delete bi;
   3746 #endif
   3747 }
   3748 
   3749 void RBBITest::TestSentBreaks(void)
   3750 {
   3751 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3752     Locale        locale("en");
   3753     UErrorCode    status = U_ZERO_ERROR;
   3754     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
   3755     UChar         str[200];
   3756     static const char *strlist[] =
   3757     {
   3758      "Now\ris\nthe\r\ntime\n\rfor\r\r",
   3759      "This\n",
   3760      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
   3761      "\"Sentence ending with a quote.\" Bye.",
   3762      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
   3763      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
   3764      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
   3765      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
   3766      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
   3767      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
   3768      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
   3769              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
   3770              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
   3771              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
   3772      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
   3773              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
   3774              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
   3775              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
   3776              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
   3777              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
   3778     };
   3779     int loop;
   3780     if (U_FAILURE(status)) {
   3781         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   3782         return;
   3783     }
   3784     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   3785         u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
   3786         UnicodeString ustr(str);
   3787 
   3788         RBBISentMonkey monkey;
   3789         if (U_FAILURE(monkey.deferredStatus)) {
   3790             continue;
   3791         }
   3792 
   3793         const int EXPECTEDSIZE = 50;
   3794         int expected[EXPECTEDSIZE];
   3795         int expectedcount = 0;
   3796 
   3797         monkey.setText(ustr);
   3798         int i;
   3799         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   3800             if (expectedcount >= EXPECTEDSIZE) {
   3801                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
   3802                 return;
   3803             }
   3804             expected[expectedcount ++] = i;
   3805         }
   3806 
   3807         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   3808     }
   3809     delete bi;
   3810 #endif
   3811 }
   3812 
   3813 void RBBITest::TestMonkey(char *params) {
   3814 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3815 
   3816     UErrorCode     status    = U_ZERO_ERROR;
   3817     int32_t        loopCount = 500;
   3818     int32_t        seed      = 1;
   3819     UnicodeString  breakType = "all";
   3820     Locale         locale("en");
   3821     UBool          useUText  = FALSE;
   3822 
   3823     if (quick == FALSE) {
   3824         loopCount = 10000;
   3825     }
   3826 
   3827     if (params) {
   3828         UnicodeString p(params);
   3829         loopCount = getIntParam("loop", p, loopCount);
   3830         seed      = getIntParam("seed", p, seed);
   3831 
   3832         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
   3833         if (m.find()) {
   3834             breakType = m.group(1, status);
   3835             m.reset();
   3836             p = m.replaceFirst("", status);
   3837         }
   3838 
   3839         RegexMatcher u(" *utext", p, 0, status);
   3840         if (u.find()) {
   3841             useUText = TRUE;
   3842             u.reset();
   3843             p = u.replaceFirst("", status);
   3844         }
   3845 
   3846 
   3847         // m.reset(p);
   3848         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
   3849             // Each option is stripped out of the option string as it is processed.
   3850             // All options have been checked.  The option string should have been completely emptied..
   3851             char buf[100];
   3852             p.extract(buf, sizeof(buf), NULL, status);
   3853             buf[sizeof(buf)-1] = 0;
   3854             errln("Unrecognized or extra parameter:  %s\n", buf);
   3855             return;
   3856         }
   3857 
   3858     }
   3859 
   3860     if (breakType == "char" || breakType == "all") {
   3861         RBBICharMonkey  m;
   3862         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   3863         if (U_SUCCESS(status)) {
   3864             RunMonkey(bi, m, "char", seed, loopCount, useUText);
   3865             if (breakType == "all" && useUText==FALSE) {
   3866                 // Also run a quick test with UText when "all" is specified
   3867                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
   3868             }
   3869         }
   3870         else {
   3871             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
   3872         }
   3873         delete bi;
   3874     }
   3875 
   3876     if (breakType == "word" || breakType == "all") {
   3877         logln("Word Break Monkey Test");
   3878         RBBIWordMonkey  m;
   3879         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
   3880         if (U_SUCCESS(status)) {
   3881             RunMonkey(bi, m, "word", seed, loopCount, useUText);
   3882         }
   3883         else {
   3884             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
   3885         }
   3886         delete bi;
   3887     }
   3888 
   3889     if (breakType == "line" || breakType == "all") {
   3890         logln("Line Break Monkey Test");
   3891         RBBILineMonkey  m;
   3892         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
   3893         if (loopCount >= 10) {
   3894             loopCount = loopCount / 5;   // Line break runs slower than the others.
   3895         }
   3896         if (U_SUCCESS(status)) {
   3897             RunMonkey(bi, m, "line", seed, loopCount, useUText);
   3898         }
   3899         else {
   3900             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
   3901         }
   3902         delete bi;
   3903     }
   3904 
   3905     if (breakType == "sent" || breakType == "all"  ) {
   3906         logln("Sentence Break Monkey Test");
   3907         RBBISentMonkey  m;
   3908         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
   3909         if (loopCount >= 10) {
   3910             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
   3911         }
   3912         if (U_SUCCESS(status)) {
   3913             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
   3914         }
   3915         else {
   3916             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
   3917         }
   3918         delete bi;
   3919     }
   3920 
   3921 #endif
   3922 }
   3923 
   3924 //
   3925 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
   3926 //    Parameters:
   3927 //       bi      - the break iterator to use
   3928 //       mk      - MonkeyKind, abstraction for obtaining expected results
   3929 //       name    - Name of test (char, word, etc.) for use in error messages
   3930 //       seed    - Seed for starting random number generator (parameter from user)
   3931 //       numIterations
   3932 //
   3933 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
   3934                          int32_t numIterations, UBool useUText) {
   3935 
   3936 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3937 
   3938     const int32_t    TESTSTRINGLEN = 500;
   3939     UnicodeString    testText;
   3940     int32_t          numCharClasses;
   3941     UVector          *chClasses;
   3942     int              expected[TESTSTRINGLEN*2 + 1];
   3943     int              expectedCount = 0;
   3944     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
   3945     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
   3946     char             reverseBreaks[TESTSTRINGLEN*2+1];
   3947     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
   3948     char             followingBreaks[TESTSTRINGLEN*2+1];
   3949     char             precedingBreaks[TESTSTRINGLEN*2+1];
   3950     int              i;
   3951     int              loopCount = 0;
   3952 
   3953     m_seed = seed;
   3954 
   3955     numCharClasses = mk.charClasses()->size();
   3956     chClasses      = mk.charClasses();
   3957 
   3958     // Check for errors that occured during the construction of the MonkeyKind object.
   3959     //  Can't report them where they occured because errln() is a method coming from intlTest,
   3960     //  and is not visible outside of RBBITest :-(
   3961     if (U_FAILURE(mk.deferredStatus)) {
   3962         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
   3963         return;
   3964     }
   3965 
   3966     // Verify that the character classes all have at least one member.
   3967     for (i=0; i<numCharClasses; i++) {
   3968         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
   3969         if (s == NULL || s->size() == 0) {
   3970             errln("Character Class #%d is null or of zero size.", i);
   3971             return;
   3972         }
   3973     }
   3974 
   3975     while (loopCount < numIterations || numIterations == -1) {
   3976         if (numIterations == -1 && loopCount % 10 == 0) {
   3977             // If test is running in an infinite loop, display a periodic tic so
   3978             //   we can tell that it is making progress.
   3979             fprintf(stderr, ".");
   3980         }
   3981         // Save current random number seed, so that we can recreate the random numbers
   3982         //   for this loop iteration in event of an error.
   3983         seed = m_seed;
   3984 
   3985         // Populate a test string with data.
   3986         testText.truncate(0);
   3987         for (i=0; i<TESTSTRINGLEN; i++) {
   3988             int32_t  aClassNum = m_rand() % numCharClasses;
   3989             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
   3990             int32_t   charIdx = m_rand() % classSet->size();
   3991             UChar32   c = classSet->charAt(charIdx);
   3992             if (c < 0) {   // TODO:  deal with sets containing strings.
   3993                 errln("c < 0");
   3994                 break;
   3995             }
   3996             testText.append(c);
   3997         }
   3998 
   3999         // Calculate the expected results for this test string.
   4000         mk.setText(testText);
   4001         memset(expectedBreaks, 0, sizeof(expectedBreaks));
   4002         expectedBreaks[0] = 1;
   4003         int32_t breakPos = 0;
   4004         expectedCount = 0;
   4005         for (;;) {
   4006             breakPos = mk.next(breakPos);
   4007             if (breakPos == -1) {
   4008                 break;
   4009             }
   4010             if (breakPos > testText.length()) {
   4011                 errln("breakPos > testText.length()");
   4012             }
   4013             expectedBreaks[breakPos] = 1;
   4014             U_ASSERT(expectedCount<testText.length());
   4015             expected[expectedCount ++] = breakPos;
   4016             (void)expected;   // Set but not used warning.
   4017                               // TODO (andy): check it out.
   4018         }
   4019 
   4020         // Find the break positions using forward iteration
   4021         memset(forwardBreaks, 0, sizeof(forwardBreaks));
   4022         if (useUText) {
   4023             UErrorCode status = U_ZERO_ERROR;
   4024             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
   4025             // testUText = utext_openUnicodeString(testUText, &testText, &status);
   4026             bi->setText(testUText, status);
   4027             TEST_ASSERT_SUCCESS(status);
   4028             utext_close(testUText);   // The break iterator does a shallow clone of the UText
   4029                                       //  This UText can be closed immediately, so long as the
   4030                                       //  testText string continues to exist.
   4031         } else {
   4032             bi->setText(testText);
   4033         }
   4034 
   4035         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
   4036             if (i < 0 || i > testText.length()) {
   4037                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
   4038                 break;
   4039             }
   4040             forwardBreaks[i] = 1;
   4041         }
   4042 
   4043         // Find the break positions using reverse iteration
   4044         memset(reverseBreaks, 0, sizeof(reverseBreaks));
   4045         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
   4046             if (i < 0 || i > testText.length()) {
   4047                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
   4048                 break;
   4049             }
   4050             reverseBreaks[i] = 1;
   4051         }
   4052 
   4053         // Find the break positions using isBoundary() tests.
   4054         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
   4055         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
   4056         for (i=0; i<=testText.length(); i++) {
   4057             isBoundaryBreaks[i] = bi->isBoundary(i);
   4058         }
   4059 
   4060 
   4061         // Find the break positions using the following() function.
   4062         // printf(".");
   4063         memset(followingBreaks, 0, sizeof(followingBreaks));
   4064         int32_t   lastBreakPos = 0;
   4065         followingBreaks[0] = 1;
   4066         for (i=0; i<testText.length(); i++) {
   4067             breakPos = bi->following(i);
   4068             if (breakPos <= i ||
   4069                 breakPos < lastBreakPos ||
   4070                 breakPos > testText.length() ||
   4071                 (breakPos > lastBreakPos && lastBreakPos > i)) {
   4072                 errln("%s break monkey test: "
   4073                     "Out of range value returned by BreakIterator::following().\n"
   4074                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
   4075                          name, seed, i, breakPos, lastBreakPos);
   4076                 break;
   4077             }
   4078             followingBreaks[breakPos] = 1;
   4079             lastBreakPos = breakPos;
   4080         }
   4081 
   4082         // Find the break positions using the preceding() function.
   4083         memset(precedingBreaks, 0, sizeof(precedingBreaks));
   4084         lastBreakPos = testText.length();
   4085         precedingBreaks[testText.length()] = 1;
   4086         for (i=testText.length(); i>0; i--) {
   4087             breakPos = bi->preceding(i);
   4088             if (breakPos >= i ||
   4089                 breakPos > lastBreakPos ||
   4090                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
   4091                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
   4092                 errln("%s break monkey test: "
   4093                     "Out of range value returned by BreakIterator::preceding().\n"
   4094                     "index=%d;  prev returned %d; lastBreak=%d" ,
   4095                     name,  i, breakPos, lastBreakPos);
   4096                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
   4097                     precedingBreaks[i] = 2;   // Forces an error.
   4098                 }
   4099             } else {
   4100                 if (breakPos >= 0) {
   4101                     precedingBreaks[breakPos] = 1;
   4102                 }
   4103                 lastBreakPos = breakPos;
   4104             }
   4105         }
   4106 
   4107         // Compare the expected and actual results.
   4108         for (i=0; i<=testText.length(); i++) {
   4109             const char *errorType = NULL;
   4110             if  (forwardBreaks[i] != expectedBreaks[i]) {
   4111                 errorType = "next()";
   4112             } else if (reverseBreaks[i] != forwardBreaks[i]) {
   4113                 errorType = "previous()";
   4114             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
   4115                 errorType = "isBoundary()";
   4116             } else if (followingBreaks[i] != expectedBreaks[i]) {
   4117                 errorType = "following()";
   4118             } else if (precedingBreaks[i] != expectedBreaks[i]) {
   4119                 errorType = "preceding()";
   4120             }
   4121 
   4122 
   4123             if (errorType != NULL) {
   4124                 // Format a range of the test text that includes the failure as
   4125                 //  a data item that can be included in the rbbi test data file.
   4126 
   4127                 // Start of the range is the last point where expected and actual results
   4128                 //   both agreed that there was a break position.
   4129                 int startContext = i;
   4130                 int32_t count = 0;
   4131                 for (;;) {
   4132                     if (startContext==0) { break; }
   4133                     startContext --;
   4134                     if (expectedBreaks[startContext] != 0) {
   4135                         if (count == 2) break;
   4136                         count ++;
   4137                     }
   4138                 }
   4139 
   4140                 // End of range is two expected breaks past the start position.
   4141                 int endContext = i + 1;
   4142                 int ci;
   4143                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
   4144                     for (;;) {
   4145                         if (endContext >= testText.length()) {break;}
   4146                         if (expectedBreaks[endContext-1] != 0) {
   4147                             if (count == 0) break;
   4148                             count --;
   4149                         }
   4150                         endContext ++;
   4151                     }
   4152                 }
   4153 
   4154                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
   4155                 UnicodeString errorText = "<data>";
   4156                 /***if (strcmp(errorType, "next()") == 0) {
   4157                     startContext = 0;
   4158                     endContext = testText.length();
   4159 
   4160                     printStringBreaks(testText, expected, expectedCount);
   4161                 }***/
   4162 
   4163                 for (ci=startContext; ci<endContext;) {
   4164                     UnicodeString hexChars("0123456789abcdef");
   4165                     UChar32  c;
   4166                     int      bn;
   4167                     c = testText.char32At(ci);
   4168                     if (ci == i) {
   4169                         // This is the location of the error.
   4170                         errorText.append("<?>");
   4171                     } else if (expectedBreaks[ci] != 0) {
   4172                         // This a non-error expected break position.
   4173                         errorText.append("\\");
   4174                     }
   4175                     if (c < 0x10000) {
   4176                         errorText.append("\\u");
   4177                         for (bn=12; bn>=0; bn-=4) {
   4178                             errorText.append(hexChars.charAt((c>>bn)&0xf));
   4179                         }
   4180                     } else {
   4181                         errorText.append("\\U");
   4182                         for (bn=28; bn>=0; bn-=4) {
   4183                             errorText.append(hexChars.charAt((c>>bn)&0xf));
   4184                         }
   4185                     }
   4186                     ci = testText.moveIndex32(ci, 1);
   4187                 }
   4188                 errorText.append("\\");
   4189                 errorText.append("</data>\n");
   4190 
   4191                 // Output the error
   4192                 char  charErrorTxt[500];
   4193                 UErrorCode status = U_ZERO_ERROR;
   4194                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
   4195                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
   4196                 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
   4197 
   4198                 errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
   4199                     name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
   4200                     errorType, seed, i, charErrorTxt);
   4201                 break;
   4202             }
   4203         }
   4204 
   4205         loopCount++;
   4206     }
   4207 #endif
   4208 }
   4209 
   4210 
   4211 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
   4212 //             This test checks the initial patch,
   4213 //             which is to just keep it from crashing.  Correct word boundaries
   4214 //             await a proper fix to the dictionary code.
   4215 //
   4216 void RBBITest::TestBug5532(void)  {
   4217    // Text includes a mixture of Thai and Latin.
   4218    const unsigned char utf8Data[] = {
   4219            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
   4220            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
   4221            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
   4222            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
   4223            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
   4224            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
   4225            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
   4226            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
   4227            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
   4228            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
   4229            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
   4230 
   4231     UErrorCode status = U_ZERO_ERROR;
   4232     UText utext=UTEXT_INITIALIZER;
   4233     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
   4234     TEST_ASSERT_SUCCESS(status);
   4235 
   4236     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
   4237     TEST_ASSERT_SUCCESS(status);
   4238     if (U_SUCCESS(status)) {
   4239         bi->setText(&utext, status);
   4240         TEST_ASSERT_SUCCESS(status);
   4241 
   4242         int32_t breakCount = 0;
   4243         int32_t previousBreak = -1;
   4244         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
   4245             // For now, just make sure that the break iterator doesn't hang.
   4246             TEST_ASSERT(previousBreak < bi->current());
   4247             previousBreak = bi->current();
   4248         }
   4249         TEST_ASSERT(breakCount > 0);
   4250     }
   4251     delete bi;
   4252     utext_close(&utext);
   4253 }
   4254 
   4255 
   4256 void RBBITest::TestBug9983(void)  {
   4257     UnicodeString text = UnicodeString("\\u002A"  // * Other
   4258                                        "\\uFF65"  //   Other
   4259                                        "\\u309C"  //   Katakana
   4260                                        "\\uFF9F"  //   Extend
   4261                                        "\\uFF65"  //   Other
   4262                                        "\\u0020"  //   Other
   4263                                        "\\u0000").unescape();
   4264 
   4265     UErrorCode status = U_ZERO_ERROR;
   4266     LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
   4267         BreakIterator::createWordInstance(Locale::getRoot(), status)));
   4268     TEST_ASSERT_SUCCESS(status);
   4269     LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
   4270         BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
   4271     TEST_ASSERT_SUCCESS(status);
   4272     if (U_FAILURE(status)) {
   4273         return;
   4274     }
   4275     int32_t offset, rstatus, iterationCount;
   4276 
   4277     brkiter->setText(text);
   4278     brkiter->last();
   4279     iterationCount = 0;
   4280     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
   4281         iterationCount++;
   4282         rstatus = brkiter->getRuleStatus();
   4283         (void)rstatus;     // Suppress set but not used warning.
   4284         if (iterationCount >= 10) {
   4285            break;
   4286         }
   4287     }
   4288     TEST_ASSERT(iterationCount == 6);
   4289 
   4290     brkiterPOSIX->setText(text);
   4291     brkiterPOSIX->last();
   4292     iterationCount = 0;
   4293     while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
   4294         iterationCount++;
   4295         rstatus = brkiterPOSIX->getRuleStatus();
   4296         (void)rstatus;     // Suppress set but not used warning.
   4297         if (iterationCount >= 10) {
   4298            break;
   4299         }
   4300     }
   4301     TEST_ASSERT(iterationCount == 6);
   4302 }
   4303 
   4304 
   4305 //
   4306 //  TestDebug    -  A place-holder test for debugging purposes.
   4307 //                  For putting in fragments of other tests that can be invoked
   4308 //                  for tracing  without a lot of unwanted extra stuff happening.
   4309 //
   4310 void RBBITest::TestDebug(void) {
   4311 #if 0
   4312     UErrorCode   status = U_ZERO_ERROR;
   4313     int pos = 0;
   4314     int ruleStatus = 0;
   4315 
   4316     RuleBasedBreakIterator* bi =
   4317        // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
   4318        // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
   4319        (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
   4320     UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
   4321     // UnicodeString s("Aaa.  Bcd");
   4322     s = s.unescape();
   4323     bi->setText(s);
   4324     UBool r = bi->isBoundary(8);
   4325     printf("%s", r?"true":"false");
   4326     return;
   4327     pos = bi->last();
   4328     do {
   4329         // ruleStatus = bi->getRuleStatus();
   4330         printf("%d\t%d\n", pos, ruleStatus);
   4331         pos = bi->previous();
   4332     } while (pos != BreakIterator::DONE);
   4333 #endif
   4334 }
   4335 
   4336 void RBBITest::TestProperties() {
   4337     UErrorCode errorCode = U_ZERO_ERROR;
   4338     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
   4339     if (!prependSet.isEmpty()) {
   4340         errln(
   4341             "[:GCB=Prepend:] is not empty any more. "
   4342             "Uncomment relevant lines in source/data/brkitr/char.txt and "
   4343             "change this test to the opposite condition.");
   4344     }
   4345 }
   4346 
   4347 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   4348