Home | History | Annotate | Download | only in intltest
      1 /********************************************************************
      2  * COPYRIGHT:
      3  * Copyright (c) 1999-2015, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  ********************************************************************/
      6 /************************************************************************
      7 *   Date        Name        Description
      8 *   12/15/99    Madhu        Creation.
      9 *   01/12/2000  Madhu        Updated for changed API and added new tests
     10 ************************************************************************/
     11 
     12 #include "utypeinfo.h"  // for 'typeid' to work
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_BREAK_ITERATION
     17 
     18 #include "unicode/utypes.h"
     19 #include "unicode/brkiter.h"
     20 #include "unicode/rbbi.h"
     21 #include "unicode/uchar.h"
     22 #include "unicode/utf16.h"
     23 #include "unicode/ucnv.h"
     24 #include "unicode/schriter.h"
     25 #include "unicode/uniset.h"
     26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     27 #include "unicode/regex.h"
     28 #endif
     29 #include "unicode/ustring.h"
     30 #include "unicode/utext.h"
     31 #include "intltest.h"
     32 #include "rbbitst.h"
     33 #include <string.h>
     34 #include "charstr.h"
     35 #include "uvector.h"
     36 #include "uvectr32.h"
     37 #include <stdio.h>
     38 #include <stdlib.h>
     39 #include "unicode/numfmt.h"
     40 #include "unicode/uscript.h"
     41 #include "cmemory.h"
     42 
     43 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
     44 #include "unicode/filteredbrk.h"
     45 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
     46 
     47 #define TEST_ASSERT(x) {if (!(x)) { \
     48     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
     49 
     50 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
     51     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
     52 
     53 
     54 //---------------------------------------------
     55 // runIndexedTest
     56 //---------------------------------------------
     57 
     58 
     59 //  Note:  Before adding new tests to this file, check whether the desired test data can
     60 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
     61 //         it's much less work than writing a new test, diagnostic output in the event of failures
     62 //         is good, and the test data file will is shared with ICU4J, so eventually the test
     63 //         will run there as well, without additional effort.
     64 
     65 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
     66 {
     67     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
     68 
     69     switch (index) {
     70 #if !UCONFIG_NO_FILE_IO
     71         case 0: name = "TestBug4153072";
     72             if(exec) TestBug4153072();                         break;
     73 #else
     74         case 0: name = "skip";
     75             break;
     76 #endif
     77 
     78         case 1: name = "skip";
     79             break;
     80         case 2: name = "TestStatusReturn";
     81             if(exec) TestStatusReturn();                       break;
     82 
     83 #if !UCONFIG_NO_FILE_IO
     84         case 3: name = "TestUnicodeFiles";
     85             if(exec) TestUnicodeFiles();                       break;
     86         case 4: name = "TestEmptyString";
     87             if(exec) TestEmptyString();                        break;
     88 #else
     89         case 3: case 4: name = "skip";
     90             break;
     91 #endif
     92 
     93         case 5: name = "TestGetAvailableLocales";
     94             if(exec) TestGetAvailableLocales();                break;
     95 
     96         case 6: name = "TestGetDisplayName";
     97             if(exec) TestGetDisplayName();                     break;
     98 
     99 #if !UCONFIG_NO_FILE_IO
    100         case 7: name = "TestEndBehaviour";
    101             if(exec) TestEndBehaviour();                       break;
    102         case 8: case 9: case 10: name = "skip";
    103              break;
    104         case 11: name = "TestWordBreaks";
    105              if(exec) TestWordBreaks();                        break;
    106         case 12: name = "TestWordBoundary";
    107              if(exec) TestWordBoundary();                      break;
    108         case 13: name = "TestLineBreaks";
    109              if(exec) TestLineBreaks();                        break;
    110         case 14: name = "TestSentBreaks";
    111              if(exec) TestSentBreaks();                        break;
    112         case 15: name = "TestExtended";
    113              if(exec) TestExtended();                          break;
    114 #else
    115         case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
    116              break;
    117 #endif
    118 
    119 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
    120         case 16:
    121             name = "TestMonkey"; if(exec)  TestMonkey(params); break;
    122 #else
    123         case 16:
    124              name = "skip";                                    break;
    125 #endif
    126 
    127 #if !UCONFIG_NO_FILE_IO
    128         case 17: name = "TestBug3818";
    129             if(exec) TestBug3818();                            break;
    130 #else
    131         case 17: name = "skip";
    132             break;
    133 #endif
    134 
    135         case 18: name = "skip";
    136             break;
    137         case 19: name = "TestDebug";
    138             if(exec) TestDebug();                              break;
    139         case 20: name = "skip";
    140             break;
    141 
    142 #if !UCONFIG_NO_FILE_IO
    143         case 21: name = "TestBug5775";
    144             if (exec) TestBug5775();                           break;
    145 #else
    146         case 21: name = "skip";
    147             break;
    148 #endif
    149 
    150         case 22: name = "TestBug9983";
    151             if (exec) TestBug9983();                           break;
    152         case 23: name = "TestDictRules";
    153             if (exec) TestDictRules();                         break;
    154         case 24: name = "TestBug5532";
    155             if (exec) TestBug5532();                           break;
    156         default: name = ""; break; //needed to end loop
    157     }
    158 }
    159 
    160 
    161 //---------------------------------------------------------------------------
    162 //
    163 //   class BITestData   Holds a set of Break iterator test data and results
    164 //                      Includes
    165 //                         - the string data to be broken
    166 //                         - a vector of the expected break positions.
    167 //                         - a vector of source line numbers for the data,
    168 //                               (to help see where errors occured.)
    169 //                         - The expected break tag values.
    170 //                         - Vectors of actual break positions and tag values.
    171 //                         - Functions for comparing actual with expected and
    172 //                            reporting errors.
    173 //
    174 //----------------------------------------------------------------------------
    175 class BITestData {
    176 public:
    177     UnicodeString    fDataToBreak;
    178     UVector          fExpectedBreakPositions;
    179     UVector          fExpectedTags;
    180     UVector          fLineNum;
    181     UVector          fActualBreakPositions;   // Test Results.
    182     UVector          fActualTags;
    183 
    184     BITestData(UErrorCode &status);
    185     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
    186     void             checkResults(const char *heading, RBBITest *test);
    187     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
    188     void             clearResults();
    189 };
    190 
    191 //
    192 // Constructor.
    193 //
    194 BITestData::BITestData(UErrorCode &status)
    195 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
    196   fActualTags(status)
    197 {
    198 }
    199 
    200 //
    201 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
    202 //                 The macro form collects the line number, which is helpful
    203 //                 when tracking down failures.
    204 //
    205 //                 A null data item is inserted at the start of each test's data
    206 //                  to put the starting zero into the data list.  The position saved for
    207 //                  each non-null item is its ending position.
    208 //
    209 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
    210 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
    211     if (U_FAILURE(status)) {return;}
    212     if (data != NULL) {
    213         fDataToBreak.append(CharsToUnicodeString(data));
    214     }
    215     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
    216     fExpectedTags.addElement(tag, status);
    217     fLineNum.addElement(lineNum, status);
    218 }
    219 
    220 
    221 //
    222 //  checkResults.   Compare the actual and expected break positions, report any differences.
    223 //
    224 void BITestData::checkResults(const char *heading, RBBITest *test) {
    225     int32_t   expectedIndex = 0;
    226     int32_t   actualIndex = 0;
    227 
    228     for (;;) {
    229         // If we've run through both the expected and actual results vectors, we're done.
    230         //   break out of the loop.
    231         if (expectedIndex >= fExpectedBreakPositions.size() &&
    232             actualIndex   >= fActualBreakPositions.size()) {
    233             break;
    234         }
    235 
    236 
    237         if (expectedIndex >= fExpectedBreakPositions.size()) {
    238             err(heading, test, expectedIndex-1, actualIndex);
    239             actualIndex++;
    240             continue;
    241         }
    242 
    243         if (actualIndex >= fActualBreakPositions.size()) {
    244             err(heading, test, expectedIndex, actualIndex-1);
    245             expectedIndex++;
    246             continue;
    247         }
    248 
    249         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
    250             err(heading, test, expectedIndex, actualIndex);
    251             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
    252             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
    253                 actualIndex++;
    254             } else {
    255                 expectedIndex++;
    256             }
    257             continue;
    258         }
    259 
    260         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
    261             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
    262                 heading, fLineNum.elementAt(expectedIndex),
    263                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
    264         }
    265 
    266         actualIndex++;
    267         expectedIndex++;
    268     }
    269 }
    270 
    271 //
    272 //  err   -  An error was found.  Report it, along with information about where the
    273 //                                incorrectly broken test data appeared in the source file.
    274 //
    275 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
    276 {
    277     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
    278     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
    279     int32_t   o        = 0;
    280     int32_t   line     = fLineNum.elementAti(expectedIdx);
    281     if (expectedIdx > 0) {
    282         // The line numbers are off by one because a premature break occurs somewhere
    283         //    within the previous item, rather than at the start of the current (expected) item.
    284         //    We want to report the offset of the unexpected break from the start of
    285         //      this previous item.
    286         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
    287     }
    288     if (actual < expected) {
    289         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
    290     } else {
    291         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
    292     }
    293 }
    294 
    295 
    296 void BITestData::clearResults() {
    297     fActualBreakPositions.removeAllElements();
    298     fActualTags.removeAllElements();
    299 }
    300 
    301 
    302 //--------------------------------------------------------------------------------------
    303 //
    304 //    RBBITest    constructor and destructor
    305 //
    306 //--------------------------------------------------------------------------------------
    307 
    308 RBBITest::RBBITest() {
    309 }
    310 
    311 
    312 RBBITest::~RBBITest() {
    313 }
    314 
    315 //-----------------------------------------------------------------------------------
    316 //
    317 //   Test for status {tag} return value from break rules.
    318 //        TODO:  a more thorough test.
    319 //
    320 //-----------------------------------------------------------------------------------
    321 void RBBITest::TestStatusReturn() {
    322      UnicodeString rulesString1("$Letters = [:L:];\n"
    323                                   "$Numbers = [:N:];\n"
    324                                   "$Letters+{1};\n"
    325                                   "$Numbers+{2};\n"
    326                                   "Help\\ {4}/me\\!;\n"
    327                                   "[^$Letters $Numbers];\n"
    328                                   "!.*;\n", -1, US_INV);
    329      UnicodeString testString1  = "abc123..abc Help me Help me!";
    330                                 // 01234567890123456789012345678
    331      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
    332      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
    333 
    334      UErrorCode status=U_ZERO_ERROR;
    335      UParseError    parseError;
    336 
    337      BreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
    338      if(U_FAILURE(status)) {
    339          dataerrln("FAIL : in construction - %s", u_errorName(status));
    340      } else {
    341          int32_t  pos;
    342          int32_t  i = 0;
    343          bi->setText(testString1);
    344          for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
    345              if (pos != bounds1[i]) {
    346                  errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
    347                  break;
    348              }
    349 
    350              int tag = bi->getRuleStatus();
    351              if (tag != brkStatus[i]) {
    352                  errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
    353                  break;
    354              }
    355              i++;
    356          }
    357      }
    358      delete bi;
    359 }
    360 
    361 
    362 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
    363     UErrorCode status = U_ZERO_ERROR;
    364     char name[100];
    365     printf("code    alpha extend alphanum type word sent line name\n");
    366     int nextExpectedIndex = 0;
    367     utext_setNativeIndex(tstr, 0);
    368     for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
    369         if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
    370             printf("------------------------------------------------ %d\n", j);
    371             ++nextExpectedIndex;
    372         }
    373 
    374         UChar32 c = utext_next32(tstr);
    375         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
    376         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
    377                            u_isUAlphabetic(c),
    378                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
    379                            u_isalnum(c),
    380                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
    381                                                   u_charType(c),
    382                                                   U_SHORT_PROPERTY_NAME),
    383                            u_getPropertyValueName(UCHAR_WORD_BREAK,
    384                                                   u_getIntPropertyValue(c,
    385                                                           UCHAR_WORD_BREAK),
    386                                                   U_SHORT_PROPERTY_NAME),
    387                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
    388                                    u_getIntPropertyValue(c,
    389                                            UCHAR_SENTENCE_BREAK),
    390                                    U_SHORT_PROPERTY_NAME),
    391                            u_getPropertyValueName(UCHAR_LINE_BREAK,
    392                                    u_getIntPropertyValue(c,
    393                                            UCHAR_LINE_BREAK),
    394                                    U_SHORT_PROPERTY_NAME),
    395                            name);
    396     }
    397 }
    398 
    399 
    400 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
    401    UErrorCode status = U_ZERO_ERROR;
    402    UText *tstr = NULL;
    403    tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
    404    if (U_FAILURE(status)) {
    405        printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
    406        return;
    407     }
    408    printStringBreaks(tstr, expected, expectedCount);
    409    utext_close(tstr);
    410 }
    411 
    412 
    413 void RBBITest::TestBug3818() {
    414     UErrorCode  status = U_ZERO_ERROR;
    415 
    416     // Four Thai words...
    417     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
    418                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
    419     UnicodeString  thaiStr(thaiWordData);
    420 
    421     BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
    422     if (U_FAILURE(status) || bi == NULL) {
    423         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
    424         return;
    425     }
    426     bi->setText(thaiStr);
    427 
    428     int32_t  startOfSecondWord = bi->following(1);
    429     if (startOfSecondWord != 4) {
    430         errln("Fail at file %s, line %d expected start of word at 4, got %d",
    431             __FILE__, __LINE__, startOfSecondWord);
    432     }
    433     startOfSecondWord = bi->following(0);
    434     if (startOfSecondWord != 4) {
    435         errln("Fail at file %s, line %d expected start of word at 4, got %d",
    436             __FILE__, __LINE__, startOfSecondWord);
    437     }
    438     delete bi;
    439 }
    440 
    441 //----------------------------------------------------------------------------
    442 //
    443 // generalIteratorTest      Given a break iterator and a set of test data,
    444 //                          Run the tests and report the results.
    445 //
    446 //----------------------------------------------------------------------------
    447 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
    448 {
    449 
    450     bi.setText(td.fDataToBreak);
    451 
    452     testFirstAndNext(bi, td);
    453 
    454     testLastAndPrevious(bi, td);
    455 
    456     testFollowing(bi, td);
    457     testPreceding(bi, td);
    458     testIsBoundary(bi, td);
    459     doMultipleSelectionTest(bi, td);
    460 }
    461 
    462 
    463 //
    464 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
    465 //                       kind of loop.
    466 //
    467 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
    468 {
    469     UErrorCode  status = U_ZERO_ERROR;
    470     int32_t     p;
    471     int32_t     lastP = -1;
    472     int32_t     tag;
    473 
    474     logln("Test first and next");
    475     bi.setText(td.fDataToBreak);
    476     td.clearResults();
    477 
    478     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
    479         td.fActualBreakPositions.addElement(p, status);  // Save result.
    480         tag = bi.getRuleStatus();
    481         td.fActualTags.addElement(tag, status);
    482         if (p <= lastP) {
    483             // If the iterator is not making forward progress, stop.
    484             //  No need to raise an error here, it'll be detected in the normal check of results.
    485             break;
    486         }
    487         lastP = p;
    488     }
    489     td.checkResults("testFirstAndNext", this);
    490 }
    491 
    492 
    493 //
    494 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
    495 //
    496 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
    497 {
    498     UErrorCode  status = U_ZERO_ERROR;
    499     int32_t     p;
    500     int32_t     lastP  = 0x7ffffffe;
    501     int32_t     tag;
    502 
    503     logln("Test last and previous");
    504     bi.setText(td.fDataToBreak);
    505     td.clearResults();
    506 
    507     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
    508         // Save break position.  Insert it at start of vector of results, shoving
    509         //    already-saved results further towards the end.
    510         td.fActualBreakPositions.insertElementAt(p, 0, status);
    511         // bi.previous();   // TODO:  Why does this fix things up????
    512         // bi.next();
    513         tag = bi.getRuleStatus();
    514         td.fActualTags.insertElementAt(tag, 0, status);
    515         if (p >= lastP) {
    516             // If the iterator is not making progress, stop.
    517             //  No need to raise an error here, it'll be detected in the normal check of results.
    518             break;
    519         }
    520         lastP = p;
    521     }
    522     td.checkResults("testLastAndPrevious", this);
    523 }
    524 
    525 
    526 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
    527 {
    528     UErrorCode  status = U_ZERO_ERROR;
    529     int32_t     p;
    530     int32_t     tag;
    531     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
    532                                  //   cannot be -1; that is returned for DONE.
    533     int         i;
    534 
    535     logln("testFollowing():");
    536     bi.setText(td.fDataToBreak);
    537     td.clearResults();
    538 
    539     // Save the starting point, since we won't get that out of following.
    540     p = bi.first();
    541     td.fActualBreakPositions.addElement(p, status);  // Save result.
    542     tag = bi.getRuleStatus();
    543     td.fActualTags.addElement(tag, status);
    544 
    545     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
    546         p = bi.following(i);
    547         if (p != lastP) {
    548             if (p == RuleBasedBreakIterator::DONE) {
    549                 break;
    550             }
    551             // We've reached a new break position.  Save it.
    552             td.fActualBreakPositions.addElement(p, status);  // Save result.
    553             tag = bi.getRuleStatus();
    554             td.fActualTags.addElement(tag, status);
    555             lastP = p;
    556         }
    557     }
    558     // The loop normally exits by means of the break in the middle.
    559     // Make sure that the index was at the correct position for the break iterator to have
    560     //   returned DONE.
    561     if (i != td.fDataToBreak.length()) {
    562         errln("testFollowing():  iterator returned DONE prematurely.");
    563     }
    564 
    565     // Full check of all results.
    566     td.checkResults("testFollowing", this);
    567 }
    568 
    569 
    570 
    571 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
    572     UErrorCode  status = U_ZERO_ERROR;
    573     int32_t     p;
    574     int32_t     tag;
    575     int32_t     lastP  = 0x7ffffffe;
    576     int         i;
    577 
    578     logln("testPreceding():");
    579     bi.setText(td.fDataToBreak);
    580     td.clearResults();
    581 
    582     p = bi.last();
    583     td.fActualBreakPositions.addElement(p, status);
    584     tag = bi.getRuleStatus();
    585     td.fActualTags.addElement(tag, status);
    586 
    587     for (i = td.fDataToBreak.length(); i>=-1; i--) {
    588         p = bi.preceding(i);
    589         if (p != lastP) {
    590             if (p == RuleBasedBreakIterator::DONE) {
    591                 break;
    592             }
    593             // We've reached a new break position.  Save it.
    594             td.fActualBreakPositions.insertElementAt(p, 0, status);
    595             lastP = p;
    596             tag = bi.getRuleStatus();
    597             td.fActualTags.insertElementAt(tag, 0, status);
    598         }
    599     }
    600     // The loop normally exits by means of the break in the middle.
    601     // Make sure that the index was at the correct position for the break iterator to have
    602     //   returned DONE.
    603     if (i != 0) {
    604         errln("testPreceding():  iterator returned DONE prematurely.");
    605     }
    606 
    607     // Full check of all results.
    608     td.checkResults("testPreceding", this);
    609 }
    610 
    611 
    612 
    613 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
    614     UErrorCode  status = U_ZERO_ERROR;
    615     int         i;
    616     int32_t     tag;
    617 
    618     logln("testIsBoundary():");
    619     bi.setText(td.fDataToBreak);
    620     td.clearResults();
    621 
    622     for (i = 0; i <= td.fDataToBreak.length(); i++) {
    623         if (bi.isBoundary(i)) {
    624             td.fActualBreakPositions.addElement(i, status);  // Save result.
    625             tag = bi.getRuleStatus();
    626             td.fActualTags.addElement(tag, status);
    627         }
    628     }
    629     td.checkResults("testIsBoundary: ", this);
    630 }
    631 
    632 
    633 
    634 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
    635 {
    636     iterator.setText(td.fDataToBreak);
    637 
    638     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
    639     int32_t offset = iterator.first();
    640     int32_t testOffset;
    641     int32_t count = 0;
    642 
    643     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
    644 
    645     if (*testIterator != iterator)
    646         errln("clone() or operator!= failed: two clones compared unequal");
    647 
    648     do {
    649         testOffset = testIterator->first();
    650         testOffset = testIterator->next(count);
    651         if (offset != testOffset)
    652             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
    653 
    654         if (offset != RuleBasedBreakIterator::DONE) {
    655             count++;
    656             offset = iterator.next();
    657 
    658             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
    659                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
    660                 if (count > 10000 || offset == -1) {
    661                     errln("operator== failed too many times. Stopping test.");
    662                     if (offset == -1) {
    663                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
    664                     }
    665                     return;
    666                 }
    667             }
    668         }
    669     } while (offset != RuleBasedBreakIterator::DONE);
    670 
    671     // now do it backwards...
    672     offset = iterator.last();
    673     count = 0;
    674 
    675     do {
    676         testOffset = testIterator->last();
    677         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
    678         if (offset != testOffset)
    679             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
    680 
    681         if (offset != RuleBasedBreakIterator::DONE) {
    682             count--;
    683             offset = iterator.previous();
    684         }
    685     } while (offset != RuleBasedBreakIterator::DONE);
    686 
    687     delete testIterator;
    688 }
    689 
    690 
    691 //---------------------------------------------
    692 //
    693 //     other tests
    694 //
    695 //---------------------------------------------
    696 void RBBITest::TestEmptyString()
    697 {
    698     UnicodeString text = "";
    699     UErrorCode status = U_ZERO_ERROR;
    700 
    701     BITestData x(status);
    702     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
    703     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
    704     if (U_FAILURE(status))
    705     {
    706         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
    707         return;
    708     }
    709     generalIteratorTest(*bi, x);
    710     delete bi;
    711 }
    712 
    713 void RBBITest::TestGetAvailableLocales()
    714 {
    715     int32_t locCount = 0;
    716     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
    717 
    718     if (locCount == 0)
    719         dataerrln("getAvailableLocales() returned an empty list!");
    720     // Just make sure that it's returning good memory.
    721     int32_t i;
    722     for (i = 0; i < locCount; ++i) {
    723         logln(locList[i].getName());
    724     }
    725 }
    726 
    727 //Testing the BreakIterator::getDisplayName() function
    728 void RBBITest::TestGetDisplayName()
    729 {
    730     UnicodeString   result;
    731 
    732     BreakIterator::getDisplayName(Locale::getUS(), result);
    733     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
    734         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
    735                 + result);
    736 
    737     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
    738     if (result != "French (France)")
    739         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
    740                 + result);
    741 }
    742 /**
    743  * Test End Behaviour
    744  * @bug 4068137
    745  */
    746 void RBBITest::TestEndBehaviour()
    747 {
    748     UErrorCode status = U_ZERO_ERROR;
    749     UnicodeString testString("boo.");
    750     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
    751     if (U_FAILURE(status))
    752     {
    753         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
    754         return;
    755     }
    756     wb->setText(testString);
    757 
    758     if (wb->first() != 0)
    759         errln("Didn't get break at beginning of string.");
    760     if (wb->next() != 3)
    761         errln("Didn't get break before period in \"boo.\"");
    762     if (wb->current() != 4 && wb->next() != 4)
    763         errln("Didn't get break at end of string.");
    764     delete wb;
    765 }
    766 /*
    767  * @bug 4153072
    768  */
    769 void RBBITest::TestBug4153072() {
    770     UErrorCode status = U_ZERO_ERROR;
    771     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
    772     if (U_FAILURE(status))
    773     {
    774         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
    775         return;
    776     }
    777     UnicodeString str("...Hello, World!...");
    778     int32_t begin = 3;
    779     int32_t end = str.length() - 3;
    780     UBool onBoundary;
    781 
    782     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
    783     iter->adoptText(textIterator);
    784     int index;
    785     // Note: with the switch to UText, there is no way to restrict the
    786     //       iteration range to begin at an index other than zero.
    787     //       String character iterators created with a non-zero bound are
    788     //         treated by RBBI as being empty.
    789     for (index = -1; index < begin + 1; ++index) {
    790         onBoundary = iter->isBoundary(index);
    791         if (index == 0?  !onBoundary : onBoundary) {
    792             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
    793                             " and begin index = " + begin);
    794         }
    795     }
    796     delete iter;
    797 }
    798 
    799 
    800 //
    801 // Test for problem reported by Ashok Matoria on 9 July 2007
    802 //    One.<kSoftHyphen><kSpace>Two.
    803 //
    804 //    Sentence break at start (0) and then on calling next() it breaks at
    805 //   'T' of "Two". Now, at this point if I do next() and
    806 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
    807 //
    808 void RBBITest::TestBug5775() {
    809     UErrorCode status = U_ZERO_ERROR;
    810     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
    811     TEST_ASSERT_SUCCESS(status);
    812     if (U_FAILURE(status)) {
    813         return;
    814     }
    815 // Check for status first for better handling of no data errors.
    816     TEST_ASSERT(bi != NULL);
    817     if (bi == NULL) {
    818         return;
    819     }
    820 
    821     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
    822     //               01234      56789
    823     s = s.unescape();
    824     bi->setText(s);
    825     int pos = bi->next();
    826     TEST_ASSERT(pos == 6);
    827     pos = bi->next();
    828     TEST_ASSERT(pos == 10);
    829     pos = bi->previous();
    830     TEST_ASSERT(pos == 6);
    831     delete bi;
    832 }
    833 
    834 
    835 
    836 //------------------------------------------------------------------------------
    837 //
    838 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
    839 //
    840 //------------------------------------------------------------------------------
    841 
    842 struct TestParams {
    843     BreakIterator   *bi;                   // Break iterator is set while parsing test source.
    844                                            //   Changed out whenever test data changes break type.
    845 
    846     UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
    847     UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
    848     UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
    849     UVector32       *srcCol;
    850 
    851     UText           *textToBreak;          // UText, could be UTF8 or UTF16.
    852     UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
    853     CharString       utf8String;           // UTF-8 form of text to break.
    854 
    855     TestParams(UErrorCode &status) : dataToBreak() {
    856         bi               = NULL;
    857         expectedBreaks   = new UVector32(status);
    858         srcLine          = new UVector32(status);
    859         srcCol           = new UVector32(status);
    860         textToBreak      = NULL;
    861         textMap          = new UVector32(status);
    862     }
    863 
    864     ~TestParams() {
    865         delete bi;
    866         delete expectedBreaks;
    867         delete srcLine;
    868         delete srcCol;
    869         utext_close(textToBreak);
    870         delete textMap;
    871     }
    872 
    873     int32_t getSrcLine(int32_t bp);
    874     int32_t getExpectedBreak(int32_t bp);
    875     int32_t getSrcCol(int32_t bp);
    876 
    877     void setUTF16(UErrorCode &status);
    878     void setUTF8(UErrorCode &status);
    879 };
    880 
    881 // Append a UnicodeString to a CharString with UTF-8 encoding.
    882 // Substitute any invalid chars.
    883 //   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
    884 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
    885     if (U_FAILURE(status)) {
    886         return;
    887     }
    888     int32_t utf8Length;
    889     u_strToUTF8WithSub(NULL, 0, &utf8Length,            // Output Buffer, NULL for preflight.
    890                        src.getBuffer(), src.length(),   // UTF-16 data
    891                        0xfffd, NULL,                    // Substitution char, number of subs.
    892                        &status);
    893     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
    894         return;
    895     }
    896     status = U_ZERO_ERROR;
    897     int32_t capacity;
    898     char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
    899     u_strToUTF8WithSub(buffer, utf8Length, NULL,
    900                        src.getBuffer(), src.length(),
    901                        0xfffd, NULL, &status);
    902     dest.append(buffer, utf8Length, status);
    903 }
    904 
    905 
    906 void TestParams::setUTF16(UErrorCode &status) {
    907     textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
    908     textMap->removeAllElements();
    909     for (int32_t i=0; i<dataToBreak.length(); i++) {
    910         if (i == dataToBreak.getChar32Start(i)) {
    911             textMap->addElement(i, status);
    912         } else {
    913             textMap->addElement(-1, status);
    914         }
    915     }
    916     textMap->addElement(dataToBreak.length(), status);
    917     U_ASSERT(dataToBreak.length() + 1 == textMap->size());
    918 }
    919 
    920 
    921 void TestParams::setUTF8(UErrorCode &status) {
    922     if (U_FAILURE(status)) {
    923         return;
    924     }
    925     utf8String.clear();
    926     CharStringAppend(utf8String, dataToBreak, status);
    927     textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
    928     if (U_FAILURE(status)) {
    929         return;
    930     }
    931 
    932     textMap->removeAllElements();
    933     int32_t utf16Index = 0;
    934     for (;;) {
    935         textMap->addElement(utf16Index, status);
    936         UChar32 c32 = utext_current32(textToBreak);
    937         if (c32 < 0) {
    938             break;
    939         }
    940         utf16Index += U16_LENGTH(c32);
    941         utext_next32(textToBreak);
    942         while (textMap->size() < utext_getNativeIndex(textToBreak)) {
    943             textMap->addElement(-1, status);
    944         }
    945     }
    946     U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
    947 }
    948 
    949 
    950 int32_t TestParams::getSrcLine(int bp) {
    951     if (bp >= textMap->size()) {
    952         bp = textMap->size() - 1;
    953     }
    954     int32_t i = 0;
    955     for(; bp >= 0 ; --bp) {
    956         // Move to a character boundary if we are not on one already.
    957         i = textMap->elementAti(bp);
    958         if (i >= 0) {
    959             break;
    960         }
    961     }
    962     return srcLine->elementAti(i);
    963 }
    964 
    965 
    966 int32_t TestParams::getExpectedBreak(int bp) {
    967     if (bp >= textMap->size()) {
    968         return 0;
    969     }
    970     int32_t i = textMap->elementAti(bp);
    971     int32_t retVal = 0;
    972     if (i >= 0) {
    973         retVal = expectedBreaks->elementAti(i);
    974     }
    975     return retVal;
    976 }
    977 
    978 
    979 int32_t TestParams::getSrcCol(int bp) {
    980     if (bp >= textMap->size()) {
    981         bp = textMap->size() - 1;
    982     }
    983     int32_t i = 0;
    984     for(; bp >= 0; --bp) {
    985         // Move bp to a character boundary if we are not on one already.
    986         i = textMap->elementAti(bp);
    987         if (i >= 0) {
    988             break;
    989         }
    990     }
    991     return srcCol->elementAti(i);
    992 }
    993 
    994 
    995 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
    996     int32_t    bp;
    997     int32_t    prevBP;
    998     int32_t    i;
    999 
   1000     TEST_ASSERT_SUCCESS(status);
   1001     if (U_FAILURE(status)) {
   1002         return;
   1003     }
   1004 
   1005     if (t->bi == NULL) {
   1006         return;
   1007     }
   1008 
   1009     t->bi->setText(t->textToBreak, status);
   1010     //
   1011     //  Run the iterator forward
   1012     //
   1013     prevBP = -1;
   1014     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
   1015         if (prevBP ==  bp) {
   1016             // Fail for lack of forward progress.
   1017             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
   1018                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
   1019             break;
   1020         }
   1021 
   1022         // Check that there we didn't miss an expected break between the last one
   1023         //  and this one.
   1024         for (i=prevBP+1; i<bp; i++) {
   1025             if (t->getExpectedBreak(i) != 0) {
   1026                 int expected[] = {0, i};
   1027                 printStringBreaks(t->dataToBreak, expected, 2);
   1028                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1029                       i, t->getSrcLine(i), t->getSrcCol(i));
   1030             }
   1031         }
   1032 
   1033         // Check that the break we did find was expected
   1034         if (t->getExpectedBreak(bp) == 0) {
   1035             int expected[] = {0, bp};
   1036             printStringBreaks(t->textToBreak, expected, 2);
   1037             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
   1038                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
   1039         } else {
   1040             // The break was expected.
   1041             //   Check that the {nnn} tag value is correct.
   1042             int32_t expectedTagVal = t->getExpectedBreak(bp);
   1043             if (expectedTagVal == -1) {
   1044                 expectedTagVal = 0;
   1045             }
   1046             int32_t line = t->getSrcLine(bp);
   1047             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
   1048             if (rs != expectedTagVal) {
   1049                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
   1050                       "          Actual, Expected status = %4d, %4d",
   1051                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
   1052             }
   1053         }
   1054 
   1055         prevBP = bp;
   1056     }
   1057 
   1058     // Verify that there were no missed expected breaks after the last one found
   1059     for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
   1060         if (t->getExpectedBreak(i) != 0) {
   1061             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1062                       i, t->getSrcLine(i), t->getSrcCol(i));
   1063         }
   1064     }
   1065 
   1066     //
   1067     //  Run the iterator backwards, verify that the same breaks are found.
   1068     //
   1069     prevBP = utext_nativeLength(t->textToBreak)+2;  // start with a phony value for the last break pos seen.
   1070     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
   1071         if (prevBP ==  bp) {
   1072             // Fail for lack of progress.
   1073             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
   1074                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
   1075             break;
   1076         }
   1077 
   1078         // Check that we didn't miss an expected break between the last one
   1079         //  and this one.  (UVector returns zeros for index out of bounds.)
   1080         for (i=prevBP-1; i>bp; i--) {
   1081             if (t->getExpectedBreak(i) != 0) {
   1082                 errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1083                       i, t->getSrcLine(i), t->getSrcCol(i));
   1084             }
   1085         }
   1086 
   1087         // Check that the break we did find was expected
   1088         if (t->getExpectedBreak(bp) == 0) {
   1089             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
   1090                    bp, t->getSrcLine(bp), t->getSrcCol(bp));
   1091         } else {
   1092             // The break was expected.
   1093             //   Check that the {nnn} tag value is correct.
   1094             int32_t expectedTagVal = t->getExpectedBreak(bp);
   1095             if (expectedTagVal == -1) {
   1096                 expectedTagVal = 0;
   1097             }
   1098             int line = t->getSrcLine(bp);
   1099             int32_t rs = t->bi->getRuleStatus();
   1100             if (rs != expectedTagVal) {
   1101                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
   1102                       "          Actual, Expected status = %4d, %4d",
   1103                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
   1104             }
   1105         }
   1106 
   1107         prevBP = bp;
   1108     }
   1109 
   1110     // Verify that there were no missed breaks prior to the last one found
   1111     for (i=prevBP-1; i>=0; i--) {
   1112         if (t->getExpectedBreak(i) != 0) {
   1113             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1114                       i, t->getSrcLine(i), t->getSrcCol(i));
   1115         }
   1116     }
   1117 
   1118     // Check isBoundary()
   1119     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
   1120         UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
   1121         UBool boundaryFound    = t->bi->isBoundary(i);
   1122         if (boundaryExpected != boundaryFound) {
   1123             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
   1124                   "        Expected, Actual= %s, %s",
   1125                   i, t->getSrcLine(i), t->getSrcCol(i),
   1126                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
   1127         }
   1128     }
   1129 
   1130     // Check following()
   1131     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
   1132         int32_t actualBreak = t->bi->following(i);
   1133         int32_t expectedBreak = BreakIterator::DONE;
   1134         for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
   1135             if (t->getExpectedBreak(j) != 0) {
   1136                 expectedBreak = j;
   1137                 break;
   1138             }
   1139         }
   1140         if (expectedBreak != actualBreak) {
   1141             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
   1142                   "        Expected, Actual= %d, %d",
   1143                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
   1144         }
   1145     }
   1146 
   1147     // Check preceding()
   1148     for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
   1149         int32_t actualBreak = t->bi->preceding(i);
   1150         int32_t expectedBreak = BreakIterator::DONE;
   1151 
   1152         // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
   1153         // preceding(trailing byte) will return the index of some preceding code point,
   1154         // not the lead byte of the current code point, even though that has a smaller index.
   1155         // Therefore, start looking at the expected break data not at i-1, but at
   1156         // the start of code point index - 1.
   1157         utext_setNativeIndex(t->textToBreak, i);
   1158         int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
   1159         for (; j >= 0; j--) {
   1160             if (t->getExpectedBreak(j) != 0) {
   1161                 expectedBreak = j;
   1162                 break;
   1163             }
   1164         }
   1165         if (expectedBreak != actualBreak) {
   1166             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
   1167                   "        Expected, Actual= %d, %d",
   1168                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
   1169         }
   1170     }
   1171 }
   1172 
   1173 
   1174 void RBBITest::TestExtended() {
   1175 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   1176     UErrorCode      status  = U_ZERO_ERROR;
   1177     Locale          locale("");
   1178 
   1179     UnicodeString       rules;
   1180     TestParams          tp(status);
   1181 
   1182     RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
   1183     if (U_FAILURE(status)) {
   1184         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
   1185     }
   1186 
   1187 
   1188     //
   1189     //  Open and read the test data file.
   1190     //
   1191     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   1192     char testFileName[1000];
   1193     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
   1194         errln("Can't open test data.  Path too long.");
   1195         return;
   1196     }
   1197     strcpy(testFileName, testDataDirectory);
   1198     strcat(testFileName, "rbbitst.txt");
   1199 
   1200     int    len;
   1201     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
   1202     if (U_FAILURE(status)) {
   1203         return; /* something went wrong, error already output */
   1204     }
   1205 
   1206 
   1207     bool skipTest = false; // Skip this test?
   1208 
   1209     //
   1210     //  Put the test data into a UnicodeString
   1211     //
   1212     UnicodeString testString(FALSE, testFile, len);
   1213 
   1214     enum EParseState{
   1215         PARSE_COMMENT,
   1216         PARSE_TAG,
   1217         PARSE_DATA,
   1218         PARSE_NUM
   1219     }
   1220     parseState = PARSE_TAG;
   1221 
   1222     EParseState savedState = PARSE_TAG;
   1223 
   1224     static const UChar CH_LF        = 0x0a;
   1225     static const UChar CH_CR        = 0x0d;
   1226     static const UChar CH_HASH      = 0x23;
   1227     /*static const UChar CH_PERIOD    = 0x2e;*/
   1228     static const UChar CH_LT        = 0x3c;
   1229     static const UChar CH_GT        = 0x3e;
   1230     static const UChar CH_BACKSLASH = 0x5c;
   1231     static const UChar CH_BULLET    = 0x2022;
   1232 
   1233     int32_t    lineNum  = 1;
   1234     int32_t    colStart = 0;
   1235     int32_t    column   = 0;
   1236     int32_t    charIdx  = 0;
   1237 
   1238     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
   1239 
   1240     for (charIdx = 0; charIdx < len; ) {
   1241         status = U_ZERO_ERROR;
   1242         UChar  c = testString.charAt(charIdx);
   1243         charIdx++;
   1244         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
   1245             // treat CRLF as a unit
   1246             c = CH_LF;
   1247             charIdx++;
   1248         }
   1249         if (c == CH_LF || c == CH_CR) {
   1250             lineNum++;
   1251             colStart = charIdx;
   1252         }
   1253         column = charIdx - colStart + 1;
   1254 
   1255         switch (parseState) {
   1256         case PARSE_COMMENT:
   1257             if (c == 0x0a || c == 0x0d) {
   1258                 parseState = savedState;
   1259             }
   1260             break;
   1261 
   1262         case PARSE_TAG:
   1263             {
   1264             if (c == CH_HASH) {
   1265                 parseState = PARSE_COMMENT;
   1266                 savedState = PARSE_TAG;
   1267                 break;
   1268             }
   1269             if (u_isUWhiteSpace(c)) {
   1270                 break;
   1271             }
   1272             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
   1273                 delete tp.bi;
   1274                 tp.bi = BreakIterator::createWordInstance(locale,  status);
   1275                 skipTest = false;
   1276                 charIdx += 5;
   1277                 break;
   1278             }
   1279             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
   1280                 delete tp.bi;
   1281                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
   1282                 skipTest = false;
   1283                 charIdx += 5;
   1284                 break;
   1285             }
   1286             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
   1287                 delete tp.bi;
   1288                 tp.bi = BreakIterator::createLineInstance(locale,  status);
   1289                 skipTest = false;
   1290                 charIdx += 5;
   1291                 break;
   1292             }
   1293             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
   1294                 delete tp.bi;
   1295                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
   1296                 skipTest = false;
   1297                 charIdx += 5;
   1298                 break;
   1299             }
   1300             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
   1301                 delete tp.bi;
   1302                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
   1303                 charIdx += 6;
   1304                 break;
   1305             }
   1306 
   1307             // <locale  loc_name>
   1308             localeMatcher.reset(testString);
   1309             if (localeMatcher.lookingAt(charIdx-1, status)) {
   1310                 UnicodeString localeName = localeMatcher.group(1, status);
   1311                 char localeName8[100];
   1312                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
   1313                 locale = Locale::createFromName(localeName8);
   1314                 charIdx += localeMatcher.group(0, status).length() - 1;
   1315                 TEST_ASSERT_SUCCESS(status);
   1316                 break;
   1317             }
   1318             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
   1319                 parseState = PARSE_DATA;
   1320                 charIdx += 5;
   1321                 tp.dataToBreak = "";
   1322                 tp.expectedBreaks->removeAllElements();
   1323                 tp.srcCol ->removeAllElements();
   1324                 tp.srcLine->removeAllElements();
   1325                 break;
   1326             }
   1327 
   1328             errln("line %d: Tag expected in test file.", lineNum);
   1329             parseState = PARSE_COMMENT;
   1330             savedState = PARSE_DATA;
   1331             goto end_test; // Stop the test.
   1332             }
   1333             break;
   1334 
   1335         case PARSE_DATA:
   1336             if (c == CH_BULLET) {
   1337                 int32_t  breakIdx = tp.dataToBreak.length();
   1338                 tp.expectedBreaks->setSize(breakIdx+1);
   1339                 tp.expectedBreaks->setElementAt(-1, breakIdx);
   1340                 tp.srcLine->setSize(breakIdx+1);
   1341                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1342                 tp.srcCol ->setSize(breakIdx+1);
   1343                 tp.srcCol ->setElementAt(column, breakIdx);
   1344                 break;
   1345             }
   1346 
   1347             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
   1348                 // Add final entry to mappings from break location to source file position.
   1349                 //  Need one extra because last break position returned is after the
   1350                 //    last char in the data, not at the last char.
   1351                 tp.srcLine->addElement(lineNum, status);
   1352                 tp.srcCol ->addElement(column, status);
   1353 
   1354                 parseState = PARSE_TAG;
   1355                 charIdx += 6;
   1356 
   1357                 if (!skipTest) {
   1358                     // RUN THE TEST!
   1359                     status = U_ZERO_ERROR;
   1360                     tp.setUTF16(status);
   1361                     executeTest(&tp, status);
   1362                     TEST_ASSERT_SUCCESS(status);
   1363 
   1364                     // Run again, this time with UTF-8 text wrapped in a UText.
   1365                     status = U_ZERO_ERROR;
   1366                     tp.setUTF8(status);
   1367                     TEST_ASSERT_SUCCESS(status);
   1368                     executeTest(&tp, status);
   1369                 }
   1370                 break;
   1371             }
   1372 
   1373             if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
   1374                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
   1375                 // Get the code point from the name and insert it into the test data.
   1376                 //   (Damn, no API takes names in Unicode  !!!
   1377                 //    we've got to take it back to char *)
   1378                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
   1379                 int32_t nameLength = nameEndIdx - (charIdx+2);
   1380                 char charNameBuf[200];
   1381                 UChar32 theChar = -1;
   1382                 if (nameEndIdx != -1) {
   1383                     UErrorCode status = U_ZERO_ERROR;
   1384                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
   1385                     charNameBuf[sizeof(charNameBuf)-1] = 0;
   1386                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
   1387                     if (U_FAILURE(status)) {
   1388                         theChar = -1;
   1389                     }
   1390                 }
   1391                 if (theChar == -1) {
   1392                     errln("Error in named character in test file at line %d, col %d",
   1393                         lineNum, column);
   1394                 } else {
   1395                     // Named code point was recognized.  Insert it
   1396                     //   into the test data.
   1397                     tp.dataToBreak.append(theChar);
   1398                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
   1399                         tp.srcLine->addElement(lineNum, status);
   1400                         tp.srcCol ->addElement(column, status);
   1401                     }
   1402                 }
   1403                 if (nameEndIdx > charIdx) {
   1404                     charIdx = nameEndIdx+1;
   1405 
   1406                 }
   1407                 break;
   1408             }
   1409 
   1410 
   1411 
   1412 
   1413             if (testString.compare(charIdx-1, 2, "<>") == 0) {
   1414                 charIdx++;
   1415                 int32_t  breakIdx = tp.dataToBreak.length();
   1416                 tp.expectedBreaks->setSize(breakIdx+1);
   1417                 tp.expectedBreaks->setElementAt(-1, breakIdx);
   1418                 tp.srcLine->setSize(breakIdx+1);
   1419                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1420                 tp.srcCol ->setSize(breakIdx+1);
   1421                 tp.srcCol ->setElementAt(column, breakIdx);
   1422                 break;
   1423             }
   1424 
   1425             if (c == CH_LT) {
   1426                 tagValue   = 0;
   1427                 parseState = PARSE_NUM;
   1428                 break;
   1429             }
   1430 
   1431             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
   1432                 parseState = PARSE_COMMENT;
   1433                 savedState = PARSE_DATA;
   1434                 break;
   1435             }
   1436 
   1437             if (c == CH_BACKSLASH) {
   1438                 // Check for \ at end of line, a line continuation.
   1439                 //     Advance over (discard) the newline
   1440                 UChar32 cp = testString.char32At(charIdx);
   1441                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
   1442                     // We have a CR LF
   1443                     //  Need an extra increment of the input ptr to move over both of them
   1444                     charIdx++;
   1445                 }
   1446                 if (cp == CH_LF || cp == CH_CR) {
   1447                     lineNum++;
   1448                     colStart = charIdx;
   1449                     charIdx++;
   1450                     break;
   1451                 }
   1452 
   1453                 // Let unescape handle the back slash.
   1454                 cp = testString.unescapeAt(charIdx);
   1455                 if (cp != -1) {
   1456                     // Escape sequence was recognized.  Insert the char
   1457                     //   into the test data.
   1458                     tp.dataToBreak.append(cp);
   1459                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
   1460                         tp.srcLine->addElement(lineNum, status);
   1461                         tp.srcCol ->addElement(column, status);
   1462                     }
   1463                     break;
   1464                 }
   1465 
   1466 
   1467                 // Not a recognized backslash escape sequence.
   1468                 // Take the next char as a literal.
   1469                 //  TODO:  Should this be an error?
   1470                 c = testString.charAt(charIdx);
   1471                 charIdx = testString.moveIndex32(charIdx, 1);
   1472             }
   1473 
   1474             // Normal, non-escaped data char.
   1475             tp.dataToBreak.append(c);
   1476 
   1477             // Save the mapping from offset in the data to line/column numbers in
   1478             //   the original input file.  Will be used for better error messages only.
   1479             //   If there's an expected break before this char, the slot in the mapping
   1480             //     vector will already be set for this char; don't overwrite it.
   1481             if (tp.dataToBreak.length() > tp.srcLine->size()) {
   1482                 tp.srcLine->addElement(lineNum, status);
   1483                 tp.srcCol ->addElement(column, status);
   1484             }
   1485             break;
   1486 
   1487 
   1488         case PARSE_NUM:
   1489             // We are parsing an expected numeric tag value, like <1234>,
   1490             //   within a chunk of data.
   1491             if (u_isUWhiteSpace(c)) {
   1492                 break;
   1493             }
   1494 
   1495             if (c == CH_GT) {
   1496                 // Finished the number.  Add the info to the expected break data,
   1497                 //   and switch parse state back to doing plain data.
   1498                 parseState = PARSE_DATA;
   1499                 if (tagValue == 0) {
   1500                     tagValue = -1;
   1501                 }
   1502                 int32_t  breakIdx = tp.dataToBreak.length();
   1503                 tp.expectedBreaks->setSize(breakIdx+1);
   1504                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
   1505                 tp.srcLine->setSize(breakIdx+1);
   1506                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1507                 tp.srcCol ->setSize(breakIdx+1);
   1508                 tp.srcCol ->setElementAt(column, breakIdx);
   1509                 break;
   1510             }
   1511 
   1512             if (u_isdigit(c)) {
   1513                 tagValue = tagValue*10 + u_charDigitValue(c);
   1514                 break;
   1515             }
   1516 
   1517             errln("Syntax Error in test file at line %d, col %d",
   1518                 lineNum, column);
   1519             parseState = PARSE_COMMENT;
   1520             goto end_test; // Stop the test
   1521             break;
   1522         }
   1523 
   1524 
   1525         if (U_FAILURE(status)) {
   1526             dataerrln("ICU Error %s while parsing test file at line %d.",
   1527                 u_errorName(status), lineNum);
   1528             status = U_ZERO_ERROR;
   1529             goto end_test; // Stop the test
   1530         }
   1531 
   1532     }
   1533 
   1534 end_test:
   1535     delete [] testFile;
   1536 #endif
   1537 }
   1538 
   1539 
   1540 //-------------------------------------------------------------------------------
   1541 //
   1542 //  TestDictRules   create a break iterator from source rules that includes a
   1543 //                  dictionary range.   Regression for bug #7130.  Source rules
   1544 //                  do not declare a break iterator type (word, line, sentence, etc.
   1545 //                  but the dictionary code, without a type, would loop.
   1546 //
   1547 //-------------------------------------------------------------------------------
   1548 void RBBITest::TestDictRules() {
   1549     const char *rules =  "$dictionary = [a-z]; \n"
   1550                          "!!forward; \n"
   1551                          "$dictionary $dictionary; \n"
   1552                          "!!reverse; \n"
   1553                          "$dictionary $dictionary; \n";
   1554     const char *text = "aa";
   1555     UErrorCode status = U_ZERO_ERROR;
   1556     UParseError parseError;
   1557 
   1558     RuleBasedBreakIterator bi(rules, parseError, status);
   1559     if (U_SUCCESS(status)) {
   1560         UnicodeString utext = text;
   1561         bi.setText(utext);
   1562         int32_t position;
   1563         int32_t loops;
   1564         for (loops = 0; loops<10; loops++) {
   1565             position = bi.next();
   1566             if (position == RuleBasedBreakIterator::DONE) {
   1567                 break;
   1568             }
   1569         }
   1570         TEST_ASSERT(loops == 1);
   1571     } else {
   1572         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
   1573     }
   1574 }
   1575 
   1576 
   1577 
   1578 //-------------------------------------------------------------------------------
   1579 //
   1580 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
   1581 //    return the datain one big UChar * buffer, which the caller must delete.
   1582 //
   1583 //    parameters:
   1584 //          fileName:   the name of the file, with no directory part.  The test data directory
   1585 //                      is assumed.
   1586 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
   1587 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
   1588 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
   1589 //                      Pass NULL for the system default encoding.
   1590 //          status
   1591 //    returns:
   1592 //                      The file data, converted to UChar.
   1593 //                      The caller must delete this when done with
   1594 //                           delete [] theBuffer;
   1595 //
   1596 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
   1597 //           Move this function to some common place.
   1598 //
   1599 //--------------------------------------------------------------------------------
   1600 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
   1601     UChar       *retPtr  = NULL;
   1602     char        *fileBuf = NULL;
   1603     UConverter* conv     = NULL;
   1604     FILE        *f       = NULL;
   1605 
   1606     ulen = 0;
   1607     if (U_FAILURE(status)) {
   1608         return retPtr;
   1609     }
   1610 
   1611     //
   1612     //  Open the file.
   1613     //
   1614     f = fopen(fileName, "rb");
   1615     if (f == 0) {
   1616         dataerrln("Error opening test data file %s\n", fileName);
   1617         status = U_FILE_ACCESS_ERROR;
   1618         return NULL;
   1619     }
   1620     //
   1621     //  Read it in
   1622     //
   1623     int   fileSize;
   1624     int   amt_read;
   1625 
   1626     fseek( f, 0, SEEK_END);
   1627     fileSize = ftell(f);
   1628     fileBuf = new char[fileSize];
   1629     fseek(f, 0, SEEK_SET);
   1630     amt_read = fread(fileBuf, 1, fileSize, f);
   1631     if (amt_read != fileSize || fileSize <= 0) {
   1632         errln("Error reading test data file.");
   1633         goto cleanUpAndReturn;
   1634     }
   1635 
   1636     //
   1637     // Look for a Unicode Signature (BOM) on the data just read
   1638     //
   1639     int32_t        signatureLength;
   1640     const char *   fileBufC;
   1641     const char*    bomEncoding;
   1642 
   1643     fileBufC = fileBuf;
   1644     bomEncoding = ucnv_detectUnicodeSignature(
   1645         fileBuf, fileSize, &signatureLength, &status);
   1646     if(bomEncoding!=NULL ){
   1647         fileBufC  += signatureLength;
   1648         fileSize  -= signatureLength;
   1649         encoding = bomEncoding;
   1650     }
   1651 
   1652     //
   1653     // Open a converter to take the rule file to UTF-16
   1654     //
   1655     conv = ucnv_open(encoding, &status);
   1656     if (U_FAILURE(status)) {
   1657         goto cleanUpAndReturn;
   1658     }
   1659 
   1660     //
   1661     // Convert the rules to UChar.
   1662     //  Preflight first to determine required buffer size.
   1663     //
   1664     ulen = ucnv_toUChars(conv,
   1665         NULL,           //  dest,
   1666         0,              //  destCapacity,
   1667         fileBufC,
   1668         fileSize,
   1669         &status);
   1670     if (status == U_BUFFER_OVERFLOW_ERROR) {
   1671         // Buffer Overflow is expected from the preflight operation.
   1672         status = U_ZERO_ERROR;
   1673 
   1674         retPtr = new UChar[ulen+1];
   1675         ucnv_toUChars(conv,
   1676             retPtr,       //  dest,
   1677             ulen+1,
   1678             fileBufC,
   1679             fileSize,
   1680             &status);
   1681     }
   1682 
   1683 cleanUpAndReturn:
   1684     fclose(f);
   1685     delete []fileBuf;
   1686     ucnv_close(conv);
   1687     if (U_FAILURE(status)) {
   1688         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   1689         delete []retPtr;
   1690         retPtr = 0;
   1691         ulen   = 0;
   1692     };
   1693     return retPtr;
   1694 }
   1695 
   1696 
   1697 
   1698 //--------------------------------------------------------------------------------------------
   1699 //
   1700 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
   1701 //
   1702 //-------------------------------------------------------------------------------------------
   1703 void RBBITest::TestUnicodeFiles() {
   1704     RuleBasedBreakIterator  *bi;
   1705     UErrorCode               status = U_ZERO_ERROR;
   1706 
   1707     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
   1708     TEST_ASSERT_SUCCESS(status);
   1709     if (U_SUCCESS(status)) {
   1710         runUnicodeTestData("GraphemeBreakTest.txt", bi);
   1711     }
   1712     delete bi;
   1713 
   1714     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
   1715     TEST_ASSERT_SUCCESS(status);
   1716     if (U_SUCCESS(status)) {
   1717         runUnicodeTestData("WordBreakTest.txt", bi);
   1718     }
   1719     delete bi;
   1720 
   1721     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
   1722     TEST_ASSERT_SUCCESS(status);
   1723     if (U_SUCCESS(status)) {
   1724         runUnicodeTestData("SentenceBreakTest.txt", bi);
   1725     }
   1726     delete bi;
   1727 
   1728     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
   1729     TEST_ASSERT_SUCCESS(status);
   1730     if (U_SUCCESS(status)) {
   1731         runUnicodeTestData("LineBreakTest.txt", bi);
   1732     }
   1733     delete bi;
   1734 }
   1735 
   1736 
   1737 // Check for test cases from the Unicode test data files that are known to fail
   1738 // and should be skipped because ICU is not yet able to fully implement the spec.
   1739 // See ticket #7270.
   1740 
   1741 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
   1742     static const UChar badTestCases[][4] = {                     // Line Numbers from Unicode 7.0.0 file.
   1743         {(UChar)0x200B, (UChar)0x0020, (UChar)0x007D, (UChar)0x0000},   // Line 5198
   1744         {(UChar)0x200B, (UChar)0x0020, (UChar)0x0029, (UChar)0x0000},   // Line 5202
   1745         {(UChar)0x200B, (UChar)0x0020, (UChar)0x0021, (UChar)0x0000},   // Line 5214
   1746         {(UChar)0x200B, (UChar)0x0020, (UChar)0x002c, (UChar)0x0000},   // Line 5246
   1747         {(UChar)0x200B, (UChar)0x0020, (UChar)0x002f, (UChar)0x0000},   // Line 5298
   1748         {(UChar)0x200B, (UChar)0x0020, (UChar)0x2060, (UChar)0x0000}    // Line 5302
   1749     };
   1750     if (strcmp(fileName, "LineBreakTest.txt") != 0) {
   1751         return FALSE;
   1752     }
   1753 
   1754     for (int i=0; i<UPRV_LENGTHOF(badTestCases); i++) {
   1755         if (testCase == UnicodeString(badTestCases[i])) {
   1756             return logKnownIssue("7270");
   1757         }
   1758     }
   1759     return FALSE;
   1760 }
   1761 
   1762 
   1763 //--------------------------------------------------------------------------------------------
   1764 //
   1765 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
   1766 //
   1767 //-------------------------------------------------------------------------------------------
   1768 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
   1769 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   1770     UErrorCode  status = U_ZERO_ERROR;
   1771 
   1772     //
   1773     //  Open and read the test data file, put it into a UnicodeString.
   1774     //
   1775     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   1776     char testFileName[1000];
   1777     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
   1778         dataerrln("Can't open test data.  Path too long.");
   1779         return;
   1780     }
   1781     strcpy(testFileName, testDataDirectory);
   1782     strcat(testFileName, fileName);
   1783 
   1784     logln("Opening data file %s\n", fileName);
   1785 
   1786     int    len;
   1787     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
   1788     if (status != U_FILE_ACCESS_ERROR) {
   1789         TEST_ASSERT_SUCCESS(status);
   1790         TEST_ASSERT(testFile != NULL);
   1791     }
   1792     if (U_FAILURE(status) || testFile == NULL) {
   1793         return; /* something went wrong, error already output */
   1794     }
   1795     UnicodeString testFileAsString(TRUE, testFile, len);
   1796 
   1797     //
   1798     //  Parse the test data file using a regular expression.
   1799     //  Each kind of token is recognized in its own capture group; what type of item was scanned
   1800     //     is identified by which group had a match.
   1801     //
   1802     //    Caputure Group #                  1          2            3            4           5
   1803     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
   1804     //
   1805     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
   1806     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
   1807     UnicodeString   testString;
   1808     UVector32       breakPositions(status);
   1809     int             lineNumber = 1;
   1810     TEST_ASSERT_SUCCESS(status);
   1811     if (U_FAILURE(status)) {
   1812         return;
   1813     }
   1814 
   1815     //
   1816     //  Scan through each test case, building up the string to be broken in testString,
   1817     //   and the positions that should be boundaries in the breakPositions vector.
   1818     //
   1819     int spin = 0;
   1820     while (tokenMatcher.find()) {
   1821       	if(tokenMatcher.hitEnd()) {
   1822           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
   1823              This occurred when the text file was corrupt (wasn't marked as UTF-8)
   1824              and caused an infinite loop here on EBCDIC systems!
   1825           */
   1826           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
   1827           //	   return;
   1828       	}
   1829         if (tokenMatcher.start(1, status) >= 0) {
   1830             // Scanned a divide sign, indicating a break position in the test data.
   1831             if (testString.length()>0) {
   1832                 breakPositions.addElement(testString.length(), status);
   1833             }
   1834         }
   1835         else if (tokenMatcher.start(2, status) >= 0) {
   1836             // Scanned an 'x', meaning no break at this position in the test data
   1837             //   Nothing to be done here.
   1838             }
   1839         else if (tokenMatcher.start(3, status) >= 0) {
   1840             // Scanned Hex digits.  Convert them to binary, append to the character data string.
   1841             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
   1842             int length = hexNumber.length();
   1843             if (length<=8) {
   1844                 char buf[10];
   1845                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
   1846                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
   1847                 if (c<=0x10ffff) {
   1848                     testString.append(c);
   1849                 } else {
   1850                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
   1851                        fileName, lineNumber);
   1852                 }
   1853             } else {
   1854                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
   1855                        fileName, lineNumber);
   1856              }
   1857         }
   1858         else if (tokenMatcher.start(4, status) >= 0) {
   1859             // Scanned to end of a line, possibly skipping over a comment in the process.
   1860             //   If the line from the file contained test data, run the test now.
   1861             if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
   1862                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
   1863             }
   1864 
   1865             // Clear out this test case.
   1866             //    The string and breakPositions vector will be refilled as the next
   1867             //       test case is parsed.
   1868             testString.remove();
   1869             breakPositions.removeAllElements();
   1870             lineNumber++;
   1871         } else {
   1872             // Scanner catchall.  Something unrecognized appeared on the line.
   1873             char token[16];
   1874             UnicodeString uToken = tokenMatcher.group(0, status);
   1875             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
   1876             token[sizeof(token)-1] = 0;
   1877             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
   1878 
   1879             // Clean up, in preparation for continuing with the next line.
   1880             testString.remove();
   1881             breakPositions.removeAllElements();
   1882             lineNumber++;
   1883         }
   1884         TEST_ASSERT_SUCCESS(status);
   1885         if (U_FAILURE(status)) {
   1886             break;
   1887         }
   1888     }
   1889 
   1890     delete [] testFile;
   1891  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
   1892 }
   1893 
   1894 //--------------------------------------------------------------------------------------------
   1895 //
   1896 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
   1897 //                            test data files.  Do only a simple, forward-only check -
   1898 //                            this test is mostly to check that ICU and the Unicode
   1899 //                            data agree with each other.
   1900 //
   1901 //--------------------------------------------------------------------------------------------
   1902 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
   1903                          const UnicodeString &testString,   // Text data to be broken
   1904                          UVector32 *breakPositions,         // Positions where breaks should be found.
   1905                          RuleBasedBreakIterator *bi) {
   1906     int32_t pos;                 // Break Position in the test string
   1907     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
   1908     int32_t expectedPos;         // Expected break position (index into test string)
   1909 
   1910     bi->setText(testString);
   1911     pos = bi->first();
   1912     pos = bi->next();
   1913 
   1914     while (pos != BreakIterator::DONE) {
   1915         if (expectedI >= breakPositions->size()) {
   1916             errln("Test file \"%s\", line %d, unexpected break found at position %d",
   1917                 testFileName, lineNumber, pos);
   1918             break;
   1919         }
   1920         expectedPos = breakPositions->elementAti(expectedI);
   1921         if (pos < expectedPos) {
   1922             errln("Test file \"%s\", line %d, unexpected break found at position %d",
   1923                 testFileName, lineNumber, pos);
   1924             break;
   1925         }
   1926         if (pos > expectedPos) {
   1927             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
   1928                 testFileName, lineNumber, expectedPos);
   1929             break;
   1930         }
   1931         pos = bi->next();
   1932         expectedI++;
   1933     }
   1934 
   1935     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
   1936         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
   1937             testFileName, lineNumber, breakPositions->elementAti(expectedI));
   1938     }
   1939 }
   1940 
   1941 
   1942 
   1943 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   1944 //---------------------------------------------------------------------------------------
   1945 //
   1946 //   classs RBBIMonkeyKind
   1947 //
   1948 //      Monkey Test for Break Iteration
   1949 //      Abstract interface class.   Concrete derived classes independently
   1950 //      implement the break rules for different iterator types.
   1951 //
   1952 //      The Monkey Test itself uses doesn't know which type of break iterator it is
   1953 //      testing, but works purely in terms of the interface defined here.
   1954 //
   1955 //---------------------------------------------------------------------------------------
   1956 class RBBIMonkeyKind {
   1957 public:
   1958     // Return a UVector of UnicodeSets, representing the character classes used
   1959     //   for this type of iterator.
   1960     virtual  UVector  *charClasses() = 0;
   1961 
   1962     // Set the test text on which subsequent calls to next() will operate
   1963     virtual  void      setText(const UnicodeString &s) = 0;
   1964 
   1965     // Find the next break postion, starting from the prev break position, or from zero.
   1966     // Return -1 after reaching end of string.
   1967     virtual  int32_t   next(int32_t i) = 0;
   1968 
   1969     virtual ~RBBIMonkeyKind();
   1970     UErrorCode       deferredStatus;
   1971 
   1972 
   1973 protected:
   1974     RBBIMonkeyKind();
   1975 
   1976 private:
   1977 };
   1978 
   1979 RBBIMonkeyKind::RBBIMonkeyKind() {
   1980     deferredStatus = U_ZERO_ERROR;
   1981 }
   1982 
   1983 RBBIMonkeyKind::~RBBIMonkeyKind() {
   1984 }
   1985 
   1986 
   1987 //----------------------------------------------------------------------------------------
   1988 //
   1989 //   Random Numbers.  Similar to standard lib rand() and srand()
   1990 //                    Not using library to
   1991 //                      1.  Get same results on all platforms.
   1992 //                      2.  Get access to current seed, to more easily reproduce failures.
   1993 //
   1994 //---------------------------------------------------------------------------------------
   1995 static uint32_t m_seed = 1;
   1996 
   1997 static uint32_t m_rand()
   1998 {
   1999     m_seed = m_seed * 1103515245 + 12345;
   2000     return (uint32_t)(m_seed/65536) % 32768;
   2001 }
   2002 
   2003 
   2004 //------------------------------------------------------------------------------------------
   2005 //
   2006 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
   2007 //                             of RBBIMonkeyKind.
   2008 //
   2009 //------------------------------------------------------------------------------------------
   2010 class RBBICharMonkey: public RBBIMonkeyKind {
   2011 public:
   2012     RBBICharMonkey();
   2013     virtual          ~RBBICharMonkey();
   2014     virtual  UVector *charClasses();
   2015     virtual  void     setText(const UnicodeString &s);
   2016     virtual  int32_t  next(int32_t i);
   2017 private:
   2018     UVector   *fSets;
   2019 
   2020     UnicodeSet  *fCRLFSet;
   2021     UnicodeSet  *fControlSet;
   2022     UnicodeSet  *fExtendSet;
   2023     UnicodeSet  *fRegionalIndicatorSet;
   2024     UnicodeSet  *fPrependSet;
   2025     UnicodeSet  *fSpacingSet;
   2026     UnicodeSet  *fLSet;
   2027     UnicodeSet  *fVSet;
   2028     UnicodeSet  *fTSet;
   2029     UnicodeSet  *fLVSet;
   2030     UnicodeSet  *fLVTSet;
   2031     UnicodeSet  *fHangulSet;
   2032     UnicodeSet  *fAnySet;
   2033 
   2034     const UnicodeString *fText;
   2035 };
   2036 
   2037 
   2038 RBBICharMonkey::RBBICharMonkey() {
   2039     UErrorCode  status = U_ZERO_ERROR;
   2040 
   2041     fText = NULL;
   2042 
   2043     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
   2044     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
   2045     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
   2046     fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
   2047     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
   2048     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
   2049     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
   2050     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
   2051     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
   2052     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
   2053     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
   2054     fHangulSet  = new UnicodeSet();
   2055     fHangulSet->addAll(*fLSet);
   2056     fHangulSet->addAll(*fVSet);
   2057     fHangulSet->addAll(*fTSet);
   2058     fHangulSet->addAll(*fLVSet);
   2059     fHangulSet->addAll(*fLVTSet);
   2060     fAnySet     = new UnicodeSet(0, 0x10ffff);
   2061 
   2062     fSets       = new UVector(status);
   2063     fSets->addElement(fCRLFSet,    status);
   2064     fSets->addElement(fControlSet, status);
   2065     fSets->addElement(fExtendSet,  status);
   2066     fSets->addElement(fRegionalIndicatorSet, status);
   2067     if (!fPrependSet->isEmpty()) {
   2068         fSets->addElement(fPrependSet, status);
   2069     }
   2070     fSets->addElement(fSpacingSet, status);
   2071     fSets->addElement(fHangulSet,  status);
   2072     fSets->addElement(fAnySet,     status);
   2073     if (U_FAILURE(status)) {
   2074         deferredStatus = status;
   2075     }
   2076 }
   2077 
   2078 
   2079 void RBBICharMonkey::setText(const UnicodeString &s) {
   2080     fText = &s;
   2081 }
   2082 
   2083 
   2084 
   2085 int32_t RBBICharMonkey::next(int32_t prevPos) {
   2086     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2087                               //   break position being tested.  The candidate break
   2088                               //   location is before p2.
   2089 
   2090     int     breakPos = -1;
   2091 
   2092     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2093 
   2094     if (U_FAILURE(deferredStatus)) {
   2095         return -1;
   2096     }
   2097 
   2098     // Previous break at end of string.  return DONE.
   2099     if (prevPos >= fText->length()) {
   2100         return -1;
   2101     }
   2102     p0 = p1 = p2 = p3 = prevPos;
   2103     c3 =  fText->char32At(prevPos);
   2104     c0 = c1 = c2 = 0;
   2105     (void)p0;   // suppress set but not used warning.
   2106     (void)c0;
   2107 
   2108     // Loop runs once per "significant" character position in the input text.
   2109     for (;;) {
   2110         // Move all of the positions forward in the input string.
   2111         p0 = p1;  c0 = c1;
   2112         p1 = p2;  c1 = c2;
   2113         p2 = p3;  c2 = c3;
   2114 
   2115         // Advancd p3 by one codepoint
   2116         p3 = fText->moveIndex32(p3, 1);
   2117         c3 = fText->char32At(p3);
   2118 
   2119         if (p1 == p2) {
   2120             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2121             continue;
   2122         }
   2123         if (p2 == fText->length()) {
   2124             // Reached end of string.  Always a break position.
   2125             break;
   2126         }
   2127 
   2128         // Rule  GB3   CR x LF
   2129         //     No Extend or Format characters may appear between the CR and LF,
   2130         //     which requires the additional check for p2 immediately following p1.
   2131         //
   2132         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
   2133             continue;
   2134         }
   2135 
   2136         // Rule (GB4).   ( Control | CR | LF ) <break>
   2137         if (fControlSet->contains(c1) ||
   2138             c1 == 0x0D ||
   2139             c1 == 0x0A)  {
   2140             break;
   2141         }
   2142 
   2143         // Rule (GB5)    <break>  ( Control | CR | LF )
   2144         //
   2145         if (fControlSet->contains(c2) ||
   2146             c2 == 0x0D ||
   2147             c2 == 0x0A)  {
   2148             break;
   2149         }
   2150 
   2151 
   2152         // Rule (GB6)  L x ( L | V | LV | LVT )
   2153         if (fLSet->contains(c1) &&
   2154                (fLSet->contains(c2)  ||
   2155                 fVSet->contains(c2)  ||
   2156                 fLVSet->contains(c2) ||
   2157                 fLVTSet->contains(c2))) {
   2158             continue;
   2159         }
   2160 
   2161         // Rule (GB7)    ( LV | V )  x  ( V | T )
   2162         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
   2163             (fVSet->contains(c2) || fTSet->contains(c2)))  {
   2164             continue;
   2165         }
   2166 
   2167         // Rule (GB8)    ( LVT | T)  x T
   2168         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
   2169             fTSet->contains(c2))  {
   2170             continue;
   2171         }
   2172 
   2173         // Rule (GB8a)    Regional_Indicator x Regional_Indicator
   2174         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
   2175             continue;
   2176         }
   2177 
   2178         // Rule (GB9)    Numeric x ALetter
   2179         if (fExtendSet->contains(c2))  {
   2180             continue;
   2181         }
   2182 
   2183         // Rule (GB9a)   x  SpacingMark
   2184         if (fSpacingSet->contains(c2)) {
   2185             continue;
   2186         }
   2187 
   2188         // Rule (GB9b)   Prepend x
   2189         if (fPrependSet->contains(c1)) {
   2190             continue;
   2191         }
   2192 
   2193         // Rule (GB10)  Any  <break>  Any
   2194         break;
   2195     }
   2196 
   2197     breakPos = p2;
   2198     return breakPos;
   2199 }
   2200 
   2201 
   2202 
   2203 UVector  *RBBICharMonkey::charClasses() {
   2204     return fSets;
   2205 }
   2206 
   2207 
   2208 RBBICharMonkey::~RBBICharMonkey() {
   2209     delete fSets;
   2210     delete fCRLFSet;
   2211     delete fControlSet;
   2212     delete fExtendSet;
   2213     delete fRegionalIndicatorSet;
   2214     delete fPrependSet;
   2215     delete fSpacingSet;
   2216     delete fLSet;
   2217     delete fVSet;
   2218     delete fTSet;
   2219     delete fLVSet;
   2220     delete fLVTSet;
   2221     delete fHangulSet;
   2222     delete fAnySet;
   2223 }
   2224 
   2225 //------------------------------------------------------------------------------------------
   2226 //
   2227 //   class RBBIWordMonkey      Word Break specific implementation
   2228 //                             of RBBIMonkeyKind.
   2229 //
   2230 //------------------------------------------------------------------------------------------
   2231 class RBBIWordMonkey: public RBBIMonkeyKind {
   2232 public:
   2233     RBBIWordMonkey();
   2234     virtual          ~RBBIWordMonkey();
   2235     virtual  UVector *charClasses();
   2236     virtual  void     setText(const UnicodeString &s);
   2237     virtual int32_t   next(int32_t i);
   2238 private:
   2239     UVector      *fSets;
   2240 
   2241     UnicodeSet  *fCRSet;
   2242     UnicodeSet  *fLFSet;
   2243     UnicodeSet  *fNewlineSet;
   2244     UnicodeSet  *fRegionalIndicatorSet;
   2245     UnicodeSet  *fKatakanaSet;
   2246     UnicodeSet  *fHebrew_LetterSet;
   2247     UnicodeSet  *fALetterSet;
   2248     // TODO(jungshik): Do we still need this change?
   2249     // UnicodeSet  *fALetterSet; // matches ALetterPlus in word.txt
   2250     UnicodeSet  *fSingle_QuoteSet;
   2251     UnicodeSet  *fDouble_QuoteSet;
   2252     UnicodeSet  *fMidNumLetSet;
   2253     UnicodeSet  *fMidLetterSet;
   2254     UnicodeSet  *fMidNumSet;
   2255     UnicodeSet  *fNumericSet;
   2256     UnicodeSet  *fFormatSet;
   2257     UnicodeSet  *fOtherSet;
   2258     UnicodeSet  *fExtendSet;
   2259     UnicodeSet  *fExtendNumLetSet;
   2260     UnicodeSet  *fDictionaryCjkSet;
   2261 
   2262     const UnicodeString  *fText;
   2263 };
   2264 
   2265 
   2266 RBBIWordMonkey::RBBIWordMonkey()
   2267 {
   2268     UErrorCode  status = U_ZERO_ERROR;
   2269 
   2270     fSets            = new UVector(status);
   2271 
   2272     fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
   2273     fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
   2274     fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
   2275     fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
   2276     // Exclude Hangul syllables from ALetterSet during testing.
   2277     // Leave CJK dictionary characters out from the monkey tests!
   2278 #if 0
   2279     fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
   2280                                       "[\\p{Line_Break = Complex_Context}"
   2281                                       "-\\p{Grapheme_Cluster_Break = Extend}"
   2282                                       "-\\p{Grapheme_Cluster_Break = Control}"
   2283                                       "]]",
   2284                                       status);
   2285 #endif
   2286     fRegionalIndicatorSet =  new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
   2287     fKatakanaSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
   2288     fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);
   2289     fALetterSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
   2290     fALetterSet->removeAll(*fDictionaryCjkSet);
   2291     fSingle_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"),    status);
   2292     fDouble_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"),    status);
   2293     fMidNumLetSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
   2294     fMidLetterSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
   2295     fMidNumSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
   2296     // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
   2297     // we should figure out why
   2298     fNumericSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
   2299     fFormatSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
   2300     fExtendNumLetSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
   2301     fExtendSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
   2302 
   2303     fOtherSet        = new UnicodeSet();
   2304     if(U_FAILURE(status)) {
   2305       deferredStatus = status;
   2306       return;
   2307     }
   2308 
   2309     fOtherSet->complement();
   2310     fOtherSet->removeAll(*fCRSet);
   2311     fOtherSet->removeAll(*fLFSet);
   2312     fOtherSet->removeAll(*fNewlineSet);
   2313     fOtherSet->removeAll(*fKatakanaSet);
   2314     fOtherSet->removeAll(*fHebrew_LetterSet);
   2315     fOtherSet->removeAll(*fALetterSet);
   2316     fOtherSet->removeAll(*fSingle_QuoteSet);
   2317     fOtherSet->removeAll(*fDouble_QuoteSet);
   2318     fOtherSet->removeAll(*fMidLetterSet);
   2319     fOtherSet->removeAll(*fMidNumSet);
   2320     fOtherSet->removeAll(*fNumericSet);
   2321     fOtherSet->removeAll(*fExtendNumLetSet);
   2322     fOtherSet->removeAll(*fFormatSet);
   2323     fOtherSet->removeAll(*fExtendSet);
   2324     fOtherSet->removeAll(*fRegionalIndicatorSet);
   2325     // Inhibit dictionary characters from being tested at all.
   2326     fOtherSet->removeAll(*fDictionaryCjkSet);
   2327     fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
   2328 
   2329     fSets->addElement(fCRSet,                status);
   2330     fSets->addElement(fLFSet,                status);
   2331     fSets->addElement(fNewlineSet,           status);
   2332     fSets->addElement(fRegionalIndicatorSet, status);
   2333     fSets->addElement(fHebrew_LetterSet,     status);
   2334     fSets->addElement(fALetterSet,           status);
   2335     fSets->addElement(fSingle_QuoteSet,      status);
   2336     fSets->addElement(fDouble_QuoteSet,      status);
   2337     //fSets->addElement(fKatakanaSet,          status); //TODO: work out how to test katakana
   2338     fSets->addElement(fMidLetterSet,         status);
   2339     fSets->addElement(fMidNumLetSet,         status);
   2340     fSets->addElement(fMidNumSet,            status);
   2341     fSets->addElement(fNumericSet,           status);
   2342     fSets->addElement(fFormatSet,            status);
   2343     fSets->addElement(fExtendSet,            status);
   2344     fSets->addElement(fOtherSet,             status);
   2345     fSets->addElement(fExtendNumLetSet,      status);
   2346 
   2347     if (U_FAILURE(status)) {
   2348         deferredStatus = status;
   2349     }
   2350 }
   2351 
   2352 void RBBIWordMonkey::setText(const UnicodeString &s) {
   2353     fText       = &s;
   2354 }
   2355 
   2356 
   2357 int32_t RBBIWordMonkey::next(int32_t prevPos) {
   2358     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2359                               //   break position being tested.  The candidate break
   2360                               //   location is before p2.
   2361 
   2362     int     breakPos = -1;
   2363 
   2364     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2365 
   2366     if (U_FAILURE(deferredStatus)) {
   2367         return -1;
   2368     }
   2369 
   2370     // Prev break at end of string.  return DONE.
   2371     if (prevPos >= fText->length()) {
   2372         return -1;
   2373     }
   2374     p0 = p1 = p2 = p3 = prevPos;
   2375     c3 =  fText->char32At(prevPos);
   2376     c0 = c1 = c2 = 0;
   2377     (void)p0;       // Suppress set but not used warning.
   2378 
   2379     // Loop runs once per "significant" character position in the input text.
   2380     for (;;) {
   2381         // Move all of the positions forward in the input string.
   2382         p0 = p1;  c0 = c1;
   2383         p1 = p2;  c1 = c2;
   2384         p2 = p3;  c2 = c3;
   2385 
   2386         // Advancd p3 by    X(Extend | Format)*   Rule 4
   2387         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
   2388         do {
   2389             p3 = fText->moveIndex32(p3, 1);
   2390             c3 = fText->char32At(p3);
   2391             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
   2392                break;
   2393             };
   2394         }
   2395         while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
   2396 
   2397 
   2398         if (p1 == p2) {
   2399             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2400             continue;
   2401         }
   2402         if (p2 == fText->length()) {
   2403             // Reached end of string.  Always a break position.
   2404             break;
   2405         }
   2406 
   2407         // Rule  (3)   CR x LF
   2408         //     No Extend or Format characters may appear between the CR and LF,
   2409         //     which requires the additional check for p2 immediately following p1.
   2410         //
   2411         if (c1==0x0D && c2==0x0A) {
   2412             continue;
   2413         }
   2414 
   2415         // Rule (3a)  Break before and after newlines (including CR and LF)
   2416         //
   2417         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
   2418             break;
   2419         };
   2420         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
   2421             break;
   2422         };
   2423 
   2424         // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
   2425         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
   2426             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
   2427             continue;
   2428         }
   2429 
   2430         // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
   2431         //
   2432         if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
   2433              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
   2434              (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
   2435             continue;
   2436         }
   2437 
   2438         // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
   2439         if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
   2440             (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
   2441             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
   2442             continue;
   2443         }
   2444 
   2445         // Rule (7a)     Hebrew_Letter x Single_Quote
   2446         if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
   2447             continue;
   2448         }
   2449 
   2450         // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
   2451         if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
   2452             continue;
   2453         }
   2454 
   2455         // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
   2456         if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
   2457             continue;
   2458         }
   2459 
   2460         // Rule (8)    Numeric x Numeric
   2461         if (fNumericSet->contains(c1) &&
   2462             fNumericSet->contains(c2))  {
   2463             continue;
   2464         }
   2465 
   2466         // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
   2467         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
   2468             fNumericSet->contains(c2))  {
   2469             continue;
   2470         }
   2471 
   2472         // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
   2473         if (fNumericSet->contains(c1) &&
   2474             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
   2475             continue;
   2476         }
   2477 
   2478         // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
   2479         if (fNumericSet->contains(c0) &&
   2480             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
   2481             fNumericSet->contains(c2)) {
   2482             continue;
   2483         }
   2484 
   2485         // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
   2486         if (fNumericSet->contains(c1) &&
   2487             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
   2488             fNumericSet->contains(c3)) {
   2489             continue;
   2490         }
   2491 
   2492         // Rule (13)  Katakana x Katakana
   2493         if (fKatakanaSet->contains(c1) &&
   2494             fKatakanaSet->contains(c2))  {
   2495             continue;
   2496         }
   2497 
   2498         // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
   2499         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
   2500              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
   2501              fExtendNumLetSet->contains(c2)) {
   2502                 continue;
   2503         }
   2504 
   2505         // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
   2506         if (fExtendNumLetSet->contains(c1) &&
   2507                 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
   2508                  fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
   2509             continue;
   2510         }
   2511 
   2512         // Rule 13c
   2513         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
   2514             continue;
   2515         }
   2516 
   2517         // Rule 14.  Break found here.
   2518         break;
   2519     }
   2520 
   2521     breakPos = p2;
   2522     return breakPos;
   2523 }
   2524 
   2525 
   2526 UVector  *RBBIWordMonkey::charClasses() {
   2527     return fSets;
   2528 }
   2529 
   2530 
   2531 RBBIWordMonkey::~RBBIWordMonkey() {
   2532     delete fSets;
   2533     delete fCRSet;
   2534     delete fLFSet;
   2535     delete fNewlineSet;
   2536     delete fKatakanaSet;
   2537     delete fHebrew_LetterSet;
   2538     delete fALetterSet;
   2539     delete fSingle_QuoteSet;
   2540     delete fDouble_QuoteSet;
   2541     delete fMidNumLetSet;
   2542     delete fMidLetterSet;
   2543     delete fMidNumSet;
   2544     delete fNumericSet;
   2545     delete fFormatSet;
   2546     delete fExtendSet;
   2547     delete fExtendNumLetSet;
   2548     delete fRegionalIndicatorSet;
   2549     delete fDictionaryCjkSet;
   2550     delete fOtherSet;
   2551 }
   2552 
   2553 
   2554 
   2555 
   2556 //------------------------------------------------------------------------------------------
   2557 //
   2558 //   class RBBISentMonkey      Sentence Break specific implementation
   2559 //                             of RBBIMonkeyKind.
   2560 //
   2561 //------------------------------------------------------------------------------------------
   2562 class RBBISentMonkey: public RBBIMonkeyKind {
   2563 public:
   2564     RBBISentMonkey();
   2565     virtual          ~RBBISentMonkey();
   2566     virtual  UVector *charClasses();
   2567     virtual  void     setText(const UnicodeString &s);
   2568     virtual int32_t   next(int32_t i);
   2569 private:
   2570     int               moveBack(int posFrom);
   2571     int               moveForward(int posFrom);
   2572     UChar32           cAt(int pos);
   2573 
   2574     UVector      *fSets;
   2575 
   2576     UnicodeSet  *fSepSet;
   2577     UnicodeSet  *fFormatSet;
   2578     UnicodeSet  *fSpSet;
   2579     UnicodeSet  *fLowerSet;
   2580     UnicodeSet  *fUpperSet;
   2581     UnicodeSet  *fOLetterSet;
   2582     UnicodeSet  *fNumericSet;
   2583     UnicodeSet  *fATermSet;
   2584     UnicodeSet  *fSContinueSet;
   2585     UnicodeSet  *fSTermSet;
   2586     UnicodeSet  *fCloseSet;
   2587     UnicodeSet  *fOtherSet;
   2588     UnicodeSet  *fExtendSet;
   2589 
   2590     const UnicodeString  *fText;
   2591 
   2592 };
   2593 
   2594 RBBISentMonkey::RBBISentMonkey()
   2595 {
   2596     UErrorCode  status = U_ZERO_ERROR;
   2597 
   2598     fSets            = new UVector(status);
   2599 
   2600     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
   2601     //                       set and made into character classes of their own.  For the monkey impl,
   2602     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
   2603     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
   2604     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
   2605     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
   2606     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
   2607     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
   2608     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
   2609     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
   2610     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
   2611     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
   2612     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
   2613     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
   2614     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
   2615     fOtherSet        = new UnicodeSet();
   2616 
   2617     if(U_FAILURE(status)) {
   2618       deferredStatus = status;
   2619       return;
   2620     }
   2621 
   2622     fOtherSet->complement();
   2623     fOtherSet->removeAll(*fSepSet);
   2624     fOtherSet->removeAll(*fFormatSet);
   2625     fOtherSet->removeAll(*fSpSet);
   2626     fOtherSet->removeAll(*fLowerSet);
   2627     fOtherSet->removeAll(*fUpperSet);
   2628     fOtherSet->removeAll(*fOLetterSet);
   2629     fOtherSet->removeAll(*fNumericSet);
   2630     fOtherSet->removeAll(*fATermSet);
   2631     fOtherSet->removeAll(*fSContinueSet);
   2632     fOtherSet->removeAll(*fSTermSet);
   2633     fOtherSet->removeAll(*fCloseSet);
   2634     fOtherSet->removeAll(*fExtendSet);
   2635 
   2636     fSets->addElement(fSepSet,       status);
   2637     fSets->addElement(fFormatSet,    status);
   2638     fSets->addElement(fSpSet,        status);
   2639     fSets->addElement(fLowerSet,     status);
   2640     fSets->addElement(fUpperSet,     status);
   2641     fSets->addElement(fOLetterSet,   status);
   2642     fSets->addElement(fNumericSet,   status);
   2643     fSets->addElement(fATermSet,     status);
   2644     fSets->addElement(fSContinueSet, status);
   2645     fSets->addElement(fSTermSet,     status);
   2646     fSets->addElement(fCloseSet,     status);
   2647     fSets->addElement(fOtherSet,     status);
   2648     fSets->addElement(fExtendSet,    status);
   2649 
   2650     if (U_FAILURE(status)) {
   2651         deferredStatus = status;
   2652     }
   2653 }
   2654 
   2655 
   2656 
   2657 void RBBISentMonkey::setText(const UnicodeString &s) {
   2658     fText       = &s;
   2659 }
   2660 
   2661 UVector  *RBBISentMonkey::charClasses() {
   2662     return fSets;
   2663 }
   2664 
   2665 
   2666 //  moveBack()   Find the "significant" code point preceding the index i.
   2667 //               Skips over ($Extend | $Format)* .
   2668 //
   2669 int RBBISentMonkey::moveBack(int i) {
   2670     if (i <= 0) {
   2671         return -1;
   2672     }
   2673     UChar32   c;
   2674     int32_t   j = i;
   2675     do {
   2676         j = fText->moveIndex32(j, -1);
   2677         c = fText->char32At(j);
   2678     }
   2679     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
   2680     return j;
   2681 
   2682  }
   2683 
   2684 
   2685 int RBBISentMonkey::moveForward(int i) {
   2686     if (i>=fText->length()) {
   2687         return fText->length();
   2688     }
   2689     UChar32   c;
   2690     int32_t   j = i;
   2691     do {
   2692         j = fText->moveIndex32(j, 1);
   2693         c = cAt(j);
   2694     }
   2695     while (fFormatSet->contains(c) || fExtendSet->contains(c));
   2696     return j;
   2697 }
   2698 
   2699 UChar32 RBBISentMonkey::cAt(int pos) {
   2700     if (pos<0 || pos>=fText->length()) {
   2701         return -1;
   2702     } else {
   2703         return fText->char32At(pos);
   2704     }
   2705 }
   2706 
   2707 int32_t RBBISentMonkey::next(int32_t prevPos) {
   2708     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2709                               //   break position being tested.  The candidate break
   2710                               //   location is before p2.
   2711 
   2712     int     breakPos = -1;
   2713 
   2714     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2715     UChar32 c;
   2716 
   2717     if (U_FAILURE(deferredStatus)) {
   2718         return -1;
   2719     }
   2720 
   2721     // Prev break at end of string.  return DONE.
   2722     if (prevPos >= fText->length()) {
   2723         return -1;
   2724     }
   2725     p0 = p1 = p2 = p3 = prevPos;
   2726     c3 =  fText->char32At(prevPos);
   2727     c0 = c1 = c2 = 0;
   2728     (void)p0;     // Suppress set but not used warning.
   2729 
   2730     // Loop runs once per "significant" character position in the input text.
   2731     for (;;) {
   2732         // Move all of the positions forward in the input string.
   2733         p0 = p1;  c0 = c1;
   2734         p1 = p2;  c1 = c2;
   2735         p2 = p3;  c2 = c3;
   2736 
   2737         // Advancd p3 by    X(Extend | Format)*   Rule 4
   2738         p3 = moveForward(p3);
   2739         c3 = cAt(p3);
   2740 
   2741         // Rule (3)  CR x LF
   2742         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
   2743             continue;
   2744         }
   2745 
   2746         // Rule (4).   Sep  <break>
   2747         if (fSepSet->contains(c1)) {
   2748             p2 = p1+1;   // Separators don't combine with Extend or Format.
   2749             break;
   2750         }
   2751 
   2752         if (p2 >= fText->length()) {
   2753             // Reached end of string.  Always a break position.
   2754             break;
   2755         }
   2756 
   2757         if (p2 == prevPos) {
   2758             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2759             continue;
   2760         }
   2761 
   2762         // Rule (6).   ATerm x Numeric
   2763         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
   2764             continue;
   2765         }
   2766 
   2767         // Rule (7).  (Upper | Lower) ATerm  x  Uppper
   2768         if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
   2769                 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
   2770             continue;
   2771         }
   2772 
   2773         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
   2774         //           Note:  STerm | ATerm are added to the negated part of the expression by a
   2775         //                  note to the Unicode 5.0 documents.
   2776         int p8 = p1;
   2777         while (fSpSet->contains(cAt(p8))) {
   2778             p8 = moveBack(p8);
   2779         }
   2780         while (fCloseSet->contains(cAt(p8))) {
   2781             p8 = moveBack(p8);
   2782         }
   2783         if (fATermSet->contains(cAt(p8))) {
   2784             p8=p2;
   2785             for (;;) {
   2786                 c = cAt(p8);
   2787                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
   2788                     fLowerSet->contains(c) || fSepSet->contains(c) ||
   2789                     fATermSet->contains(c) || fSTermSet->contains(c))  {
   2790                     break;
   2791                 }
   2792                 p8 = moveForward(p8);
   2793             }
   2794             if (fLowerSet->contains(cAt(p8))) {
   2795                 continue;
   2796             }
   2797         }
   2798 
   2799         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
   2800         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
   2801             p8 = p1;
   2802             while (fSpSet->contains(cAt(p8))) {
   2803                 p8 = moveBack(p8);
   2804             }
   2805             while (fCloseSet->contains(cAt(p8))) {
   2806                 p8 = moveBack(p8);
   2807             }
   2808             c = cAt(p8);
   2809             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
   2810                 continue;
   2811             }
   2812         }
   2813 
   2814         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
   2815         int p9 = p1;
   2816         while (fCloseSet->contains(cAt(p9))) {
   2817             p9 = moveBack(p9);
   2818         }
   2819         c = cAt(p9);
   2820         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
   2821             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
   2822                 continue;
   2823             }
   2824         }
   2825 
   2826         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
   2827         int p10 = p1;
   2828         while (fSpSet->contains(cAt(p10))) {
   2829             p10 = moveBack(p10);
   2830         }
   2831         while (fCloseSet->contains(cAt(p10))) {
   2832             p10 = moveBack(p10);
   2833         }
   2834         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
   2835             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
   2836                 continue;
   2837             }
   2838         }
   2839 
   2840         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
   2841         int p11 = p1;
   2842         if (fSepSet->contains(cAt(p11))) {
   2843             p11 = moveBack(p11);
   2844         }
   2845         while (fSpSet->contains(cAt(p11))) {
   2846             p11 = moveBack(p11);
   2847         }
   2848         while (fCloseSet->contains(cAt(p11))) {
   2849             p11 = moveBack(p11);
   2850         }
   2851         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
   2852             break;
   2853         }
   2854 
   2855         //  Rule (12)  Any x Any
   2856         continue;
   2857     }
   2858     breakPos = p2;
   2859     return breakPos;
   2860 }
   2861 
   2862 RBBISentMonkey::~RBBISentMonkey() {
   2863     delete fSets;
   2864     delete fSepSet;
   2865     delete fFormatSet;
   2866     delete fSpSet;
   2867     delete fLowerSet;
   2868     delete fUpperSet;
   2869     delete fOLetterSet;
   2870     delete fNumericSet;
   2871     delete fATermSet;
   2872     delete fSContinueSet;
   2873     delete fSTermSet;
   2874     delete fCloseSet;
   2875     delete fOtherSet;
   2876     delete fExtendSet;
   2877 }
   2878 
   2879 
   2880 
   2881 //-------------------------------------------------------------------------------------------
   2882 //
   2883 //  RBBILineMonkey
   2884 //
   2885 //-------------------------------------------------------------------------------------------
   2886 
   2887 class RBBILineMonkey: public RBBIMonkeyKind {
   2888 public:
   2889     RBBILineMonkey();
   2890     virtual          ~RBBILineMonkey();
   2891     virtual  UVector *charClasses();
   2892     virtual  void     setText(const UnicodeString &s);
   2893     virtual  int32_t  next(int32_t i);
   2894     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
   2895 private:
   2896     UVector      *fSets;
   2897 
   2898     UnicodeSet  *fBK;
   2899     UnicodeSet  *fCR;
   2900     UnicodeSet  *fLF;
   2901     UnicodeSet  *fCM;
   2902     UnicodeSet  *fNL;
   2903     UnicodeSet  *fSG;
   2904     UnicodeSet  *fWJ;
   2905     UnicodeSet  *fZW;
   2906     UnicodeSet  *fGL;
   2907     UnicodeSet  *fCB;
   2908     UnicodeSet  *fSP;
   2909     UnicodeSet  *fB2;
   2910     UnicodeSet  *fBA;
   2911     UnicodeSet  *fBB;
   2912     UnicodeSet  *fHY;
   2913     UnicodeSet  *fH2;
   2914     UnicodeSet  *fH3;
   2915     UnicodeSet  *fCL;
   2916     UnicodeSet  *fCP;
   2917     UnicodeSet  *fEX;
   2918     UnicodeSet  *fIN;
   2919     UnicodeSet  *fJL;
   2920     UnicodeSet  *fJV;
   2921     UnicodeSet  *fJT;
   2922     UnicodeSet  *fNS;
   2923     UnicodeSet  *fOP;
   2924     UnicodeSet  *fQU;
   2925     UnicodeSet  *fIS;
   2926     UnicodeSet  *fNU;
   2927     UnicodeSet  *fPO;
   2928     UnicodeSet  *fPR;
   2929     UnicodeSet  *fSY;
   2930     UnicodeSet  *fAI;
   2931     UnicodeSet  *fAL;
   2932     UnicodeSet  *fCJ;
   2933     UnicodeSet  *fHL;
   2934     UnicodeSet  *fID;
   2935     UnicodeSet  *fRI;
   2936     UnicodeSet  *fSA;
   2937     UnicodeSet  *fXX;
   2938 
   2939     BreakIterator        *fCharBI;
   2940     const UnicodeString  *fText;
   2941     RegexMatcher         *fNumberMatcher;
   2942 };
   2943 
   2944 
   2945 RBBILineMonkey::RBBILineMonkey()
   2946 {
   2947     UErrorCode  status = U_ZERO_ERROR;
   2948 
   2949     fSets  = new UVector(status);
   2950 
   2951     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
   2952     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
   2953     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
   2954     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
   2955     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
   2956     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
   2957     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
   2958     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
   2959     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
   2960     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
   2961     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
   2962     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
   2963     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
   2964     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
   2965     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
   2966     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
   2967     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
   2968     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
   2969     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
   2970     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
   2971     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
   2972     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
   2973     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
   2974     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
   2975     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
   2976     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
   2977     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
   2978     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
   2979     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
   2980     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
   2981     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
   2982     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
   2983     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
   2984     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
   2985     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
   2986     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
   2987     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
   2988     fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
   2989     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
   2990     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
   2991 
   2992     if (U_FAILURE(status)) {
   2993         deferredStatus = status;
   2994         fCharBI = NULL;
   2995         fNumberMatcher = NULL;
   2996         return;
   2997     }
   2998 
   2999     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
   3000     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
   3001     fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
   3002     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
   3003 
   3004     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
   3005 
   3006     fSets->addElement(fBK, status);
   3007     fSets->addElement(fCR, status);
   3008     fSets->addElement(fLF, status);
   3009     fSets->addElement(fCM, status);
   3010     fSets->addElement(fNL, status);
   3011     fSets->addElement(fWJ, status);
   3012     fSets->addElement(fZW, status);
   3013     fSets->addElement(fGL, status);
   3014     fSets->addElement(fCB, status);
   3015     fSets->addElement(fSP, status);
   3016     fSets->addElement(fB2, status);
   3017     fSets->addElement(fBA, status);
   3018     fSets->addElement(fBB, status);
   3019     fSets->addElement(fHY, status);
   3020     fSets->addElement(fH2, status);
   3021     fSets->addElement(fH3, status);
   3022     fSets->addElement(fCL, status);
   3023     fSets->addElement(fCP, status);
   3024     fSets->addElement(fEX, status);
   3025     fSets->addElement(fIN, status);
   3026     fSets->addElement(fJL, status);
   3027     fSets->addElement(fJT, status);
   3028     fSets->addElement(fJV, status);
   3029     fSets->addElement(fNS, status);
   3030     fSets->addElement(fOP, status);
   3031     fSets->addElement(fQU, status);
   3032     fSets->addElement(fIS, status);
   3033     fSets->addElement(fNU, status);
   3034     fSets->addElement(fPO, status);
   3035     fSets->addElement(fPR, status);
   3036     fSets->addElement(fSY, status);
   3037     fSets->addElement(fAI, status);
   3038     fSets->addElement(fAL, status);
   3039     fSets->addElement(fHL, status);
   3040     fSets->addElement(fID, status);
   3041     fSets->addElement(fWJ, status);
   3042     fSets->addElement(fRI, status);
   3043     fSets->addElement(fSA, status);
   3044     fSets->addElement(fSG, status);
   3045 
   3046     const char *rules =
   3047             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
   3048             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
   3049             "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
   3050             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
   3051             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
   3052             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
   3053 
   3054     fNumberMatcher = new RegexMatcher(
   3055         UnicodeString(rules, -1, US_INV), 0, status);
   3056 
   3057     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
   3058 
   3059     if (U_FAILURE(status)) {
   3060         deferredStatus = status;
   3061     }
   3062 }
   3063 
   3064 
   3065 void RBBILineMonkey::setText(const UnicodeString &s) {
   3066     fText       = &s;
   3067     fCharBI->setText(s);
   3068     fNumberMatcher->reset(s);
   3069 }
   3070 
   3071 //
   3072 //  rule9Adjust
   3073 //     Line Break TR rules 9 and 10 implementation.
   3074 //     This deals with combining marks and other sequences that
   3075 //     that must be treated as if they were something other than what they actually are.
   3076 //
   3077 //     This is factored out into a separate function because it must be applied twice for
   3078 //     each potential break, once to the chars before the position being checked, then
   3079 //     again to the text following the possible break.
   3080 //
   3081 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
   3082     if (pos == -1) {
   3083         // Invalid initial position.  Happens during the warmup iteration of the
   3084         //   main loop in next().
   3085         return;
   3086     }
   3087 
   3088     int32_t  nPos = *nextPos;
   3089 
   3090     // LB 9  Keep combining sequences together.
   3091     //  advance over any CM class chars.  Note that Line Break CM is different
   3092     //  from the normal Grapheme Extend property.
   3093     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
   3094           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
   3095         for (;;) {
   3096             *nextChar = fText->char32At(nPos);
   3097             if (!fCM->contains(*nextChar)) {
   3098                 break;
   3099             }
   3100             nPos = fText->moveIndex32(nPos, 1);
   3101         }
   3102     }
   3103 
   3104 
   3105     // LB 9 Treat X CM* as if it were x.
   3106     //       No explicit action required.
   3107 
   3108     // LB 10  Treat any remaining combining mark as AL
   3109     if (fCM->contains(*posChar)) {
   3110         *posChar = 0x41;   // thisChar = 'A';
   3111     }
   3112 
   3113     // Push the updated nextPos and nextChar back to our caller.
   3114     // This only makes a difference if posChar got bigger by consuming a
   3115     // combining sequence.
   3116     *nextPos  = nPos;
   3117     *nextChar = fText->char32At(nPos);
   3118 }
   3119 
   3120 
   3121 
   3122 int32_t RBBILineMonkey::next(int32_t startPos) {
   3123     UErrorCode status = U_ZERO_ERROR;
   3124     int32_t    pos;       //  Index of the char following a potential break position
   3125     UChar32    thisChar;  //  Character at above position "pos"
   3126 
   3127     int32_t    prevPos;   //  Index of the char preceding a potential break position
   3128     UChar32    prevChar;  //  Character at above position.  Note that prevChar
   3129                           //   and thisChar may not be adjacent because combining
   3130                           //   characters between them will be ignored.
   3131 
   3132     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
   3133     UChar32    prevCharX2;
   3134 
   3135     int32_t    nextPos;   //  Index of the next character following pos.
   3136                           //     Usually skips over combining marks.
   3137     int32_t    nextCPPos; //  Index of the code point following "pos."
   3138                           //     May point to a combining mark.
   3139     int32_t    tPos;      //  temp value.
   3140     UChar32    c;
   3141 
   3142     if (U_FAILURE(deferredStatus)) {
   3143         return -1;
   3144     }
   3145 
   3146     if (startPos >= fText->length()) {
   3147         return -1;
   3148     }
   3149 
   3150 
   3151     // Initial values for loop.  Loop will run the first time without finding breaks,
   3152     //                           while the invalid values shift out and the "this" and
   3153     //                           "prev" positions are filled in with good values.
   3154     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
   3155     thisChar = prevChar  = prevCharX2 = 0;
   3156     nextPos  = nextCPPos = startPos;
   3157 
   3158 
   3159     // Loop runs once per position in the test text, until a break position
   3160     //  is found.
   3161     for (;;) {
   3162         prevPosX2 = prevPos;
   3163         prevCharX2 = prevChar;
   3164 
   3165         prevPos   = pos;
   3166         prevChar  = thisChar;
   3167 
   3168         pos       = nextPos;
   3169         thisChar  = fText->char32At(pos);
   3170 
   3171         nextCPPos = fText->moveIndex32(pos, 1);
   3172         nextPos   = nextCPPos;
   3173 
   3174         // Rule LB2 - Break at end of text.
   3175         if (pos >= fText->length()) {
   3176             break;
   3177         }
   3178 
   3179         // Rule LB 9 - adjust for combining sequences.
   3180         //             We do this one out-of-order because the adjustment does not change anything
   3181         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
   3182         //             be applied.
   3183         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
   3184         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
   3185         c = fText->char32At(nextPos);
   3186         rule9Adjust(pos,     &thisChar, &nextPos, &c);
   3187 
   3188         // If the loop is still warming up - if we haven't shifted the initial
   3189         //   -1 positions out of prevPos yet - loop back to advance the
   3190         //    position in the input without any further looking for breaks.
   3191         if (prevPos == -1) {
   3192             continue;
   3193         }
   3194 
   3195         // LB 4  Always break after hard line breaks,
   3196         if (fBK->contains(prevChar)) {
   3197             break;
   3198         }
   3199 
   3200         // LB 5  Break after CR, LF, NL, but not inside CR LF
   3201         if (prevChar == 0x0d && thisChar == 0x0a) {
   3202             continue;
   3203         }
   3204         if (prevChar == 0x0d ||
   3205             prevChar == 0x0a ||
   3206             prevChar == 0x85)  {
   3207             break;
   3208         }
   3209 
   3210         // LB 6  Don't break before hard line breaks
   3211         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
   3212             fBK->contains(thisChar)) {
   3213                 continue;
   3214         }
   3215 
   3216 
   3217         // LB 7  Don't break before spaces or zero-width space.
   3218         if (fSP->contains(thisChar)) {
   3219             continue;
   3220         }
   3221 
   3222         if (fZW->contains(thisChar)) {
   3223             continue;
   3224         }
   3225 
   3226         // LB 8  Break after zero width space
   3227         if (fZW->contains(prevChar)) {
   3228             break;
   3229         }
   3230 
   3231         // LB 9, 10  Already done, at top of loop.
   3232         //
   3233 
   3234 
   3235         // LB 11  Do not break before or after WORD JOINER and related characters.
   3236         //    x  WJ
   3237         //    WJ  x
   3238         //
   3239         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
   3240             continue;
   3241         }
   3242 
   3243         // LB 12
   3244         //    GL  x
   3245         if (fGL->contains(prevChar)) {
   3246             continue;
   3247         }
   3248 
   3249         // LB 12a
   3250         //    [^SP BA HY] x GL
   3251         if (!(fSP->contains(prevChar) ||
   3252               fBA->contains(prevChar) ||
   3253               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
   3254             continue;
   3255         }
   3256 
   3257 
   3258 
   3259         // LB 13  Don't break before closings.
   3260         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
   3261         //        fall into LB 17 and the more general number regular expression.
   3262         //
   3263         if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
   3264             (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
   3265                                          fEX->contains(thisChar)  ||
   3266             (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
   3267             (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
   3268             continue;
   3269         }
   3270 
   3271         // LB 14 Don't break after OP SP*
   3272         //       Scan backwards, checking for this sequence.
   3273         //       The OP char could include combining marks, so we actually check for
   3274         //           OP CM* SP*
   3275         //       Another Twist: The Rule 67 fixes may have changed a SP CM
   3276         //       sequence into a ID char, so before scanning back through spaces,
   3277         //       verify that prevChar is indeed a space.  The prevChar variable
   3278         //       may differ from fText[prevPos]
   3279         tPos = prevPos;
   3280         if (fSP->contains(prevChar)) {
   3281             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
   3282                 tPos=fText->moveIndex32(tPos, -1);
   3283             }
   3284         }
   3285         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
   3286             tPos=fText->moveIndex32(tPos, -1);
   3287         }
   3288         if (fOP->contains(fText->char32At(tPos))) {
   3289             continue;
   3290         }
   3291 
   3292 
   3293         // LB 15    QU SP* x OP
   3294         if (fOP->contains(thisChar)) {
   3295             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
   3296             int tPos = prevPos;
   3297             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
   3298                 tPos = fText->moveIndex32(tPos, -1);
   3299             }
   3300             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
   3301                 tPos = fText->moveIndex32(tPos, -1);
   3302             }
   3303             if (fQU->contains(fText->char32At(tPos))) {
   3304                 continue;
   3305             }
   3306         }
   3307 
   3308 
   3309 
   3310         // LB 16   (CL | CP) SP* x NS
   3311         //    Scan backwards for SP* CM* (CL | CP)
   3312         if (fNS->contains(thisChar)) {
   3313             int tPos = prevPos;
   3314             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
   3315                 tPos = fText->moveIndex32(tPos, -1);
   3316             }
   3317             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
   3318                 tPos = fText->moveIndex32(tPos, -1);
   3319             }
   3320             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
   3321                 continue;
   3322             }
   3323         }
   3324 
   3325 
   3326         // LB 17        B2 SP* x B2
   3327         if (fB2->contains(thisChar)) {
   3328             //  Scan backwards, checking for the B2 CM* SP* sequence.
   3329             tPos = prevPos;
   3330             if (fSP->contains(prevChar)) {
   3331                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
   3332                     tPos=fText->moveIndex32(tPos, -1);
   3333                 }
   3334             }
   3335             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
   3336                 tPos=fText->moveIndex32(tPos, -1);
   3337             }
   3338             if (fB2->contains(fText->char32At(tPos))) {
   3339                 continue;
   3340             }
   3341         }
   3342 
   3343 
   3344         // LB 18    break after space
   3345         if (fSP->contains(prevChar)) {
   3346             break;
   3347         }
   3348 
   3349         // LB 19
   3350         //    x   QU
   3351         //    QU  x
   3352         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
   3353             continue;
   3354         }
   3355 
   3356         // LB 20  Break around a CB
   3357         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
   3358             break;
   3359         }
   3360 
   3361         // LB 21
   3362         if (fBA->contains(thisChar) ||
   3363             fHY->contains(thisChar) ||
   3364             fNS->contains(thisChar) ||
   3365             fBB->contains(prevChar) )   {
   3366             continue;
   3367         }
   3368 
   3369         // LB 21a
   3370         //   HL (HY | BA) x
   3371         if (fHL->contains(prevCharX2) &&
   3372                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
   3373             continue;
   3374         }
   3375 
   3376         // LB 21b
   3377         //   SY x HL
   3378         if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
   3379             continue;
   3380         }
   3381 
   3382         // LB 22
   3383         if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
   3384             (fEX->contains(prevChar) && fIN->contains(thisChar)) ||
   3385             (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
   3386             (fID->contains(prevChar) && fIN->contains(thisChar)) ||
   3387             (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
   3388             (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
   3389             continue;
   3390         }
   3391 
   3392 
   3393         // LB 23    ID x PO
   3394         //          AL x NU
   3395         //          HL x NU
   3396         //          NU x AL
   3397         if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
   3398             (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
   3399             (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
   3400             (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
   3401             (fNU->contains(prevChar) && fHL->contains(thisChar)) )   {
   3402             continue;
   3403         }
   3404 
   3405         // LB 24  Do not break between prefix and letters or ideographs.
   3406         //        PR x ID
   3407         //        PR x (AL | HL)
   3408         //        PO x (AL | HL)
   3409         if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
   3410             (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
   3411             (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))))  {
   3412             continue;
   3413         }
   3414 
   3415 
   3416 
   3417         // LB 25    Numbers
   3418         if (fNumberMatcher->lookingAt(prevPos, status)) {
   3419             if (U_FAILURE(status)) {
   3420                 break;
   3421             }
   3422             // Matched a number.  But could have been just a single digit, which would
   3423             //    not represent a "no break here" between prevChar and thisChar
   3424             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
   3425             if (numEndIdx > pos) {
   3426                 // Number match includes at least our two chars being checked
   3427                 if (numEndIdx > nextPos) {
   3428                     // Number match includes additional chars.  Update pos and nextPos
   3429                     //   so that next loop iteration will continue at the end of the number,
   3430                     //   checking for breaks between last char in number & whatever follows.
   3431                     pos = nextPos = numEndIdx;
   3432                     do {
   3433                         pos = fText->moveIndex32(pos, -1);
   3434                         thisChar = fText->char32At(pos);
   3435                     } while (fCM->contains(thisChar));
   3436                 }
   3437                 continue;
   3438             }
   3439         }
   3440 
   3441 
   3442         // LB 26 Do not break a Korean syllable.
   3443         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
   3444                                         fJV->contains(thisChar) ||
   3445                                         fH2->contains(thisChar) ||
   3446                                         fH3->contains(thisChar))) {
   3447                                             continue;
   3448                                         }
   3449 
   3450         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
   3451             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
   3452                 continue;
   3453         }
   3454 
   3455         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
   3456             fJT->contains(thisChar)) {
   3457                 continue;
   3458         }
   3459 
   3460         // LB 27 Treat a Korean Syllable Block the same as ID.
   3461         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
   3462             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
   3463             fIN->contains(thisChar)) {
   3464                 continue;
   3465             }
   3466         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
   3467             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
   3468             fPO->contains(thisChar)) {
   3469                 continue;
   3470             }
   3471         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
   3472             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
   3473                 continue;
   3474             }
   3475 
   3476 
   3477 
   3478         // LB 28  Do not break between alphabetics ("at").
   3479         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
   3480             continue;
   3481         }
   3482 
   3483         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
   3484         if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
   3485             continue;
   3486         }
   3487 
   3488         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
   3489         //          (AL | NU) x OP
   3490         //          CP x (AL | NU)
   3491         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
   3492             continue;
   3493         }
   3494         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
   3495             continue;
   3496         }
   3497 
   3498         // LB30a  Do not break between regional indicators.
   3499         //        RI x RI
   3500         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
   3501             continue;
   3502         }
   3503 
   3504         // LB 31    Break everywhere else
   3505         break;
   3506 
   3507     }
   3508 
   3509     return pos;
   3510 }
   3511 
   3512 
   3513 UVector  *RBBILineMonkey::charClasses() {
   3514     return fSets;
   3515 }
   3516 
   3517 
   3518 RBBILineMonkey::~RBBILineMonkey() {
   3519     delete fSets;
   3520 
   3521     delete fBK;
   3522     delete fCR;
   3523     delete fLF;
   3524     delete fCM;
   3525     delete fNL;
   3526     delete fWJ;
   3527     delete fZW;
   3528     delete fGL;
   3529     delete fCB;
   3530     delete fSP;
   3531     delete fB2;
   3532     delete fBA;
   3533     delete fBB;
   3534     delete fHY;
   3535     delete fH2;
   3536     delete fH3;
   3537     delete fCL;
   3538     delete fCP;
   3539     delete fEX;
   3540     delete fIN;
   3541     delete fJL;
   3542     delete fJV;
   3543     delete fJT;
   3544     delete fNS;
   3545     delete fOP;
   3546     delete fQU;
   3547     delete fIS;
   3548     delete fNU;
   3549     delete fPO;
   3550     delete fPR;
   3551     delete fSY;
   3552     delete fAI;
   3553     delete fAL;
   3554     delete fCJ;
   3555     delete fHL;
   3556     delete fID;
   3557     delete fRI;
   3558     delete fSA;
   3559     delete fSG;
   3560     delete fXX;
   3561 
   3562     delete fCharBI;
   3563     delete fNumberMatcher;
   3564 }
   3565 
   3566 
   3567 //-------------------------------------------------------------------------------------------
   3568 //
   3569 //   TestMonkey
   3570 //
   3571 //     params
   3572 //       seed=nnnnn        Random number starting seed.
   3573 //                         Setting the seed allows errors to be reproduced.
   3574 //       loop=nnn          Looping count.  Controls running time.
   3575 //                         -1:  run forever.
   3576 //                          0 or greater:  run length.
   3577 //
   3578 //       type = char | word | line | sent | title
   3579 //
   3580 //-------------------------------------------------------------------------------------------
   3581 
   3582 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
   3583     int32_t val = defaultVal;
   3584     name.append(" *= *(-?\\d+)");
   3585     UErrorCode status = U_ZERO_ERROR;
   3586     RegexMatcher m(name, params, 0, status);
   3587     if (m.find()) {
   3588         // The param exists.  Convert the string to an int.
   3589         char valString[100];
   3590         int32_t paramLength = m.end(1, status) - m.start(1, status);
   3591         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
   3592             paramLength = (int32_t)(sizeof(valString)-2);
   3593         }
   3594         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
   3595         val = strtol(valString,  NULL, 10);
   3596 
   3597         // Delete this parameter from the params string.
   3598         m.reset();
   3599         params = m.replaceFirst("", status);
   3600     }
   3601     U_ASSERT(U_SUCCESS(status));
   3602     return val;
   3603 }
   3604 #endif
   3605 
   3606 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3607 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
   3608                                     BreakIterator *bi,
   3609                                     int expected[],
   3610                                     int expectedcount)
   3611 {
   3612     int count = 0;
   3613     int i = 0;
   3614     int forward[50];
   3615     bi->setText(ustr);
   3616     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
   3617         forward[count] = i;
   3618         if (count < expectedcount && expected[count] != i) {
   3619             test->errln("break forward test failed: expected %d but got %d",
   3620                         expected[count], i);
   3621             break;
   3622         }
   3623         count ++;
   3624     }
   3625     if (count != expectedcount) {
   3626         printStringBreaks(ustr, expected, expectedcount);
   3627         test->errln("break forward test failed: missed %d match",
   3628                     expectedcount - count);
   3629         return;
   3630     }
   3631     // testing boundaries
   3632     for (i = 1; i < expectedcount; i ++) {
   3633         int j = expected[i - 1];
   3634         if (!bi->isBoundary(j)) {
   3635             printStringBreaks(ustr, expected, expectedcount);
   3636             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
   3637             return;
   3638         }
   3639         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
   3640             if (bi->isBoundary(j)) {
   3641                 printStringBreaks(ustr, expected, expectedcount);
   3642                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
   3643                 return;
   3644             }
   3645         }
   3646     }
   3647 
   3648     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
   3649         count --;
   3650         if (forward[count] != i) {
   3651             printStringBreaks(ustr, expected, expectedcount);
   3652             test->errln("happy break test previous() failed: expected %d but got %d",
   3653                         forward[count], i);
   3654             break;
   3655         }
   3656     }
   3657     if (count != 0) {
   3658         printStringBreaks(ustr, expected, expectedcount);
   3659         test->errln("break test previous() failed: missed a match");
   3660         return;
   3661     }
   3662 
   3663     // testing preceding
   3664     for (i = 0; i < expectedcount - 1; i ++) {
   3665         // int j = expected[i] + 1;
   3666         int j = ustr.moveIndex32(expected[i], 1);
   3667         for (; j <= expected[i + 1]; j ++) {
   3668             if (bi->preceding(j) != expected[i]) {
   3669                 printStringBreaks(ustr, expected, expectedcount);
   3670                 test->errln("preceding(): Not expecting boundary at position %d", j);
   3671                 return;
   3672             }
   3673         }
   3674     }
   3675 }
   3676 #endif
   3677 
   3678 void RBBITest::TestWordBreaks(void)
   3679 {
   3680 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3681 
   3682     Locale        locale("en");
   3683     UErrorCode    status = U_ZERO_ERROR;
   3684     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   3685     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
   3686     // Replaced any C+J characters in a row with a random sequence of characters
   3687     // of the same length to make our C+J segmentation not get in the way.
   3688     static const char *strlist[] =
   3689     {
   3690     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
   3691     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
   3692     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
   3693     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
   3694     "\\uac00\\u3588\\u009c\\u0953\\u194b",
   3695     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   3696     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
   3697     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
   3698     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   3699     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   3700     "\\u2027\\U000e0067\\u0a47\\u00b7",
   3701     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   3702     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   3703     "\\u0589\\U000e006e\\u0a42\\U000104a5",
   3704     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
   3705     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   3706     "\\u0027\\u11af\\U000e0057\\u0602",
   3707     "\\U0001d7f2\\U000e007\\u0004\\u0589",
   3708     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
   3709     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   3710     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   3711     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   3712     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
   3713     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   3714     "\\u0233\\U000e0020\\u0a69\\u0d6a",
   3715     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   3716     "\\u18f4\\U000e0049\\u20e7\\u2027",
   3717     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   3718     "\\ua183\\u102d\\u0bec\\u003a",
   3719     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   3720     "\\u003a\\u0e57\\u0fad\\u002e",
   3721     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
   3722     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
   3723     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
   3724     "\\u003a\\u0664\\u00b7\\u1fba",
   3725     "\\u003b\\u0027\\u00b7\\u47a3",
   3726     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
   3727     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
   3728     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
   3729     };
   3730     int loop;
   3731     if (U_FAILURE(status)) {
   3732         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   3733         return;
   3734     }
   3735     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   3736         // printf("looping %d\n", loop);
   3737         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
   3738         // RBBICharMonkey monkey;
   3739         RBBIWordMonkey monkey;
   3740 
   3741         int expected[50];
   3742         int expectedcount = 0;
   3743 
   3744         monkey.setText(ustr);
   3745         int i;
   3746         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   3747             expected[expectedcount ++] = i;
   3748         }
   3749 
   3750         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   3751     }
   3752     delete bi;
   3753 #endif
   3754 }
   3755 
   3756 void RBBITest::TestWordBoundary(void)
   3757 {
   3758     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
   3759     Locale        locale("en");
   3760     UErrorCode    status = U_ZERO_ERROR;
   3761     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   3762     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
   3763     UChar         str[50];
   3764     static const char *strlist[] =
   3765     {
   3766     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   3767     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   3768     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   3769     "\\u2027\\U000e0067\\u0a47\\u00b7",
   3770     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   3771     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   3772     "\\u0589\\U000e006e\\u0a42\\U000104a5",
   3773     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
   3774     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   3775     "\\u0027\\u11af\\U000e0057\\u0602",
   3776     "\\U0001d7f2\\U000e007\\u0004\\u0589",
   3777     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
   3778     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   3779     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   3780     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   3781     "\\U000e0065\\u302c\\u09ee\\U000e0068",
   3782     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   3783     "\\u0233\\U000e0020\\u0a69\\u0d6a",
   3784     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   3785     "\\u58f4\\U000e0049\\u20e7\\u2027",
   3786     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   3787     "\\ua183\\u102d\\u0bec\\u003a",
   3788     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   3789     "\\u003a\\u0e57\\u0fad\\u002e",
   3790     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
   3791     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
   3792     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
   3793     "\\u003a\\u0664\\u00b7\\u1fba",
   3794     "\\u003b\\u0027\\u00b7\\u47a3",
   3795     };
   3796     int loop;
   3797     if (U_FAILURE(status)) {
   3798         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   3799         return;
   3800     }
   3801     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   3802         // printf("looping %d\n", loop);
   3803         u_unescape(strlist[loop], str, 20);
   3804         UnicodeString ustr(str);
   3805         int forward[50];
   3806         int count = 0;
   3807 
   3808         bi->setText(ustr);
   3809         int prev = 0;
   3810         int i;
   3811         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
   3812             forward[count ++] = i;
   3813             if (i > prev) {
   3814                 int j;
   3815                 for (j = prev + 1; j < i; j ++) {
   3816                     if (bi->isBoundary(j)) {
   3817                         printStringBreaks(ustr, forward, count);
   3818                         errln("happy boundary test failed: expected %d not a boundary",
   3819                                j);
   3820                         return;
   3821                     }
   3822                 }
   3823             }
   3824             if (!bi->isBoundary(i)) {
   3825                 printStringBreaks(ustr, forward, count);
   3826                 errln("happy boundary test failed: expected %d a boundary",
   3827                        i);
   3828                 return;
   3829             }
   3830             prev = i;
   3831         }
   3832     }
   3833     delete bi;
   3834 }
   3835 
   3836 void RBBITest::TestLineBreaks(void)
   3837 {
   3838 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3839     Locale        locale("en");
   3840     UErrorCode    status = U_ZERO_ERROR;
   3841     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
   3842     const int32_t  STRSIZE = 50;
   3843     UChar         str[STRSIZE];
   3844     static const char *strlist[] =
   3845     {
   3846      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
   3847      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
   3848              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
   3849      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
   3850              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
   3851      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
   3852      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
   3853      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
   3854      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
   3855      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
   3856      "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
   3857      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
   3858      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
   3859      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
   3860      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
   3861      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
   3862      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
   3863      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
   3864      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
   3865      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
   3866      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
   3867      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
   3868      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
   3869      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
   3870      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
   3871      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
   3872      "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
   3873      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
   3874      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
   3875      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
   3876      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
   3877      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
   3878      "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
   3879      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
   3880      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
   3881      "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
   3882      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
   3883      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
   3884      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
   3885      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
   3886      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
   3887      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
   3888          "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
   3889          "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
   3890          "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
   3891      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
   3892          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
   3893     };
   3894     int loop;
   3895     TEST_ASSERT_SUCCESS(status);
   3896     if (U_FAILURE(status)) {
   3897         return;
   3898     }
   3899     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   3900         // printf("looping %d\n", loop);
   3901         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
   3902         if (t >= STRSIZE) {
   3903             TEST_ASSERT(FALSE);
   3904             continue;
   3905         }
   3906 
   3907 
   3908         UnicodeString ustr(str);
   3909         RBBILineMonkey monkey;
   3910         if (U_FAILURE(monkey.deferredStatus)) {
   3911             continue;
   3912         }
   3913 
   3914         const int EXPECTEDSIZE = 50;
   3915         int expected[EXPECTEDSIZE];
   3916         int expectedcount = 0;
   3917 
   3918         monkey.setText(ustr);
   3919         int i;
   3920         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   3921             if (expectedcount >= EXPECTEDSIZE) {
   3922                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
   3923                 return;
   3924             }
   3925             expected[expectedcount ++] = i;
   3926         }
   3927 
   3928         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   3929     }
   3930     delete bi;
   3931 #endif
   3932 }
   3933 
   3934 void RBBITest::TestSentBreaks(void)
   3935 {
   3936 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3937     Locale        locale("en");
   3938     UErrorCode    status = U_ZERO_ERROR;
   3939     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
   3940     UChar         str[200];
   3941     static const char *strlist[] =
   3942     {
   3943      "Now\ris\nthe\r\ntime\n\rfor\r\r",
   3944      "This\n",
   3945      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
   3946      "\"Sentence ending with a quote.\" Bye.",
   3947      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
   3948      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
   3949      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
   3950      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
   3951      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
   3952      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
   3953      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
   3954              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
   3955              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
   3956              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
   3957      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
   3958              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
   3959              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
   3960              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
   3961              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
   3962              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
   3963     };
   3964     int loop;
   3965     if (U_FAILURE(status)) {
   3966         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   3967         return;
   3968     }
   3969     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   3970         u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
   3971         UnicodeString ustr(str);
   3972 
   3973         RBBISentMonkey monkey;
   3974         if (U_FAILURE(monkey.deferredStatus)) {
   3975             continue;
   3976         }
   3977 
   3978         const int EXPECTEDSIZE = 50;
   3979         int expected[EXPECTEDSIZE];
   3980         int expectedcount = 0;
   3981 
   3982         monkey.setText(ustr);
   3983         int i;
   3984         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   3985             if (expectedcount >= EXPECTEDSIZE) {
   3986                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
   3987                 return;
   3988             }
   3989             expected[expectedcount ++] = i;
   3990         }
   3991 
   3992         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   3993     }
   3994     delete bi;
   3995 #endif
   3996 }
   3997 
   3998 void RBBITest::TestMonkey(char *params) {
   3999 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4000 
   4001     UErrorCode     status    = U_ZERO_ERROR;
   4002     int32_t        loopCount = 500;
   4003     int32_t        seed      = 1;
   4004     UnicodeString  breakType = "all";
   4005     Locale         locale("en");
   4006     UBool          useUText  = FALSE;
   4007 
   4008     if (quick == FALSE) {
   4009         loopCount = 10000;
   4010     }
   4011 
   4012     if (params) {
   4013         UnicodeString p(params);
   4014         loopCount = getIntParam("loop", p, loopCount);
   4015         seed      = getIntParam("seed", p, seed);
   4016 
   4017         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
   4018         if (m.find()) {
   4019             breakType = m.group(1, status);
   4020             m.reset();
   4021             p = m.replaceFirst("", status);
   4022         }
   4023 
   4024         RegexMatcher u(" *utext", p, 0, status);
   4025         if (u.find()) {
   4026             useUText = TRUE;
   4027             u.reset();
   4028             p = u.replaceFirst("", status);
   4029         }
   4030 
   4031 
   4032         // m.reset(p);
   4033         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
   4034             // Each option is stripped out of the option string as it is processed.
   4035             // All options have been checked.  The option string should have been completely emptied..
   4036             char buf[100];
   4037             p.extract(buf, sizeof(buf), NULL, status);
   4038             buf[sizeof(buf)-1] = 0;
   4039             errln("Unrecognized or extra parameter:  %s\n", buf);
   4040             return;
   4041         }
   4042 
   4043     }
   4044 
   4045     if (breakType == "char" || breakType == "all") {
   4046         RBBICharMonkey  m;
   4047         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   4048         if (U_SUCCESS(status)) {
   4049             RunMonkey(bi, m, "char", seed, loopCount, useUText);
   4050             if (breakType == "all" && useUText==FALSE) {
   4051                 // Also run a quick test with UText when "all" is specified
   4052                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
   4053             }
   4054         }
   4055         else {
   4056             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
   4057         }
   4058         delete bi;
   4059     }
   4060 
   4061     if (breakType == "word" || breakType == "all") {
   4062         logln("Word Break Monkey Test");
   4063         RBBIWordMonkey  m;
   4064         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
   4065         if (U_SUCCESS(status)) {
   4066             RunMonkey(bi, m, "word", seed, loopCount, useUText);
   4067         }
   4068         else {
   4069             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
   4070         }
   4071         delete bi;
   4072     }
   4073 
   4074     if (breakType == "line" || breakType == "all") {
   4075         logln("Line Break Monkey Test");
   4076         RBBILineMonkey  m;
   4077         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
   4078         if (loopCount >= 10) {
   4079             loopCount = loopCount / 5;   // Line break runs slower than the others.
   4080         }
   4081         if (U_SUCCESS(status)) {
   4082             RunMonkey(bi, m, "line", seed, loopCount, useUText);
   4083         }
   4084         else {
   4085             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
   4086         }
   4087         delete bi;
   4088     }
   4089 
   4090     if (breakType == "sent" || breakType == "all"  ) {
   4091         logln("Sentence Break Monkey Test");
   4092         RBBISentMonkey  m;
   4093         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
   4094         if (loopCount >= 10) {
   4095             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
   4096         }
   4097         if (U_SUCCESS(status)) {
   4098             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
   4099         }
   4100         else {
   4101             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
   4102         }
   4103         delete bi;
   4104     }
   4105 
   4106 #endif
   4107 }
   4108 
   4109 //
   4110 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
   4111 //    Parameters:
   4112 //       bi      - the break iterator to use
   4113 //       mk      - MonkeyKind, abstraction for obtaining expected results
   4114 //       name    - Name of test (char, word, etc.) for use in error messages
   4115 //       seed    - Seed for starting random number generator (parameter from user)
   4116 //       numIterations
   4117 //
   4118 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
   4119                          int32_t numIterations, UBool useUText) {
   4120 
   4121 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4122 
   4123     const int32_t    TESTSTRINGLEN = 500;
   4124     UnicodeString    testText;
   4125     int32_t          numCharClasses;
   4126     UVector          *chClasses;
   4127     int              expected[TESTSTRINGLEN*2 + 1];
   4128     int              expectedCount = 0;
   4129     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
   4130     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
   4131     char             reverseBreaks[TESTSTRINGLEN*2+1];
   4132     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
   4133     char             followingBreaks[TESTSTRINGLEN*2+1];
   4134     char             precedingBreaks[TESTSTRINGLEN*2+1];
   4135     int              i;
   4136     int              loopCount = 0;
   4137 
   4138     m_seed = seed;
   4139 
   4140     numCharClasses = mk.charClasses()->size();
   4141     chClasses      = mk.charClasses();
   4142 
   4143     // Check for errors that occured during the construction of the MonkeyKind object.
   4144     //  Can't report them where they occured because errln() is a method coming from intlTest,
   4145     //  and is not visible outside of RBBITest :-(
   4146     if (U_FAILURE(mk.deferredStatus)) {
   4147         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
   4148         return;
   4149     }
   4150 
   4151     // Verify that the character classes all have at least one member.
   4152     for (i=0; i<numCharClasses; i++) {
   4153         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
   4154         if (s == NULL || s->size() == 0) {
   4155             errln("Character Class #%d is null or of zero size.", i);
   4156             return;
   4157         }
   4158     }
   4159 
   4160     while (loopCount < numIterations || numIterations == -1) {
   4161         if (numIterations == -1 && loopCount % 10 == 0) {
   4162             // If test is running in an infinite loop, display a periodic tic so
   4163             //   we can tell that it is making progress.
   4164             fprintf(stderr, ".");
   4165         }
   4166         // Save current random number seed, so that we can recreate the random numbers
   4167         //   for this loop iteration in event of an error.
   4168         seed = m_seed;
   4169 
   4170         // Populate a test string with data.
   4171         testText.truncate(0);
   4172         for (i=0; i<TESTSTRINGLEN; i++) {
   4173             int32_t  aClassNum = m_rand() % numCharClasses;
   4174             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
   4175             int32_t   charIdx = m_rand() % classSet->size();
   4176             UChar32   c = classSet->charAt(charIdx);
   4177             if (c < 0) {   // TODO:  deal with sets containing strings.
   4178                 errln("c < 0");
   4179                 break;
   4180             }
   4181             testText.append(c);
   4182         }
   4183 
   4184         // Calculate the expected results for this test string.
   4185         mk.setText(testText);
   4186         memset(expectedBreaks, 0, sizeof(expectedBreaks));
   4187         expectedBreaks[0] = 1;
   4188         int32_t breakPos = 0;
   4189         expectedCount = 0;
   4190         for (;;) {
   4191             breakPos = mk.next(breakPos);
   4192             if (breakPos == -1) {
   4193                 break;
   4194             }
   4195             if (breakPos > testText.length()) {
   4196                 errln("breakPos > testText.length()");
   4197             }
   4198             expectedBreaks[breakPos] = 1;
   4199             U_ASSERT(expectedCount<testText.length());
   4200             expected[expectedCount ++] = breakPos;
   4201             (void)expected;   // Set but not used warning.
   4202                               // TODO (andy): check it out.
   4203         }
   4204 
   4205         // Find the break positions using forward iteration
   4206         memset(forwardBreaks, 0, sizeof(forwardBreaks));
   4207         if (useUText) {
   4208             UErrorCode status = U_ZERO_ERROR;
   4209             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
   4210             // testUText = utext_openUnicodeString(testUText, &testText, &status);
   4211             bi->setText(testUText, status);
   4212             TEST_ASSERT_SUCCESS(status);
   4213             utext_close(testUText);   // The break iterator does a shallow clone of the UText
   4214                                       //  This UText can be closed immediately, so long as the
   4215                                       //  testText string continues to exist.
   4216         } else {
   4217             bi->setText(testText);
   4218         }
   4219 
   4220         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
   4221             if (i < 0 || i > testText.length()) {
   4222                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
   4223                 break;
   4224             }
   4225             forwardBreaks[i] = 1;
   4226         }
   4227 
   4228         // Find the break positions using reverse iteration
   4229         memset(reverseBreaks, 0, sizeof(reverseBreaks));
   4230         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
   4231             if (i < 0 || i > testText.length()) {
   4232                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
   4233                 break;
   4234             }
   4235             reverseBreaks[i] = 1;
   4236         }
   4237 
   4238         // Find the break positions using isBoundary() tests.
   4239         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
   4240         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
   4241         for (i=0; i<=testText.length(); i++) {
   4242             isBoundaryBreaks[i] = bi->isBoundary(i);
   4243         }
   4244 
   4245 
   4246         // Find the break positions using the following() function.
   4247         // printf(".");
   4248         memset(followingBreaks, 0, sizeof(followingBreaks));
   4249         int32_t   lastBreakPos = 0;
   4250         followingBreaks[0] = 1;
   4251         for (i=0; i<testText.length(); i++) {
   4252             breakPos = bi->following(i);
   4253             if (breakPos <= i ||
   4254                 breakPos < lastBreakPos ||
   4255                 breakPos > testText.length() ||
   4256                 (breakPos > lastBreakPos && lastBreakPos > i)) {
   4257                 errln("%s break monkey test: "
   4258                     "Out of range value returned by BreakIterator::following().\n"
   4259                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
   4260                          name, seed, i, breakPos, lastBreakPos);
   4261                 break;
   4262             }
   4263             followingBreaks[breakPos] = 1;
   4264             lastBreakPos = breakPos;
   4265         }
   4266 
   4267         // Find the break positions using the preceding() function.
   4268         memset(precedingBreaks, 0, sizeof(precedingBreaks));
   4269         lastBreakPos = testText.length();
   4270         precedingBreaks[testText.length()] = 1;
   4271         for (i=testText.length(); i>0; i--) {
   4272             breakPos = bi->preceding(i);
   4273             if (breakPos >= i ||
   4274                 breakPos > lastBreakPos ||
   4275                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
   4276                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
   4277                 errln("%s break monkey test: "
   4278                     "Out of range value returned by BreakIterator::preceding().\n"
   4279                     "index=%d;  prev returned %d; lastBreak=%d" ,
   4280                     name,  i, breakPos, lastBreakPos);
   4281                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
   4282                     precedingBreaks[i] = 2;   // Forces an error.
   4283                 }
   4284             } else {
   4285                 if (breakPos >= 0) {
   4286                     precedingBreaks[breakPos] = 1;
   4287                 }
   4288                 lastBreakPos = breakPos;
   4289             }
   4290         }
   4291 
   4292         // Compare the expected and actual results.
   4293         for (i=0; i<=testText.length(); i++) {
   4294             const char *errorType = NULL;
   4295             if  (forwardBreaks[i] != expectedBreaks[i]) {
   4296                 errorType = "next()";
   4297             } else if (reverseBreaks[i] != forwardBreaks[i]) {
   4298                 errorType = "previous()";
   4299             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
   4300                 errorType = "isBoundary()";
   4301             } else if (followingBreaks[i] != expectedBreaks[i]) {
   4302                 errorType = "following()";
   4303             } else if (precedingBreaks[i] != expectedBreaks[i]) {
   4304                 errorType = "preceding()";
   4305             }
   4306 
   4307 
   4308             if (errorType != NULL) {
   4309                 // Format a range of the test text that includes the failure as
   4310                 //  a data item that can be included in the rbbi test data file.
   4311 
   4312                 // Start of the range is the last point where expected and actual results
   4313                 //   both agreed that there was a break position.
   4314                 int startContext = i;
   4315                 int32_t count = 0;
   4316                 for (;;) {
   4317                     if (startContext==0) { break; }
   4318                     startContext --;
   4319                     if (expectedBreaks[startContext] != 0) {
   4320                         if (count == 2) break;
   4321                         count ++;
   4322                     }
   4323                 }
   4324 
   4325                 // End of range is two expected breaks past the start position.
   4326                 int endContext = i + 1;
   4327                 int ci;
   4328                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
   4329                     for (;;) {
   4330                         if (endContext >= testText.length()) {break;}
   4331                         if (expectedBreaks[endContext-1] != 0) {
   4332                             if (count == 0) break;
   4333                             count --;
   4334                         }
   4335                         endContext ++;
   4336                     }
   4337                 }
   4338 
   4339                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
   4340                 UnicodeString errorText = "<data>";
   4341                 /***if (strcmp(errorType, "next()") == 0) {
   4342                     startContext = 0;
   4343                     endContext = testText.length();
   4344 
   4345                     printStringBreaks(testText, expected, expectedCount);
   4346                 }***/
   4347 
   4348                 for (ci=startContext; ci<endContext;) {
   4349                     UnicodeString hexChars("0123456789abcdef");
   4350                     UChar32  c;
   4351                     int      bn;
   4352                     c = testText.char32At(ci);
   4353                     if (ci == i) {
   4354                         // This is the location of the error.
   4355                         errorText.append("<?>");
   4356                     } else if (expectedBreaks[ci] != 0) {
   4357                         // This a non-error expected break position.
   4358                         errorText.append("\\");
   4359                     }
   4360                     if (c < 0x10000) {
   4361                         errorText.append("\\u");
   4362                         for (bn=12; bn>=0; bn-=4) {
   4363                             errorText.append(hexChars.charAt((c>>bn)&0xf));
   4364                         }
   4365                     } else {
   4366                         errorText.append("\\U");
   4367                         for (bn=28; bn>=0; bn-=4) {
   4368                             errorText.append(hexChars.charAt((c>>bn)&0xf));
   4369                         }
   4370                     }
   4371                     ci = testText.moveIndex32(ci, 1);
   4372                 }
   4373                 errorText.append("\\");
   4374                 errorText.append("</data>\n");
   4375 
   4376                 // Output the error
   4377                 char  charErrorTxt[500];
   4378                 UErrorCode status = U_ZERO_ERROR;
   4379                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
   4380                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
   4381                 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
   4382 
   4383                 errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
   4384                     name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
   4385                     errorType, seed, i, charErrorTxt);
   4386                 break;
   4387             }
   4388         }
   4389 
   4390         loopCount++;
   4391     }
   4392 #endif
   4393 }
   4394 
   4395 
   4396 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
   4397 //             This test checks the initial patch,
   4398 //             which is to just keep it from crashing.  Correct word boundaries
   4399 //             await a proper fix to the dictionary code.
   4400 //
   4401 void RBBITest::TestBug5532(void)  {
   4402    // Text includes a mixture of Thai and Latin.
   4403    const unsigned char utf8Data[] = {
   4404            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
   4405            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
   4406            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
   4407            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
   4408            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
   4409            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
   4410            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
   4411            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
   4412            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
   4413            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
   4414            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
   4415 
   4416     UErrorCode status = U_ZERO_ERROR;
   4417     UText utext=UTEXT_INITIALIZER;
   4418     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
   4419     TEST_ASSERT_SUCCESS(status);
   4420 
   4421     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
   4422     TEST_ASSERT_SUCCESS(status);
   4423     if (U_SUCCESS(status)) {
   4424         bi->setText(&utext, status);
   4425         TEST_ASSERT_SUCCESS(status);
   4426 
   4427         int32_t breakCount = 0;
   4428         int32_t previousBreak = -1;
   4429         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
   4430             // For now, just make sure that the break iterator doesn't hang.
   4431             TEST_ASSERT(previousBreak < bi->current());
   4432             previousBreak = bi->current();
   4433         }
   4434         TEST_ASSERT(breakCount > 0);
   4435     }
   4436     delete bi;
   4437     utext_close(&utext);
   4438 }
   4439 
   4440 
   4441 void RBBITest::TestBug9983(void)  {
   4442     UnicodeString text = UnicodeString("\\u002A"  // * Other
   4443                                        "\\uFF65"  //   Other
   4444                                        "\\u309C"  //   Katakana
   4445                                        "\\uFF9F"  //   Extend
   4446                                        "\\uFF65"  //   Other
   4447                                        "\\u0020"  //   Other
   4448                                        "\\u0000").unescape();
   4449 
   4450     UErrorCode status = U_ZERO_ERROR;
   4451     LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
   4452         BreakIterator::createWordInstance(Locale::getRoot(), status)));
   4453     TEST_ASSERT_SUCCESS(status);
   4454     LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
   4455         BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
   4456     TEST_ASSERT_SUCCESS(status);
   4457     if (U_FAILURE(status)) {
   4458         return;
   4459     }
   4460     int32_t offset, rstatus, iterationCount;
   4461 
   4462     brkiter->setText(text);
   4463     brkiter->last();
   4464     iterationCount = 0;
   4465     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
   4466         iterationCount++;
   4467         rstatus = brkiter->getRuleStatus();
   4468         (void)rstatus;     // Suppress set but not used warning.
   4469         if (iterationCount >= 10) {
   4470            break;
   4471         }
   4472     }
   4473     TEST_ASSERT(iterationCount == 6);
   4474 
   4475     brkiterPOSIX->setText(text);
   4476     brkiterPOSIX->last();
   4477     iterationCount = 0;
   4478     while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
   4479         iterationCount++;
   4480         rstatus = brkiterPOSIX->getRuleStatus();
   4481         (void)rstatus;     // Suppress set but not used warning.
   4482         if (iterationCount >= 10) {
   4483            break;
   4484         }
   4485     }
   4486     TEST_ASSERT(iterationCount == 6);
   4487 }
   4488 
   4489 
   4490 //
   4491 //  TestDebug    -  A place-holder test for debugging purposes.
   4492 //                  For putting in fragments of other tests that can be invoked
   4493 //                  for tracing  without a lot of unwanted extra stuff happening.
   4494 //
   4495 void RBBITest::TestDebug(void) {
   4496 #if 0
   4497     UErrorCode   status = U_ZERO_ERROR;
   4498     int pos = 0;
   4499     int ruleStatus = 0;
   4500 
   4501     RuleBasedBreakIterator* bi =
   4502        // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
   4503        // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
   4504        (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
   4505     UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
   4506     // UnicodeString s("Aaa.  Bcd");
   4507     s = s.unescape();
   4508     bi->setText(s);
   4509     UBool r = bi->isBoundary(8);
   4510     printf("%s", r?"true":"false");
   4511     return;
   4512     pos = bi->last();
   4513     do {
   4514         // ruleStatus = bi->getRuleStatus();
   4515         printf("%d\t%d\n", pos, ruleStatus);
   4516         pos = bi->previous();
   4517     } while (pos != BreakIterator::DONE);
   4518 #endif
   4519 }
   4520 
   4521 void RBBITest::TestProperties() {
   4522     UErrorCode errorCode = U_ZERO_ERROR;
   4523     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
   4524     if (!prependSet.isEmpty()) {
   4525         errln(
   4526             "[:GCB=Prepend:] is not empty any more. "
   4527             "Uncomment relevant lines in source/data/brkitr/char.txt and "
   4528             "change this test to the opposite condition.");
   4529     }
   4530 }
   4531 
   4532 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   4533