Home | History | Annotate | Download | only in intltest
      1 /********************************************************************
      2  * COPYRIGHT:
      3  * Copyright (c) 1999-2015, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  ********************************************************************/
      6 /************************************************************************
      7 *   Date        Name        Description
      8 *   12/15/99    Madhu        Creation.
      9 *   01/12/2000  Madhu        Updated for changed API and added new tests
     10 ************************************************************************/
     11 
     12 #include "utypeinfo.h"  // for 'typeid' to work
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_BREAK_ITERATION
     17 
     18 #include "unicode/utypes.h"
     19 #include "unicode/brkiter.h"
     20 #include "unicode/rbbi.h"
     21 #include "unicode/uchar.h"
     22 #include "unicode/utf16.h"
     23 #include "unicode/ucnv.h"
     24 #include "unicode/schriter.h"
     25 #include "unicode/uniset.h"
     26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     27 #include "unicode/regex.h"
     28 #endif
     29 #include "unicode/ustring.h"
     30 #include "unicode/utext.h"
     31 #include "intltest.h"
     32 #include "rbbitst.h"
     33 #include <string.h>
     34 #include "charstr.h"
     35 #include "uvector.h"
     36 #include "uvectr32.h"
     37 #include <stdio.h>
     38 #include <stdlib.h>
     39 #include "unicode/numfmt.h"
     40 #include "unicode/uscript.h"
     41 #include "cmemory.h"
     42 
     43 #define TEST_ASSERT(x) {if (!(x)) { \
     44     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
     45 
     46 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
     47     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
     48 
     49 
     50 //---------------------------------------------
     51 // runIndexedTest
     52 //---------------------------------------------
     53 
     54 
     55 //  Note:  Before adding new tests to this file, check whether the desired test data can
     56 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
     57 //         it's much less work than writing a new test, diagnostic output in the event of failures
     58 //         is good, and the test data file will is shared with ICU4J, so eventually the test
     59 //         will run there as well, without additional effort.
     60 
     61 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
     62 {
     63     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
     64 
     65     switch (index) {
     66 #if !UCONFIG_NO_FILE_IO
     67         case 0: name = "TestBug4153072";
     68             if(exec) TestBug4153072();                         break;
     69 #else
     70         case 0: name = "skip";
     71             break;
     72 #endif
     73 
     74         case 1: name = "skip";
     75             break;
     76         case 2: name = "TestStatusReturn";
     77             if(exec) TestStatusReturn();                       break;
     78 
     79 #if !UCONFIG_NO_FILE_IO
     80         case 3: name = "TestUnicodeFiles";
     81             if(exec) TestUnicodeFiles();                       break;
     82         case 4: name = "TestEmptyString";
     83             if(exec) TestEmptyString();                        break;
     84 #else
     85         case 3: case 4: name = "skip";
     86             break;
     87 #endif
     88 
     89         case 5: name = "TestGetAvailableLocales";
     90             if(exec) TestGetAvailableLocales();                break;
     91 
     92         case 6: name = "TestGetDisplayName";
     93             if(exec) TestGetDisplayName();                     break;
     94 
     95 #if !UCONFIG_NO_FILE_IO
     96         case 7: name = "TestEndBehaviour";
     97             if(exec) TestEndBehaviour();                       break;
     98         case 8: case 9: case 10: name = "skip";
     99              break;
    100         case 11: name = "TestWordBreaks";
    101              if(exec) TestWordBreaks();                        break;
    102         case 12: name = "TestWordBoundary";
    103              if(exec) TestWordBoundary();                      break;
    104         case 13: name = "TestLineBreaks";
    105              if(exec) TestLineBreaks();                        break;
    106         case 14: name = "TestSentBreaks";
    107              if(exec) TestSentBreaks();                        break;
    108         case 15: name = "TestExtended";
    109              if(exec) TestExtended();                          break;
    110 #else
    111         case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
    112              break;
    113 #endif
    114 
    115 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
    116         case 16:
    117             name = "TestMonkey"; if(exec)  TestMonkey(params); break;
    118 #else
    119         case 16:
    120              name = "skip";                                    break;
    121 #endif
    122 
    123 #if !UCONFIG_NO_FILE_IO
    124         case 17: name = "TestBug3818";
    125             if(exec) TestBug3818();                            break;
    126 #else
    127         case 17: name = "skip";
    128             break;
    129 #endif
    130 
    131         case 18: name = "skip";
    132             break;
    133         case 19: name = "TestDebug";
    134             if(exec) TestDebug();                              break;
    135         case 20: name = "skip";
    136             break;
    137 
    138 #if !UCONFIG_NO_FILE_IO
    139         case 21: name = "TestBug5775";
    140             if (exec) TestBug5775();                           break;
    141 #else
    142         case 21: name = "skip";
    143             break;
    144 #endif
    145 
    146         case 22: name = "TestBug9983";
    147             if (exec) TestBug9983();                           break;
    148         case 23: name = "TestDictRules";
    149             if (exec) TestDictRules();                         break;
    150         case 24: name = "TestBug5532";
    151             if (exec) TestBug5532();                           break;
    152         default: name = ""; break; //needed to end loop
    153     }
    154 }
    155 
    156 
    157 //---------------------------------------------------------------------------
    158 //
    159 //   class BITestData   Holds a set of Break iterator test data and results
    160 //                      Includes
    161 //                         - the string data to be broken
    162 //                         - a vector of the expected break positions.
    163 //                         - a vector of source line numbers for the data,
    164 //                               (to help see where errors occured.)
    165 //                         - The expected break tag values.
    166 //                         - Vectors of actual break positions and tag values.
    167 //                         - Functions for comparing actual with expected and
    168 //                            reporting errors.
    169 //
    170 //----------------------------------------------------------------------------
    171 class BITestData {
    172 public:
    173     UnicodeString    fDataToBreak;
    174     UVector          fExpectedBreakPositions;
    175     UVector          fExpectedTags;
    176     UVector          fLineNum;
    177     UVector          fActualBreakPositions;   // Test Results.
    178     UVector          fActualTags;
    179 
    180     BITestData(UErrorCode &status);
    181     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
    182     void             checkResults(const char *heading, RBBITest *test);
    183     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
    184     void             clearResults();
    185 };
    186 
    187 //
    188 // Constructor.
    189 //
    190 BITestData::BITestData(UErrorCode &status)
    191 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
    192   fActualTags(status)
    193 {
    194 }
    195 
    196 //
    197 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
    198 //                 The macro form collects the line number, which is helpful
    199 //                 when tracking down failures.
    200 //
    201 //                 A null data item is inserted at the start of each test's data
    202 //                  to put the starting zero into the data list.  The position saved for
    203 //                  each non-null item is its ending position.
    204 //
    205 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
    206 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
    207     if (U_FAILURE(status)) {return;}
    208     if (data != NULL) {
    209         fDataToBreak.append(CharsToUnicodeString(data));
    210     }
    211     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
    212     fExpectedTags.addElement(tag, status);
    213     fLineNum.addElement(lineNum, status);
    214 }
    215 
    216 
    217 //
    218 //  checkResults.   Compare the actual and expected break positions, report any differences.
    219 //
    220 void BITestData::checkResults(const char *heading, RBBITest *test) {
    221     int32_t   expectedIndex = 0;
    222     int32_t   actualIndex = 0;
    223 
    224     for (;;) {
    225         // If we've run through both the expected and actual results vectors, we're done.
    226         //   break out of the loop.
    227         if (expectedIndex >= fExpectedBreakPositions.size() &&
    228             actualIndex   >= fActualBreakPositions.size()) {
    229             break;
    230         }
    231 
    232 
    233         if (expectedIndex >= fExpectedBreakPositions.size()) {
    234             err(heading, test, expectedIndex-1, actualIndex);
    235             actualIndex++;
    236             continue;
    237         }
    238 
    239         if (actualIndex >= fActualBreakPositions.size()) {
    240             err(heading, test, expectedIndex, actualIndex-1);
    241             expectedIndex++;
    242             continue;
    243         }
    244 
    245         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
    246             err(heading, test, expectedIndex, actualIndex);
    247             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
    248             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
    249                 actualIndex++;
    250             } else {
    251                 expectedIndex++;
    252             }
    253             continue;
    254         }
    255 
    256         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
    257             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
    258                 heading, fLineNum.elementAt(expectedIndex),
    259                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
    260         }
    261 
    262         actualIndex++;
    263         expectedIndex++;
    264     }
    265 }
    266 
    267 //
    268 //  err   -  An error was found.  Report it, along with information about where the
    269 //                                incorrectly broken test data appeared in the source file.
    270 //
    271 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
    272 {
    273     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
    274     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
    275     int32_t   o        = 0;
    276     int32_t   line     = fLineNum.elementAti(expectedIdx);
    277     if (expectedIdx > 0) {
    278         // The line numbers are off by one because a premature break occurs somewhere
    279         //    within the previous item, rather than at the start of the current (expected) item.
    280         //    We want to report the offset of the unexpected break from the start of
    281         //      this previous item.
    282         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
    283     }
    284     if (actual < expected) {
    285         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
    286     } else {
    287         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
    288     }
    289 }
    290 
    291 
    292 void BITestData::clearResults() {
    293     fActualBreakPositions.removeAllElements();
    294     fActualTags.removeAllElements();
    295 }
    296 
    297 
    298 //--------------------------------------------------------------------------------------
    299 //
    300 //    RBBITest    constructor and destructor
    301 //
    302 //--------------------------------------------------------------------------------------
    303 
    304 RBBITest::RBBITest() {
    305 }
    306 
    307 
    308 RBBITest::~RBBITest() {
    309 }
    310 
    311 //-----------------------------------------------------------------------------------
    312 //
    313 //   Test for status {tag} return value from break rules.
    314 //        TODO:  a more thorough test.
    315 //
    316 //-----------------------------------------------------------------------------------
    317 void RBBITest::TestStatusReturn() {
    318      UnicodeString rulesString1("$Letters = [:L:];\n"
    319                                   "$Numbers = [:N:];\n"
    320                                   "$Letters+{1};\n"
    321                                   "$Numbers+{2};\n"
    322                                   "Help\\ {4}/me\\!;\n"
    323                                   "[^$Letters $Numbers];\n"
    324                                   "!.*;\n", -1, US_INV);
    325      UnicodeString testString1  = "abc123..abc Help me Help me!";
    326                                 // 01234567890123456789012345678
    327      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
    328      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
    329 
    330      UErrorCode status=U_ZERO_ERROR;
    331      UParseError    parseError;
    332 
    333      BreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
    334      if(U_FAILURE(status)) {
    335          dataerrln("FAIL : in construction - %s", u_errorName(status));
    336      } else {
    337          int32_t  pos;
    338          int32_t  i = 0;
    339          bi->setText(testString1);
    340          for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
    341              if (pos != bounds1[i]) {
    342                  errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
    343                  break;
    344              }
    345 
    346              int tag = bi->getRuleStatus();
    347              if (tag != brkStatus[i]) {
    348                  errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
    349                  break;
    350              }
    351              i++;
    352          }
    353      }
    354      delete bi;
    355 }
    356 
    357 
    358 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
    359     UErrorCode status = U_ZERO_ERROR;
    360     char name[100];
    361     printf("code    alpha extend alphanum type word sent line name\n");
    362     int nextExpectedIndex = 0;
    363     utext_setNativeIndex(tstr, 0);
    364     for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
    365         if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
    366             printf("------------------------------------------------ %d\n", j);
    367             ++nextExpectedIndex;
    368         }
    369 
    370         UChar32 c = utext_next32(tstr);
    371         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
    372         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
    373                            u_isUAlphabetic(c),
    374                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
    375                            u_isalnum(c),
    376                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
    377                                                   u_charType(c),
    378                                                   U_SHORT_PROPERTY_NAME),
    379                            u_getPropertyValueName(UCHAR_WORD_BREAK,
    380                                                   u_getIntPropertyValue(c,
    381                                                           UCHAR_WORD_BREAK),
    382                                                   U_SHORT_PROPERTY_NAME),
    383                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
    384                                    u_getIntPropertyValue(c,
    385                                            UCHAR_SENTENCE_BREAK),
    386                                    U_SHORT_PROPERTY_NAME),
    387                            u_getPropertyValueName(UCHAR_LINE_BREAK,
    388                                    u_getIntPropertyValue(c,
    389                                            UCHAR_LINE_BREAK),
    390                                    U_SHORT_PROPERTY_NAME),
    391                            name);
    392     }
    393 }
    394 
    395 
    396 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
    397    UErrorCode status = U_ZERO_ERROR;
    398    UText *tstr = NULL;
    399    tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
    400    if (U_FAILURE(status)) {
    401        printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
    402        return;
    403     }
    404    printStringBreaks(tstr, expected, expectedCount);
    405    utext_close(tstr);
    406 }
    407 
    408 
    409 void RBBITest::TestBug3818() {
    410     UErrorCode  status = U_ZERO_ERROR;
    411 
    412     // Four Thai words...
    413     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
    414                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
    415     UnicodeString  thaiStr(thaiWordData);
    416 
    417     BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
    418     if (U_FAILURE(status) || bi == NULL) {
    419         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
    420         return;
    421     }
    422     bi->setText(thaiStr);
    423 
    424     int32_t  startOfSecondWord = bi->following(1);
    425     if (startOfSecondWord != 4) {
    426         errln("Fail at file %s, line %d expected start of word at 4, got %d",
    427             __FILE__, __LINE__, startOfSecondWord);
    428     }
    429     startOfSecondWord = bi->following(0);
    430     if (startOfSecondWord != 4) {
    431         errln("Fail at file %s, line %d expected start of word at 4, got %d",
    432             __FILE__, __LINE__, startOfSecondWord);
    433     }
    434     delete bi;
    435 }
    436 
    437 //----------------------------------------------------------------------------
    438 //
    439 // generalIteratorTest      Given a break iterator and a set of test data,
    440 //                          Run the tests and report the results.
    441 //
    442 //----------------------------------------------------------------------------
    443 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
    444 {
    445 
    446     bi.setText(td.fDataToBreak);
    447 
    448     testFirstAndNext(bi, td);
    449 
    450     testLastAndPrevious(bi, td);
    451 
    452     testFollowing(bi, td);
    453     testPreceding(bi, td);
    454     testIsBoundary(bi, td);
    455     doMultipleSelectionTest(bi, td);
    456 }
    457 
    458 
    459 //
    460 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
    461 //                       kind of loop.
    462 //
    463 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
    464 {
    465     UErrorCode  status = U_ZERO_ERROR;
    466     int32_t     p;
    467     int32_t     lastP = -1;
    468     int32_t     tag;
    469 
    470     logln("Test first and next");
    471     bi.setText(td.fDataToBreak);
    472     td.clearResults();
    473 
    474     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
    475         td.fActualBreakPositions.addElement(p, status);  // Save result.
    476         tag = bi.getRuleStatus();
    477         td.fActualTags.addElement(tag, status);
    478         if (p <= lastP) {
    479             // If the iterator is not making forward progress, stop.
    480             //  No need to raise an error here, it'll be detected in the normal check of results.
    481             break;
    482         }
    483         lastP = p;
    484     }
    485     td.checkResults("testFirstAndNext", this);
    486 }
    487 
    488 
    489 //
    490 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
    491 //
    492 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
    493 {
    494     UErrorCode  status = U_ZERO_ERROR;
    495     int32_t     p;
    496     int32_t     lastP  = 0x7ffffffe;
    497     int32_t     tag;
    498 
    499     logln("Test last and previous");
    500     bi.setText(td.fDataToBreak);
    501     td.clearResults();
    502 
    503     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
    504         // Save break position.  Insert it at start of vector of results, shoving
    505         //    already-saved results further towards the end.
    506         td.fActualBreakPositions.insertElementAt(p, 0, status);
    507         // bi.previous();   // TODO:  Why does this fix things up????
    508         // bi.next();
    509         tag = bi.getRuleStatus();
    510         td.fActualTags.insertElementAt(tag, 0, status);
    511         if (p >= lastP) {
    512             // If the iterator is not making progress, stop.
    513             //  No need to raise an error here, it'll be detected in the normal check of results.
    514             break;
    515         }
    516         lastP = p;
    517     }
    518     td.checkResults("testLastAndPrevious", this);
    519 }
    520 
    521 
    522 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
    523 {
    524     UErrorCode  status = U_ZERO_ERROR;
    525     int32_t     p;
    526     int32_t     tag;
    527     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
    528                                  //   cannot be -1; that is returned for DONE.
    529     int         i;
    530 
    531     logln("testFollowing():");
    532     bi.setText(td.fDataToBreak);
    533     td.clearResults();
    534 
    535     // Save the starting point, since we won't get that out of following.
    536     p = bi.first();
    537     td.fActualBreakPositions.addElement(p, status);  // Save result.
    538     tag = bi.getRuleStatus();
    539     td.fActualTags.addElement(tag, status);
    540 
    541     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
    542         p = bi.following(i);
    543         if (p != lastP) {
    544             if (p == RuleBasedBreakIterator::DONE) {
    545                 break;
    546             }
    547             // We've reached a new break position.  Save it.
    548             td.fActualBreakPositions.addElement(p, status);  // Save result.
    549             tag = bi.getRuleStatus();
    550             td.fActualTags.addElement(tag, status);
    551             lastP = p;
    552         }
    553     }
    554     // The loop normally exits by means of the break in the middle.
    555     // Make sure that the index was at the correct position for the break iterator to have
    556     //   returned DONE.
    557     if (i != td.fDataToBreak.length()) {
    558         errln("testFollowing():  iterator returned DONE prematurely.");
    559     }
    560 
    561     // Full check of all results.
    562     td.checkResults("testFollowing", this);
    563 }
    564 
    565 
    566 
    567 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
    568     UErrorCode  status = U_ZERO_ERROR;
    569     int32_t     p;
    570     int32_t     tag;
    571     int32_t     lastP  = 0x7ffffffe;
    572     int         i;
    573 
    574     logln("testPreceding():");
    575     bi.setText(td.fDataToBreak);
    576     td.clearResults();
    577 
    578     p = bi.last();
    579     td.fActualBreakPositions.addElement(p, status);
    580     tag = bi.getRuleStatus();
    581     td.fActualTags.addElement(tag, status);
    582 
    583     for (i = td.fDataToBreak.length(); i>=-1; i--) {
    584         p = bi.preceding(i);
    585         if (p != lastP) {
    586             if (p == RuleBasedBreakIterator::DONE) {
    587                 break;
    588             }
    589             // We've reached a new break position.  Save it.
    590             td.fActualBreakPositions.insertElementAt(p, 0, status);
    591             lastP = p;
    592             tag = bi.getRuleStatus();
    593             td.fActualTags.insertElementAt(tag, 0, status);
    594         }
    595     }
    596     // The loop normally exits by means of the break in the middle.
    597     // Make sure that the index was at the correct position for the break iterator to have
    598     //   returned DONE.
    599     if (i != 0) {
    600         errln("testPreceding():  iterator returned DONE prematurely.");
    601     }
    602 
    603     // Full check of all results.
    604     td.checkResults("testPreceding", this);
    605 }
    606 
    607 
    608 
    609 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
    610     UErrorCode  status = U_ZERO_ERROR;
    611     int         i;
    612     int32_t     tag;
    613 
    614     logln("testIsBoundary():");
    615     bi.setText(td.fDataToBreak);
    616     td.clearResults();
    617 
    618     for (i = 0; i <= td.fDataToBreak.length(); i++) {
    619         if (bi.isBoundary(i)) {
    620             td.fActualBreakPositions.addElement(i, status);  // Save result.
    621             tag = bi.getRuleStatus();
    622             td.fActualTags.addElement(tag, status);
    623         }
    624     }
    625     td.checkResults("testIsBoundary: ", this);
    626 }
    627 
    628 
    629 
    630 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
    631 {
    632     iterator.setText(td.fDataToBreak);
    633 
    634     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
    635     int32_t offset = iterator.first();
    636     int32_t testOffset;
    637     int32_t count = 0;
    638 
    639     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
    640 
    641     if (*testIterator != iterator)
    642         errln("clone() or operator!= failed: two clones compared unequal");
    643 
    644     do {
    645         testOffset = testIterator->first();
    646         testOffset = testIterator->next(count);
    647         if (offset != testOffset)
    648             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
    649 
    650         if (offset != RuleBasedBreakIterator::DONE) {
    651             count++;
    652             offset = iterator.next();
    653 
    654             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
    655                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
    656                 if (count > 10000 || offset == -1) {
    657                     errln("operator== failed too many times. Stopping test.");
    658                     if (offset == -1) {
    659                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
    660                     }
    661                     return;
    662                 }
    663             }
    664         }
    665     } while (offset != RuleBasedBreakIterator::DONE);
    666 
    667     // now do it backwards...
    668     offset = iterator.last();
    669     count = 0;
    670 
    671     do {
    672         testOffset = testIterator->last();
    673         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
    674         if (offset != testOffset)
    675             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
    676 
    677         if (offset != RuleBasedBreakIterator::DONE) {
    678             count--;
    679             offset = iterator.previous();
    680         }
    681     } while (offset != RuleBasedBreakIterator::DONE);
    682 
    683     delete testIterator;
    684 }
    685 
    686 
    687 //---------------------------------------------
    688 //
    689 //     other tests
    690 //
    691 //---------------------------------------------
    692 void RBBITest::TestEmptyString()
    693 {
    694     UnicodeString text = "";
    695     UErrorCode status = U_ZERO_ERROR;
    696 
    697     BITestData x(status);
    698     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
    699     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
    700     if (U_FAILURE(status))
    701     {
    702         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
    703         return;
    704     }
    705     generalIteratorTest(*bi, x);
    706     delete bi;
    707 }
    708 
    709 void RBBITest::TestGetAvailableLocales()
    710 {
    711     int32_t locCount = 0;
    712     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
    713 
    714     if (locCount == 0)
    715         dataerrln("getAvailableLocales() returned an empty list!");
    716     // Just make sure that it's returning good memory.
    717     int32_t i;
    718     for (i = 0; i < locCount; ++i) {
    719         logln(locList[i].getName());
    720     }
    721 }
    722 
    723 //Testing the BreakIterator::getDisplayName() function
    724 void RBBITest::TestGetDisplayName()
    725 {
    726     UnicodeString   result;
    727 
    728     BreakIterator::getDisplayName(Locale::getUS(), result);
    729     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
    730         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
    731                 + result);
    732 
    733     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
    734     if (result != "French (France)")
    735         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
    736                 + result);
    737 }
    738 /**
    739  * Test End Behaviour
    740  * @bug 4068137
    741  */
    742 void RBBITest::TestEndBehaviour()
    743 {
    744     UErrorCode status = U_ZERO_ERROR;
    745     UnicodeString testString("boo.");
    746     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
    747     if (U_FAILURE(status))
    748     {
    749         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
    750         return;
    751     }
    752     wb->setText(testString);
    753 
    754     if (wb->first() != 0)
    755         errln("Didn't get break at beginning of string.");
    756     if (wb->next() != 3)
    757         errln("Didn't get break before period in \"boo.\"");
    758     if (wb->current() != 4 && wb->next() != 4)
    759         errln("Didn't get break at end of string.");
    760     delete wb;
    761 }
    762 /*
    763  * @bug 4153072
    764  */
    765 void RBBITest::TestBug4153072() {
    766     UErrorCode status = U_ZERO_ERROR;
    767     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
    768     if (U_FAILURE(status))
    769     {
    770         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
    771         return;
    772     }
    773     UnicodeString str("...Hello, World!...");
    774     int32_t begin = 3;
    775     int32_t end = str.length() - 3;
    776     UBool onBoundary;
    777 
    778     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
    779     iter->adoptText(textIterator);
    780     int index;
    781     // Note: with the switch to UText, there is no way to restrict the
    782     //       iteration range to begin at an index other than zero.
    783     //       String character iterators created with a non-zero bound are
    784     //         treated by RBBI as being empty.
    785     for (index = -1; index < begin + 1; ++index) {
    786         onBoundary = iter->isBoundary(index);
    787         if (index == 0?  !onBoundary : onBoundary) {
    788             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
    789                             " and begin index = " + begin);
    790         }
    791     }
    792     delete iter;
    793 }
    794 
    795 
    796 //
    797 // Test for problem reported by Ashok Matoria on 9 July 2007
    798 //    One.<kSoftHyphen><kSpace>Two.
    799 //
    800 //    Sentence break at start (0) and then on calling next() it breaks at
    801 //   'T' of "Two". Now, at this point if I do next() and
    802 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
    803 //
    804 void RBBITest::TestBug5775() {
    805     UErrorCode status = U_ZERO_ERROR;
    806     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
    807     TEST_ASSERT_SUCCESS(status);
    808     if (U_FAILURE(status)) {
    809         return;
    810     }
    811 // Check for status first for better handling of no data errors.
    812     TEST_ASSERT(bi != NULL);
    813     if (bi == NULL) {
    814         return;
    815     }
    816 
    817     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
    818     //               01234      56789
    819     s = s.unescape();
    820     bi->setText(s);
    821     int pos = bi->next();
    822     TEST_ASSERT(pos == 6);
    823     pos = bi->next();
    824     TEST_ASSERT(pos == 10);
    825     pos = bi->previous();
    826     TEST_ASSERT(pos == 6);
    827     delete bi;
    828 }
    829 
    830 
    831 
    832 //------------------------------------------------------------------------------
    833 //
    834 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
    835 //
    836 //------------------------------------------------------------------------------
    837 
    838 struct TestParams {
    839     BreakIterator   *bi;                   // Break iterator is set while parsing test source.
    840                                            //   Changed out whenever test data changes break type.
    841 
    842     UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
    843     UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
    844     UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
    845     UVector32       *srcCol;
    846 
    847     UText           *textToBreak;          // UText, could be UTF8 or UTF16.
    848     UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
    849     CharString       utf8String;           // UTF-8 form of text to break.
    850 
    851     TestParams(UErrorCode &status) : dataToBreak() {
    852         bi               = NULL;
    853         expectedBreaks   = new UVector32(status);
    854         srcLine          = new UVector32(status);
    855         srcCol           = new UVector32(status);
    856         textToBreak      = NULL;
    857         textMap          = new UVector32(status);
    858     }
    859 
    860     ~TestParams() {
    861         delete bi;
    862         delete expectedBreaks;
    863         delete srcLine;
    864         delete srcCol;
    865         utext_close(textToBreak);
    866         delete textMap;
    867     }
    868 
    869     int32_t getSrcLine(int32_t bp);
    870     int32_t getExpectedBreak(int32_t bp);
    871     int32_t getSrcCol(int32_t bp);
    872 
    873     void setUTF16(UErrorCode &status);
    874     void setUTF8(UErrorCode &status);
    875 };
    876 
    877 // Append a UnicodeString to a CharString with UTF-8 encoding.
    878 // Substitute any invalid chars.
    879 //   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
    880 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
    881     if (U_FAILURE(status)) {
    882         return;
    883     }
    884     int32_t utf8Length;
    885     u_strToUTF8WithSub(NULL, 0, &utf8Length,            // Output Buffer, NULL for preflight.
    886                        src.getBuffer(), src.length(),   // UTF-16 data
    887                        0xfffd, NULL,                    // Substitution char, number of subs.
    888                        &status);
    889     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
    890         return;
    891     }
    892     status = U_ZERO_ERROR;
    893     int32_t capacity;
    894     char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
    895     u_strToUTF8WithSub(buffer, utf8Length, NULL,
    896                        src.getBuffer(), src.length(),
    897                        0xfffd, NULL, &status);
    898     dest.append(buffer, utf8Length, status);
    899 }
    900 
    901 
    902 void TestParams::setUTF16(UErrorCode &status) {
    903     textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
    904     textMap->removeAllElements();
    905     for (int32_t i=0; i<dataToBreak.length(); i++) {
    906         if (i == dataToBreak.getChar32Start(i)) {
    907             textMap->addElement(i, status);
    908         } else {
    909             textMap->addElement(-1, status);
    910         }
    911     }
    912     textMap->addElement(dataToBreak.length(), status);
    913     U_ASSERT(dataToBreak.length() + 1 == textMap->size());
    914 }
    915 
    916 
    917 void TestParams::setUTF8(UErrorCode &status) {
    918     if (U_FAILURE(status)) {
    919         return;
    920     }
    921     utf8String.clear();
    922     CharStringAppend(utf8String, dataToBreak, status);
    923     textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
    924     if (U_FAILURE(status)) {
    925         return;
    926     }
    927 
    928     textMap->removeAllElements();
    929     int32_t utf16Index = 0;
    930     for (;;) {
    931         textMap->addElement(utf16Index, status);
    932         UChar32 c32 = utext_current32(textToBreak);
    933         if (c32 < 0) {
    934             break;
    935         }
    936         utf16Index += U16_LENGTH(c32);
    937         utext_next32(textToBreak);
    938         while (textMap->size() < utext_getNativeIndex(textToBreak)) {
    939             textMap->addElement(-1, status);
    940         }
    941     }
    942     U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
    943 }
    944 
    945 
    946 int32_t TestParams::getSrcLine(int bp) {
    947     if (bp >= textMap->size()) {
    948         bp = textMap->size() - 1;
    949     }
    950     int32_t i = 0;
    951     for(; bp >= 0 ; --bp) {
    952         // Move to a character boundary if we are not on one already.
    953         i = textMap->elementAti(bp);
    954         if (i >= 0) {
    955             break;
    956         }
    957     }
    958     return srcLine->elementAti(i);
    959 }
    960 
    961 
    962 int32_t TestParams::getExpectedBreak(int bp) {
    963     if (bp >= textMap->size()) {
    964         return 0;
    965     }
    966     int32_t i = textMap->elementAti(bp);
    967     int32_t retVal = 0;
    968     if (i >= 0) {
    969         retVal = expectedBreaks->elementAti(i);
    970     }
    971     return retVal;
    972 }
    973 
    974 
    975 int32_t TestParams::getSrcCol(int bp) {
    976     if (bp >= textMap->size()) {
    977         bp = textMap->size() - 1;
    978     }
    979     int32_t i = 0;
    980     for(; bp >= 0; --bp) {
    981         // Move bp to a character boundary if we are not on one already.
    982         i = textMap->elementAti(bp);
    983         if (i >= 0) {
    984             break;
    985         }
    986     }
    987     return srcCol->elementAti(i);
    988 }
    989 
    990 
    991 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
    992     int32_t    bp;
    993     int32_t    prevBP;
    994     int32_t    i;
    995 
    996     TEST_ASSERT_SUCCESS(status);
    997     if (U_FAILURE(status)) {
    998         return;
    999     }
   1000 
   1001     if (t->bi == NULL) {
   1002         return;
   1003     }
   1004 
   1005     t->bi->setText(t->textToBreak, status);
   1006     //
   1007     //  Run the iterator forward
   1008     //
   1009     prevBP = -1;
   1010     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
   1011         if (prevBP ==  bp) {
   1012             // Fail for lack of forward progress.
   1013             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
   1014                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
   1015             break;
   1016         }
   1017 
   1018         // Check that there we didn't miss an expected break between the last one
   1019         //  and this one.
   1020         for (i=prevBP+1; i<bp; i++) {
   1021             if (t->getExpectedBreak(i) != 0) {
   1022                 int expected[] = {0, i};
   1023                 printStringBreaks(t->dataToBreak, expected, 2);
   1024                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1025                       i, t->getSrcLine(i), t->getSrcCol(i));
   1026             }
   1027         }
   1028 
   1029         // Check that the break we did find was expected
   1030         if (t->getExpectedBreak(bp) == 0) {
   1031             int expected[] = {0, bp};
   1032             printStringBreaks(t->textToBreak, expected, 2);
   1033             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
   1034                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
   1035         } else {
   1036             // The break was expected.
   1037             //   Check that the {nnn} tag value is correct.
   1038             int32_t expectedTagVal = t->getExpectedBreak(bp);
   1039             if (expectedTagVal == -1) {
   1040                 expectedTagVal = 0;
   1041             }
   1042             int32_t line = t->getSrcLine(bp);
   1043             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
   1044             if (rs != expectedTagVal) {
   1045                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
   1046                       "          Actual, Expected status = %4d, %4d",
   1047                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
   1048             }
   1049         }
   1050 
   1051         prevBP = bp;
   1052     }
   1053 
   1054     // Verify that there were no missed expected breaks after the last one found
   1055     for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
   1056         if (t->getExpectedBreak(i) != 0) {
   1057             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1058                       i, t->getSrcLine(i), t->getSrcCol(i));
   1059         }
   1060     }
   1061 
   1062     //
   1063     //  Run the iterator backwards, verify that the same breaks are found.
   1064     //
   1065     prevBP = utext_nativeLength(t->textToBreak)+2;  // start with a phony value for the last break pos seen.
   1066     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
   1067         if (prevBP ==  bp) {
   1068             // Fail for lack of progress.
   1069             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
   1070                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
   1071             break;
   1072         }
   1073 
   1074         // Check that we didn't miss an expected break between the last one
   1075         //  and this one.  (UVector returns zeros for index out of bounds.)
   1076         for (i=prevBP-1; i>bp; i--) {
   1077             if (t->getExpectedBreak(i) != 0) {
   1078                 errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1079                       i, t->getSrcLine(i), t->getSrcCol(i));
   1080             }
   1081         }
   1082 
   1083         // Check that the break we did find was expected
   1084         if (t->getExpectedBreak(bp) == 0) {
   1085             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
   1086                    bp, t->getSrcLine(bp), t->getSrcCol(bp));
   1087         } else {
   1088             // The break was expected.
   1089             //   Check that the {nnn} tag value is correct.
   1090             int32_t expectedTagVal = t->getExpectedBreak(bp);
   1091             if (expectedTagVal == -1) {
   1092                 expectedTagVal = 0;
   1093             }
   1094             int line = t->getSrcLine(bp);
   1095             int32_t rs = t->bi->getRuleStatus();
   1096             if (rs != expectedTagVal) {
   1097                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
   1098                       "          Actual, Expected status = %4d, %4d",
   1099                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
   1100             }
   1101         }
   1102 
   1103         prevBP = bp;
   1104     }
   1105 
   1106     // Verify that there were no missed breaks prior to the last one found
   1107     for (i=prevBP-1; i>=0; i--) {
   1108         if (t->getExpectedBreak(i) != 0) {
   1109             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1110                       i, t->getSrcLine(i), t->getSrcCol(i));
   1111         }
   1112     }
   1113 
   1114     // Check isBoundary()
   1115     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
   1116         UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
   1117         UBool boundaryFound    = t->bi->isBoundary(i);
   1118         if (boundaryExpected != boundaryFound) {
   1119             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
   1120                   "        Expected, Actual= %s, %s",
   1121                   i, t->getSrcLine(i), t->getSrcCol(i),
   1122                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
   1123         }
   1124     }
   1125 
   1126     // Check following()
   1127     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
   1128         int32_t actualBreak = t->bi->following(i);
   1129         int32_t expectedBreak = BreakIterator::DONE;
   1130         for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
   1131             if (t->getExpectedBreak(j) != 0) {
   1132                 expectedBreak = j;
   1133                 break;
   1134             }
   1135         }
   1136         if (expectedBreak != actualBreak) {
   1137             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
   1138                   "        Expected, Actual= %d, %d",
   1139                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
   1140         }
   1141     }
   1142 
   1143     // Check preceding()
   1144     for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
   1145         int32_t actualBreak = t->bi->preceding(i);
   1146         int32_t expectedBreak = BreakIterator::DONE;
   1147 
   1148         // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
   1149         // preceding(trailing byte) will return the index of some preceding code point,
   1150         // not the lead byte of the current code point, even though that has a smaller index.
   1151         // Therefore, start looking at the expected break data not at i-1, but at
   1152         // the start of code point index - 1.
   1153         utext_setNativeIndex(t->textToBreak, i);
   1154         int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
   1155         for (; j >= 0; j--) {
   1156             if (t->getExpectedBreak(j) != 0) {
   1157                 expectedBreak = j;
   1158                 break;
   1159             }
   1160         }
   1161         if (expectedBreak != actualBreak) {
   1162             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
   1163                   "        Expected, Actual= %d, %d",
   1164                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
   1165         }
   1166     }
   1167 }
   1168 
   1169 
   1170 void RBBITest::TestExtended() {
   1171 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   1172     UErrorCode      status  = U_ZERO_ERROR;
   1173     Locale          locale("");
   1174 
   1175     UnicodeString       rules;
   1176     TestParams          tp(status);
   1177 
   1178     RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@=-]*) *>"), 0, status);
   1179     if (U_FAILURE(status)) {
   1180         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
   1181     }
   1182 
   1183 
   1184     //
   1185     //  Open and read the test data file.
   1186     //
   1187     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   1188     char testFileName[1000];
   1189     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
   1190         errln("Can't open test data.  Path too long.");
   1191         return;
   1192     }
   1193     strcpy(testFileName, testDataDirectory);
   1194     strcat(testFileName, "rbbitst.txt");
   1195 
   1196     int    len;
   1197     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
   1198     if (U_FAILURE(status)) {
   1199         return; /* something went wrong, error already output */
   1200     }
   1201 
   1202 
   1203 
   1204 
   1205     //
   1206     //  Put the test data into a UnicodeString
   1207     //
   1208     UnicodeString testString(FALSE, testFile, len);
   1209 
   1210     enum EParseState{
   1211         PARSE_COMMENT,
   1212         PARSE_TAG,
   1213         PARSE_DATA,
   1214         PARSE_NUM
   1215     }
   1216     parseState = PARSE_TAG;
   1217 
   1218     EParseState savedState = PARSE_TAG;
   1219 
   1220     static const UChar CH_LF        = 0x0a;
   1221     static const UChar CH_CR        = 0x0d;
   1222     static const UChar CH_HASH      = 0x23;
   1223     /*static const UChar CH_PERIOD    = 0x2e;*/
   1224     static const UChar CH_LT        = 0x3c;
   1225     static const UChar CH_GT        = 0x3e;
   1226     static const UChar CH_BACKSLASH = 0x5c;
   1227     static const UChar CH_BULLET    = 0x2022;
   1228 
   1229     int32_t    lineNum  = 1;
   1230     int32_t    colStart = 0;
   1231     int32_t    column   = 0;
   1232     int32_t    charIdx  = 0;
   1233 
   1234     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
   1235 
   1236     for (charIdx = 0; charIdx < len; ) {
   1237         status = U_ZERO_ERROR;
   1238         UChar  c = testString.charAt(charIdx);
   1239         charIdx++;
   1240         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
   1241             // treat CRLF as a unit
   1242             c = CH_LF;
   1243             charIdx++;
   1244         }
   1245         if (c == CH_LF || c == CH_CR) {
   1246             lineNum++;
   1247             colStart = charIdx;
   1248         }
   1249         column = charIdx - colStart + 1;
   1250 
   1251         switch (parseState) {
   1252         case PARSE_COMMENT:
   1253             if (c == 0x0a || c == 0x0d) {
   1254                 parseState = savedState;
   1255             }
   1256             break;
   1257 
   1258         case PARSE_TAG:
   1259             {
   1260             if (c == CH_HASH) {
   1261                 parseState = PARSE_COMMENT;
   1262                 savedState = PARSE_TAG;
   1263                 break;
   1264             }
   1265             if (u_isUWhiteSpace(c)) {
   1266                 break;
   1267             }
   1268             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
   1269                 delete tp.bi;
   1270                 tp.bi = BreakIterator::createWordInstance(locale,  status);
   1271                 charIdx += 5;
   1272                 break;
   1273             }
   1274             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
   1275                 delete tp.bi;
   1276                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
   1277                 charIdx += 5;
   1278                 break;
   1279             }
   1280             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
   1281                 delete tp.bi;
   1282                 tp.bi = BreakIterator::createLineInstance(locale,  status);
   1283                 charIdx += 5;
   1284                 break;
   1285             }
   1286             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
   1287                 delete tp.bi;
   1288                 tp.bi = NULL;
   1289                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
   1290                 charIdx += 5;
   1291                 break;
   1292             }
   1293             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
   1294                 delete tp.bi;
   1295                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
   1296                 charIdx += 6;
   1297                 break;
   1298             }
   1299 
   1300             // <locale  loc_name>
   1301             localeMatcher.reset(testString);
   1302             if (localeMatcher.lookingAt(charIdx-1, status)) {
   1303                 UnicodeString localeName = localeMatcher.group(1, status);
   1304                 char localeName8[100];
   1305                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
   1306                 locale = Locale::createFromName(localeName8);
   1307                 charIdx += localeMatcher.group(0, status).length() - 1;
   1308                 TEST_ASSERT_SUCCESS(status);
   1309                 break;
   1310             }
   1311             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
   1312                 parseState = PARSE_DATA;
   1313                 charIdx += 5;
   1314                 tp.dataToBreak = "";
   1315                 tp.expectedBreaks->removeAllElements();
   1316                 tp.srcCol ->removeAllElements();
   1317                 tp.srcLine->removeAllElements();
   1318                 break;
   1319             }
   1320 
   1321             errln("line %d: Tag expected in test file.", lineNum);
   1322             parseState = PARSE_COMMENT;
   1323             savedState = PARSE_DATA;
   1324             goto end_test; // Stop the test.
   1325             }
   1326             break;
   1327 
   1328         case PARSE_DATA:
   1329             if (c == CH_BULLET) {
   1330                 int32_t  breakIdx = tp.dataToBreak.length();
   1331                 tp.expectedBreaks->setSize(breakIdx+1);
   1332                 tp.expectedBreaks->setElementAt(-1, breakIdx);
   1333                 tp.srcLine->setSize(breakIdx+1);
   1334                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1335                 tp.srcCol ->setSize(breakIdx+1);
   1336                 tp.srcCol ->setElementAt(column, breakIdx);
   1337                 break;
   1338             }
   1339 
   1340             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
   1341                 // Add final entry to mappings from break location to source file position.
   1342                 //  Need one extra because last break position returned is after the
   1343                 //    last char in the data, not at the last char.
   1344                 tp.srcLine->addElement(lineNum, status);
   1345                 tp.srcCol ->addElement(column, status);
   1346 
   1347                 parseState = PARSE_TAG;
   1348                 charIdx += 6;
   1349 
   1350                 // RUN THE TEST!
   1351                 status = U_ZERO_ERROR;
   1352                 tp.setUTF16(status);
   1353                 executeTest(&tp, status);
   1354                 TEST_ASSERT_SUCCESS(status);
   1355 
   1356                 // Run again, this time with UTF-8 text wrapped in a UText.
   1357                 status = U_ZERO_ERROR;
   1358                 tp.setUTF8(status);
   1359                 TEST_ASSERT_SUCCESS(status);
   1360                 executeTest(&tp, status);
   1361                 break;
   1362             }
   1363 
   1364             if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
   1365                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
   1366                 // Get the code point from the name and insert it into the test data.
   1367                 //   (Damn, no API takes names in Unicode  !!!
   1368                 //    we've got to take it back to char *)
   1369                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
   1370                 int32_t nameLength = nameEndIdx - (charIdx+2);
   1371                 char charNameBuf[200];
   1372                 UChar32 theChar = -1;
   1373                 if (nameEndIdx != -1) {
   1374                     UErrorCode status = U_ZERO_ERROR;
   1375                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
   1376                     charNameBuf[sizeof(charNameBuf)-1] = 0;
   1377                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
   1378                     if (U_FAILURE(status)) {
   1379                         theChar = -1;
   1380                     }
   1381                 }
   1382                 if (theChar == -1) {
   1383                     errln("Error in named character in test file at line %d, col %d",
   1384                         lineNum, column);
   1385                 } else {
   1386                     // Named code point was recognized.  Insert it
   1387                     //   into the test data.
   1388                     tp.dataToBreak.append(theChar);
   1389                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
   1390                         tp.srcLine->addElement(lineNum, status);
   1391                         tp.srcCol ->addElement(column, status);
   1392                     }
   1393                 }
   1394                 if (nameEndIdx > charIdx) {
   1395                     charIdx = nameEndIdx+1;
   1396 
   1397                 }
   1398                 break;
   1399             }
   1400 
   1401 
   1402 
   1403 
   1404             if (testString.compare(charIdx-1, 2, "<>") == 0) {
   1405                 charIdx++;
   1406                 int32_t  breakIdx = tp.dataToBreak.length();
   1407                 tp.expectedBreaks->setSize(breakIdx+1);
   1408                 tp.expectedBreaks->setElementAt(-1, breakIdx);
   1409                 tp.srcLine->setSize(breakIdx+1);
   1410                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1411                 tp.srcCol ->setSize(breakIdx+1);
   1412                 tp.srcCol ->setElementAt(column, breakIdx);
   1413                 break;
   1414             }
   1415 
   1416             if (c == CH_LT) {
   1417                 tagValue   = 0;
   1418                 parseState = PARSE_NUM;
   1419                 break;
   1420             }
   1421 
   1422             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
   1423                 parseState = PARSE_COMMENT;
   1424                 savedState = PARSE_DATA;
   1425                 break;
   1426             }
   1427 
   1428             if (c == CH_BACKSLASH) {
   1429                 // Check for \ at end of line, a line continuation.
   1430                 //     Advance over (discard) the newline
   1431                 UChar32 cp = testString.char32At(charIdx);
   1432                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
   1433                     // We have a CR LF
   1434                     //  Need an extra increment of the input ptr to move over both of them
   1435                     charIdx++;
   1436                 }
   1437                 if (cp == CH_LF || cp == CH_CR) {
   1438                     lineNum++;
   1439                     colStart = charIdx;
   1440                     charIdx++;
   1441                     break;
   1442                 }
   1443 
   1444                 // Let unescape handle the back slash.
   1445                 cp = testString.unescapeAt(charIdx);
   1446                 if (cp != -1) {
   1447                     // Escape sequence was recognized.  Insert the char
   1448                     //   into the test data.
   1449                     tp.dataToBreak.append(cp);
   1450                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
   1451                         tp.srcLine->addElement(lineNum, status);
   1452                         tp.srcCol ->addElement(column, status);
   1453                     }
   1454                     break;
   1455                 }
   1456 
   1457 
   1458                 // Not a recognized backslash escape sequence.
   1459                 // Take the next char as a literal.
   1460                 //  TODO:  Should this be an error?
   1461                 c = testString.charAt(charIdx);
   1462                 charIdx = testString.moveIndex32(charIdx, 1);
   1463             }
   1464 
   1465             // Normal, non-escaped data char.
   1466             tp.dataToBreak.append(c);
   1467 
   1468             // Save the mapping from offset in the data to line/column numbers in
   1469             //   the original input file.  Will be used for better error messages only.
   1470             //   If there's an expected break before this char, the slot in the mapping
   1471             //     vector will already be set for this char; don't overwrite it.
   1472             if (tp.dataToBreak.length() > tp.srcLine->size()) {
   1473                 tp.srcLine->addElement(lineNum, status);
   1474                 tp.srcCol ->addElement(column, status);
   1475             }
   1476             break;
   1477 
   1478 
   1479         case PARSE_NUM:
   1480             // We are parsing an expected numeric tag value, like <1234>,
   1481             //   within a chunk of data.
   1482             if (u_isUWhiteSpace(c)) {
   1483                 break;
   1484             }
   1485 
   1486             if (c == CH_GT) {
   1487                 // Finished the number.  Add the info to the expected break data,
   1488                 //   and switch parse state back to doing plain data.
   1489                 parseState = PARSE_DATA;
   1490                 if (tagValue == 0) {
   1491                     tagValue = -1;
   1492                 }
   1493                 int32_t  breakIdx = tp.dataToBreak.length();
   1494                 tp.expectedBreaks->setSize(breakIdx+1);
   1495                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
   1496                 tp.srcLine->setSize(breakIdx+1);
   1497                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1498                 tp.srcCol ->setSize(breakIdx+1);
   1499                 tp.srcCol ->setElementAt(column, breakIdx);
   1500                 break;
   1501             }
   1502 
   1503             if (u_isdigit(c)) {
   1504                 tagValue = tagValue*10 + u_charDigitValue(c);
   1505                 break;
   1506             }
   1507 
   1508             errln("Syntax Error in test file at line %d, col %d",
   1509                 lineNum, column);
   1510             parseState = PARSE_COMMENT;
   1511             goto end_test; // Stop the test
   1512             break;
   1513         }
   1514 
   1515 
   1516         if (U_FAILURE(status)) {
   1517             dataerrln("ICU Error %s while parsing test file at line %d.",
   1518                 u_errorName(status), lineNum);
   1519             status = U_ZERO_ERROR;
   1520             goto end_test; // Stop the test
   1521         }
   1522 
   1523     }
   1524 
   1525 end_test:
   1526     delete [] testFile;
   1527 #endif
   1528 }
   1529 
   1530 
   1531 //-------------------------------------------------------------------------------
   1532 //
   1533 //  TestDictRules   create a break iterator from source rules that includes a
   1534 //                  dictionary range.   Regression for bug #7130.  Source rules
   1535 //                  do not declare a break iterator type (word, line, sentence, etc.
   1536 //                  but the dictionary code, without a type, would loop.
   1537 //
   1538 //-------------------------------------------------------------------------------
   1539 void RBBITest::TestDictRules() {
   1540     const char *rules =  "$dictionary = [a-z]; \n"
   1541                          "!!forward; \n"
   1542                          "$dictionary $dictionary; \n"
   1543                          "!!reverse; \n"
   1544                          "$dictionary $dictionary; \n";
   1545     const char *text = "aa";
   1546     UErrorCode status = U_ZERO_ERROR;
   1547     UParseError parseError;
   1548 
   1549     RuleBasedBreakIterator bi(rules, parseError, status);
   1550     if (U_SUCCESS(status)) {
   1551         UnicodeString utext = text;
   1552         bi.setText(utext);
   1553         int32_t position;
   1554         int32_t loops;
   1555         for (loops = 0; loops<10; loops++) {
   1556             position = bi.next();
   1557             if (position == RuleBasedBreakIterator::DONE) {
   1558                 break;
   1559             }
   1560         }
   1561         TEST_ASSERT(loops == 1);
   1562     } else {
   1563         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
   1564     }
   1565 }
   1566 
   1567 
   1568 
   1569 //-------------------------------------------------------------------------------
   1570 //
   1571 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
   1572 //    return the datain one big UChar * buffer, which the caller must delete.
   1573 //
   1574 //    parameters:
   1575 //          fileName:   the name of the file, with no directory part.  The test data directory
   1576 //                      is assumed.
   1577 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
   1578 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
   1579 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
   1580 //                      Pass NULL for the system default encoding.
   1581 //          status
   1582 //    returns:
   1583 //                      The file data, converted to UChar.
   1584 //                      The caller must delete this when done with
   1585 //                           delete [] theBuffer;
   1586 //
   1587 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
   1588 //           Move this function to some common place.
   1589 //
   1590 //--------------------------------------------------------------------------------
   1591 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
   1592     UChar       *retPtr  = NULL;
   1593     char        *fileBuf = NULL;
   1594     UConverter* conv     = NULL;
   1595     FILE        *f       = NULL;
   1596 
   1597     ulen = 0;
   1598     if (U_FAILURE(status)) {
   1599         return retPtr;
   1600     }
   1601 
   1602     //
   1603     //  Open the file.
   1604     //
   1605     f = fopen(fileName, "rb");
   1606     if (f == 0) {
   1607         dataerrln("Error opening test data file %s\n", fileName);
   1608         status = U_FILE_ACCESS_ERROR;
   1609         return NULL;
   1610     }
   1611     //
   1612     //  Read it in
   1613     //
   1614     int   fileSize;
   1615     int   amt_read;
   1616 
   1617     fseek( f, 0, SEEK_END);
   1618     fileSize = ftell(f);
   1619     fileBuf = new char[fileSize];
   1620     fseek(f, 0, SEEK_SET);
   1621     amt_read = fread(fileBuf, 1, fileSize, f);
   1622     if (amt_read != fileSize || fileSize <= 0) {
   1623         errln("Error reading test data file.");
   1624         goto cleanUpAndReturn;
   1625     }
   1626 
   1627     //
   1628     // Look for a Unicode Signature (BOM) on the data just read
   1629     //
   1630     int32_t        signatureLength;
   1631     const char *   fileBufC;
   1632     const char*    bomEncoding;
   1633 
   1634     fileBufC = fileBuf;
   1635     bomEncoding = ucnv_detectUnicodeSignature(
   1636         fileBuf, fileSize, &signatureLength, &status);
   1637     if(bomEncoding!=NULL ){
   1638         fileBufC  += signatureLength;
   1639         fileSize  -= signatureLength;
   1640         encoding = bomEncoding;
   1641     }
   1642 
   1643     //
   1644     // Open a converter to take the rule file to UTF-16
   1645     //
   1646     conv = ucnv_open(encoding, &status);
   1647     if (U_FAILURE(status)) {
   1648         goto cleanUpAndReturn;
   1649     }
   1650 
   1651     //
   1652     // Convert the rules to UChar.
   1653     //  Preflight first to determine required buffer size.
   1654     //
   1655     ulen = ucnv_toUChars(conv,
   1656         NULL,           //  dest,
   1657         0,              //  destCapacity,
   1658         fileBufC,
   1659         fileSize,
   1660         &status);
   1661     if (status == U_BUFFER_OVERFLOW_ERROR) {
   1662         // Buffer Overflow is expected from the preflight operation.
   1663         status = U_ZERO_ERROR;
   1664 
   1665         retPtr = new UChar[ulen+1];
   1666         ucnv_toUChars(conv,
   1667             retPtr,       //  dest,
   1668             ulen+1,
   1669             fileBufC,
   1670             fileSize,
   1671             &status);
   1672     }
   1673 
   1674 cleanUpAndReturn:
   1675     fclose(f);
   1676     delete []fileBuf;
   1677     ucnv_close(conv);
   1678     if (U_FAILURE(status)) {
   1679         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   1680         delete []retPtr;
   1681         retPtr = 0;
   1682         ulen   = 0;
   1683     };
   1684     return retPtr;
   1685 }
   1686 
   1687 
   1688 
   1689 //--------------------------------------------------------------------------------------------
   1690 //
   1691 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
   1692 //
   1693 //-------------------------------------------------------------------------------------------
   1694 void RBBITest::TestUnicodeFiles() {
   1695     RuleBasedBreakIterator  *bi;
   1696     UErrorCode               status = U_ZERO_ERROR;
   1697 
   1698     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
   1699     TEST_ASSERT_SUCCESS(status);
   1700     if (U_SUCCESS(status)) {
   1701         runUnicodeTestData("GraphemeBreakTest.txt", bi);
   1702     }
   1703     delete bi;
   1704 
   1705     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
   1706     TEST_ASSERT_SUCCESS(status);
   1707     if (U_SUCCESS(status)) {
   1708         runUnicodeTestData("WordBreakTest.txt", bi);
   1709     }
   1710     delete bi;
   1711 
   1712     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
   1713     TEST_ASSERT_SUCCESS(status);
   1714     if (U_SUCCESS(status)) {
   1715         runUnicodeTestData("SentenceBreakTest.txt", bi);
   1716     }
   1717     delete bi;
   1718 
   1719     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
   1720     TEST_ASSERT_SUCCESS(status);
   1721     if (U_SUCCESS(status)) {
   1722         runUnicodeTestData("LineBreakTest.txt", bi);
   1723     }
   1724     delete bi;
   1725 }
   1726 
   1727 
   1728 // Check for test cases from the Unicode test data files that are known to fail
   1729 // and should be skipped because ICU is not yet able to fully implement the spec.
   1730 // See ticket #7270.
   1731 
   1732 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
   1733     static const UChar badTestCases[][4] = {                     // Line Numbers from Unicode 7.0.0 file.
   1734         {(UChar)0x200B, (UChar)0x0020, (UChar)0x007D, (UChar)0x0000},   // Line 5198
   1735         {(UChar)0x200B, (UChar)0x0020, (UChar)0x0029, (UChar)0x0000},   // Line 5202
   1736         {(UChar)0x200B, (UChar)0x0020, (UChar)0x0021, (UChar)0x0000},   // Line 5214
   1737         {(UChar)0x200B, (UChar)0x0020, (UChar)0x002c, (UChar)0x0000},   // Line 5246
   1738         {(UChar)0x200B, (UChar)0x0020, (UChar)0x002f, (UChar)0x0000},   // Line 5298
   1739         {(UChar)0x200B, (UChar)0x0020, (UChar)0x2060, (UChar)0x0000}    // Line 5302
   1740     };
   1741     if (strcmp(fileName, "LineBreakTest.txt") != 0) {
   1742         return FALSE;
   1743     }
   1744 
   1745     for (int i=0; i<UPRV_LENGTHOF(badTestCases); i++) {
   1746         if (testCase == UnicodeString(badTestCases[i])) {
   1747             return logKnownIssue("7270");
   1748         }
   1749     }
   1750     return FALSE;
   1751 }
   1752 
   1753 
   1754 //--------------------------------------------------------------------------------------------
   1755 //
   1756 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
   1757 //
   1758 //-------------------------------------------------------------------------------------------
   1759 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
   1760 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   1761     UErrorCode  status = U_ZERO_ERROR;
   1762 
   1763     //
   1764     //  Open and read the test data file, put it into a UnicodeString.
   1765     //
   1766     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   1767     char testFileName[1000];
   1768     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
   1769         dataerrln("Can't open test data.  Path too long.");
   1770         return;
   1771     }
   1772     strcpy(testFileName, testDataDirectory);
   1773     strcat(testFileName, fileName);
   1774 
   1775     logln("Opening data file %s\n", fileName);
   1776 
   1777     int    len;
   1778     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
   1779     if (status != U_FILE_ACCESS_ERROR) {
   1780         TEST_ASSERT_SUCCESS(status);
   1781         TEST_ASSERT(testFile != NULL);
   1782     }
   1783     if (U_FAILURE(status) || testFile == NULL) {
   1784         return; /* something went wrong, error already output */
   1785     }
   1786     UnicodeString testFileAsString(TRUE, testFile, len);
   1787 
   1788     //
   1789     //  Parse the test data file using a regular expression.
   1790     //  Each kind of token is recognized in its own capture group; what type of item was scanned
   1791     //     is identified by which group had a match.
   1792     //
   1793     //    Caputure Group #                  1          2            3            4           5
   1794     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
   1795     //
   1796     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
   1797     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
   1798     UnicodeString   testString;
   1799     UVector32       breakPositions(status);
   1800     int             lineNumber = 1;
   1801     TEST_ASSERT_SUCCESS(status);
   1802     if (U_FAILURE(status)) {
   1803         return;
   1804     }
   1805 
   1806     //
   1807     //  Scan through each test case, building up the string to be broken in testString,
   1808     //   and the positions that should be boundaries in the breakPositions vector.
   1809     //
   1810     int spin = 0;
   1811     while (tokenMatcher.find()) {
   1812       	if(tokenMatcher.hitEnd()) {
   1813           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
   1814              This occurred when the text file was corrupt (wasn't marked as UTF-8)
   1815              and caused an infinite loop here on EBCDIC systems!
   1816           */
   1817           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
   1818           //	   return;
   1819       	}
   1820         if (tokenMatcher.start(1, status) >= 0) {
   1821             // Scanned a divide sign, indicating a break position in the test data.
   1822             if (testString.length()>0) {
   1823                 breakPositions.addElement(testString.length(), status);
   1824             }
   1825         }
   1826         else if (tokenMatcher.start(2, status) >= 0) {
   1827             // Scanned an 'x', meaning no break at this position in the test data
   1828             //   Nothing to be done here.
   1829             }
   1830         else if (tokenMatcher.start(3, status) >= 0) {
   1831             // Scanned Hex digits.  Convert them to binary, append to the character data string.
   1832             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
   1833             int length = hexNumber.length();
   1834             if (length<=8) {
   1835                 char buf[10];
   1836                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
   1837                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
   1838                 if (c<=0x10ffff) {
   1839                     testString.append(c);
   1840                 } else {
   1841                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
   1842                        fileName, lineNumber);
   1843                 }
   1844             } else {
   1845                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
   1846                        fileName, lineNumber);
   1847              }
   1848         }
   1849         else if (tokenMatcher.start(4, status) >= 0) {
   1850             // Scanned to end of a line, possibly skipping over a comment in the process.
   1851             //   If the line from the file contained test data, run the test now.
   1852             if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
   1853                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
   1854             }
   1855 
   1856             // Clear out this test case.
   1857             //    The string and breakPositions vector will be refilled as the next
   1858             //       test case is parsed.
   1859             testString.remove();
   1860             breakPositions.removeAllElements();
   1861             lineNumber++;
   1862         } else {
   1863             // Scanner catchall.  Something unrecognized appeared on the line.
   1864             char token[16];
   1865             UnicodeString uToken = tokenMatcher.group(0, status);
   1866             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
   1867             token[sizeof(token)-1] = 0;
   1868             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
   1869 
   1870             // Clean up, in preparation for continuing with the next line.
   1871             testString.remove();
   1872             breakPositions.removeAllElements();
   1873             lineNumber++;
   1874         }
   1875         TEST_ASSERT_SUCCESS(status);
   1876         if (U_FAILURE(status)) {
   1877             break;
   1878         }
   1879     }
   1880 
   1881     delete [] testFile;
   1882  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
   1883 }
   1884 
   1885 //--------------------------------------------------------------------------------------------
   1886 //
   1887 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
   1888 //                            test data files.  Do only a simple, forward-only check -
   1889 //                            this test is mostly to check that ICU and the Unicode
   1890 //                            data agree with each other.
   1891 //
   1892 //--------------------------------------------------------------------------------------------
   1893 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
   1894                          const UnicodeString &testString,   // Text data to be broken
   1895                          UVector32 *breakPositions,         // Positions where breaks should be found.
   1896                          RuleBasedBreakIterator *bi) {
   1897     int32_t pos;                 // Break Position in the test string
   1898     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
   1899     int32_t expectedPos;         // Expected break position (index into test string)
   1900 
   1901     bi->setText(testString);
   1902     pos = bi->first();
   1903     pos = bi->next();
   1904 
   1905     while (pos != BreakIterator::DONE) {
   1906         if (expectedI >= breakPositions->size()) {
   1907             errln("Test file \"%s\", line %d, unexpected break found at position %d",
   1908                 testFileName, lineNumber, pos);
   1909             break;
   1910         }
   1911         expectedPos = breakPositions->elementAti(expectedI);
   1912         if (pos < expectedPos) {
   1913             errln("Test file \"%s\", line %d, unexpected break found at position %d",
   1914                 testFileName, lineNumber, pos);
   1915             break;
   1916         }
   1917         if (pos > expectedPos) {
   1918             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
   1919                 testFileName, lineNumber, expectedPos);
   1920             break;
   1921         }
   1922         pos = bi->next();
   1923         expectedI++;
   1924     }
   1925 
   1926     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
   1927         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
   1928             testFileName, lineNumber, breakPositions->elementAti(expectedI));
   1929     }
   1930 }
   1931 
   1932 
   1933 
   1934 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   1935 //---------------------------------------------------------------------------------------
   1936 //
   1937 //   classs RBBIMonkeyKind
   1938 //
   1939 //      Monkey Test for Break Iteration
   1940 //      Abstract interface class.   Concrete derived classes independently
   1941 //      implement the break rules for different iterator types.
   1942 //
   1943 //      The Monkey Test itself uses doesn't know which type of break iterator it is
   1944 //      testing, but works purely in terms of the interface defined here.
   1945 //
   1946 //---------------------------------------------------------------------------------------
   1947 class RBBIMonkeyKind {
   1948 public:
   1949     // Return a UVector of UnicodeSets, representing the character classes used
   1950     //   for this type of iterator.
   1951     virtual  UVector  *charClasses() = 0;
   1952 
   1953     // Set the test text on which subsequent calls to next() will operate
   1954     virtual  void      setText(const UnicodeString &s) = 0;
   1955 
   1956     // Find the next break postion, starting from the prev break position, or from zero.
   1957     // Return -1 after reaching end of string.
   1958     virtual  int32_t   next(int32_t i) = 0;
   1959 
   1960     virtual ~RBBIMonkeyKind();
   1961     UErrorCode       deferredStatus;
   1962 
   1963 
   1964 protected:
   1965     RBBIMonkeyKind();
   1966 
   1967 private:
   1968 };
   1969 
   1970 RBBIMonkeyKind::RBBIMonkeyKind() {
   1971     deferredStatus = U_ZERO_ERROR;
   1972 }
   1973 
   1974 RBBIMonkeyKind::~RBBIMonkeyKind() {
   1975 }
   1976 
   1977 
   1978 //----------------------------------------------------------------------------------------
   1979 //
   1980 //   Random Numbers.  Similar to standard lib rand() and srand()
   1981 //                    Not using library to
   1982 //                      1.  Get same results on all platforms.
   1983 //                      2.  Get access to current seed, to more easily reproduce failures.
   1984 //
   1985 //---------------------------------------------------------------------------------------
   1986 static uint32_t m_seed = 1;
   1987 
   1988 static uint32_t m_rand()
   1989 {
   1990     m_seed = m_seed * 1103515245 + 12345;
   1991     return (uint32_t)(m_seed/65536) % 32768;
   1992 }
   1993 
   1994 
   1995 //------------------------------------------------------------------------------------------
   1996 //
   1997 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
   1998 //                             of RBBIMonkeyKind.
   1999 //
   2000 //------------------------------------------------------------------------------------------
   2001 class RBBICharMonkey: public RBBIMonkeyKind {
   2002 public:
   2003     RBBICharMonkey();
   2004     virtual          ~RBBICharMonkey();
   2005     virtual  UVector *charClasses();
   2006     virtual  void     setText(const UnicodeString &s);
   2007     virtual  int32_t  next(int32_t i);
   2008 private:
   2009     UVector   *fSets;
   2010 
   2011     UnicodeSet  *fCRLFSet;
   2012     UnicodeSet  *fControlSet;
   2013     UnicodeSet  *fExtendSet;
   2014     UnicodeSet  *fRegionalIndicatorSet;
   2015     UnicodeSet  *fPrependSet;
   2016     UnicodeSet  *fSpacingSet;
   2017     UnicodeSet  *fLSet;
   2018     UnicodeSet  *fVSet;
   2019     UnicodeSet  *fTSet;
   2020     UnicodeSet  *fLVSet;
   2021     UnicodeSet  *fLVTSet;
   2022     UnicodeSet  *fHangulSet;
   2023     UnicodeSet  *fAnySet;
   2024 
   2025     const UnicodeString *fText;
   2026 };
   2027 
   2028 
   2029 RBBICharMonkey::RBBICharMonkey() {
   2030     UErrorCode  status = U_ZERO_ERROR;
   2031 
   2032     fText = NULL;
   2033 
   2034     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
   2035     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
   2036     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
   2037     fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
   2038     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
   2039     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
   2040     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
   2041     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
   2042     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
   2043     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
   2044     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
   2045     fHangulSet  = new UnicodeSet();
   2046     fHangulSet->addAll(*fLSet);
   2047     fHangulSet->addAll(*fVSet);
   2048     fHangulSet->addAll(*fTSet);
   2049     fHangulSet->addAll(*fLVSet);
   2050     fHangulSet->addAll(*fLVTSet);
   2051     fAnySet     = new UnicodeSet(0, 0x10ffff);
   2052 
   2053     fSets       = new UVector(status);
   2054     fSets->addElement(fCRLFSet,    status);
   2055     fSets->addElement(fControlSet, status);
   2056     fSets->addElement(fExtendSet,  status);
   2057     fSets->addElement(fRegionalIndicatorSet, status);
   2058     if (!fPrependSet->isEmpty()) {
   2059         fSets->addElement(fPrependSet, status);
   2060     }
   2061     fSets->addElement(fSpacingSet, status);
   2062     fSets->addElement(fHangulSet,  status);
   2063     fSets->addElement(fAnySet,     status);
   2064     if (U_FAILURE(status)) {
   2065         deferredStatus = status;
   2066     }
   2067 }
   2068 
   2069 
   2070 void RBBICharMonkey::setText(const UnicodeString &s) {
   2071     fText = &s;
   2072 }
   2073 
   2074 
   2075 
   2076 int32_t RBBICharMonkey::next(int32_t prevPos) {
   2077     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2078                               //   break position being tested.  The candidate break
   2079                               //   location is before p2.
   2080 
   2081     int     breakPos = -1;
   2082 
   2083     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2084 
   2085     if (U_FAILURE(deferredStatus)) {
   2086         return -1;
   2087     }
   2088 
   2089     // Previous break at end of string.  return DONE.
   2090     if (prevPos >= fText->length()) {
   2091         return -1;
   2092     }
   2093     p0 = p1 = p2 = p3 = prevPos;
   2094     c3 =  fText->char32At(prevPos);
   2095     c0 = c1 = c2 = 0;
   2096     (void)p0;   // suppress set but not used warning.
   2097     (void)c0;
   2098 
   2099     // Loop runs once per "significant" character position in the input text.
   2100     for (;;) {
   2101         // Move all of the positions forward in the input string.
   2102         p0 = p1;  c0 = c1;
   2103         p1 = p2;  c1 = c2;
   2104         p2 = p3;  c2 = c3;
   2105 
   2106         // Advancd p3 by one codepoint
   2107         p3 = fText->moveIndex32(p3, 1);
   2108         c3 = fText->char32At(p3);
   2109 
   2110         if (p1 == p2) {
   2111             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2112             continue;
   2113         }
   2114         if (p2 == fText->length()) {
   2115             // Reached end of string.  Always a break position.
   2116             break;
   2117         }
   2118 
   2119         // Rule  GB3   CR x LF
   2120         //     No Extend or Format characters may appear between the CR and LF,
   2121         //     which requires the additional check for p2 immediately following p1.
   2122         //
   2123         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
   2124             continue;
   2125         }
   2126 
   2127         // Rule (GB4).   ( Control | CR | LF ) <break>
   2128         if (fControlSet->contains(c1) ||
   2129             c1 == 0x0D ||
   2130             c1 == 0x0A)  {
   2131             break;
   2132         }
   2133 
   2134         // Rule (GB5)    <break>  ( Control | CR | LF )
   2135         //
   2136         if (fControlSet->contains(c2) ||
   2137             c2 == 0x0D ||
   2138             c2 == 0x0A)  {
   2139             break;
   2140         }
   2141 
   2142 
   2143         // Rule (GB6)  L x ( L | V | LV | LVT )
   2144         if (fLSet->contains(c1) &&
   2145                (fLSet->contains(c2)  ||
   2146                 fVSet->contains(c2)  ||
   2147                 fLVSet->contains(c2) ||
   2148                 fLVTSet->contains(c2))) {
   2149             continue;
   2150         }
   2151 
   2152         // Rule (GB7)    ( LV | V )  x  ( V | T )
   2153         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
   2154             (fVSet->contains(c2) || fTSet->contains(c2)))  {
   2155             continue;
   2156         }
   2157 
   2158         // Rule (GB8)    ( LVT | T)  x T
   2159         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
   2160             fTSet->contains(c2))  {
   2161             continue;
   2162         }
   2163 
   2164         // Rule (GB8a)    Regional_Indicator x Regional_Indicator
   2165         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
   2166             continue;
   2167         }
   2168 
   2169         // Rule (GB9)    Numeric x ALetter
   2170         if (fExtendSet->contains(c2))  {
   2171             continue;
   2172         }
   2173 
   2174         // Rule (GB9a)   x  SpacingMark
   2175         if (fSpacingSet->contains(c2)) {
   2176             continue;
   2177         }
   2178 
   2179         // Rule (GB9b)   Prepend x
   2180         if (fPrependSet->contains(c1)) {
   2181             continue;
   2182         }
   2183 
   2184         // Rule (GB10)  Any  <break>  Any
   2185         break;
   2186     }
   2187 
   2188     breakPos = p2;
   2189     return breakPos;
   2190 }
   2191 
   2192 
   2193 
   2194 UVector  *RBBICharMonkey::charClasses() {
   2195     return fSets;
   2196 }
   2197 
   2198 
   2199 RBBICharMonkey::~RBBICharMonkey() {
   2200     delete fSets;
   2201     delete fCRLFSet;
   2202     delete fControlSet;
   2203     delete fExtendSet;
   2204     delete fRegionalIndicatorSet;
   2205     delete fPrependSet;
   2206     delete fSpacingSet;
   2207     delete fLSet;
   2208     delete fVSet;
   2209     delete fTSet;
   2210     delete fLVSet;
   2211     delete fLVTSet;
   2212     delete fHangulSet;
   2213     delete fAnySet;
   2214 }
   2215 
   2216 //------------------------------------------------------------------------------------------
   2217 //
   2218 //   class RBBIWordMonkey      Word Break specific implementation
   2219 //                             of RBBIMonkeyKind.
   2220 //
   2221 //------------------------------------------------------------------------------------------
   2222 class RBBIWordMonkey: public RBBIMonkeyKind {
   2223 public:
   2224     RBBIWordMonkey();
   2225     virtual          ~RBBIWordMonkey();
   2226     virtual  UVector *charClasses();
   2227     virtual  void     setText(const UnicodeString &s);
   2228     virtual int32_t   next(int32_t i);
   2229 private:
   2230     UVector      *fSets;
   2231 
   2232     UnicodeSet  *fCRSet;
   2233     UnicodeSet  *fLFSet;
   2234     UnicodeSet  *fNewlineSet;
   2235     UnicodeSet  *fRegionalIndicatorSet;
   2236     UnicodeSet  *fKatakanaSet;
   2237     UnicodeSet  *fHebrew_LetterSet;
   2238     UnicodeSet  *fALetterSet;
   2239     // TODO(jungshik): Do we still need this change?
   2240     // UnicodeSet  *fALetterSet; // matches ALetterPlus in word.txt
   2241     UnicodeSet  *fSingle_QuoteSet;
   2242     UnicodeSet  *fDouble_QuoteSet;
   2243     UnicodeSet  *fMidNumLetSet;
   2244     UnicodeSet  *fMidLetterSet;
   2245     UnicodeSet  *fMidNumSet;
   2246     UnicodeSet  *fNumericSet;
   2247     UnicodeSet  *fFormatSet;
   2248     UnicodeSet  *fOtherSet;
   2249     UnicodeSet  *fExtendSet;
   2250     UnicodeSet  *fExtendNumLetSet;
   2251     UnicodeSet  *fDictionaryCjkSet;
   2252 
   2253     const UnicodeString  *fText;
   2254 };
   2255 
   2256 
   2257 RBBIWordMonkey::RBBIWordMonkey()
   2258 {
   2259     UErrorCode  status = U_ZERO_ERROR;
   2260 
   2261     fSets            = new UVector(status);
   2262 
   2263     fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
   2264     fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
   2265     fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
   2266     fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
   2267     // Exclude Hangul syllables from ALetterSet during testing.
   2268     // Leave CJK dictionary characters out from the monkey tests!
   2269 #if 0
   2270     fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
   2271                                       "[\\p{Line_Break = Complex_Context}"
   2272                                       "-\\p{Grapheme_Cluster_Break = Extend}"
   2273                                       "-\\p{Grapheme_Cluster_Break = Control}"
   2274                                       "]]",
   2275                                       status);
   2276 #endif
   2277     fRegionalIndicatorSet =  new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
   2278     fKatakanaSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
   2279     fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);
   2280     fALetterSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
   2281     fALetterSet->removeAll(*fDictionaryCjkSet);
   2282     fSingle_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"),    status);
   2283     fDouble_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"),    status);
   2284     fMidNumLetSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
   2285     fMidLetterSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
   2286     fMidNumSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
   2287     // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
   2288     // we should figure out why
   2289     fNumericSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
   2290     fFormatSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
   2291     fExtendNumLetSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
   2292     fExtendSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
   2293 
   2294     fOtherSet        = new UnicodeSet();
   2295     if(U_FAILURE(status)) {
   2296       deferredStatus = status;
   2297       return;
   2298     }
   2299 
   2300     fOtherSet->complement();
   2301     fOtherSet->removeAll(*fCRSet);
   2302     fOtherSet->removeAll(*fLFSet);
   2303     fOtherSet->removeAll(*fNewlineSet);
   2304     fOtherSet->removeAll(*fKatakanaSet);
   2305     fOtherSet->removeAll(*fHebrew_LetterSet);
   2306     fOtherSet->removeAll(*fALetterSet);
   2307     fOtherSet->removeAll(*fSingle_QuoteSet);
   2308     fOtherSet->removeAll(*fDouble_QuoteSet);
   2309     fOtherSet->removeAll(*fMidLetterSet);
   2310     fOtherSet->removeAll(*fMidNumSet);
   2311     fOtherSet->removeAll(*fNumericSet);
   2312     fOtherSet->removeAll(*fExtendNumLetSet);
   2313     fOtherSet->removeAll(*fFormatSet);
   2314     fOtherSet->removeAll(*fExtendSet);
   2315     fOtherSet->removeAll(*fRegionalIndicatorSet);
   2316     // Inhibit dictionary characters from being tested at all.
   2317     fOtherSet->removeAll(*fDictionaryCjkSet);
   2318     fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
   2319 
   2320     fSets->addElement(fCRSet,                status);
   2321     fSets->addElement(fLFSet,                status);
   2322     fSets->addElement(fNewlineSet,           status);
   2323     fSets->addElement(fRegionalIndicatorSet, status);
   2324     fSets->addElement(fHebrew_LetterSet,     status);
   2325     fSets->addElement(fALetterSet,           status);
   2326     fSets->addElement(fSingle_QuoteSet,      status);
   2327     fSets->addElement(fDouble_QuoteSet,      status);
   2328     //fSets->addElement(fKatakanaSet,          status); //TODO: work out how to test katakana
   2329     fSets->addElement(fMidLetterSet,         status);
   2330     fSets->addElement(fMidNumLetSet,         status);
   2331     fSets->addElement(fMidNumSet,            status);
   2332     fSets->addElement(fNumericSet,           status);
   2333     fSets->addElement(fFormatSet,            status);
   2334     fSets->addElement(fExtendSet,            status);
   2335     fSets->addElement(fOtherSet,             status);
   2336     fSets->addElement(fExtendNumLetSet,      status);
   2337 
   2338     if (U_FAILURE(status)) {
   2339         deferredStatus = status;
   2340     }
   2341 }
   2342 
   2343 void RBBIWordMonkey::setText(const UnicodeString &s) {
   2344     fText       = &s;
   2345 }
   2346 
   2347 
   2348 int32_t RBBIWordMonkey::next(int32_t prevPos) {
   2349     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2350                               //   break position being tested.  The candidate break
   2351                               //   location is before p2.
   2352 
   2353     int     breakPos = -1;
   2354 
   2355     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2356 
   2357     if (U_FAILURE(deferredStatus)) {
   2358         return -1;
   2359     }
   2360 
   2361     // Prev break at end of string.  return DONE.
   2362     if (prevPos >= fText->length()) {
   2363         return -1;
   2364     }
   2365     p0 = p1 = p2 = p3 = prevPos;
   2366     c3 =  fText->char32At(prevPos);
   2367     c0 = c1 = c2 = 0;
   2368     (void)p0;       // Suppress set but not used warning.
   2369 
   2370     // Loop runs once per "significant" character position in the input text.
   2371     for (;;) {
   2372         // Move all of the positions forward in the input string.
   2373         p0 = p1;  c0 = c1;
   2374         p1 = p2;  c1 = c2;
   2375         p2 = p3;  c2 = c3;
   2376 
   2377         // Advancd p3 by    X(Extend | Format)*   Rule 4
   2378         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
   2379         do {
   2380             p3 = fText->moveIndex32(p3, 1);
   2381             c3 = fText->char32At(p3);
   2382             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
   2383                break;
   2384             };
   2385         }
   2386         while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
   2387 
   2388 
   2389         if (p1 == p2) {
   2390             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2391             continue;
   2392         }
   2393         if (p2 == fText->length()) {
   2394             // Reached end of string.  Always a break position.
   2395             break;
   2396         }
   2397 
   2398         // Rule  (3)   CR x LF
   2399         //     No Extend or Format characters may appear between the CR and LF,
   2400         //     which requires the additional check for p2 immediately following p1.
   2401         //
   2402         if (c1==0x0D && c2==0x0A) {
   2403             continue;
   2404         }
   2405 
   2406         // Rule (3a)  Break before and after newlines (including CR and LF)
   2407         //
   2408         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
   2409             break;
   2410         };
   2411         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
   2412             break;
   2413         };
   2414 
   2415         // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
   2416         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
   2417             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
   2418             continue;
   2419         }
   2420 
   2421         // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
   2422         //
   2423         if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
   2424              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
   2425              (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
   2426             continue;
   2427         }
   2428 
   2429         // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
   2430         if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
   2431             (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
   2432             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
   2433             continue;
   2434         }
   2435 
   2436         // Rule (7a)     Hebrew_Letter x Single_Quote
   2437         if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
   2438             continue;
   2439         }
   2440 
   2441         // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
   2442         if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
   2443             continue;
   2444         }
   2445 
   2446         // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
   2447         if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
   2448             continue;
   2449         }
   2450 
   2451         // Rule (8)    Numeric x Numeric
   2452         if (fNumericSet->contains(c1) &&
   2453             fNumericSet->contains(c2))  {
   2454             continue;
   2455         }
   2456 
   2457         // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
   2458         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
   2459             fNumericSet->contains(c2))  {
   2460             continue;
   2461         }
   2462 
   2463         // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
   2464         if (fNumericSet->contains(c1) &&
   2465             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
   2466             continue;
   2467         }
   2468 
   2469         // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
   2470         if (fNumericSet->contains(c0) &&
   2471             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
   2472             fNumericSet->contains(c2)) {
   2473             continue;
   2474         }
   2475 
   2476         // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
   2477         if (fNumericSet->contains(c1) &&
   2478             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
   2479             fNumericSet->contains(c3)) {
   2480             continue;
   2481         }
   2482 
   2483         // Rule (13)  Katakana x Katakana
   2484         if (fKatakanaSet->contains(c1) &&
   2485             fKatakanaSet->contains(c2))  {
   2486             continue;
   2487         }
   2488 
   2489         // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
   2490         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
   2491              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
   2492              fExtendNumLetSet->contains(c2)) {
   2493                 continue;
   2494         }
   2495 
   2496         // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
   2497         if (fExtendNumLetSet->contains(c1) &&
   2498                 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
   2499                  fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
   2500             continue;
   2501         }
   2502 
   2503         // Rule 13c
   2504         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
   2505             continue;
   2506         }
   2507 
   2508         // Rule 14.  Break found here.
   2509         break;
   2510     }
   2511 
   2512     breakPos = p2;
   2513     return breakPos;
   2514 }
   2515 
   2516 
   2517 UVector  *RBBIWordMonkey::charClasses() {
   2518     return fSets;
   2519 }
   2520 
   2521 
   2522 RBBIWordMonkey::~RBBIWordMonkey() {
   2523     delete fSets;
   2524     delete fCRSet;
   2525     delete fLFSet;
   2526     delete fNewlineSet;
   2527     delete fKatakanaSet;
   2528     delete fHebrew_LetterSet;
   2529     delete fALetterSet;
   2530     delete fSingle_QuoteSet;
   2531     delete fDouble_QuoteSet;
   2532     delete fMidNumLetSet;
   2533     delete fMidLetterSet;
   2534     delete fMidNumSet;
   2535     delete fNumericSet;
   2536     delete fFormatSet;
   2537     delete fExtendSet;
   2538     delete fExtendNumLetSet;
   2539     delete fRegionalIndicatorSet;
   2540     delete fDictionaryCjkSet;
   2541     delete fOtherSet;
   2542 }
   2543 
   2544 
   2545 
   2546 
   2547 //------------------------------------------------------------------------------------------
   2548 //
   2549 //   class RBBISentMonkey      Sentence Break specific implementation
   2550 //                             of RBBIMonkeyKind.
   2551 //
   2552 //------------------------------------------------------------------------------------------
   2553 class RBBISentMonkey: public RBBIMonkeyKind {
   2554 public:
   2555     RBBISentMonkey();
   2556     virtual          ~RBBISentMonkey();
   2557     virtual  UVector *charClasses();
   2558     virtual  void     setText(const UnicodeString &s);
   2559     virtual int32_t   next(int32_t i);
   2560 private:
   2561     int               moveBack(int posFrom);
   2562     int               moveForward(int posFrom);
   2563     UChar32           cAt(int pos);
   2564 
   2565     UVector      *fSets;
   2566 
   2567     UnicodeSet  *fSepSet;
   2568     UnicodeSet  *fFormatSet;
   2569     UnicodeSet  *fSpSet;
   2570     UnicodeSet  *fLowerSet;
   2571     UnicodeSet  *fUpperSet;
   2572     UnicodeSet  *fOLetterSet;
   2573     UnicodeSet  *fNumericSet;
   2574     UnicodeSet  *fATermSet;
   2575     UnicodeSet  *fSContinueSet;
   2576     UnicodeSet  *fSTermSet;
   2577     UnicodeSet  *fCloseSet;
   2578     UnicodeSet  *fOtherSet;
   2579     UnicodeSet  *fExtendSet;
   2580 
   2581     const UnicodeString  *fText;
   2582 
   2583 };
   2584 
   2585 RBBISentMonkey::RBBISentMonkey()
   2586 {
   2587     UErrorCode  status = U_ZERO_ERROR;
   2588 
   2589     fSets            = new UVector(status);
   2590 
   2591     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
   2592     //                       set and made into character classes of their own.  For the monkey impl,
   2593     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
   2594     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
   2595     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
   2596     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
   2597     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
   2598     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
   2599     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
   2600     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
   2601     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
   2602     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
   2603     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
   2604     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
   2605     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
   2606     fOtherSet        = new UnicodeSet();
   2607 
   2608     if(U_FAILURE(status)) {
   2609       deferredStatus = status;
   2610       return;
   2611     }
   2612 
   2613     fOtherSet->complement();
   2614     fOtherSet->removeAll(*fSepSet);
   2615     fOtherSet->removeAll(*fFormatSet);
   2616     fOtherSet->removeAll(*fSpSet);
   2617     fOtherSet->removeAll(*fLowerSet);
   2618     fOtherSet->removeAll(*fUpperSet);
   2619     fOtherSet->removeAll(*fOLetterSet);
   2620     fOtherSet->removeAll(*fNumericSet);
   2621     fOtherSet->removeAll(*fATermSet);
   2622     fOtherSet->removeAll(*fSContinueSet);
   2623     fOtherSet->removeAll(*fSTermSet);
   2624     fOtherSet->removeAll(*fCloseSet);
   2625     fOtherSet->removeAll(*fExtendSet);
   2626 
   2627     fSets->addElement(fSepSet,       status);
   2628     fSets->addElement(fFormatSet,    status);
   2629     fSets->addElement(fSpSet,        status);
   2630     fSets->addElement(fLowerSet,     status);
   2631     fSets->addElement(fUpperSet,     status);
   2632     fSets->addElement(fOLetterSet,   status);
   2633     fSets->addElement(fNumericSet,   status);
   2634     fSets->addElement(fATermSet,     status);
   2635     fSets->addElement(fSContinueSet, status);
   2636     fSets->addElement(fSTermSet,     status);
   2637     fSets->addElement(fCloseSet,     status);
   2638     fSets->addElement(fOtherSet,     status);
   2639     fSets->addElement(fExtendSet,    status);
   2640 
   2641     if (U_FAILURE(status)) {
   2642         deferredStatus = status;
   2643     }
   2644 }
   2645 
   2646 
   2647 
   2648 void RBBISentMonkey::setText(const UnicodeString &s) {
   2649     fText       = &s;
   2650 }
   2651 
   2652 UVector  *RBBISentMonkey::charClasses() {
   2653     return fSets;
   2654 }
   2655 
   2656 
   2657 //  moveBack()   Find the "significant" code point preceding the index i.
   2658 //               Skips over ($Extend | $Format)* .
   2659 //
   2660 int RBBISentMonkey::moveBack(int i) {
   2661     if (i <= 0) {
   2662         return -1;
   2663     }
   2664     UChar32   c;
   2665     int32_t   j = i;
   2666     do {
   2667         j = fText->moveIndex32(j, -1);
   2668         c = fText->char32At(j);
   2669     }
   2670     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
   2671     return j;
   2672 
   2673  }
   2674 
   2675 
   2676 int RBBISentMonkey::moveForward(int i) {
   2677     if (i>=fText->length()) {
   2678         return fText->length();
   2679     }
   2680     UChar32   c;
   2681     int32_t   j = i;
   2682     do {
   2683         j = fText->moveIndex32(j, 1);
   2684         c = cAt(j);
   2685     }
   2686     while (fFormatSet->contains(c) || fExtendSet->contains(c));
   2687     return j;
   2688 }
   2689 
   2690 UChar32 RBBISentMonkey::cAt(int pos) {
   2691     if (pos<0 || pos>=fText->length()) {
   2692         return -1;
   2693     } else {
   2694         return fText->char32At(pos);
   2695     }
   2696 }
   2697 
   2698 int32_t RBBISentMonkey::next(int32_t prevPos) {
   2699     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2700                               //   break position being tested.  The candidate break
   2701                               //   location is before p2.
   2702 
   2703     int     breakPos = -1;
   2704 
   2705     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2706     UChar32 c;
   2707 
   2708     if (U_FAILURE(deferredStatus)) {
   2709         return -1;
   2710     }
   2711 
   2712     // Prev break at end of string.  return DONE.
   2713     if (prevPos >= fText->length()) {
   2714         return -1;
   2715     }
   2716     p0 = p1 = p2 = p3 = prevPos;
   2717     c3 =  fText->char32At(prevPos);
   2718     c0 = c1 = c2 = 0;
   2719     (void)p0;     // Suppress set but not used warning.
   2720 
   2721     // Loop runs once per "significant" character position in the input text.
   2722     for (;;) {
   2723         // Move all of the positions forward in the input string.
   2724         p0 = p1;  c0 = c1;
   2725         p1 = p2;  c1 = c2;
   2726         p2 = p3;  c2 = c3;
   2727 
   2728         // Advancd p3 by    X(Extend | Format)*   Rule 4
   2729         p3 = moveForward(p3);
   2730         c3 = cAt(p3);
   2731 
   2732         // Rule (3)  CR x LF
   2733         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
   2734             continue;
   2735         }
   2736 
   2737         // Rule (4).   Sep  <break>
   2738         if (fSepSet->contains(c1)) {
   2739             p2 = p1+1;   // Separators don't combine with Extend or Format.
   2740             break;
   2741         }
   2742 
   2743         if (p2 >= fText->length()) {
   2744             // Reached end of string.  Always a break position.
   2745             break;
   2746         }
   2747 
   2748         if (p2 == prevPos) {
   2749             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2750             continue;
   2751         }
   2752 
   2753         // Rule (6).   ATerm x Numeric
   2754         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
   2755             continue;
   2756         }
   2757 
   2758         // Rule (7).  Upper ATerm  x  Uppper
   2759         if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
   2760             continue;
   2761         }
   2762 
   2763         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
   2764         //           Note:  STerm | ATerm are added to the negated part of the expression by a
   2765         //                  note to the Unicode 5.0 documents.
   2766         int p8 = p1;
   2767         while (fSpSet->contains(cAt(p8))) {
   2768             p8 = moveBack(p8);
   2769         }
   2770         while (fCloseSet->contains(cAt(p8))) {
   2771             p8 = moveBack(p8);
   2772         }
   2773         if (fATermSet->contains(cAt(p8))) {
   2774             p8=p2;
   2775             for (;;) {
   2776                 c = cAt(p8);
   2777                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
   2778                     fLowerSet->contains(c) || fSepSet->contains(c) ||
   2779                     fATermSet->contains(c) || fSTermSet->contains(c))  {
   2780                     break;
   2781                 }
   2782                 p8 = moveForward(p8);
   2783             }
   2784             if (fLowerSet->contains(cAt(p8))) {
   2785                 continue;
   2786             }
   2787         }
   2788 
   2789         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
   2790         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
   2791             p8 = p1;
   2792             while (fSpSet->contains(cAt(p8))) {
   2793                 p8 = moveBack(p8);
   2794             }
   2795             while (fCloseSet->contains(cAt(p8))) {
   2796                 p8 = moveBack(p8);
   2797             }
   2798             c = cAt(p8);
   2799             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
   2800                 continue;
   2801             }
   2802         }
   2803 
   2804         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
   2805         int p9 = p1;
   2806         while (fCloseSet->contains(cAt(p9))) {
   2807             p9 = moveBack(p9);
   2808         }
   2809         c = cAt(p9);
   2810         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
   2811             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
   2812                 continue;
   2813             }
   2814         }
   2815 
   2816         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
   2817         int p10 = p1;
   2818         while (fSpSet->contains(cAt(p10))) {
   2819             p10 = moveBack(p10);
   2820         }
   2821         while (fCloseSet->contains(cAt(p10))) {
   2822             p10 = moveBack(p10);
   2823         }
   2824         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
   2825             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
   2826                 continue;
   2827             }
   2828         }
   2829 
   2830         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
   2831         int p11 = p1;
   2832         if (fSepSet->contains(cAt(p11))) {
   2833             p11 = moveBack(p11);
   2834         }
   2835         while (fSpSet->contains(cAt(p11))) {
   2836             p11 = moveBack(p11);
   2837         }
   2838         while (fCloseSet->contains(cAt(p11))) {
   2839             p11 = moveBack(p11);
   2840         }
   2841         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
   2842             break;
   2843         }
   2844 
   2845         //  Rule (12)  Any x Any
   2846         continue;
   2847     }
   2848     breakPos = p2;
   2849     return breakPos;
   2850 }
   2851 
   2852 RBBISentMonkey::~RBBISentMonkey() {
   2853     delete fSets;
   2854     delete fSepSet;
   2855     delete fFormatSet;
   2856     delete fSpSet;
   2857     delete fLowerSet;
   2858     delete fUpperSet;
   2859     delete fOLetterSet;
   2860     delete fNumericSet;
   2861     delete fATermSet;
   2862     delete fSContinueSet;
   2863     delete fSTermSet;
   2864     delete fCloseSet;
   2865     delete fOtherSet;
   2866     delete fExtendSet;
   2867 }
   2868 
   2869 
   2870 
   2871 //-------------------------------------------------------------------------------------------
   2872 //
   2873 //  RBBILineMonkey
   2874 //
   2875 //-------------------------------------------------------------------------------------------
   2876 
   2877 class RBBILineMonkey: public RBBIMonkeyKind {
   2878 public:
   2879     RBBILineMonkey();
   2880     virtual          ~RBBILineMonkey();
   2881     virtual  UVector *charClasses();
   2882     virtual  void     setText(const UnicodeString &s);
   2883     virtual  int32_t  next(int32_t i);
   2884     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
   2885 private:
   2886     UVector      *fSets;
   2887 
   2888     UnicodeSet  *fBK;
   2889     UnicodeSet  *fCR;
   2890     UnicodeSet  *fLF;
   2891     UnicodeSet  *fCM;
   2892     UnicodeSet  *fNL;
   2893     UnicodeSet  *fSG;
   2894     UnicodeSet  *fWJ;
   2895     UnicodeSet  *fZW;
   2896     UnicodeSet  *fGL;
   2897     UnicodeSet  *fCB;
   2898     UnicodeSet  *fSP;
   2899     UnicodeSet  *fB2;
   2900     UnicodeSet  *fBA;
   2901     UnicodeSet  *fBB;
   2902     UnicodeSet  *fHY;
   2903     UnicodeSet  *fH2;
   2904     UnicodeSet  *fH3;
   2905     UnicodeSet  *fCL;
   2906     UnicodeSet  *fCP;
   2907     UnicodeSet  *fEX;
   2908     UnicodeSet  *fIN;
   2909     UnicodeSet  *fJL;
   2910     UnicodeSet  *fJV;
   2911     UnicodeSet  *fJT;
   2912     UnicodeSet  *fNS;
   2913     UnicodeSet  *fOP;
   2914     UnicodeSet  *fQU;
   2915     UnicodeSet  *fIS;
   2916     UnicodeSet  *fNU;
   2917     UnicodeSet  *fPO;
   2918     UnicodeSet  *fPR;
   2919     UnicodeSet  *fSY;
   2920     UnicodeSet  *fAI;
   2921     UnicodeSet  *fAL;
   2922     UnicodeSet  *fCJ;
   2923     UnicodeSet  *fHL;
   2924     UnicodeSet  *fID;
   2925     UnicodeSet  *fRI;
   2926     UnicodeSet  *fSA;
   2927     UnicodeSet  *fXX;
   2928 
   2929     BreakIterator        *fCharBI;
   2930     const UnicodeString  *fText;
   2931     RegexMatcher         *fNumberMatcher;
   2932 };
   2933 
   2934 
   2935 RBBILineMonkey::RBBILineMonkey()
   2936 {
   2937     UErrorCode  status = U_ZERO_ERROR;
   2938 
   2939     fSets  = new UVector(status);
   2940 
   2941     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
   2942     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
   2943     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
   2944     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
   2945     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
   2946     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
   2947     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
   2948     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
   2949     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
   2950     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
   2951     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
   2952     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
   2953     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
   2954     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
   2955     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
   2956     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
   2957     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
   2958     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
   2959     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
   2960     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
   2961     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
   2962     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
   2963     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
   2964     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
   2965     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
   2966     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
   2967     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
   2968     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
   2969     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
   2970     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
   2971     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
   2972     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
   2973     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
   2974     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
   2975     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
   2976     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
   2977     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
   2978     fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
   2979     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
   2980     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
   2981 
   2982     if (U_FAILURE(status)) {
   2983         deferredStatus = status;
   2984         fCharBI = NULL;
   2985         fNumberMatcher = NULL;
   2986         return;
   2987     }
   2988 
   2989     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
   2990     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
   2991     fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
   2992     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
   2993 
   2994     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
   2995 
   2996     fSets->addElement(fBK, status);
   2997     fSets->addElement(fCR, status);
   2998     fSets->addElement(fLF, status);
   2999     fSets->addElement(fCM, status);
   3000     fSets->addElement(fNL, status);
   3001     fSets->addElement(fWJ, status);
   3002     fSets->addElement(fZW, status);
   3003     fSets->addElement(fGL, status);
   3004     fSets->addElement(fCB, status);
   3005     fSets->addElement(fSP, status);
   3006     fSets->addElement(fB2, status);
   3007     fSets->addElement(fBA, status);
   3008     fSets->addElement(fBB, status);
   3009     fSets->addElement(fHY, status);
   3010     fSets->addElement(fH2, status);
   3011     fSets->addElement(fH3, status);
   3012     fSets->addElement(fCL, status);
   3013     fSets->addElement(fCP, status);
   3014     fSets->addElement(fEX, status);
   3015     fSets->addElement(fIN, status);
   3016     fSets->addElement(fJL, status);
   3017     fSets->addElement(fJT, status);
   3018     fSets->addElement(fJV, status);
   3019     fSets->addElement(fNS, status);
   3020     fSets->addElement(fOP, status);
   3021     fSets->addElement(fQU, status);
   3022     fSets->addElement(fIS, status);
   3023     fSets->addElement(fNU, status);
   3024     fSets->addElement(fPO, status);
   3025     fSets->addElement(fPR, status);
   3026     fSets->addElement(fSY, status);
   3027     fSets->addElement(fAI, status);
   3028     fSets->addElement(fAL, status);
   3029     fSets->addElement(fHL, status);
   3030     fSets->addElement(fID, status);
   3031     fSets->addElement(fWJ, status);
   3032     fSets->addElement(fRI, status);
   3033     fSets->addElement(fSA, status);
   3034     fSets->addElement(fSG, status);
   3035 
   3036     const char *rules =
   3037             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
   3038             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
   3039             "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
   3040             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
   3041             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
   3042             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
   3043 
   3044     fNumberMatcher = new RegexMatcher(
   3045         UnicodeString(rules, -1, US_INV), 0, status);
   3046 
   3047     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
   3048 
   3049     if (U_FAILURE(status)) {
   3050         deferredStatus = status;
   3051     }
   3052 }
   3053 
   3054 
   3055 void RBBILineMonkey::setText(const UnicodeString &s) {
   3056     fText       = &s;
   3057     fCharBI->setText(s);
   3058     fNumberMatcher->reset(s);
   3059 }
   3060 
   3061 //
   3062 //  rule9Adjust
   3063 //     Line Break TR rules 9 and 10 implementation.
   3064 //     This deals with combining marks and other sequences that
   3065 //     that must be treated as if they were something other than what they actually are.
   3066 //
   3067 //     This is factored out into a separate function because it must be applied twice for
   3068 //     each potential break, once to the chars before the position being checked, then
   3069 //     again to the text following the possible break.
   3070 //
   3071 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
   3072     if (pos == -1) {
   3073         // Invalid initial position.  Happens during the warmup iteration of the
   3074         //   main loop in next().
   3075         return;
   3076     }
   3077 
   3078     int32_t  nPos = *nextPos;
   3079 
   3080     // LB 9  Keep combining sequences together.
   3081     //  advance over any CM class chars.  Note that Line Break CM is different
   3082     //  from the normal Grapheme Extend property.
   3083     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
   3084           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
   3085         for (;;) {
   3086             *nextChar = fText->char32At(nPos);
   3087             if (!fCM->contains(*nextChar)) {
   3088                 break;
   3089             }
   3090             nPos = fText->moveIndex32(nPos, 1);
   3091         }
   3092     }
   3093 
   3094 
   3095     // LB 9 Treat X CM* as if it were x.
   3096     //       No explicit action required.
   3097 
   3098     // LB 10  Treat any remaining combining mark as AL
   3099     if (fCM->contains(*posChar)) {
   3100         *posChar = 0x41;   // thisChar = 'A';
   3101     }
   3102 
   3103     // Push the updated nextPos and nextChar back to our caller.
   3104     // This only makes a difference if posChar got bigger by consuming a
   3105     // combining sequence.
   3106     *nextPos  = nPos;
   3107     *nextChar = fText->char32At(nPos);
   3108 }
   3109 
   3110 
   3111 
   3112 int32_t RBBILineMonkey::next(int32_t startPos) {
   3113     UErrorCode status = U_ZERO_ERROR;
   3114     int32_t    pos;       //  Index of the char following a potential break position
   3115     UChar32    thisChar;  //  Character at above position "pos"
   3116 
   3117     int32_t    prevPos;   //  Index of the char preceding a potential break position
   3118     UChar32    prevChar;  //  Character at above position.  Note that prevChar
   3119                           //   and thisChar may not be adjacent because combining
   3120                           //   characters between them will be ignored.
   3121 
   3122     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
   3123     UChar32    prevCharX2;
   3124 
   3125     int32_t    nextPos;   //  Index of the next character following pos.
   3126                           //     Usually skips over combining marks.
   3127     int32_t    nextCPPos; //  Index of the code point following "pos."
   3128                           //     May point to a combining mark.
   3129     int32_t    tPos;      //  temp value.
   3130     UChar32    c;
   3131 
   3132     if (U_FAILURE(deferredStatus)) {
   3133         return -1;
   3134     }
   3135 
   3136     if (startPos >= fText->length()) {
   3137         return -1;
   3138     }
   3139 
   3140 
   3141     // Initial values for loop.  Loop will run the first time without finding breaks,
   3142     //                           while the invalid values shift out and the "this" and
   3143     //                           "prev" positions are filled in with good values.
   3144     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
   3145     thisChar = prevChar  = prevCharX2 = 0;
   3146     nextPos  = nextCPPos = startPos;
   3147 
   3148 
   3149     // Loop runs once per position in the test text, until a break position
   3150     //  is found.
   3151     for (;;) {
   3152         prevPosX2 = prevPos;
   3153         prevCharX2 = prevChar;
   3154 
   3155         prevPos   = pos;
   3156         prevChar  = thisChar;
   3157 
   3158         pos       = nextPos;
   3159         thisChar  = fText->char32At(pos);
   3160 
   3161         nextCPPos = fText->moveIndex32(pos, 1);
   3162         nextPos   = nextCPPos;
   3163 
   3164         // Rule LB2 - Break at end of text.
   3165         if (pos >= fText->length()) {
   3166             break;
   3167         }
   3168 
   3169         // Rule LB 9 - adjust for combining sequences.
   3170         //             We do this one out-of-order because the adjustment does not change anything
   3171         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
   3172         //             be applied.
   3173         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
   3174         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
   3175         c = fText->char32At(nextPos);
   3176         rule9Adjust(pos,     &thisChar, &nextPos, &c);
   3177 
   3178         // If the loop is still warming up - if we haven't shifted the initial
   3179         //   -1 positions out of prevPos yet - loop back to advance the
   3180         //    position in the input without any further looking for breaks.
   3181         if (prevPos == -1) {
   3182             continue;
   3183         }
   3184 
   3185         // LB 4  Always break after hard line breaks,
   3186         if (fBK->contains(prevChar)) {
   3187             break;
   3188         }
   3189 
   3190         // LB 5  Break after CR, LF, NL, but not inside CR LF
   3191         if (prevChar == 0x0d && thisChar == 0x0a) {
   3192             continue;
   3193         }
   3194         if (prevChar == 0x0d ||
   3195             prevChar == 0x0a ||
   3196             prevChar == 0x85)  {
   3197             break;
   3198         }
   3199 
   3200         // LB 6  Don't break before hard line breaks
   3201         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
   3202             fBK->contains(thisChar)) {
   3203                 continue;
   3204         }
   3205 
   3206 
   3207         // LB 7  Don't break before spaces or zero-width space.
   3208         if (fSP->contains(thisChar)) {
   3209             continue;
   3210         }
   3211 
   3212         if (fZW->contains(thisChar)) {
   3213             continue;
   3214         }
   3215 
   3216         // LB 8  Break after zero width space
   3217         if (fZW->contains(prevChar)) {
   3218             break;
   3219         }
   3220 
   3221         // LB 9, 10  Already done, at top of loop.
   3222         //
   3223 
   3224 
   3225         // LB 11  Do not break before or after WORD JOINER and related characters.
   3226         //    x  WJ
   3227         //    WJ  x
   3228         //
   3229         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
   3230             continue;
   3231         }
   3232 
   3233         // LB 12
   3234         //    GL  x
   3235         if (fGL->contains(prevChar)) {
   3236             continue;
   3237         }
   3238 
   3239         // LB 12a
   3240         //    [^SP BA HY] x GL
   3241         if (!(fSP->contains(prevChar) ||
   3242               fBA->contains(prevChar) ||
   3243               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
   3244             continue;
   3245         }
   3246 
   3247 
   3248 
   3249         // LB 13  Don't break before closings.
   3250         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
   3251         //        fall into LB 17 and the more general number regular expression.
   3252         //
   3253         if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
   3254             (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
   3255                                          fEX->contains(thisChar)  ||
   3256             (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
   3257             (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
   3258             continue;
   3259         }
   3260 
   3261         // LB 14 Don't break after OP SP*
   3262         //       Scan backwards, checking for this sequence.
   3263         //       The OP char could include combining marks, so we actually check for
   3264         //           OP CM* SP*
   3265         //       Another Twist: The Rule 67 fixes may have changed a SP CM
   3266         //       sequence into a ID char, so before scanning back through spaces,
   3267         //       verify that prevChar is indeed a space.  The prevChar variable
   3268         //       may differ from fText[prevPos]
   3269         tPos = prevPos;
   3270         if (fSP->contains(prevChar)) {
   3271             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
   3272                 tPos=fText->moveIndex32(tPos, -1);
   3273             }
   3274         }
   3275         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
   3276             tPos=fText->moveIndex32(tPos, -1);
   3277         }
   3278         if (fOP->contains(fText->char32At(tPos))) {
   3279             continue;
   3280         }
   3281 
   3282 
   3283         // LB 15    QU SP* x OP
   3284         if (fOP->contains(thisChar)) {
   3285             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
   3286             int tPos = prevPos;
   3287             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
   3288                 tPos = fText->moveIndex32(tPos, -1);
   3289             }
   3290             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
   3291                 tPos = fText->moveIndex32(tPos, -1);
   3292             }
   3293             if (fQU->contains(fText->char32At(tPos))) {
   3294                 continue;
   3295             }
   3296         }
   3297 
   3298 
   3299 
   3300         // LB 16   (CL | CP) SP* x NS
   3301         //    Scan backwards for SP* CM* (CL | CP)
   3302         if (fNS->contains(thisChar)) {
   3303             int tPos = prevPos;
   3304             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
   3305                 tPos = fText->moveIndex32(tPos, -1);
   3306             }
   3307             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
   3308                 tPos = fText->moveIndex32(tPos, -1);
   3309             }
   3310             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
   3311                 continue;
   3312             }
   3313         }
   3314 
   3315 
   3316         // LB 17        B2 SP* x B2
   3317         if (fB2->contains(thisChar)) {
   3318             //  Scan backwards, checking for the B2 CM* SP* sequence.
   3319             tPos = prevPos;
   3320             if (fSP->contains(prevChar)) {
   3321                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
   3322                     tPos=fText->moveIndex32(tPos, -1);
   3323                 }
   3324             }
   3325             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
   3326                 tPos=fText->moveIndex32(tPos, -1);
   3327             }
   3328             if (fB2->contains(fText->char32At(tPos))) {
   3329                 continue;
   3330             }
   3331         }
   3332 
   3333 
   3334         // LB 18    break after space
   3335         if (fSP->contains(prevChar)) {
   3336             break;
   3337         }
   3338 
   3339         // LB 19
   3340         //    x   QU
   3341         //    QU  x
   3342         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
   3343             continue;
   3344         }
   3345 
   3346         // LB 20  Break around a CB
   3347         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
   3348             break;
   3349         }
   3350 
   3351         // LB 21
   3352         if (fBA->contains(thisChar) ||
   3353             fHY->contains(thisChar) ||
   3354             fNS->contains(thisChar) ||
   3355             fBB->contains(prevChar) )   {
   3356             continue;
   3357         }
   3358 
   3359         // LB 21a
   3360         //   HL (HY | BA) x
   3361         if (fHL->contains(prevCharX2) &&
   3362                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
   3363             continue;
   3364         }
   3365 
   3366         // LB 21b
   3367         //   SY x HL
   3368         if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
   3369             continue;
   3370         }
   3371 
   3372         // LB 22
   3373         if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
   3374             (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
   3375             (fID->contains(prevChar) && fIN->contains(thisChar)) ||
   3376             (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
   3377             (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
   3378             continue;
   3379         }
   3380 
   3381 
   3382         // LB 23    ID x PO
   3383         //          AL x NU
   3384         //          HL x NU
   3385         //          NU x AL
   3386         if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
   3387             (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
   3388             (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
   3389             (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
   3390             (fNU->contains(prevChar) && fHL->contains(thisChar)) )   {
   3391             continue;
   3392         }
   3393 
   3394         // LB 24  Do not break between prefix and letters or ideographs.
   3395         //        PR x ID
   3396         //        PR x (AL | HL)
   3397         //        PO x (AL | HL)
   3398         if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
   3399             (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
   3400             (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))))  {
   3401             continue;
   3402         }
   3403 
   3404 
   3405 
   3406         // LB 25    Numbers
   3407         if (fNumberMatcher->lookingAt(prevPos, status)) {
   3408             if (U_FAILURE(status)) {
   3409                 break;
   3410             }
   3411             // Matched a number.  But could have been just a single digit, which would
   3412             //    not represent a "no break here" between prevChar and thisChar
   3413             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
   3414             if (numEndIdx > pos) {
   3415                 // Number match includes at least our two chars being checked
   3416                 if (numEndIdx > nextPos) {
   3417                     // Number match includes additional chars.  Update pos and nextPos
   3418                     //   so that next loop iteration will continue at the end of the number,
   3419                     //   checking for breaks between last char in number & whatever follows.
   3420                     pos = nextPos = numEndIdx;
   3421                     do {
   3422                         pos = fText->moveIndex32(pos, -1);
   3423                         thisChar = fText->char32At(pos);
   3424                     } while (fCM->contains(thisChar));
   3425                 }
   3426                 continue;
   3427             }
   3428         }
   3429 
   3430 
   3431         // LB 26 Do not break a Korean syllable.
   3432         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
   3433                                         fJV->contains(thisChar) ||
   3434                                         fH2->contains(thisChar) ||
   3435                                         fH3->contains(thisChar))) {
   3436                                             continue;
   3437                                         }
   3438 
   3439         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
   3440             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
   3441                 continue;
   3442         }
   3443 
   3444         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
   3445             fJT->contains(thisChar)) {
   3446                 continue;
   3447         }
   3448 
   3449         // LB 27 Treat a Korean Syllable Block the same as ID.
   3450         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
   3451             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
   3452             fIN->contains(thisChar)) {
   3453                 continue;
   3454             }
   3455         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
   3456             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
   3457             fPO->contains(thisChar)) {
   3458                 continue;
   3459             }
   3460         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
   3461             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
   3462                 continue;
   3463             }
   3464 
   3465 
   3466 
   3467         // LB 28  Do not break between alphabetics ("at").
   3468         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
   3469             continue;
   3470         }
   3471 
   3472         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
   3473         if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
   3474             continue;
   3475         }
   3476 
   3477         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
   3478         //          (AL | NU) x OP
   3479         //          CP x (AL | NU)
   3480         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
   3481             continue;
   3482         }
   3483         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
   3484             continue;
   3485         }
   3486 
   3487         // LB30a  Do not break between regional indicators.
   3488         //        RI x RI
   3489         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
   3490             continue;
   3491         }
   3492 
   3493         // LB 31    Break everywhere else
   3494         break;
   3495 
   3496     }
   3497 
   3498     return pos;
   3499 }
   3500 
   3501 
   3502 UVector  *RBBILineMonkey::charClasses() {
   3503     return fSets;
   3504 }
   3505 
   3506 
   3507 RBBILineMonkey::~RBBILineMonkey() {
   3508     delete fSets;
   3509 
   3510     delete fBK;
   3511     delete fCR;
   3512     delete fLF;
   3513     delete fCM;
   3514     delete fNL;
   3515     delete fWJ;
   3516     delete fZW;
   3517     delete fGL;
   3518     delete fCB;
   3519     delete fSP;
   3520     delete fB2;
   3521     delete fBA;
   3522     delete fBB;
   3523     delete fHY;
   3524     delete fH2;
   3525     delete fH3;
   3526     delete fCL;
   3527     delete fCP;
   3528     delete fEX;
   3529     delete fIN;
   3530     delete fJL;
   3531     delete fJV;
   3532     delete fJT;
   3533     delete fNS;
   3534     delete fOP;
   3535     delete fQU;
   3536     delete fIS;
   3537     delete fNU;
   3538     delete fPO;
   3539     delete fPR;
   3540     delete fSY;
   3541     delete fAI;
   3542     delete fAL;
   3543     delete fCJ;
   3544     delete fHL;
   3545     delete fID;
   3546     delete fRI;
   3547     delete fSA;
   3548     delete fSG;
   3549     delete fXX;
   3550 
   3551     delete fCharBI;
   3552     delete fNumberMatcher;
   3553 }
   3554 
   3555 
   3556 //-------------------------------------------------------------------------------------------
   3557 //
   3558 //   TestMonkey
   3559 //
   3560 //     params
   3561 //       seed=nnnnn        Random number starting seed.
   3562 //                         Setting the seed allows errors to be reproduced.
   3563 //       loop=nnn          Looping count.  Controls running time.
   3564 //                         -1:  run forever.
   3565 //                          0 or greater:  run length.
   3566 //
   3567 //       type = char | word | line | sent | title
   3568 //
   3569 //-------------------------------------------------------------------------------------------
   3570 
   3571 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
   3572     int32_t val = defaultVal;
   3573     name.append(" *= *(-?\\d+)");
   3574     UErrorCode status = U_ZERO_ERROR;
   3575     RegexMatcher m(name, params, 0, status);
   3576     if (m.find()) {
   3577         // The param exists.  Convert the string to an int.
   3578         char valString[100];
   3579         int32_t paramLength = m.end(1, status) - m.start(1, status);
   3580         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
   3581             paramLength = (int32_t)(sizeof(valString)-2);
   3582         }
   3583         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
   3584         val = strtol(valString,  NULL, 10);
   3585 
   3586         // Delete this parameter from the params string.
   3587         m.reset();
   3588         params = m.replaceFirst("", status);
   3589     }
   3590     U_ASSERT(U_SUCCESS(status));
   3591     return val;
   3592 }
   3593 #endif
   3594 
   3595 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3596 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
   3597                                     BreakIterator *bi,
   3598                                     int expected[],
   3599                                     int expectedcount)
   3600 {
   3601     int count = 0;
   3602     int i = 0;
   3603     int forward[50];
   3604     bi->setText(ustr);
   3605     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
   3606         forward[count] = i;
   3607         if (count < expectedcount && expected[count] != i) {
   3608             test->errln("break forward test failed: expected %d but got %d",
   3609                         expected[count], i);
   3610             break;
   3611         }
   3612         count ++;
   3613     }
   3614     if (count != expectedcount) {
   3615         printStringBreaks(ustr, expected, expectedcount);
   3616         test->errln("break forward test failed: missed %d match",
   3617                     expectedcount - count);
   3618         return;
   3619     }
   3620     // testing boundaries
   3621     for (i = 1; i < expectedcount; i ++) {
   3622         int j = expected[i - 1];
   3623         if (!bi->isBoundary(j)) {
   3624             printStringBreaks(ustr, expected, expectedcount);
   3625             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
   3626             return;
   3627         }
   3628         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
   3629             if (bi->isBoundary(j)) {
   3630                 printStringBreaks(ustr, expected, expectedcount);
   3631                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
   3632                 return;
   3633             }
   3634         }
   3635     }
   3636 
   3637     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
   3638         count --;
   3639         if (forward[count] != i) {
   3640             printStringBreaks(ustr, expected, expectedcount);
   3641             test->errln("happy break test previous() failed: expected %d but got %d",
   3642                         forward[count], i);
   3643             break;
   3644         }
   3645     }
   3646     if (count != 0) {
   3647         printStringBreaks(ustr, expected, expectedcount);
   3648         test->errln("break test previous() failed: missed a match");
   3649         return;
   3650     }
   3651 
   3652     // testing preceding
   3653     for (i = 0; i < expectedcount - 1; i ++) {
   3654         // int j = expected[i] + 1;
   3655         int j = ustr.moveIndex32(expected[i], 1);
   3656         for (; j <= expected[i + 1]; j ++) {
   3657             if (bi->preceding(j) != expected[i]) {
   3658                 printStringBreaks(ustr, expected, expectedcount);
   3659                 test->errln("preceding(): Not expecting boundary at position %d", j);
   3660                 return;
   3661             }
   3662         }
   3663     }
   3664 }
   3665 #endif
   3666 
   3667 void RBBITest::TestWordBreaks(void)
   3668 {
   3669 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3670 
   3671     Locale        locale("en");
   3672     UErrorCode    status = U_ZERO_ERROR;
   3673     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   3674     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
   3675     // Replaced any C+J characters in a row with a random sequence of characters
   3676     // of the same length to make our C+J segmentation not get in the way.
   3677     static const char *strlist[] =
   3678     {
   3679     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
   3680     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
   3681     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
   3682     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
   3683     "\\uac00\\u3588\\u009c\\u0953\\u194b",
   3684     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   3685     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
   3686     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
   3687     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   3688     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   3689     "\\u2027\\U000e0067\\u0a47\\u00b7",
   3690     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   3691     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   3692     "\\u0589\\U000e006e\\u0a42\\U000104a5",
   3693     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
   3694     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   3695     "\\u0027\\u11af\\U000e0057\\u0602",
   3696     "\\U0001d7f2\\U000e007\\u0004\\u0589",
   3697     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
   3698     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   3699     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   3700     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   3701     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
   3702     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   3703     "\\u0233\\U000e0020\\u0a69\\u0d6a",
   3704     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   3705     "\\u18f4\\U000e0049\\u20e7\\u2027",
   3706     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   3707     "\\ua183\\u102d\\u0bec\\u003a",
   3708     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   3709     "\\u003a\\u0e57\\u0fad\\u002e",
   3710     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
   3711     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
   3712     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
   3713     "\\u003a\\u0664\\u00b7\\u1fba",
   3714     "\\u003b\\u0027\\u00b7\\u47a3",
   3715     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
   3716     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
   3717     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
   3718     };
   3719     int loop;
   3720     if (U_FAILURE(status)) {
   3721         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   3722         return;
   3723     }
   3724     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   3725         // printf("looping %d\n", loop);
   3726         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
   3727         // RBBICharMonkey monkey;
   3728         RBBIWordMonkey monkey;
   3729 
   3730         int expected[50];
   3731         int expectedcount = 0;
   3732 
   3733         monkey.setText(ustr);
   3734         int i;
   3735         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   3736             expected[expectedcount ++] = i;
   3737         }
   3738 
   3739         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   3740     }
   3741     delete bi;
   3742 #endif
   3743 }
   3744 
   3745 void RBBITest::TestWordBoundary(void)
   3746 {
   3747     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
   3748     Locale        locale("en");
   3749     UErrorCode    status = U_ZERO_ERROR;
   3750     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   3751     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
   3752     UChar         str[50];
   3753     static const char *strlist[] =
   3754     {
   3755     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   3756     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   3757     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   3758     "\\u2027\\U000e0067\\u0a47\\u00b7",
   3759     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   3760     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   3761     "\\u0589\\U000e006e\\u0a42\\U000104a5",
   3762     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
   3763     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   3764     "\\u0027\\u11af\\U000e0057\\u0602",
   3765     "\\U0001d7f2\\U000e007\\u0004\\u0589",
   3766     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
   3767     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   3768     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   3769     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   3770     "\\U000e0065\\u302c\\u09ee\\U000e0068",
   3771     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   3772     "\\u0233\\U000e0020\\u0a69\\u0d6a",
   3773     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   3774     "\\u58f4\\U000e0049\\u20e7\\u2027",
   3775     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   3776     "\\ua183\\u102d\\u0bec\\u003a",
   3777     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   3778     "\\u003a\\u0e57\\u0fad\\u002e",
   3779     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
   3780     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
   3781     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
   3782     "\\u003a\\u0664\\u00b7\\u1fba",
   3783     "\\u003b\\u0027\\u00b7\\u47a3",
   3784     };
   3785     int loop;
   3786     if (U_FAILURE(status)) {
   3787         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   3788         return;
   3789     }
   3790     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   3791         // printf("looping %d\n", loop);
   3792         u_unescape(strlist[loop], str, 20);
   3793         UnicodeString ustr(str);
   3794         int forward[50];
   3795         int count = 0;
   3796 
   3797         bi->setText(ustr);
   3798         int prev = 0;
   3799         int i;
   3800         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
   3801             forward[count ++] = i;
   3802             if (i > prev) {
   3803                 int j;
   3804                 for (j = prev + 1; j < i; j ++) {
   3805                     if (bi->isBoundary(j)) {
   3806                         printStringBreaks(ustr, forward, count);
   3807                         errln("happy boundary test failed: expected %d not a boundary",
   3808                                j);
   3809                         return;
   3810                     }
   3811                 }
   3812             }
   3813             if (!bi->isBoundary(i)) {
   3814                 printStringBreaks(ustr, forward, count);
   3815                 errln("happy boundary test failed: expected %d a boundary",
   3816                        i);
   3817                 return;
   3818             }
   3819             prev = i;
   3820         }
   3821     }
   3822     delete bi;
   3823 }
   3824 
   3825 void RBBITest::TestLineBreaks(void)
   3826 {
   3827 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3828     Locale        locale("en");
   3829     UErrorCode    status = U_ZERO_ERROR;
   3830     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
   3831     const int32_t  STRSIZE = 50;
   3832     UChar         str[STRSIZE];
   3833     static const char *strlist[] =
   3834     {
   3835      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
   3836      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
   3837              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
   3838      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
   3839              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
   3840      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
   3841      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
   3842      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
   3843      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
   3844      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
   3845      "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
   3846      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
   3847      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
   3848      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
   3849      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
   3850      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
   3851      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
   3852      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
   3853      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
   3854      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
   3855      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
   3856      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
   3857      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
   3858      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
   3859      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
   3860      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
   3861      "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
   3862      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
   3863      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
   3864      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
   3865      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
   3866      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
   3867      "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
   3868      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
   3869      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
   3870      "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
   3871      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
   3872      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
   3873      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
   3874      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
   3875      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
   3876      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
   3877          "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
   3878          "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
   3879          "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
   3880      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
   3881          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
   3882     };
   3883     int loop;
   3884     TEST_ASSERT_SUCCESS(status);
   3885     if (U_FAILURE(status)) {
   3886         return;
   3887     }
   3888     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   3889         // printf("looping %d\n", loop);
   3890         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
   3891         if (t >= STRSIZE) {
   3892             TEST_ASSERT(FALSE);
   3893             continue;
   3894         }
   3895 
   3896 
   3897         UnicodeString ustr(str);
   3898         RBBILineMonkey monkey;
   3899         if (U_FAILURE(monkey.deferredStatus)) {
   3900             continue;
   3901         }
   3902 
   3903         const int EXPECTEDSIZE = 50;
   3904         int expected[EXPECTEDSIZE];
   3905         int expectedcount = 0;
   3906 
   3907         monkey.setText(ustr);
   3908         int i;
   3909         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   3910             if (expectedcount >= EXPECTEDSIZE) {
   3911                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
   3912                 return;
   3913             }
   3914             expected[expectedcount ++] = i;
   3915         }
   3916 
   3917         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   3918     }
   3919     delete bi;
   3920 #endif
   3921 }
   3922 
   3923 void RBBITest::TestSentBreaks(void)
   3924 {
   3925 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3926     Locale        locale("en");
   3927     UErrorCode    status = U_ZERO_ERROR;
   3928     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
   3929     UChar         str[200];
   3930     static const char *strlist[] =
   3931     {
   3932      "Now\ris\nthe\r\ntime\n\rfor\r\r",
   3933      "This\n",
   3934      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
   3935      "\"Sentence ending with a quote.\" Bye.",
   3936      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
   3937      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
   3938      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
   3939      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
   3940      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
   3941      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
   3942      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
   3943              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
   3944              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
   3945              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
   3946      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
   3947              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
   3948              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
   3949              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
   3950              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
   3951              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
   3952     };
   3953     int loop;
   3954     if (U_FAILURE(status)) {
   3955         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   3956         return;
   3957     }
   3958     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   3959         u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
   3960         UnicodeString ustr(str);
   3961 
   3962         RBBISentMonkey monkey;
   3963         if (U_FAILURE(monkey.deferredStatus)) {
   3964             continue;
   3965         }
   3966 
   3967         const int EXPECTEDSIZE = 50;
   3968         int expected[EXPECTEDSIZE];
   3969         int expectedcount = 0;
   3970 
   3971         monkey.setText(ustr);
   3972         int i;
   3973         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   3974             if (expectedcount >= EXPECTEDSIZE) {
   3975                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
   3976                 return;
   3977             }
   3978             expected[expectedcount ++] = i;
   3979         }
   3980 
   3981         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   3982     }
   3983     delete bi;
   3984 #endif
   3985 }
   3986 
   3987 void RBBITest::TestMonkey(char *params) {
   3988 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3989 
   3990     UErrorCode     status    = U_ZERO_ERROR;
   3991     int32_t        loopCount = 500;
   3992     int32_t        seed      = 1;
   3993     UnicodeString  breakType = "all";
   3994     Locale         locale("en");
   3995     UBool          useUText  = FALSE;
   3996 
   3997     if (quick == FALSE) {
   3998         loopCount = 10000;
   3999     }
   4000 
   4001     if (params) {
   4002         UnicodeString p(params);
   4003         loopCount = getIntParam("loop", p, loopCount);
   4004         seed      = getIntParam("seed", p, seed);
   4005 
   4006         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
   4007         if (m.find()) {
   4008             breakType = m.group(1, status);
   4009             m.reset();
   4010             p = m.replaceFirst("", status);
   4011         }
   4012 
   4013         RegexMatcher u(" *utext", p, 0, status);
   4014         if (u.find()) {
   4015             useUText = TRUE;
   4016             u.reset();
   4017             p = u.replaceFirst("", status);
   4018         }
   4019 
   4020 
   4021         // m.reset(p);
   4022         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
   4023             // Each option is stripped out of the option string as it is processed.
   4024             // All options have been checked.  The option string should have been completely emptied..
   4025             char buf[100];
   4026             p.extract(buf, sizeof(buf), NULL, status);
   4027             buf[sizeof(buf)-1] = 0;
   4028             errln("Unrecognized or extra parameter:  %s\n", buf);
   4029             return;
   4030         }
   4031 
   4032     }
   4033 
   4034     if (breakType == "char" || breakType == "all") {
   4035         RBBICharMonkey  m;
   4036         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   4037         if (U_SUCCESS(status)) {
   4038             RunMonkey(bi, m, "char", seed, loopCount, useUText);
   4039             if (breakType == "all" && useUText==FALSE) {
   4040                 // Also run a quick test with UText when "all" is specified
   4041                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
   4042             }
   4043         }
   4044         else {
   4045             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
   4046         }
   4047         delete bi;
   4048     }
   4049 
   4050     if (breakType == "word" || breakType == "all") {
   4051         logln("Word Break Monkey Test");
   4052         RBBIWordMonkey  m;
   4053         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
   4054         if (U_SUCCESS(status)) {
   4055             RunMonkey(bi, m, "word", seed, loopCount, useUText);
   4056         }
   4057         else {
   4058             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
   4059         }
   4060         delete bi;
   4061     }
   4062 
   4063     if (breakType == "line" || breakType == "all") {
   4064         logln("Line Break Monkey Test");
   4065         RBBILineMonkey  m;
   4066         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
   4067         if (loopCount >= 10) {
   4068             loopCount = loopCount / 5;   // Line break runs slower than the others.
   4069         }
   4070         if (U_SUCCESS(status)) {
   4071             RunMonkey(bi, m, "line", seed, loopCount, useUText);
   4072         }
   4073         else {
   4074             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
   4075         }
   4076         delete bi;
   4077     }
   4078 
   4079     if (breakType == "sent" || breakType == "all"  ) {
   4080         logln("Sentence Break Monkey Test");
   4081         RBBISentMonkey  m;
   4082         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
   4083         if (loopCount >= 10) {
   4084             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
   4085         }
   4086         if (U_SUCCESS(status)) {
   4087             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
   4088         }
   4089         else {
   4090             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
   4091         }
   4092         delete bi;
   4093     }
   4094 
   4095 #endif
   4096 }
   4097 
   4098 //
   4099 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
   4100 //    Parameters:
   4101 //       bi      - the break iterator to use
   4102 //       mk      - MonkeyKind, abstraction for obtaining expected results
   4103 //       name    - Name of test (char, word, etc.) for use in error messages
   4104 //       seed    - Seed for starting random number generator (parameter from user)
   4105 //       numIterations
   4106 //
   4107 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
   4108                          int32_t numIterations, UBool useUText) {
   4109 
   4110 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4111 
   4112     const int32_t    TESTSTRINGLEN = 500;
   4113     UnicodeString    testText;
   4114     int32_t          numCharClasses;
   4115     UVector          *chClasses;
   4116     int              expected[TESTSTRINGLEN*2 + 1];
   4117     int              expectedCount = 0;
   4118     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
   4119     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
   4120     char             reverseBreaks[TESTSTRINGLEN*2+1];
   4121     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
   4122     char             followingBreaks[TESTSTRINGLEN*2+1];
   4123     char             precedingBreaks[TESTSTRINGLEN*2+1];
   4124     int              i;
   4125     int              loopCount = 0;
   4126 
   4127     m_seed = seed;
   4128 
   4129     numCharClasses = mk.charClasses()->size();
   4130     chClasses      = mk.charClasses();
   4131 
   4132     // Check for errors that occured during the construction of the MonkeyKind object.
   4133     //  Can't report them where they occured because errln() is a method coming from intlTest,
   4134     //  and is not visible outside of RBBITest :-(
   4135     if (U_FAILURE(mk.deferredStatus)) {
   4136         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
   4137         return;
   4138     }
   4139 
   4140     // Verify that the character classes all have at least one member.
   4141     for (i=0; i<numCharClasses; i++) {
   4142         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
   4143         if (s == NULL || s->size() == 0) {
   4144             errln("Character Class #%d is null or of zero size.", i);
   4145             return;
   4146         }
   4147     }
   4148 
   4149     while (loopCount < numIterations || numIterations == -1) {
   4150         if (numIterations == -1 && loopCount % 10 == 0) {
   4151             // If test is running in an infinite loop, display a periodic tic so
   4152             //   we can tell that it is making progress.
   4153             fprintf(stderr, ".");
   4154         }
   4155         // Save current random number seed, so that we can recreate the random numbers
   4156         //   for this loop iteration in event of an error.
   4157         seed = m_seed;
   4158 
   4159         // Populate a test string with data.
   4160         testText.truncate(0);
   4161         for (i=0; i<TESTSTRINGLEN; i++) {
   4162             int32_t  aClassNum = m_rand() % numCharClasses;
   4163             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
   4164             int32_t   charIdx = m_rand() % classSet->size();
   4165             UChar32   c = classSet->charAt(charIdx);
   4166             if (c < 0) {   // TODO:  deal with sets containing strings.
   4167                 errln("c < 0");
   4168                 break;
   4169             }
   4170             testText.append(c);
   4171         }
   4172 
   4173         // Calculate the expected results for this test string.
   4174         mk.setText(testText);
   4175         memset(expectedBreaks, 0, sizeof(expectedBreaks));
   4176         expectedBreaks[0] = 1;
   4177         int32_t breakPos = 0;
   4178         expectedCount = 0;
   4179         for (;;) {
   4180             breakPos = mk.next(breakPos);
   4181             if (breakPos == -1) {
   4182                 break;
   4183             }
   4184             if (breakPos > testText.length()) {
   4185                 errln("breakPos > testText.length()");
   4186             }
   4187             expectedBreaks[breakPos] = 1;
   4188             U_ASSERT(expectedCount<testText.length());
   4189             expected[expectedCount ++] = breakPos;
   4190             (void)expected;   // Set but not used warning.
   4191                               // TODO (andy): check it out.
   4192         }
   4193 
   4194         // Find the break positions using forward iteration
   4195         memset(forwardBreaks, 0, sizeof(forwardBreaks));
   4196         if (useUText) {
   4197             UErrorCode status = U_ZERO_ERROR;
   4198             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
   4199             // testUText = utext_openUnicodeString(testUText, &testText, &status);
   4200             bi->setText(testUText, status);
   4201             TEST_ASSERT_SUCCESS(status);
   4202             utext_close(testUText);   // The break iterator does a shallow clone of the UText
   4203                                       //  This UText can be closed immediately, so long as the
   4204                                       //  testText string continues to exist.
   4205         } else {
   4206             bi->setText(testText);
   4207         }
   4208 
   4209         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
   4210             if (i < 0 || i > testText.length()) {
   4211                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
   4212                 break;
   4213             }
   4214             forwardBreaks[i] = 1;
   4215         }
   4216 
   4217         // Find the break positions using reverse iteration
   4218         memset(reverseBreaks, 0, sizeof(reverseBreaks));
   4219         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
   4220             if (i < 0 || i > testText.length()) {
   4221                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
   4222                 break;
   4223             }
   4224             reverseBreaks[i] = 1;
   4225         }
   4226 
   4227         // Find the break positions using isBoundary() tests.
   4228         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
   4229         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
   4230         for (i=0; i<=testText.length(); i++) {
   4231             isBoundaryBreaks[i] = bi->isBoundary(i);
   4232         }
   4233 
   4234 
   4235         // Find the break positions using the following() function.
   4236         // printf(".");
   4237         memset(followingBreaks, 0, sizeof(followingBreaks));
   4238         int32_t   lastBreakPos = 0;
   4239         followingBreaks[0] = 1;
   4240         for (i=0; i<testText.length(); i++) {
   4241             breakPos = bi->following(i);
   4242             if (breakPos <= i ||
   4243                 breakPos < lastBreakPos ||
   4244                 breakPos > testText.length() ||
   4245                 (breakPos > lastBreakPos && lastBreakPos > i)) {
   4246                 errln("%s break monkey test: "
   4247                     "Out of range value returned by BreakIterator::following().\n"
   4248                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
   4249                          name, seed, i, breakPos, lastBreakPos);
   4250                 break;
   4251             }
   4252             followingBreaks[breakPos] = 1;
   4253             lastBreakPos = breakPos;
   4254         }
   4255 
   4256         // Find the break positions using the preceding() function.
   4257         memset(precedingBreaks, 0, sizeof(precedingBreaks));
   4258         lastBreakPos = testText.length();
   4259         precedingBreaks[testText.length()] = 1;
   4260         for (i=testText.length(); i>0; i--) {
   4261             breakPos = bi->preceding(i);
   4262             if (breakPos >= i ||
   4263                 breakPos > lastBreakPos ||
   4264                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
   4265                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
   4266                 errln("%s break monkey test: "
   4267                     "Out of range value returned by BreakIterator::preceding().\n"
   4268                     "index=%d;  prev returned %d; lastBreak=%d" ,
   4269                     name,  i, breakPos, lastBreakPos);
   4270                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
   4271                     precedingBreaks[i] = 2;   // Forces an error.
   4272                 }
   4273             } else {
   4274                 if (breakPos >= 0) {
   4275                     precedingBreaks[breakPos] = 1;
   4276                 }
   4277                 lastBreakPos = breakPos;
   4278             }
   4279         }
   4280 
   4281         // Compare the expected and actual results.
   4282         for (i=0; i<=testText.length(); i++) {
   4283             const char *errorType = NULL;
   4284             if  (forwardBreaks[i] != expectedBreaks[i]) {
   4285                 errorType = "next()";
   4286             } else if (reverseBreaks[i] != forwardBreaks[i]) {
   4287                 errorType = "previous()";
   4288             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
   4289                 errorType = "isBoundary()";
   4290             } else if (followingBreaks[i] != expectedBreaks[i]) {
   4291                 errorType = "following()";
   4292             } else if (precedingBreaks[i] != expectedBreaks[i]) {
   4293                 errorType = "preceding()";
   4294             }
   4295 
   4296 
   4297             if (errorType != NULL) {
   4298                 // Format a range of the test text that includes the failure as
   4299                 //  a data item that can be included in the rbbi test data file.
   4300 
   4301                 // Start of the range is the last point where expected and actual results
   4302                 //   both agreed that there was a break position.
   4303                 int startContext = i;
   4304                 int32_t count = 0;
   4305                 for (;;) {
   4306                     if (startContext==0) { break; }
   4307                     startContext --;
   4308                     if (expectedBreaks[startContext] != 0) {
   4309                         if (count == 2) break;
   4310                         count ++;
   4311                     }
   4312                 }
   4313 
   4314                 // End of range is two expected breaks past the start position.
   4315                 int endContext = i + 1;
   4316                 int ci;
   4317                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
   4318                     for (;;) {
   4319                         if (endContext >= testText.length()) {break;}
   4320                         if (expectedBreaks[endContext-1] != 0) {
   4321                             if (count == 0) break;
   4322                             count --;
   4323                         }
   4324                         endContext ++;
   4325                     }
   4326                 }
   4327 
   4328                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
   4329                 UnicodeString errorText = "<data>";
   4330                 /***if (strcmp(errorType, "next()") == 0) {
   4331                     startContext = 0;
   4332                     endContext = testText.length();
   4333 
   4334                     printStringBreaks(testText, expected, expectedCount);
   4335                 }***/
   4336 
   4337                 for (ci=startContext; ci<endContext;) {
   4338                     UnicodeString hexChars("0123456789abcdef");
   4339                     UChar32  c;
   4340                     int      bn;
   4341                     c = testText.char32At(ci);
   4342                     if (ci == i) {
   4343                         // This is the location of the error.
   4344                         errorText.append("<?>");
   4345                     } else if (expectedBreaks[ci] != 0) {
   4346                         // This a non-error expected break position.
   4347                         errorText.append("\\");
   4348                     }
   4349                     if (c < 0x10000) {
   4350                         errorText.append("\\u");
   4351                         for (bn=12; bn>=0; bn-=4) {
   4352                             errorText.append(hexChars.charAt((c>>bn)&0xf));
   4353                         }
   4354                     } else {
   4355                         errorText.append("\\U");
   4356                         for (bn=28; bn>=0; bn-=4) {
   4357                             errorText.append(hexChars.charAt((c>>bn)&0xf));
   4358                         }
   4359                     }
   4360                     ci = testText.moveIndex32(ci, 1);
   4361                 }
   4362                 errorText.append("\\");
   4363                 errorText.append("</data>\n");
   4364 
   4365                 // Output the error
   4366                 char  charErrorTxt[500];
   4367                 UErrorCode status = U_ZERO_ERROR;
   4368                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
   4369                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
   4370                 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
   4371 
   4372                 errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
   4373                     name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
   4374                     errorType, seed, i, charErrorTxt);
   4375                 break;
   4376             }
   4377         }
   4378 
   4379         loopCount++;
   4380     }
   4381 #endif
   4382 }
   4383 
   4384 
   4385 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
   4386 //             This test checks the initial patch,
   4387 //             which is to just keep it from crashing.  Correct word boundaries
   4388 //             await a proper fix to the dictionary code.
   4389 //
   4390 void RBBITest::TestBug5532(void)  {
   4391    // Text includes a mixture of Thai and Latin.
   4392    const unsigned char utf8Data[] = {
   4393            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
   4394            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
   4395            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
   4396            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
   4397            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
   4398            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
   4399            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
   4400            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
   4401            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
   4402            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
   4403            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
   4404 
   4405     UErrorCode status = U_ZERO_ERROR;
   4406     UText utext=UTEXT_INITIALIZER;
   4407     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
   4408     TEST_ASSERT_SUCCESS(status);
   4409 
   4410     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
   4411     TEST_ASSERT_SUCCESS(status);
   4412     if (U_SUCCESS(status)) {
   4413         bi->setText(&utext, status);
   4414         TEST_ASSERT_SUCCESS(status);
   4415 
   4416         int32_t breakCount = 0;
   4417         int32_t previousBreak = -1;
   4418         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
   4419             // For now, just make sure that the break iterator doesn't hang.
   4420             TEST_ASSERT(previousBreak < bi->current());
   4421             previousBreak = bi->current();
   4422         }
   4423         TEST_ASSERT(breakCount > 0);
   4424     }
   4425     delete bi;
   4426     utext_close(&utext);
   4427 }
   4428 
   4429 
   4430 void RBBITest::TestBug9983(void)  {
   4431     UnicodeString text = UnicodeString("\\u002A"  // * Other
   4432                                        "\\uFF65"  //   Other
   4433                                        "\\u309C"  //   Katakana
   4434                                        "\\uFF9F"  //   Extend
   4435                                        "\\uFF65"  //   Other
   4436                                        "\\u0020"  //   Other
   4437                                        "\\u0000").unescape();
   4438 
   4439     UErrorCode status = U_ZERO_ERROR;
   4440     LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
   4441         BreakIterator::createWordInstance(Locale::getRoot(), status)));
   4442     TEST_ASSERT_SUCCESS(status);
   4443     LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
   4444         BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
   4445     TEST_ASSERT_SUCCESS(status);
   4446     if (U_FAILURE(status)) {
   4447         return;
   4448     }
   4449     int32_t offset, rstatus, iterationCount;
   4450 
   4451     brkiter->setText(text);
   4452     brkiter->last();
   4453     iterationCount = 0;
   4454     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
   4455         iterationCount++;
   4456         rstatus = brkiter->getRuleStatus();
   4457         (void)rstatus;     // Suppress set but not used warning.
   4458         if (iterationCount >= 10) {
   4459            break;
   4460         }
   4461     }
   4462     TEST_ASSERT(iterationCount == 6);
   4463 
   4464     brkiterPOSIX->setText(text);
   4465     brkiterPOSIX->last();
   4466     iterationCount = 0;
   4467     while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
   4468         iterationCount++;
   4469         rstatus = brkiterPOSIX->getRuleStatus();
   4470         (void)rstatus;     // Suppress set but not used warning.
   4471         if (iterationCount >= 10) {
   4472            break;
   4473         }
   4474     }
   4475     TEST_ASSERT(iterationCount == 6);
   4476 }
   4477 
   4478 
   4479 //
   4480 //  TestDebug    -  A place-holder test for debugging purposes.
   4481 //                  For putting in fragments of other tests that can be invoked
   4482 //                  for tracing  without a lot of unwanted extra stuff happening.
   4483 //
   4484 void RBBITest::TestDebug(void) {
   4485 #if 0
   4486     UErrorCode   status = U_ZERO_ERROR;
   4487     int pos = 0;
   4488     int ruleStatus = 0;
   4489 
   4490     RuleBasedBreakIterator* bi =
   4491        // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
   4492        // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
   4493        (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
   4494     UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
   4495     // UnicodeString s("Aaa.  Bcd");
   4496     s = s.unescape();
   4497     bi->setText(s);
   4498     UBool r = bi->isBoundary(8);
   4499     printf("%s", r?"true":"false");
   4500     return;
   4501     pos = bi->last();
   4502     do {
   4503         // ruleStatus = bi->getRuleStatus();
   4504         printf("%d\t%d\n", pos, ruleStatus);
   4505         pos = bi->previous();
   4506     } while (pos != BreakIterator::DONE);
   4507 #endif
   4508 }
   4509 
   4510 void RBBITest::TestProperties() {
   4511     UErrorCode errorCode = U_ZERO_ERROR;
   4512     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
   4513     if (!prependSet.isEmpty()) {
   4514         errln(
   4515             "[:GCB=Prepend:] is not empty any more. "
   4516             "Uncomment relevant lines in source/data/brkitr/char.txt and "
   4517             "change this test to the opposite condition.");
   4518     }
   4519 }
   4520 
   4521 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   4522