Home | History | Annotate | Download | only in intltest
      1 /********************************************************************
      2  * COPYRIGHT:
      3  * Copyright (c) 1999-2009, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  ********************************************************************/
      6 /************************************************************************
      7 *   Date        Name        Description
      8 *   12/15/99    Madhu        Creation.
      9 *   01/12/2000  Madhu        Updated for changed API and added new tests
     10 ************************************************************************/
     11 
     12 #include "unicode/utypes.h"
     13 
     14 #if !UCONFIG_NO_BREAK_ITERATION
     15 
     16 #include "unicode/utypes.h"
     17 #include "unicode/brkiter.h"
     18 #include "unicode/rbbi.h"
     19 #include "unicode/uchar.h"
     20 #include "unicode/utf16.h"
     21 #include "unicode/ucnv.h"
     22 #include "unicode/schriter.h"
     23 #include "unicode/uniset.h"
     24 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
     25 #include "unicode/ustring.h"
     26 #include "unicode/utext.h"
     27 #include "intltest.h"
     28 #include "rbbitst.h"
     29 #include <string.h>
     30 #include "uvector.h"
     31 #include "uvectr32.h"
     32 #include "triedict.h"
     33 #include <string.h>
     34 #include <stdio.h>
     35 #include <stdlib.h>
     36 #include "unicode/numfmt.h"
     37 #include "unicode/uscript.h"
     38 
     39 #define TEST_ASSERT(x) {if (!(x)) { \
     40     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
     41 
     42 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
     43     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
     44 
     45 
     46 //---------------------------------------------
     47 // runIndexedTest
     48 //---------------------------------------------
     49 
     50 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
     51 {
     52     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
     53 
     54     switch (index) {
     55         case 0: name = "TestBug4153072";
     56             if(exec) TestBug4153072();                         break;
     57         case 1: name = "TestJapaneseLineBreak";
     58             if(exec) TestJapaneseLineBreak();                  break;
     59         case 2: name = "TestStatusReturn";
     60             if(exec) TestStatusReturn();                       break;
     61         case 3: name = "TestUnicodeFiles";
     62             if(exec) TestUnicodeFiles();                       break;
     63         case 4: name = "TestEmptyString";
     64             if(exec) TestEmptyString();                        break;
     65 
     66         case 5: name = "TestGetAvailableLocales";
     67             if(exec) TestGetAvailableLocales();                break;
     68 
     69         case 6: name = "TestGetDisplayName";
     70             if(exec) TestGetDisplayName();                     break;
     71 
     72         case 7: name = "TestEndBehaviour";
     73             if(exec) TestEndBehaviour();                       break;
     74         case 8: name = "TestMixedThaiLineBreak";
     75              if(exec) TestMixedThaiLineBreak();                break;
     76         case 9: name = "TestThaiLineBreak";
     77              if(exec) TestThaiLineBreak();                     break;
     78         case 10: name = "TestMaiyamok";
     79              if(exec) TestMaiyamok();                          break;
     80         case 11: name = "TestWordBreaks";
     81              if(exec) TestWordBreaks();                        break;
     82         case 12: name = "TestWordBoundary";
     83              if(exec) TestWordBoundary();                      break;
     84         case 13: name = "TestLineBreaks";
     85              if(exec) TestLineBreaks();                        break;
     86         case 14: name = "TestSentBreaks";
     87              if(exec) TestSentBreaks();                        break;
     88         case 15: name = "TestExtended";
     89              if(exec) TestExtended();                          break;
     90         case 16: name = "TestMonkey";
     91              if(exec) {
     92  #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     93                TestMonkey(params);
     94  #else
     95                logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)");
     96  #endif
     97              }
     98                                                                break;
     99         case 17: name = "TestBug3818";
    100             if(exec) TestBug3818();                            break;
    101         case 18: name = "TestJapaneseWordBreak";
    102             if(exec) TestJapaneseWordBreak();                  break;
    103         case 19: name = "TestDebug";
    104             if(exec) TestDebug();                              break;
    105         case 20: name = "TestTrieDict";
    106             if(exec) TestTrieDict();                           break;
    107         case 21: name = "TestBug5775";
    108             if (exec) TestBug5775();                           break;
    109         case 22: name = "TestThaiBreaks";
    110             if (exec) TestThaiBreaks();                        break;
    111         case 23: name = "TestTailoredBreaks";
    112             if (exec) TestTailoredBreaks();                    break;
    113         case 24: name = "TestTrieDictWithValue";
    114             if(exec) TestTrieDictWithValue();                  break;
    115 
    116         default: name = ""; break; //needed to end loop
    117     }
    118 }
    119 
    120 
    121 //---------------------------------------------------------------------------
    122 //
    123 //   class BITestData   Holds a set of Break iterator test data and results
    124 //                      Includes
    125 //                         - the string data to be broken
    126 //                         - a vector of the expected break positions.
    127 //                         - a vector of source line numbers for the data,
    128 //                               (to help see where errors occured.)
    129 //                         - The expected break tag values.
    130 //                         - Vectors of actual break positions and tag values.
    131 //                         - Functions for comparing actual with expected and
    132 //                            reporting errors.
    133 //
    134 //----------------------------------------------------------------------------
    135 class BITestData {
    136 public:
    137     UnicodeString    fDataToBreak;
    138     UVector          fExpectedBreakPositions;
    139     UVector          fExpectedTags;
    140     UVector          fLineNum;
    141     UVector          fActualBreakPositions;   // Test Results.
    142     UVector          fActualTags;
    143 
    144     BITestData(UErrorCode &status);
    145     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
    146     void             checkResults(const char *heading, RBBITest *test);
    147     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
    148     void             clearResults();
    149 };
    150 
    151 //
    152 // Constructor.
    153 //
    154 BITestData::BITestData(UErrorCode &status)
    155 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
    156   fActualTags(status)
    157 {
    158 }
    159 
    160 //
    161 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
    162 //                 The macro form collects the line number, which is helpful
    163 //                 when tracking down failures.
    164 //
    165 //                 A null data item is inserted at the start of each test's data
    166 //                  to put the starting zero into the data list.  The position saved for
    167 //                  each non-null item is its ending position.
    168 //
    169 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
    170 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
    171     if (U_FAILURE(status)) {return;}
    172     if (data != NULL) {
    173         fDataToBreak.append(CharsToUnicodeString(data));
    174     }
    175     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
    176     fExpectedTags.addElement(tag, status);
    177     fLineNum.addElement(lineNum, status);
    178 }
    179 
    180 
    181 //
    182 //  checkResults.   Compare the actual and expected break positions, report any differences.
    183 //
    184 void BITestData::checkResults(const char *heading, RBBITest *test) {
    185     int32_t   expectedIndex = 0;
    186     int32_t   actualIndex = 0;
    187 
    188     for (;;) {
    189         // If we've run through both the expected and actual results vectors, we're done.
    190         //   break out of the loop.
    191         if (expectedIndex >= fExpectedBreakPositions.size() &&
    192             actualIndex   >= fActualBreakPositions.size()) {
    193             break;
    194         }
    195 
    196 
    197         if (expectedIndex >= fExpectedBreakPositions.size()) {
    198             err(heading, test, expectedIndex-1, actualIndex);
    199             actualIndex++;
    200             continue;
    201         }
    202 
    203         if (actualIndex >= fActualBreakPositions.size()) {
    204             err(heading, test, expectedIndex, actualIndex-1);
    205             expectedIndex++;
    206             continue;
    207         }
    208 
    209         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
    210             err(heading, test, expectedIndex, actualIndex);
    211             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
    212             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
    213                 actualIndex++;
    214             } else {
    215                 expectedIndex++;
    216             }
    217             continue;
    218         }
    219 
    220         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
    221             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
    222                 heading, fLineNum.elementAt(expectedIndex),
    223                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
    224         }
    225 
    226         actualIndex++;
    227         expectedIndex++;
    228     }
    229 }
    230 
    231 //
    232 //  err   -  An error was found.  Report it, along with information about where the
    233 //                                incorrectly broken test data appeared in the source file.
    234 //
    235 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
    236 {
    237     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
    238     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
    239     int32_t   o        = 0;
    240     int32_t   line     = fLineNum.elementAti(expectedIdx);
    241     if (expectedIdx > 0) {
    242         // The line numbers are off by one because a premature break occurs somewhere
    243         //    within the previous item, rather than at the start of the current (expected) item.
    244         //    We want to report the offset of the unexpected break from the start of
    245         //      this previous item.
    246         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
    247     }
    248     if (actual < expected) {
    249         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
    250     } else {
    251         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
    252     }
    253 }
    254 
    255 
    256 void BITestData::clearResults() {
    257     fActualBreakPositions.removeAllElements();
    258     fActualTags.removeAllElements();
    259 }
    260 
    261 
    262 //-----------------------------------------------------------------------------------
    263 //
    264 //    Cannned Test Characters
    265 //
    266 //-----------------------------------------------------------------------------------
    267 
    268 static const UChar cannedTestArray[] = {
    269     0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
    270     0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
    271     0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
    272     0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
    273     0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
    274     0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
    275     0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
    276     0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
    277 };
    278 
    279 static UnicodeString* cannedTestChars = 0;
    280 
    281 #define  halfNA     "\\u0928\\u094d\\u200d"
    282 #define  halfSA     "\\u0938\\u094d\\u200d"
    283 #define  halfCHA    "\\u091a\\u094d\\u200d"
    284 #define  halfKA     "\\u0915\\u094d\\u200d"
    285 #define  deadTA     "\\u0924\\u094d"
    286 
    287 //--------------------------------------------------------------------------------------
    288 //
    289 //    RBBITest    constructor and destructor
    290 //
    291 //--------------------------------------------------------------------------------------
    292 
    293 RBBITest::RBBITest() {
    294     UnicodeString temp(cannedTestArray);
    295     cannedTestChars = new UnicodeString();
    296     *cannedTestChars += (UChar)0x0000;
    297     *cannedTestChars += temp;
    298 }
    299 
    300 
    301 RBBITest::~RBBITest() {
    302     delete cannedTestChars;
    303 }
    304 
    305 
    306 static const int T_NUMBER = 100;
    307 static const int T_LETTER = 200;
    308 static const int T_H_OR_K = 300;
    309 static const int T_IDEO   = 400;
    310 
    311 
    312 
    313 
    314 
    315 
    316 //--------------------------------------------------------------------
    317 //Testing the BreakIterator for devanagari script
    318 //--------------------------------------------------------------------
    319 
    320 #define deadRA   "\\u0930\\u094d"         /*deadform RA = devanagari RA + virama*/
    321 #define deadPHA  "\\u092b\\u094d"         /*deadform PHA = devanagari PHA + virama*/
    322 #define deadTTHA "\\u0920\\u094d"
    323 #define deadPA   "\\u092a\\u094d"
    324 #define deadSA   "\\u0938\\u094d"
    325 #define visarga  "\\u0903"                /*devanagari visarga looks like a english colon*/
    326 
    327 
    328 
    329 
    330 
    331 
    332 //-----------------------------------------------------------------------------------
    333 //
    334 //   Test for status {tag} return value from break rules.
    335 //        TODO:  a more thorough test.
    336 //
    337 //-----------------------------------------------------------------------------------
    338 void RBBITest::TestStatusReturn() {
    339      UnicodeString rulesString1("$Letters = [:L:];\n"
    340                                   "$Numbers = [:N:];\n"
    341                                   "$Letters+{1};\n"
    342                                   "$Numbers+{2};\n"
    343                                   "Help\\ {4}/me\\!;\n"
    344                                   "[^$Letters $Numbers];\n"
    345                                   "!.*;\n", -1, US_INV);
    346      UnicodeString testString1  = "abc123..abc Help me Help me!";
    347                                 // 01234567890123456789012345678
    348      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
    349      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
    350 
    351      UErrorCode status=U_ZERO_ERROR;
    352      UParseError    parseError;
    353 
    354      RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
    355      if(U_FAILURE(status)) {
    356          dataerrln("FAIL : in construction - %s", u_errorName(status));
    357      } else {
    358          int32_t  pos;
    359          int32_t  i = 0;
    360          bi->setText(testString1);
    361          for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
    362              if (pos != bounds1[i]) {
    363                  errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
    364                  break;
    365              }
    366 
    367              int tag = bi->getRuleStatus();
    368              if (tag != brkStatus[i]) {
    369                  errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
    370                  break;
    371              }
    372              i++;
    373          }
    374      }
    375      delete bi;
    376 }
    377 
    378 
    379 static void printStringBreaks(UnicodeString ustr, int expected[],
    380                               int expectedcount)
    381 {
    382     UErrorCode status = U_ZERO_ERROR;
    383     char name[100];
    384     printf("code    alpha extend alphanum type word sent line name\n");
    385     int j;
    386     for (j = 0; j < ustr.length(); j ++) {
    387         if (expectedcount > 0) {
    388             int k;
    389             for (k = 0; k < expectedcount; k ++) {
    390                 if (j == expected[k]) {
    391                     printf("------------------------------------------------ %d\n",
    392                            j);
    393                 }
    394             }
    395         }
    396         UChar32 c = ustr.char32At(j);
    397         if (c > 0xffff) {
    398             j ++;
    399         }
    400         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
    401         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
    402                            u_isUAlphabetic(c),
    403                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
    404                            u_isalnum(c),
    405                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
    406                                                   u_charType(c),
    407                                                   U_SHORT_PROPERTY_NAME),
    408                            u_getPropertyValueName(UCHAR_WORD_BREAK,
    409                                                   u_getIntPropertyValue(c,
    410                                                           UCHAR_WORD_BREAK),
    411                                                   U_SHORT_PROPERTY_NAME),
    412                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
    413                                    u_getIntPropertyValue(c,
    414                                            UCHAR_SENTENCE_BREAK),
    415                                    U_SHORT_PROPERTY_NAME),
    416                            u_getPropertyValueName(UCHAR_LINE_BREAK,
    417                                    u_getIntPropertyValue(c,
    418                                            UCHAR_LINE_BREAK),
    419                                    U_SHORT_PROPERTY_NAME),
    420                            name);
    421     }
    422 }
    423 
    424 void RBBITest::TestThaiLineBreak() {
    425     UErrorCode status = U_ZERO_ERROR;
    426     BITestData thaiLineSelection(status);
    427 
    428     // \u0e2f-- the Thai paiyannoi character-- isn't a letter.  It's a symbol that
    429     // represents elided letters at the end of a long word.  It should be bound to
    430     // the end of the word and not treated as an independent punctuation mark.
    431 
    432 
    433     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
    434     ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status);
    435     ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status);
    436     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status);
    437     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status);
    438 //        ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);
    439 //        ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
    440     ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status);
    441     // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
    442     ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status);
    443     ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status);
    444     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status);
    445     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status);
    446     ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status);
    447     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status);
    448 
    449     // the one time where the paiyannoi occurs somewhere other than at the end
    450     // of a word is in the Thai abbrevation for "etc.", which both begins and
    451     // ends with a paiyannoi
    452     ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status);
    453     ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
    454     ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status);
    455 
    456     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
    457         Locale("th"), status);
    458     if (U_FAILURE(status))
    459     {
    460         errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestThaiLineBreak. - %s", u_errorName(status));
    461         return;
    462     }
    463 
    464     generalIteratorTest(*e, thaiLineSelection);
    465     delete e;
    466 }
    467 
    468 
    469 
    470 void RBBITest::TestMixedThaiLineBreak()
    471 {
    472     UErrorCode   status = U_ZERO_ERROR;
    473     BITestData   thaiLineSelection(status);
    474 
    475     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
    476 
    477 
    478     // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters
    479     // start
    480 
    481     ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
    482     ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status);
    483     ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status);
    484     ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status);
    485     ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
    486     ADD_DATACHUNK(thaiLineSelection, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status);
    487     ADD_DATACHUNK(thaiLineSelection, "\\u0E04\\u0E23\\u0E1A", 0, status);
    488     ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E2D\\u0E1A ", 0, status);
    489     ADD_DATACHUNK(thaiLineSelection, "\"\\u0E52\\u0E52\\u0E50 ", 0, status);
    490     ADD_DATACHUNK(thaiLineSelection, "\\u0E1b\\u0E35\" ", 0, status);
    491     ADD_DATACHUNK(thaiLineSelection, "\\u0E02\\u0E2d\\u0E07", 0, status);
    492     ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status);
    493     ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status);
    494     ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status);
    495     ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status);
    496     ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status);
    497 
    498     // @suwit - end of changes
    499 
    500 
    501     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status);
    502     if (U_FAILURE(status))
    503     {
    504         errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak. - %s", u_errorName(status));
    505         return;
    506     }
    507 
    508 
    509     generalIteratorTest(*e, thaiLineSelection);
    510     delete e;
    511 }
    512 
    513 
    514 void RBBITest::TestMaiyamok()
    515 {
    516     UErrorCode status = U_ZERO_ERROR;
    517     BITestData   thaiLineSelection(status);
    518     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
    519     // the Thai maiyamok character is a shorthand symbol that means "repeat the previous
    520     // word".  Instead of appearing as a word unto itself, however, it's kept together
    521     // with the word before it
    522     ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);
    523     ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);
    524     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);
    525     ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status);
    526     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status);
    527     ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);
    528     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status);
    529     ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status);
    530     ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
    531 
    532     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
    533         Locale("th"), status);
    534 
    535     if (U_FAILURE(status))
    536     {
    537         errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMaiyamok. - %s", u_errorName(status));
    538         return;
    539     }
    540     generalIteratorTest(*e, thaiLineSelection);
    541     delete e;
    542 }
    543 
    544 
    545 
    546 void RBBITest::TestBug3818() {
    547     UErrorCode  status = U_ZERO_ERROR;
    548 
    549     // Four Thai words...
    550     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
    551                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
    552     UnicodeString  thaiStr(thaiWordData);
    553 
    554     RuleBasedBreakIterator* bi =
    555         (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
    556     if (U_FAILURE(status) || bi == NULL) {
    557         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
    558         return;
    559     }
    560     bi->setText(thaiStr);
    561 
    562     int32_t  startOfSecondWord = bi->following(1);
    563     if (startOfSecondWord != 4) {
    564         errln("Fail at file %s, line %d expected start of word at 4, got %d",
    565             __FILE__, __LINE__, startOfSecondWord);
    566     }
    567     startOfSecondWord = bi->following(0);
    568     if (startOfSecondWord != 4) {
    569         errln("Fail at file %s, line %d expected start of word at 4, got %d",
    570             __FILE__, __LINE__, startOfSecondWord);
    571     }
    572     delete bi;
    573 }
    574 
    575 
    576 void RBBITest::TestJapaneseWordBreak() {
    577 // TODO: Rewrite this test for a dictionary-based word breaking.
    578 #if 0
    579     UErrorCode status = U_ZERO_ERROR;
    580     BITestData   japaneseWordSelection(status);
    581 
    582     ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status);           // Break at start of data
    583     ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2
    584     ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5
    585     ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7
    586     ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10
    587     ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11
    588     ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12
    589 
    590     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
    591         Locale("ja"), status);
    592     if (U_FAILURE(status))
    593     {
    594         errcheckln(status, "Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
    595         return;
    596     }
    597 
    598     generalIteratorTest(*e, japaneseWordSelection);
    599     delete e;
    600 #endif
    601 }
    602 
    603 void RBBITest::TestTrieDict() {
    604     UErrorCode      status  = U_ZERO_ERROR;
    605 
    606     //
    607     //  Open and read the test data file.
    608     //
    609     const char *testDataDirectory = IntlTest::getSourceTestData(status);
    610     char testFileName[1000];
    611     if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) {
    612         errln("Can't open test data.  Path too long.");
    613         return;
    614     }
    615     strcpy(testFileName, testDataDirectory);
    616     strcat(testFileName, "riwords.txt");
    617 
    618     // Items needing deleting at the end
    619     MutableTrieDictionary *mutableDict = NULL;
    620     CompactTrieDictionary *compactDict = NULL;
    621     UnicodeSet            *breaks      = NULL;
    622     UChar                 *testFile    = NULL;
    623     StringEnumeration     *enumer1     = NULL;
    624     StringEnumeration     *enumer2     = NULL;
    625     MutableTrieDictionary *mutable2    = NULL;
    626     StringEnumeration     *cloneEnum   = NULL;
    627     CompactTrieDictionary *compact2    = NULL;
    628 
    629 
    630     const UnicodeString *originalWord = NULL;
    631     const UnicodeString *cloneWord    = NULL;
    632     UChar *current;
    633     UChar *word;
    634     UChar uc;
    635     int32_t wordLen;
    636     int32_t wordCount;
    637     int32_t testCount;
    638 
    639     int    len;
    640     testFile = ReadAndConvertFile(testFileName, len, NULL, status);
    641     if (U_FAILURE(status)) {
    642         goto cleanup; /* something went wrong, error already output */
    643     }
    644 
    645     mutableDict = new MutableTrieDictionary(0x0E1C, status);
    646     if (U_FAILURE(status)) {
    647         errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
    648         goto cleanup;
    649     }
    650 
    651     breaks = new UnicodeSet;
    652     breaks->add(0x000A);     // Line Feed
    653     breaks->add(0x000D);     // Carriage Return
    654     breaks->add(0x2028);     // Line Separator
    655     breaks->add(0x2029);     // Paragraph Separator
    656 
    657     // Now add each non-comment line of the file as a word.
    658     current = testFile;
    659     word = current;
    660     uc = *current++;
    661     wordLen = 0;
    662     wordCount = 0;
    663 
    664     while (uc) {
    665         if (uc == 0x0023) {     // #comment line, skip
    666             while (uc && !breaks->contains(uc)) {
    667                 uc = *current++;
    668             }
    669         }
    670         else while (uc && !breaks->contains(uc)) {
    671             ++wordLen;
    672             uc = *current++;
    673         }
    674         if (wordLen > 0) {
    675             mutableDict->addWord(word, wordLen, status);
    676             if (U_FAILURE(status)) {
    677                 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
    678                 goto cleanup;
    679             }
    680             wordCount += 1;
    681         }
    682 
    683         // Find beginning of next line
    684         while (uc && breaks->contains(uc)) {
    685             uc = *current++;
    686         }
    687         word = current-1;
    688         wordLen = 0;
    689     }
    690 
    691     if (wordCount < 50) {
    692         errln("Word count (%d) unreasonably small\n", wordCount);
    693         goto cleanup;
    694     }
    695 
    696     enumer1 = mutableDict->openWords(status);
    697     if (U_FAILURE(status)) {
    698         errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
    699         goto cleanup;
    700     }
    701 
    702     testCount = 0;
    703     if (wordCount != (testCount = enumer1->count(status))) {
    704         errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
    705             testCount, wordCount, u_errorName(status));
    706         goto cleanup;
    707     }
    708 
    709     // Now compact it
    710     compactDict = new CompactTrieDictionary(*mutableDict, status);
    711     if (U_FAILURE(status)) {
    712         errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
    713         goto cleanup;
    714     }
    715 
    716     enumer2 = compactDict->openWords(status);
    717     if (U_FAILURE(status)) {
    718         errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
    719         goto cleanup;
    720     }
    721 
    722     if (wordCount != (testCount = enumer2->count(status))) {
    723         errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
    724             testCount, wordCount, u_errorName(status));
    725         goto cleanup;
    726     }
    727 
    728     if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) {
    729         errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same");
    730     }
    731     delete enumer1;
    732     enumer1 = NULL;
    733     delete enumer2;
    734     enumer2 = NULL;
    735 
    736     // Now un-compact it
    737     mutable2 = compactDict->cloneMutable(status);
    738     if (U_FAILURE(status)) {
    739         errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
    740         goto cleanup;
    741     }
    742 
    743     cloneEnum = mutable2->openWords(status);
    744     if (U_FAILURE(status)) {
    745         errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
    746         goto cleanup;
    747     }
    748 
    749     if (wordCount != (testCount = cloneEnum->count(status))) {
    750         errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
    751             testCount, wordCount, u_errorName(status));
    752         goto cleanup;
    753     }
    754 
    755     // Compact original dictionary to clone. Note that we can only compare the same kind of
    756     // dictionary as the order of the enumerators is not guaranteed to be the same between
    757     // different kinds
    758     enumer1 = mutableDict->openWords(status);
    759     if (U_FAILURE(status)) {
    760         errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
    761         goto cleanup;
    762      }
    763 
    764     originalWord = enumer1->snext(status);
    765     cloneWord = cloneEnum->snext(status);
    766     while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
    767         if (*originalWord != *cloneWord) {
    768             errln("Original and cloned MutableTrieDictionary word mismatch\n");
    769             goto cleanup;
    770         }
    771         originalWord = enumer1->snext(status);
    772         cloneWord = cloneEnum->snext(status);
    773     }
    774 
    775     if (U_FAILURE(status)) {
    776         errln("Enumeration failed: %s\n", u_errorName(status));
    777         goto cleanup;
    778     }
    779 
    780     if (originalWord != cloneWord) {
    781         errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
    782         goto cleanup;
    783     }
    784 
    785     // Test the data copying constructor for CompactTrieDict, and the data access APIs.
    786     compact2 = new CompactTrieDictionary(compactDict->data(), status);
    787     if (U_FAILURE(status)) {
    788         errln("CompactTrieDictionary(const void *,...) failed\n");
    789         goto cleanup;
    790     }
    791 
    792     if (compact2->dataSize() == 0) {
    793         errln("CompactTrieDictionary->dataSize() == 0\n");
    794         goto cleanup;
    795     }
    796 
    797     // Now count the words via the second dictionary
    798     delete enumer1;
    799     enumer1 = compact2->openWords(status);
    800     if (U_FAILURE(status)) {
    801         errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
    802         goto cleanup;
    803     }
    804 
    805     if (wordCount != (testCount = enumer1->count(status))) {
    806         errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
    807             testCount, wordCount, u_errorName(status));
    808         goto cleanup;
    809     }
    810 
    811 cleanup:
    812     delete compactDict;
    813     delete mutableDict;
    814     delete breaks;
    815     delete[] testFile;
    816     delete enumer1;
    817     delete mutable2;
    818     delete cloneEnum;
    819     delete compact2;
    820 }
    821 
    822 /*TODO: delete later*/
    823 inline void writeEnumerationToFile(StringEnumeration *enumer, char *filename){
    824     UErrorCode      status  = U_ZERO_ERROR;
    825     FILE *outfile = fopen(filename,"w");
    826     UConverter *cvt = ucnv_open("UTF-8", &status);
    827     if (U_FAILURE(status))
    828         return;
    829     if(outfile != NULL){
    830         status = U_ZERO_ERROR;
    831         const UnicodeString *word = enumer->snext(status);
    832         while (word != NULL && U_SUCCESS(status)) {
    833             char u8word[500];
    834             status = U_ZERO_ERROR;
    835             ucnv_fromUChars(cvt, u8word, 500, word->getBuffer(), word->length(),
    836                     &status);
    837             fprintf(outfile,"%s\n", u8word);
    838             status = U_ZERO_ERROR;
    839             word = enumer->snext(status);
    840         }
    841         fclose(outfile);
    842     }
    843     ucnv_close(cvt);
    844 }
    845 
    846 // A very simple helper class to streamline the buffer handling in
    847 // TestTrieDictWithValue
    848 template<class T, size_t N>
    849 class AutoBuffer {
    850  public:
    851   AutoBuffer(size_t size) : buffer(stackBuffer) {
    852     if (size > N)
    853       buffer = new T[size];
    854   }
    855   ~AutoBuffer() {
    856     if (buffer != stackBuffer)
    857       delete [] buffer;
    858   }
    859   T* elems() {
    860     return buffer;
    861   }
    862   const T& operator[] (size_t i) const {
    863     return buffer[i];
    864   }
    865   T& operator[] (size_t i) {
    866     return buffer[i];
    867   }
    868  private:
    869   T stackBuffer[N];
    870   T* buffer;
    871   AutoBuffer();
    872 };
    873 
    874 //----------------------------------------------------------------------------
    875 //
    876 // TestTrieDictWithValue    Test trie dictionaries with logprob values and
    877 // more than 2^16 nodes after compaction.
    878 //
    879 //----------------------------------------------------------------------------
    880 void RBBITest::TestTrieDictWithValue() {
    881     UErrorCode      status  = U_ZERO_ERROR;
    882 
    883     //
    884     //  Open and read the test data file.
    885     //
    886     const char *testDataDirectory = IntlTest::getSourceTestData(status);
    887     const char *filename = "cjdict-truncated.txt";
    888     char testFileName[1000];
    889     if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen(filename) + 10 >= sizeof(testFileName)) {
    890         errln("Can't open test data.  Path too long.");
    891         return;
    892     }
    893     strcpy(testFileName, testDataDirectory);
    894     strcat(testFileName, filename);
    895 
    896     // Items needing deleting at the end
    897     MutableTrieDictionary *mutableDict = NULL;
    898     CompactTrieDictionary *compactDict = NULL;
    899     UnicodeSet            *breaks      = NULL;
    900     UChar                 *testFile    = NULL;
    901     StringEnumeration     *enumer1     = NULL;
    902     StringEnumeration     *enumer2     = NULL;
    903     MutableTrieDictionary *mutable2    = NULL;
    904     StringEnumeration     *cloneEnum   = NULL;
    905     CompactTrieDictionary *compact2    = NULL;
    906     NumberFormat          *nf           = NULL;
    907     UText *originalText = NULL, *cloneText = NULL;
    908 
    909     const UnicodeString *originalWord = NULL;
    910     const UnicodeString *cloneWord    = NULL;
    911     UChar *current;
    912     UChar *word;
    913     UChar uc;
    914     int32_t wordLen;
    915     int32_t wordCount;
    916     int32_t testCount;
    917     int32_t valueLen;
    918     int counter = 0;
    919 
    920     int    len;
    921     testFile = ReadAndConvertFile(testFileName, len, NULL, status);
    922     if (U_FAILURE(status)) {
    923         goto cleanup; /* something went wrong, error already output */
    924     }
    925 
    926     mutableDict = new MutableTrieDictionary(0x0E1C, status, TRUE);
    927     if (U_FAILURE(status)) {
    928         errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
    929         goto cleanup;
    930     }
    931 
    932     breaks = new UnicodeSet;
    933     breaks->add(0x000A);     // Line Feed
    934     breaks->add(0x000D);     // Carriage Return
    935     breaks->add(0x2028);     // Line Separator
    936     breaks->add(0x2029);     // Paragraph Separator
    937     breaks->add(0x0009);     // Tab character
    938 
    939     // Now add each non-comment line of the file as a word.
    940     current = testFile;
    941     word = current;
    942     uc = *current++;
    943     wordLen = 0;
    944     wordCount = 0;
    945     nf = NumberFormat::createInstance(status);
    946 
    947     while (uc) {
    948         UnicodeString ucharValue;
    949         valueLen = 0;
    950 
    951         if (uc == 0x0023) {     // #comment line, skip
    952             while (uc && !breaks->contains(uc)) {
    953                 uc = *current++;
    954             }
    955         }
    956         else{
    957             while (uc && !breaks->contains(uc)) {
    958                 ++wordLen;
    959                 uc = *current++;
    960             }
    961             if(uc == 0x0009){ //separator is a tab char, read in num after tab
    962                 uc = *current++;
    963                 while (uc && !breaks->contains(uc)) {
    964                     ucharValue.append(uc);
    965                     uc = *current++;
    966                 }
    967             }
    968         }
    969         if (wordLen > 0) {
    970             Formattable value((int32_t)0);
    971             nf->parse(ucharValue.getTerminatedBuffer(), value, status);
    972 
    973             if(U_FAILURE(status)){
    974                 errln("parsing of value failed when reading in dictionary\n");
    975                 goto cleanup;
    976             }
    977             mutableDict->addWord(word, wordLen, status, value.getLong());
    978             if (U_FAILURE(status)) {
    979                 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
    980                 goto cleanup;
    981             }
    982             wordCount += 1;
    983         }
    984 
    985         // Find beginning of next line
    986         while (uc && breaks->contains(uc)) {
    987             uc = *current++;
    988         }
    989         word = current-1;
    990         wordLen = 0;
    991     }
    992 
    993     if (wordCount < 50) {
    994         errln("Word count (%d) unreasonably small\n", wordCount);
    995         goto cleanup;
    996     }
    997 
    998     enumer1 = mutableDict->openWords(status);
    999     if (U_FAILURE(status)) {
   1000         errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
   1001         goto cleanup;
   1002     }
   1003 
   1004     testCount = 0;
   1005     if (wordCount != (testCount = enumer1->count(status))) {
   1006         errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
   1007                 testCount, wordCount, u_errorName(status));
   1008         goto cleanup;
   1009     }
   1010 
   1011     // Now compact it
   1012     compactDict = new CompactTrieDictionary(*mutableDict, status);
   1013     if (U_FAILURE(status)) {
   1014         errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
   1015         goto cleanup;
   1016     }
   1017 
   1018     enumer2 = compactDict->openWords(status);
   1019     if (U_FAILURE(status)) {
   1020         errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
   1021         goto cleanup;
   1022     }
   1023 
   1024 
   1025     //delete later
   1026 //    writeEnumerationToFile(enumer1, "/home/jchye/mutable.txt");
   1027 //    writeEnumerationToFile(enumer2, "/home/jchye/compact.txt");
   1028 
   1029     enumer1->reset(status);
   1030     enumer2->reset(status);
   1031 
   1032     originalWord = enumer1->snext(status);
   1033     cloneWord = enumer2->snext(status);
   1034     while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
   1035         if (*originalWord != *cloneWord) {
   1036             errln("MutableTrieDictionary and CompactTrieDictionary word mismatch at %d, lengths are %d and %d\n",
   1037                     counter, originalWord->length(), cloneWord->length());
   1038             goto cleanup;
   1039         }
   1040 
   1041         // check if attached values of the same word in both dictionaries tally
   1042 #if 0
   1043         int32_t lengths1[originalWord->length()], lengths2[cloneWord->length()];
   1044         uint16_t values1[originalWord->length()], values2[cloneWord->length()];
   1045 #endif
   1046         AutoBuffer<int32_t, 20> lengths1(originalWord->length());
   1047         AutoBuffer<int32_t, 20> lengths2(cloneWord->length());
   1048         AutoBuffer<uint16_t, 20> values1(originalWord->length());
   1049         AutoBuffer<uint16_t, 20> values2(cloneWord->length());
   1050 
   1051         originalText = utext_openConstUnicodeString(originalText, originalWord, &status);
   1052         cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status);
   1053 
   1054         int count1, count2;
   1055         mutableDict->matches(originalText, originalWord->length(), lengths1.elems(), count1, originalWord->length(), values1.elems());
   1056         compactDict->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems());
   1057 
   1058         if(values1[count1-1] != values2[count2-1]){
   1059             errln("Values of word %d in MutableTrieDictionary and CompactTrieDictionary do not match, with values %d and %d\n",
   1060                   counter, values1[count1-1], values2[count2-1]);
   1061             goto cleanup;
   1062         }
   1063 
   1064         counter++;
   1065         originalWord = enumer1->snext(status);
   1066         cloneWord = enumer2->snext(status);
   1067     }
   1068     if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) {
   1069         errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same");
   1070     }
   1071 
   1072     delete enumer1;
   1073     enumer1 = NULL;
   1074     delete enumer2;
   1075     enumer2 = NULL;
   1076 
   1077     // Now un-compact it
   1078     mutable2 = compactDict->cloneMutable(status);
   1079     if (U_FAILURE(status)) {
   1080         errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
   1081         goto cleanup;
   1082     }
   1083 
   1084     cloneEnum = mutable2->openWords(status);
   1085     if (U_FAILURE(status)) {
   1086         errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
   1087         goto cleanup;
   1088     }
   1089 
   1090     if (wordCount != (testCount = cloneEnum->count(status))) {
   1091         errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
   1092                 testCount, wordCount, u_errorName(status));
   1093         goto cleanup;
   1094     }
   1095 
   1096     // Compact original dictionary to clone. Note that we can only compare the same kind of
   1097     // dictionary as the order of the enumerators is not guaranteed to be the same between
   1098     // different kinds
   1099     enumer1 = mutableDict->openWords(status);
   1100     if (U_FAILURE(status)) {
   1101         errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
   1102         goto cleanup;
   1103     }
   1104 
   1105     counter = 0;
   1106     originalWord = enumer1->snext(status);
   1107     cloneWord = cloneEnum->snext(status);
   1108     while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
   1109         if (*originalWord != *cloneWord) {
   1110             errln("Original and cloned MutableTrieDictionary word mismatch\n");
   1111             goto cleanup;
   1112         }
   1113 
   1114         // check if attached values of the same word in both dictionaries tally
   1115         AutoBuffer<int32_t, 20> lengths1(originalWord->length());
   1116         AutoBuffer<int32_t, 20> lengths2(cloneWord->length());
   1117         AutoBuffer<uint16_t, 20> values1(originalWord->length());
   1118         AutoBuffer<uint16_t, 20> values2(cloneWord->length());
   1119         originalText = utext_openConstUnicodeString(originalText, originalWord, &status);
   1120         cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status);
   1121 
   1122         int count1, count2;
   1123         mutableDict->matches(originalText, originalWord->length(), lengths1.elems(), count1, originalWord->length(), values1.elems());
   1124         mutable2->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems());
   1125 
   1126         if(values1[count1-1] != values2[count2-1]){
   1127             errln("Values of word %d in original and cloned MutableTrieDictionary do not match, with values %d and %d\n",
   1128                   counter, values1[count1-1], values2[count2-1]);
   1129             goto cleanup;
   1130         }
   1131 
   1132         counter++;
   1133 
   1134         originalWord = enumer1->snext(status);
   1135         cloneWord = cloneEnum->snext(status);
   1136     }
   1137 
   1138     if (U_FAILURE(status)) {
   1139         errln("Enumeration failed: %s\n", u_errorName(status));
   1140         goto cleanup;
   1141     }
   1142 
   1143     if (originalWord != cloneWord) {
   1144         errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
   1145         goto cleanup;
   1146     }
   1147 
   1148     // Test the data copying constructor for CompactTrieDict, and the data access APIs.
   1149     compact2 = new CompactTrieDictionary(compactDict->data(), status);
   1150     if (U_FAILURE(status)) {
   1151         errln("CompactTrieDictionary(const void *,...) failed\n");
   1152         goto cleanup;
   1153     }
   1154 
   1155     if (compact2->dataSize() == 0) {
   1156         errln("CompactTrieDictionary->dataSize() == 0\n");
   1157         goto cleanup;
   1158     }
   1159 
   1160     // Now count the words via the second dictionary
   1161     delete enumer1;
   1162     enumer1 = compact2->openWords(status);
   1163     if (U_FAILURE(status)) {
   1164         errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
   1165         goto cleanup;
   1166     }
   1167 
   1168     if (wordCount != (testCount = enumer1->count(status))) {
   1169         errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
   1170                 testCount, wordCount, u_errorName(status));
   1171         goto cleanup;
   1172     }
   1173 
   1174     cleanup:
   1175     delete compactDict;
   1176     delete mutableDict;
   1177     delete breaks;
   1178     delete[] testFile;
   1179     delete enumer1;
   1180     delete mutable2;
   1181     delete cloneEnum;
   1182     delete compact2;
   1183     utext_close(originalText);
   1184     utext_close(cloneText);
   1185 
   1186 
   1187 }
   1188 
   1189 //----------------------------------------------------------------------------
   1190 //
   1191 // generalIteratorTest      Given a break iterator and a set of test data,
   1192 //                          Run the tests and report the results.
   1193 //
   1194 //----------------------------------------------------------------------------
   1195 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
   1196 {
   1197 
   1198     bi.setText(td.fDataToBreak);
   1199 
   1200     testFirstAndNext(bi, td);
   1201 
   1202     testLastAndPrevious(bi, td);
   1203 
   1204     testFollowing(bi, td);
   1205     testPreceding(bi, td);
   1206     testIsBoundary(bi, td);
   1207     doMultipleSelectionTest(bi, td);
   1208 }
   1209 
   1210 
   1211 //
   1212 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
   1213 //                       kind of loop.
   1214 //
   1215 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
   1216 {
   1217     UErrorCode  status = U_ZERO_ERROR;
   1218     int32_t     p;
   1219     int32_t     lastP = -1;
   1220     int32_t     tag;
   1221 
   1222     logln("Test first and next");
   1223     bi.setText(td.fDataToBreak);
   1224     td.clearResults();
   1225 
   1226     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
   1227         td.fActualBreakPositions.addElement(p, status);  // Save result.
   1228         tag = bi.getRuleStatus();
   1229         td.fActualTags.addElement(tag, status);
   1230         if (p <= lastP) {
   1231             // If the iterator is not making forward progress, stop.
   1232             //  No need to raise an error here, it'll be detected in the normal check of results.
   1233             break;
   1234         }
   1235         lastP = p;
   1236     }
   1237     td.checkResults("testFirstAndNext", this);
   1238 }
   1239 
   1240 
   1241 //
   1242 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
   1243 //
   1244 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
   1245 {
   1246     UErrorCode  status = U_ZERO_ERROR;
   1247     int32_t     p;
   1248     int32_t     lastP  = 0x7ffffffe;
   1249     int32_t     tag;
   1250 
   1251     logln("Test last and previous");
   1252     bi.setText(td.fDataToBreak);
   1253     td.clearResults();
   1254 
   1255     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
   1256         // Save break position.  Insert it at start of vector of results, shoving
   1257         //    already-saved results further towards the end.
   1258         td.fActualBreakPositions.insertElementAt(p, 0, status);
   1259         // bi.previous();   // TODO:  Why does this fix things up????
   1260         // bi.next();
   1261         tag = bi.getRuleStatus();
   1262         td.fActualTags.insertElementAt(tag, 0, status);
   1263         if (p >= lastP) {
   1264             // If the iterator is not making progress, stop.
   1265             //  No need to raise an error here, it'll be detected in the normal check of results.
   1266             break;
   1267         }
   1268         lastP = p;
   1269     }
   1270     td.checkResults("testLastAndPrevious", this);
   1271 }
   1272 
   1273 
   1274 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
   1275 {
   1276     UErrorCode  status = U_ZERO_ERROR;
   1277     int32_t     p;
   1278     int32_t     tag;
   1279     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
   1280                                  //   cannot be -1; that is returned for DONE.
   1281     int         i;
   1282 
   1283     logln("testFollowing():");
   1284     bi.setText(td.fDataToBreak);
   1285     td.clearResults();
   1286 
   1287     // Save the starting point, since we won't get that out of following.
   1288     p = bi.first();
   1289     td.fActualBreakPositions.addElement(p, status);  // Save result.
   1290     tag = bi.getRuleStatus();
   1291     td.fActualTags.addElement(tag, status);
   1292 
   1293     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
   1294         p = bi.following(i);
   1295         if (p != lastP) {
   1296             if (p == RuleBasedBreakIterator::DONE) {
   1297                 break;
   1298             }
   1299             // We've reached a new break position.  Save it.
   1300             td.fActualBreakPositions.addElement(p, status);  // Save result.
   1301             tag = bi.getRuleStatus();
   1302             td.fActualTags.addElement(tag, status);
   1303             lastP = p;
   1304         }
   1305     }
   1306     // The loop normally exits by means of the break in the middle.
   1307     // Make sure that the index was at the correct position for the break iterator to have
   1308     //   returned DONE.
   1309     if (i != td.fDataToBreak.length()) {
   1310         errln("testFollowing():  iterator returned DONE prematurely.");
   1311     }
   1312 
   1313     // Full check of all results.
   1314     td.checkResults("testFollowing", this);
   1315 }
   1316 
   1317 
   1318 
   1319 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
   1320     UErrorCode  status = U_ZERO_ERROR;
   1321     int32_t     p;
   1322     int32_t     tag;
   1323     int32_t     lastP  = 0x7ffffffe;
   1324     int         i;
   1325 
   1326     logln("testPreceding():");
   1327     bi.setText(td.fDataToBreak);
   1328     td.clearResults();
   1329 
   1330     p = bi.last();
   1331     td.fActualBreakPositions.addElement(p, status);
   1332     tag = bi.getRuleStatus();
   1333     td.fActualTags.addElement(tag, status);
   1334 
   1335     for (i = td.fDataToBreak.length(); i>=-1; i--) {
   1336         p = bi.preceding(i);
   1337         if (p != lastP) {
   1338             if (p == RuleBasedBreakIterator::DONE) {
   1339                 break;
   1340             }
   1341             // We've reached a new break position.  Save it.
   1342             td.fActualBreakPositions.insertElementAt(p, 0, status);
   1343             lastP = p;
   1344             tag = bi.getRuleStatus();
   1345             td.fActualTags.insertElementAt(tag, 0, status);
   1346         }
   1347     }
   1348     // The loop normally exits by means of the break in the middle.
   1349     // Make sure that the index was at the correct position for the break iterator to have
   1350     //   returned DONE.
   1351     if (i != 0) {
   1352         errln("testPreceding():  iterator returned DONE prematurely.");
   1353     }
   1354 
   1355     // Full check of all results.
   1356     td.checkResults("testPreceding", this);
   1357 }
   1358 
   1359 
   1360 
   1361 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
   1362     UErrorCode  status = U_ZERO_ERROR;
   1363     int         i;
   1364     int32_t     tag;
   1365 
   1366     logln("testIsBoundary():");
   1367     bi.setText(td.fDataToBreak);
   1368     td.clearResults();
   1369 
   1370     for (i = 0; i <= td.fDataToBreak.length(); i++) {
   1371         if (bi.isBoundary(i)) {
   1372             td.fActualBreakPositions.addElement(i, status);  // Save result.
   1373             tag = bi.getRuleStatus();
   1374             td.fActualTags.addElement(tag, status);
   1375         }
   1376     }
   1377     td.checkResults("testIsBoundary: ", this);
   1378 }
   1379 
   1380 
   1381 
   1382 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
   1383 {
   1384     iterator.setText(td.fDataToBreak);
   1385 
   1386     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
   1387     int32_t offset = iterator.first();
   1388     int32_t testOffset;
   1389     int32_t count = 0;
   1390 
   1391     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
   1392 
   1393     if (*testIterator != iterator)
   1394         errln("clone() or operator!= failed: two clones compared unequal");
   1395 
   1396     do {
   1397         testOffset = testIterator->first();
   1398         testOffset = testIterator->next(count);
   1399         if (offset != testOffset)
   1400             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
   1401 
   1402         if (offset != RuleBasedBreakIterator::DONE) {
   1403             count++;
   1404             offset = iterator.next();
   1405 
   1406             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
   1407                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
   1408                 if (count > 10000 || offset == -1) {
   1409                     errln("operator== failed too many times. Stopping test.");
   1410                     if (offset == -1) {
   1411                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
   1412                     }
   1413                     return;
   1414                 }
   1415             }
   1416         }
   1417     } while (offset != RuleBasedBreakIterator::DONE);
   1418 
   1419     // now do it backwards...
   1420     offset = iterator.last();
   1421     count = 0;
   1422 
   1423     do {
   1424         testOffset = testIterator->last();
   1425         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
   1426         if (offset != testOffset)
   1427             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
   1428 
   1429         if (offset != RuleBasedBreakIterator::DONE) {
   1430             count--;
   1431             offset = iterator.previous();
   1432         }
   1433     } while (offset != RuleBasedBreakIterator::DONE);
   1434 
   1435     delete testIterator;
   1436 }
   1437 
   1438 
   1439 //---------------------------------------------
   1440 //
   1441 //     other tests
   1442 //
   1443 //---------------------------------------------
   1444 void RBBITest::TestEmptyString()
   1445 {
   1446     UnicodeString text = "";
   1447     UErrorCode status = U_ZERO_ERROR;
   1448 
   1449     BITestData x(status);
   1450     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
   1451     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
   1452     if (U_FAILURE(status))
   1453     {
   1454         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
   1455         return;
   1456     }
   1457     generalIteratorTest(*bi, x);
   1458     delete bi;
   1459 }
   1460 
   1461 void RBBITest::TestGetAvailableLocales()
   1462 {
   1463     int32_t locCount = 0;
   1464     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
   1465 
   1466     if (locCount == 0)
   1467         dataerrln("getAvailableLocales() returned an empty list!");
   1468     // Just make sure that it's returning good memory.
   1469     int32_t i;
   1470     for (i = 0; i < locCount; ++i) {
   1471         logln(locList[i].getName());
   1472     }
   1473 }
   1474 
   1475 //Testing the BreakIterator::getDisplayName() function
   1476 void RBBITest::TestGetDisplayName()
   1477 {
   1478     UnicodeString   result;
   1479 
   1480     BreakIterator::getDisplayName(Locale::getUS(), result);
   1481     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
   1482         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
   1483                 + result);
   1484 
   1485     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
   1486     if (result != "French (France)")
   1487         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
   1488                 + result);
   1489 }
   1490 /**
   1491  * Test End Behaviour
   1492  * @bug 4068137
   1493  */
   1494 void RBBITest::TestEndBehaviour()
   1495 {
   1496     UErrorCode status = U_ZERO_ERROR;
   1497     UnicodeString testString("boo.");
   1498     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
   1499     if (U_FAILURE(status))
   1500     {
   1501         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
   1502         return;
   1503     }
   1504     wb->setText(testString);
   1505 
   1506     if (wb->first() != 0)
   1507         errln("Didn't get break at beginning of string.");
   1508     if (wb->next() != 3)
   1509         errln("Didn't get break before period in \"boo.\"");
   1510     if (wb->current() != 4 && wb->next() != 4)
   1511         errln("Didn't get break at end of string.");
   1512     delete wb;
   1513 }
   1514 /*
   1515  * @bug 4153072
   1516  */
   1517 void RBBITest::TestBug4153072() {
   1518     UErrorCode status = U_ZERO_ERROR;
   1519     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
   1520     if (U_FAILURE(status))
   1521     {
   1522         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
   1523         return;
   1524     }
   1525     UnicodeString str("...Hello, World!...");
   1526     int32_t begin = 3;
   1527     int32_t end = str.length() - 3;
   1528     UBool onBoundary;
   1529 
   1530     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
   1531     iter->adoptText(textIterator);
   1532     int index;
   1533     // Note: with the switch to UText, there is no way to restrict the
   1534     //       iteration range to begin at an index other than zero.
   1535     //       String character iterators created with a non-zero bound are
   1536     //         treated by RBBI as being empty.
   1537     for (index = -1; index < begin + 1; ++index) {
   1538         onBoundary = iter->isBoundary(index);
   1539         if (index == 0?  !onBoundary : onBoundary) {
   1540             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
   1541                             " and begin index = " + begin);
   1542         }
   1543     }
   1544     delete iter;
   1545 }
   1546 
   1547 
   1548 //
   1549 // Test for problem reported by Ashok Matoria on 9 July 2007
   1550 //    One.<kSoftHyphen><kSpace>Two.
   1551 //
   1552 //    Sentence break at start (0) and then on calling next() it breaks at
   1553 //   'T' of "Two". Now, at this point if I do next() and
   1554 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
   1555 //
   1556 void RBBITest::TestBug5775() {
   1557     UErrorCode status = U_ZERO_ERROR;
   1558     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
   1559     TEST_ASSERT_SUCCESS(status);
   1560     if (U_FAILURE(status)) {
   1561         return;
   1562     }
   1563 // Check for status first for better handling of no data errors.
   1564     TEST_ASSERT(bi != NULL);
   1565     if (bi == NULL) {
   1566         return;
   1567     }
   1568 
   1569     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
   1570     //               01234      56789
   1571     s = s.unescape();
   1572     bi->setText(s);
   1573     int pos = bi->next();
   1574     TEST_ASSERT(pos == 6);
   1575     pos = bi->next();
   1576     TEST_ASSERT(pos == 10);
   1577     pos = bi->previous();
   1578     TEST_ASSERT(pos == 6);
   1579     delete bi;
   1580 }
   1581 
   1582 
   1583 
   1584 /**
   1585  * Test Japanese Line Break
   1586  * @bug 4095322
   1587  */
   1588 void RBBITest::TestJapaneseLineBreak()
   1589 {
   1590 #if 0
   1591     // Test needs updating some more...   Dump it for now.
   1592 
   1593 
   1594     // Change for Unicode TR 14:  Punctuation characters with categories Pi and Pf do not count
   1595     //        as opening and closing punctuation for line breaking.
   1596     //        Also, \u30fc and \u30fe are not counted as hyphens.   Remove these chars
   1597     //        from these tests.    6-13-2002
   1598     //
   1599     UErrorCode status = U_ZERO_ERROR;
   1600     UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");
   1601     UnicodeString precedingChars = CharsToUnicodeString(
   1602         //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
   1603         "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
   1604     UnicodeString followingChars = CharsToUnicodeString(
   1605         // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
   1606         ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
   1607         // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
   1608         ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
   1609         "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
   1610     BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status);
   1611 
   1612     int32_t i;
   1613     if (U_FAILURE(status))
   1614     {
   1615         errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
   1616         return;
   1617     }
   1618 
   1619     for (i = 0; i < precedingChars.length(); i++) {
   1620         testString.setCharAt(1, precedingChars[i]);
   1621         iter->setText(testString);
   1622         int32_t j = iter->first();
   1623         if (j != 0)
   1624             errln("ja line break failure: failed to start at 0");
   1625         j = iter->next();
   1626         if (j != 1)
   1627             errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])
   1628                         + "' (" + ((int)(precedingChars[i])) + ")");
   1629         j = iter->next();
   1630         if (j != 3)
   1631             errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])
   1632                         + "' (" + ((int)(precedingChars[i])) + ")");
   1633     }
   1634 
   1635     for (i = 0; i < followingChars.length(); i++) {
   1636         testString.setCharAt(1, followingChars[i]);
   1637         iter->setText(testString);
   1638         int j = iter->first();
   1639         if (j != 0)
   1640             errln("ja line break failure: failed to start at 0");
   1641         j = iter->next();
   1642         if (j != 2)
   1643             errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])
   1644                         + "' (" + ((int)(followingChars[i])) + ")");
   1645         j = iter->next();
   1646         if (j != 3)
   1647             errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])
   1648                         + "' (" + ((int)(followingChars[i])) + ")");
   1649     }
   1650     delete iter;
   1651 #endif
   1652 }
   1653 
   1654 
   1655 //------------------------------------------------------------------------------
   1656 //
   1657 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
   1658 //
   1659 //------------------------------------------------------------------------------
   1660 
   1661 struct TestParams {
   1662     BreakIterator   *bi;
   1663     UnicodeString    dataToBreak;
   1664     UVector32       *expectedBreaks;
   1665     UVector32       *srcLine;
   1666     UVector32       *srcCol;
   1667 };
   1668 
   1669 void RBBITest::executeTest(TestParams *t) {
   1670     int32_t    bp;
   1671     int32_t    prevBP;
   1672     int32_t    i;
   1673 
   1674     if (t->bi == NULL) {
   1675         return;
   1676     }
   1677 
   1678     t->bi->setText(t->dataToBreak);
   1679     //
   1680     //  Run the iterator forward
   1681     //
   1682     prevBP = -1;
   1683     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
   1684         if (prevBP ==  bp) {
   1685             // Fail for lack of forward progress.
   1686             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
   1687                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
   1688             break;
   1689         }
   1690 
   1691         // Check that there were we didn't miss an expected break between the last one
   1692         //  and this one.
   1693         for (i=prevBP+1; i<bp; i++) {
   1694             if (t->expectedBreaks->elementAti(i) != 0) {
   1695                 int expected[] = {0, i};
   1696                 printStringBreaks(t->dataToBreak, expected, 2);
   1697                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1698                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
   1699             }
   1700         }
   1701 
   1702         // Check that the break we did find was expected
   1703         if (t->expectedBreaks->elementAti(bp) == 0) {
   1704             int expected[] = {0, bp};
   1705             printStringBreaks(t->dataToBreak, expected, 2);
   1706             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
   1707                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
   1708         } else {
   1709             // The break was expected.
   1710             //   Check that the {nnn} tag value is correct.
   1711             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
   1712             if (expectedTagVal == -1) {
   1713                 expectedTagVal = 0;
   1714             }
   1715             int32_t line = t->srcLine->elementAti(bp);
   1716             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
   1717             if (rs != expectedTagVal) {
   1718                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
   1719                       "          Actual, Expected status = %4d, %4d",
   1720                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
   1721             }
   1722         }
   1723 
   1724 
   1725         prevBP = bp;
   1726     }
   1727 
   1728     // Verify that there were no missed expected breaks after the last one found
   1729     for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
   1730         if (t->expectedBreaks->elementAti(i) != 0) {
   1731             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1732                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
   1733         }
   1734     }
   1735 
   1736     //
   1737     //  Run the iterator backwards, verify that the same breaks are found.
   1738     //
   1739     prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
   1740     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
   1741         if (prevBP ==  bp) {
   1742             // Fail for lack of progress.
   1743             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
   1744                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
   1745             break;
   1746         }
   1747 
   1748         // Check that there were we didn't miss an expected break between the last one
   1749         //  and this one.  (UVector returns zeros for index out of bounds.)
   1750         for (i=prevBP-1; i>bp; i--) {
   1751             if (t->expectedBreaks->elementAti(i) != 0) {
   1752                 errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1753                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
   1754             }
   1755         }
   1756 
   1757         // Check that the break we did find was expected
   1758         if (t->expectedBreaks->elementAti(bp) == 0) {
   1759             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
   1760                    bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
   1761         } else {
   1762             // The break was expected.
   1763             //   Check that the {nnn} tag value is correct.
   1764             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
   1765             if (expectedTagVal == -1) {
   1766                 expectedTagVal = 0;
   1767             }
   1768             int line = t->srcLine->elementAti(bp);
   1769             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
   1770             if (rs != expectedTagVal) {
   1771                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
   1772                       "          Actual, Expected status = %4d, %4d",
   1773                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
   1774             }
   1775         }
   1776 
   1777         prevBP = bp;
   1778     }
   1779 
   1780     // Verify that there were no missed breaks prior to the last one found
   1781     for (i=prevBP-1; i>=0; i--) {
   1782         if (t->expectedBreaks->elementAti(i) != 0) {
   1783             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1784                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
   1785         }
   1786     }
   1787 }
   1788 
   1789 
   1790 void RBBITest::TestExtended() {
   1791 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   1792     UErrorCode      status  = U_ZERO_ERROR;
   1793     Locale          locale("");
   1794 
   1795     UnicodeString       rules;
   1796     TestParams          tp;
   1797     tp.bi             = NULL;
   1798     tp.expectedBreaks = new UVector32(status);
   1799     tp.srcLine        = new UVector32(status);
   1800     tp.srcCol         = new UVector32(status);
   1801 
   1802     RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
   1803     if (U_FAILURE(status)) {
   1804         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
   1805     }
   1806 
   1807 
   1808     //
   1809     //  Open and read the test data file.
   1810     //
   1811     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   1812     char testFileName[1000];
   1813     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
   1814         errln("Can't open test data.  Path too long.");
   1815         return;
   1816     }
   1817     strcpy(testFileName, testDataDirectory);
   1818     strcat(testFileName, "rbbitst.txt");
   1819 
   1820     int    len;
   1821     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
   1822     if (U_FAILURE(status)) {
   1823         return; /* something went wrong, error already output */
   1824     }
   1825 
   1826 
   1827 
   1828 
   1829     //
   1830     //  Put the test data into a UnicodeString
   1831     //
   1832     UnicodeString testString(FALSE, testFile, len);
   1833 
   1834     enum EParseState{
   1835         PARSE_COMMENT,
   1836         PARSE_TAG,
   1837         PARSE_DATA,
   1838         PARSE_NUM
   1839     }
   1840     parseState = PARSE_TAG;
   1841 
   1842     EParseState savedState = PARSE_TAG;
   1843 
   1844     static const UChar CH_LF        = 0x0a;
   1845     static const UChar CH_CR        = 0x0d;
   1846     static const UChar CH_HASH      = 0x23;
   1847     /*static const UChar CH_PERIOD    = 0x2e;*/
   1848     static const UChar CH_LT        = 0x3c;
   1849     static const UChar CH_GT        = 0x3e;
   1850     static const UChar CH_BACKSLASH = 0x5c;
   1851     static const UChar CH_BULLET    = 0x2022;
   1852 
   1853     int32_t    lineNum  = 1;
   1854     int32_t    colStart = 0;
   1855     int32_t    column   = 0;
   1856     int32_t    charIdx  = 0;
   1857 
   1858     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
   1859 
   1860     for (charIdx = 0; charIdx < len; ) {
   1861         status = U_ZERO_ERROR;
   1862         UChar  c = testString.charAt(charIdx);
   1863         charIdx++;
   1864         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
   1865             // treat CRLF as a unit
   1866             c = CH_LF;
   1867             charIdx++;
   1868         }
   1869         if (c == CH_LF || c == CH_CR) {
   1870             lineNum++;
   1871             colStart = charIdx;
   1872         }
   1873         column = charIdx - colStart + 1;
   1874 
   1875         switch (parseState) {
   1876         case PARSE_COMMENT:
   1877             if (c == 0x0a || c == 0x0d) {
   1878                 parseState = savedState;
   1879             }
   1880             break;
   1881 
   1882         case PARSE_TAG:
   1883             {
   1884             if (c == CH_HASH) {
   1885                 parseState = PARSE_COMMENT;
   1886                 savedState = PARSE_TAG;
   1887                 break;
   1888             }
   1889             if (u_isUWhiteSpace(c)) {
   1890                 break;
   1891             }
   1892             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
   1893                 delete tp.bi;
   1894                 tp.bi = BreakIterator::createWordInstance(locale,  status);
   1895                 charIdx += 5;
   1896                 break;
   1897             }
   1898             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
   1899                 delete tp.bi;
   1900                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
   1901                 charIdx += 5;
   1902                 break;
   1903             }
   1904             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
   1905                 delete tp.bi;
   1906                 tp.bi = BreakIterator::createLineInstance(locale,  status);
   1907                 charIdx += 5;
   1908                 break;
   1909             }
   1910             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
   1911                 delete tp.bi;
   1912                 tp.bi = NULL;
   1913                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
   1914                 charIdx += 5;
   1915                 break;
   1916             }
   1917             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
   1918                 delete tp.bi;
   1919                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
   1920                 charIdx += 6;
   1921                 break;
   1922             }
   1923 
   1924             // <locale  loc_name>
   1925             localeMatcher.reset(testString);
   1926             if (localeMatcher.lookingAt(charIdx-1, status)) {
   1927                 UnicodeString localeName = localeMatcher.group(1, status);
   1928                 char localeName8[100];
   1929                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
   1930                 locale = Locale::createFromName(localeName8);
   1931                 charIdx += localeMatcher.group(0, status).length();
   1932                 TEST_ASSERT_SUCCESS(status);
   1933                 break;
   1934             }
   1935             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
   1936                 parseState = PARSE_DATA;
   1937                 charIdx += 5;
   1938                 tp.dataToBreak = "";
   1939                 tp.expectedBreaks->removeAllElements();
   1940                 tp.srcCol ->removeAllElements();
   1941                 tp.srcLine->removeAllElements();
   1942                 break;
   1943             }
   1944 
   1945             errln("line %d: Tag expected in test file.", lineNum);
   1946             parseState = PARSE_COMMENT;
   1947             savedState = PARSE_DATA;
   1948             goto end_test; // Stop the test.
   1949             }
   1950             break;
   1951 
   1952         case PARSE_DATA:
   1953             if (c == CH_BULLET) {
   1954                 int32_t  breakIdx = tp.dataToBreak.length();
   1955                 tp.expectedBreaks->setSize(breakIdx+1);
   1956                 tp.expectedBreaks->setElementAt(-1, breakIdx);
   1957                 tp.srcLine->setSize(breakIdx+1);
   1958                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1959                 tp.srcCol ->setSize(breakIdx+1);
   1960                 tp.srcCol ->setElementAt(column, breakIdx);
   1961                 break;
   1962             }
   1963 
   1964             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
   1965                 // Add final entry to mappings from break location to source file position.
   1966                 //  Need one extra because last break position returned is after the
   1967                 //    last char in the data, not at the last char.
   1968                 tp.srcLine->addElement(lineNum, status);
   1969                 tp.srcCol ->addElement(column, status);
   1970 
   1971                 parseState = PARSE_TAG;
   1972                 charIdx += 6;
   1973 
   1974                 // RUN THE TEST!
   1975                 executeTest(&tp);
   1976                 break;
   1977             }
   1978 
   1979             if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
   1980                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
   1981                 // Get the code point from the name and insert it into the test data.
   1982                 //   (Damn, no API takes names in Unicode  !!!
   1983                 //    we've got to take it back to char *)
   1984                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
   1985                 int32_t nameLength = nameEndIdx - (charIdx+2);
   1986                 char charNameBuf[200];
   1987                 UChar32 theChar = -1;
   1988                 if (nameEndIdx != -1) {
   1989                     UErrorCode status = U_ZERO_ERROR;
   1990                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
   1991                     charNameBuf[sizeof(charNameBuf)-1] = 0;
   1992                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
   1993                     if (U_FAILURE(status)) {
   1994                         theChar = -1;
   1995                     }
   1996                 }
   1997                 if (theChar == -1) {
   1998                     errln("Error in named character in test file at line %d, col %d",
   1999                         lineNum, column);
   2000                 } else {
   2001                     // Named code point was recognized.  Insert it
   2002                     //   into the test data.
   2003                     tp.dataToBreak.append(theChar);
   2004                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
   2005                         tp.srcLine->addElement(lineNum, status);
   2006                         tp.srcCol ->addElement(column, status);
   2007                     }
   2008                 }
   2009                 if (nameEndIdx > charIdx) {
   2010                     charIdx = nameEndIdx+1;
   2011 
   2012                 }
   2013                 break;
   2014             }
   2015 
   2016 
   2017 
   2018 
   2019             if (testString.compare(charIdx-1, 2, "<>") == 0) {
   2020                 charIdx++;
   2021                 int32_t  breakIdx = tp.dataToBreak.length();
   2022                 tp.expectedBreaks->setSize(breakIdx+1);
   2023                 tp.expectedBreaks->setElementAt(-1, breakIdx);
   2024                 tp.srcLine->setSize(breakIdx+1);
   2025                 tp.srcLine->setElementAt(lineNum, breakIdx);
   2026                 tp.srcCol ->setSize(breakIdx+1);
   2027                 tp.srcCol ->setElementAt(column, breakIdx);
   2028                 break;
   2029             }
   2030 
   2031             if (c == CH_LT) {
   2032                 tagValue   = 0;
   2033                 parseState = PARSE_NUM;
   2034                 break;
   2035             }
   2036 
   2037             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
   2038                 parseState = PARSE_COMMENT;
   2039                 savedState = PARSE_DATA;
   2040                 break;
   2041             }
   2042 
   2043             if (c == CH_BACKSLASH) {
   2044                 // Check for \ at end of line, a line continuation.
   2045                 //     Advance over (discard) the newline
   2046                 UChar32 cp = testString.char32At(charIdx);
   2047                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
   2048                     // We have a CR LF
   2049                     //  Need an extra increment of the input ptr to move over both of them
   2050                     charIdx++;
   2051                 }
   2052                 if (cp == CH_LF || cp == CH_CR) {
   2053                     lineNum++;
   2054                     colStart = charIdx;
   2055                     charIdx++;
   2056                     break;
   2057                 }
   2058 
   2059                 // Let unescape handle the back slash.
   2060                 cp = testString.unescapeAt(charIdx);
   2061                 if (cp != -1) {
   2062                     // Escape sequence was recognized.  Insert the char
   2063                     //   into the test data.
   2064                     tp.dataToBreak.append(cp);
   2065                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
   2066                         tp.srcLine->addElement(lineNum, status);
   2067                         tp.srcCol ->addElement(column, status);
   2068                     }
   2069                     break;
   2070                 }
   2071 
   2072 
   2073                 // Not a recognized backslash escape sequence.
   2074                 // Take the next char as a literal.
   2075                 //  TODO:  Should this be an error?
   2076                 c = testString.charAt(charIdx);
   2077                 charIdx = testString.moveIndex32(charIdx, 1);
   2078             }
   2079 
   2080             // Normal, non-escaped data char.
   2081             tp.dataToBreak.append(c);
   2082 
   2083             // Save the mapping from offset in the data to line/column numbers in
   2084             //   the original input file.  Will be used for better error messages only.
   2085             //   If there's an expected break before this char, the slot in the mapping
   2086             //     vector will already be set for this char; don't overwrite it.
   2087             if (tp.dataToBreak.length() > tp.srcLine->size()) {
   2088                 tp.srcLine->addElement(lineNum, status);
   2089                 tp.srcCol ->addElement(column, status);
   2090             }
   2091             break;
   2092 
   2093 
   2094         case PARSE_NUM:
   2095             // We are parsing an expected numeric tag value, like <1234>,
   2096             //   within a chunk of data.
   2097             if (u_isUWhiteSpace(c)) {
   2098                 break;
   2099             }
   2100 
   2101             if (c == CH_GT) {
   2102                 // Finished the number.  Add the info to the expected break data,
   2103                 //   and switch parse state back to doing plain data.
   2104                 parseState = PARSE_DATA;
   2105                 if (tagValue == 0) {
   2106                     tagValue = -1;
   2107                 }
   2108                 int32_t  breakIdx = tp.dataToBreak.length();
   2109                 tp.expectedBreaks->setSize(breakIdx+1);
   2110                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
   2111                 tp.srcLine->setSize(breakIdx+1);
   2112                 tp.srcLine->setElementAt(lineNum, breakIdx);
   2113                 tp.srcCol ->setSize(breakIdx+1);
   2114                 tp.srcCol ->setElementAt(column, breakIdx);
   2115                 break;
   2116             }
   2117 
   2118             if (u_isdigit(c)) {
   2119                 tagValue = tagValue*10 + u_charDigitValue(c);
   2120                 break;
   2121             }
   2122 
   2123             errln("Syntax Error in test file at line %d, col %d",
   2124                 lineNum, column);
   2125             parseState = PARSE_COMMENT;
   2126             goto end_test; // Stop the test
   2127             break;
   2128         }
   2129 
   2130 
   2131         if (U_FAILURE(status)) {
   2132             errln("ICU Error %s while parsing test file at line %d.",
   2133                 u_errorName(status), lineNum);
   2134             status = U_ZERO_ERROR;
   2135             goto end_test; // Stop the test
   2136         }
   2137 
   2138     }
   2139 
   2140 end_test:
   2141     delete tp.bi;
   2142     delete tp.expectedBreaks;
   2143     delete tp.srcLine;
   2144     delete tp.srcCol;
   2145     delete [] testFile;
   2146 #endif
   2147 }
   2148 
   2149 void RBBITest::TestThaiBreaks() {
   2150     UErrorCode status=U_ZERO_ERROR;
   2151     BreakIterator* b;
   2152     Locale locale = Locale("th");
   2153     int32_t p, index;
   2154     UChar c[]= {
   2155             0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B,
   2156             0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19,
   2157             0x0E16, 0x0E49, 0x0E33
   2158     };
   2159     int32_t expectedWordResult[] = {
   2160             2, 3, 6, 10, 11, 15, 17, 20, 22
   2161     };
   2162     int32_t expectedLineResult[] = {
   2163             3, 6, 11, 15, 17, 20, 22
   2164     };
   2165     int32_t size = sizeof(c)/sizeof(UChar);
   2166     UnicodeString text=UnicodeString(c);
   2167 
   2168     b = BreakIterator::createWordInstance(locale, status);
   2169     if (U_FAILURE(status)) {
   2170         errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status));
   2171         return;
   2172     }
   2173     b->setText(text);
   2174     p = index = 0;
   2175     while ((p=b->next())!=BreakIterator::DONE && p < size) {
   2176         if (p != expectedWordResult[index++]) {
   2177             errln("Incorrect break given by thai word break iterator. Expected: %d  Got: %d", expectedWordResult[index-1], p);
   2178         }
   2179     }
   2180     delete b;
   2181 
   2182     b = BreakIterator::createLineInstance(locale, status);
   2183     if (U_FAILURE(status)) {
   2184         printf("Unable to create thai line break iterator.\n");
   2185         return;
   2186     }
   2187     b->setText(text);
   2188     p = index = 0;
   2189     while ((p=b->next())!=BreakIterator::DONE && p < size) {
   2190         if (p != expectedLineResult[index++]) {
   2191             errln("Incorrect break given by thai line break iterator. Expected: %d  Got: %d", expectedLineResult[index-1], p);
   2192         }
   2193     }
   2194 
   2195     delete b;
   2196 }
   2197 
   2198 // UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"
   2199 // Words don't include colon or period (cldrbug #1969).
   2200 static const char    posxWordText[]     = "Can't have breaks in xx:yy or struct.field for CS-types.";
   2201 static const int32_t posxWordTOffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 };
   2202 static const int32_t posxWordROffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21,         26, 27, 29, 30,         42, 43, 46, 47, 49, 50, 55, 56 };
   2203 
   2204 // UBreakIteratorType UBRK_WORD, Locale "ja"
   2205 // Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
   2206 static const char    jaWordText[]     = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"
   2207                                         "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";
   2208 #if 0
   2209 static const int32_t jaWordTOffsets[] = {    2, 3,          7, 8, 14,         17, 18,     20, 21, 24,         27, 28 };
   2210 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 };
   2211 #endif
   2212 // There's no separate Japanese word break iterator. Root is the same as Japanese.
   2213 // Our dictionary-based iterator has to be tweaked to better handle U+3005,
   2214 // U+3007, U+300B and some other cases.
   2215 static const int32_t jaWordTOffsets[] = { 1, 2, 3, 4, 5,    7, 8, 12, 13, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 };
   2216 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5,    7, 8, 12, 13, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 };
   2217 
   2218 // UBreakIteratorType UBRK_SENTENCE, Locale "el"
   2219 // Add break after Greek question mark (cldrbug #2069).
   2220 static const char    elSentText[]     = "\\u0391\\u03B2, \\u03B3\\u03B4; \\u0395 \\u03B6\\u03B7\\u037E \\u0398 \\u03B9\\u03BA. "
   2221                                         "\\u039B\\u03BC \\u03BD\\u03BE! \\u039F\\u03C0, \\u03A1\\u03C2? \\u03A3";
   2222 static const int32_t elSentTOffsets[] = { 8, 14, 20, 27, 35, 36 };
   2223 static const int32_t elSentROffsets[] = {        20, 27, 35, 36 };
   2224 
   2225 // UBreakIteratorType UBRK_CHARACTER, Locale "th"
   2226 // Clusters should not include spacing Thai/Lao vowels (prefix or postfix), except for [SARA] AM (cldrbug #2161).
   2227 static const char    thCharText[]     = "\\u0E01\\u0E23\\u0E30\\u0E17\\u0E48\\u0E2D\\u0E21\\u0E23\\u0E08\\u0E19\\u0E32 "
   2228                                         "(\\u0E2A\\u0E38\\u0E0A\\u0E32\\u0E15\\u0E34-\\u0E08\\u0E38\\u0E11\\u0E32\\u0E21\\u0E32\\u0E28) "
   2229                                         "\\u0E40\\u0E14\\u0E47\\u0E01\\u0E21\\u0E35\\u0E1B\\u0E31\\u0E0D\\u0E2B\\u0E32 ";
   2230 static const int32_t thCharTOffsets[] = { 1, 2, 3, 5, 6, 7, 8, 9, 10, 11,
   2231                                           12, 13, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28,
   2232                                           29, 30, 32, 33, 35, 37, 38, 39, 40, 41 };
   2233 static const int32_t thCharROffsets[] = { 1,    3, 5, 6, 7, 8, 9,     11,
   2234                                           12, 13, 15,     17, 19, 20, 22,     24,     26, 27, 28,
   2235                                           29,     32, 33, 35, 37, 38,     40, 41 };
   2236 
   2237 typedef struct {
   2238     UBreakIteratorType  type;
   2239     const char *        locale;
   2240     const char *        escapedText;
   2241     const int32_t *     tailoredOffsets;
   2242     int32_t             tailoredOffsetsCount;
   2243     const int32_t *     rootOffsets;
   2244     int32_t             rootOffsetsCount;
   2245 } TailoredBreakItem;
   2246 
   2247 #define ARRAY_PTR_LEN(array) (array),(sizeof(array)/sizeof(array[0]))
   2248 
   2249 static const TailoredBreakItem tbItems[] = {
   2250     { UBRK_WORD,      "en_US_POSIX", posxWordText, ARRAY_PTR_LEN(posxWordTOffsets), ARRAY_PTR_LEN(posxWordROffsets) },
   2251     { UBRK_WORD,      "ja",          jaWordText,   ARRAY_PTR_LEN(jaWordTOffsets),   ARRAY_PTR_LEN(jaWordROffsets)   },
   2252     { UBRK_SENTENCE,  "el",          elSentText,   ARRAY_PTR_LEN(elSentTOffsets),   ARRAY_PTR_LEN(elSentROffsets)   },
   2253     { UBRK_CHARACTER, "th",          thCharText,   ARRAY_PTR_LEN(thCharTOffsets),   ARRAY_PTR_LEN(thCharROffsets)   },
   2254     { UBRK_CHARACTER, NULL,          NULL,         NULL,0,                          NULL,0                          } // terminator
   2255 };
   2256 
   2257 static void formatOffsets(char* buffer, int32_t buflen, int32_t count, const int32_t* offsets) {
   2258     while (count-- > 0) {
   2259         int writeCount;
   2260         sprintf(buffer, /* buflen, */ " %d%n", *offsets++, &writeCount); /* wants to be snprintf */
   2261         buffer += writeCount;
   2262         buflen -= writeCount;
   2263     }
   2264 }
   2265 
   2266 enum { kMaxOffsetCount = 128 };
   2267 
   2268 void RBBITest::TBTest(BreakIterator* brkitr, int type, const char *locale, const char* escapedText, const int32_t *expectOffsets, int32_t expectOffsetsCount) {
   2269     brkitr->setText( CharsToUnicodeString(escapedText) );
   2270     int32_t foundOffsets[kMaxOffsetCount];
   2271     int32_t offset, foundOffsetsCount = 0;
   2272     // do forwards iteration test
   2273     while ( foundOffsetsCount < kMaxOffsetCount && (offset = brkitr->next()) != BreakIterator::DONE ) {
   2274         foundOffsets[foundOffsetsCount++] = offset;
   2275     }
   2276     if ( foundOffsetsCount != expectOffsetsCount || memcmp(expectOffsets, foundOffsets, foundOffsetsCount*sizeof(foundOffsets[0])) != 0 ) {
   2277         // log error for forwards test
   2278         char formatExpect[512], formatFound[512];
   2279         formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets);
   2280         formatOffsets(formatFound, sizeof(formatFound), foundOffsetsCount, foundOffsets);
   2281         errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found %d offsets fwd:%s\n",
   2282                 type, locale, escapedText, expectOffsetsCount, formatExpect, foundOffsetsCount, formatFound);
   2283     } else {
   2284         // do backwards iteration test
   2285         --foundOffsetsCount; // back off one from the end offset
   2286         while ( foundOffsetsCount > 0 ) {
   2287             offset = brkitr->previous();
   2288             if ( offset != foundOffsets[--foundOffsetsCount] ) {
   2289                 // log error for backwards test
   2290                 char formatExpect[512];
   2291                 formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets);
   2292                 errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found rev offset %d where expect %d\n",
   2293                         type, locale, escapedText, expectOffsetsCount, formatExpect, offset, foundOffsets[foundOffsetsCount]);
   2294                 break;
   2295             }
   2296         }
   2297     }
   2298 }
   2299 
   2300 void RBBITest::TestTailoredBreaks() {
   2301     const TailoredBreakItem * tbItemPtr;
   2302     Locale rootLocale = Locale("root");
   2303     for (tbItemPtr = tbItems; tbItemPtr->escapedText != NULL; ++tbItemPtr) {
   2304         Locale testLocale = Locale(tbItemPtr->locale);
   2305         BreakIterator * tailoredBrkiter;
   2306         BreakIterator * rootBrkiter;
   2307         UErrorCode status = U_ZERO_ERROR;
   2308         switch (tbItemPtr->type) {
   2309             case UBRK_CHARACTER:
   2310                 tailoredBrkiter = BreakIterator::createCharacterInstance(testLocale, status);
   2311                 rootBrkiter = BreakIterator::createCharacterInstance(rootLocale, status);
   2312                 break;
   2313             case UBRK_WORD:
   2314                 tailoredBrkiter = BreakIterator::createWordInstance(testLocale, status);
   2315                 rootBrkiter = BreakIterator::createWordInstance(rootLocale, status);
   2316                 break;
   2317             case UBRK_LINE:
   2318                 tailoredBrkiter = BreakIterator::createLineInstance(testLocale, status);
   2319                 rootBrkiter = BreakIterator::createLineInstance(rootLocale, status);
   2320                 break;
   2321             case UBRK_SENTENCE:
   2322                 tailoredBrkiter = BreakIterator::createSentenceInstance(testLocale, status);
   2323                 rootBrkiter = BreakIterator::createSentenceInstance(rootLocale, status);
   2324                 break;
   2325             default:
   2326                 status = U_UNSUPPORTED_ERROR;
   2327                 break;
   2328         }
   2329         if (U_FAILURE(status)) {
   2330             errcheckln(status, "BreakIterator create failed for type %d, locales root or %s - Error: %s", (int)(tbItemPtr->type), tbItemPtr->locale, u_errorName(status));
   2331             continue;
   2332         }
   2333         TBTest(tailoredBrkiter, (int)(tbItemPtr->type), tbItemPtr->locale, tbItemPtr->escapedText, tbItemPtr->tailoredOffsets, tbItemPtr->tailoredOffsetsCount);
   2334         TBTest(rootBrkiter,     (int)(tbItemPtr->type), "root",            tbItemPtr->escapedText, tbItemPtr->rootOffsets,     tbItemPtr->rootOffsetsCount);
   2335 
   2336         delete rootBrkiter;
   2337         delete tailoredBrkiter;
   2338     }
   2339 }
   2340 
   2341 
   2342 //-------------------------------------------------------------------------------
   2343 //
   2344 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
   2345 //    return the datain one big UChar * buffer, which the caller must delete.
   2346 //
   2347 //    parameters:
   2348 //          fileName:   the name of the file, with no directory part.  The test data directory
   2349 //                      is assumed.
   2350 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
   2351 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
   2352 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
   2353 //                      Pass NULL for the system default encoding.
   2354 //          status
   2355 //    returns:
   2356 //                      The file data, converted to UChar.
   2357 //                      The caller must delete this when done with
   2358 //                           delete [] theBuffer;
   2359 //
   2360 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
   2361 //           Move this function to some common place.
   2362 //
   2363 //--------------------------------------------------------------------------------
   2364 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
   2365     UChar       *retPtr  = NULL;
   2366     char        *fileBuf = NULL;
   2367     UConverter* conv     = NULL;
   2368     FILE        *f       = NULL;
   2369 
   2370     ulen = 0;
   2371     if (U_FAILURE(status)) {
   2372         return retPtr;
   2373     }
   2374 
   2375     //
   2376     //  Open the file.
   2377     //
   2378     f = fopen(fileName, "rb");
   2379     if (f == 0) {
   2380         dataerrln("Error opening test data file %s\n", fileName);
   2381         status = U_FILE_ACCESS_ERROR;
   2382         return NULL;
   2383     }
   2384     //
   2385     //  Read it in
   2386     //
   2387     int   fileSize;
   2388     int   amt_read;
   2389 
   2390     fseek( f, 0, SEEK_END);
   2391     fileSize = ftell(f);
   2392     fileBuf = new char[fileSize];
   2393     fseek(f, 0, SEEK_SET);
   2394     amt_read = fread(fileBuf, 1, fileSize, f);
   2395     if (amt_read != fileSize || fileSize <= 0) {
   2396         errln("Error reading test data file.");
   2397         goto cleanUpAndReturn;
   2398     }
   2399 
   2400     //
   2401     // Look for a Unicode Signature (BOM) on the data just read
   2402     //
   2403     int32_t        signatureLength;
   2404     const char *   fileBufC;
   2405     const char*    bomEncoding;
   2406 
   2407     fileBufC = fileBuf;
   2408     bomEncoding = ucnv_detectUnicodeSignature(
   2409         fileBuf, fileSize, &signatureLength, &status);
   2410     if(bomEncoding!=NULL ){
   2411         fileBufC  += signatureLength;
   2412         fileSize  -= signatureLength;
   2413         encoding = bomEncoding;
   2414     }
   2415 
   2416     //
   2417     // Open a converter to take the rule file to UTF-16
   2418     //
   2419     conv = ucnv_open(encoding, &status);
   2420     if (U_FAILURE(status)) {
   2421         goto cleanUpAndReturn;
   2422     }
   2423 
   2424     //
   2425     // Convert the rules to UChar.
   2426     //  Preflight first to determine required buffer size.
   2427     //
   2428     ulen = ucnv_toUChars(conv,
   2429         NULL,           //  dest,
   2430         0,              //  destCapacity,
   2431         fileBufC,
   2432         fileSize,
   2433         &status);
   2434     if (status == U_BUFFER_OVERFLOW_ERROR) {
   2435         // Buffer Overflow is expected from the preflight operation.
   2436         status = U_ZERO_ERROR;
   2437 
   2438         retPtr = new UChar[ulen+1];
   2439         ucnv_toUChars(conv,
   2440             retPtr,       //  dest,
   2441             ulen+1,
   2442             fileBufC,
   2443             fileSize,
   2444             &status);
   2445     }
   2446 
   2447 cleanUpAndReturn:
   2448     fclose(f);
   2449     delete []fileBuf;
   2450     ucnv_close(conv);
   2451     if (U_FAILURE(status)) {
   2452         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   2453         delete retPtr;
   2454         retPtr = 0;
   2455         ulen   = 0;
   2456     };
   2457     return retPtr;
   2458 }
   2459 
   2460 
   2461 
   2462 //--------------------------------------------------------------------------------------------
   2463 //
   2464 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
   2465 //
   2466 //-------------------------------------------------------------------------------------------
   2467 void RBBITest::TestUnicodeFiles() {
   2468     RuleBasedBreakIterator  *bi;
   2469     UErrorCode               status = U_ZERO_ERROR;
   2470 
   2471     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getDefault(), status);
   2472     TEST_ASSERT_SUCCESS(status);
   2473     if (U_SUCCESS(status)) {
   2474         runUnicodeTestData("GraphemeBreakTest.txt", bi);
   2475     }
   2476     delete bi;
   2477 
   2478     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getDefault(), status);
   2479     TEST_ASSERT_SUCCESS(status);
   2480     if (U_SUCCESS(status)) {
   2481         runUnicodeTestData("WordBreakTest.txt", bi);
   2482     }
   2483     delete bi;
   2484 
   2485     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
   2486     TEST_ASSERT_SUCCESS(status);
   2487     if (U_SUCCESS(status)) {
   2488         runUnicodeTestData("SentenceBreakTest.txt", bi);
   2489     }
   2490     delete bi;
   2491 
   2492     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
   2493     TEST_ASSERT_SUCCESS(status);
   2494     if (U_SUCCESS(status)) {
   2495         runUnicodeTestData("LineBreakTest.txt", bi);
   2496     }
   2497     delete bi;
   2498 }
   2499 
   2500 
   2501 //--------------------------------------------------------------------------------------------
   2502 //
   2503 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
   2504 //
   2505 //-------------------------------------------------------------------------------------------
   2506 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
   2507 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   2508     UErrorCode  status = U_ZERO_ERROR;
   2509 
   2510     //
   2511     //  Open and read the test data file, put it into a UnicodeString.
   2512     //
   2513     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   2514     char testFileName[1000];
   2515     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
   2516         dataerrln("Can't open test data.  Path too long.");
   2517         return;
   2518     }
   2519     strcpy(testFileName, testDataDirectory);
   2520     strcat(testFileName, fileName);
   2521 
   2522     logln("Opening data file %s\n", fileName);
   2523 
   2524     int    len;
   2525     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
   2526     if (status != U_FILE_ACCESS_ERROR) {
   2527         TEST_ASSERT_SUCCESS(status);
   2528         TEST_ASSERT(testFile != NULL);
   2529     }
   2530     if (U_FAILURE(status) || testFile == NULL) {
   2531         return; /* something went wrong, error already output */
   2532     }
   2533     UnicodeString testFileAsString(TRUE, testFile, len);
   2534 
   2535     //
   2536     //  Parse the test data file using a regular expression.
   2537     //  Each kind of token is recognized in its own capture group; what type of item was scanned
   2538     //     is identified by which group had a match.
   2539     //
   2540     //    Caputure Group #                  1          2            3            4           5
   2541     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
   2542     //
   2543     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
   2544     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
   2545     UnicodeString   testString;
   2546     UVector32       breakPositions(status);
   2547     int             lineNumber = 1;
   2548     TEST_ASSERT_SUCCESS(status);
   2549     if (U_FAILURE(status)) {
   2550         return;
   2551     }
   2552 
   2553     //
   2554     //  Scan through each test case, building up the string to be broken in testString,
   2555     //   and the positions that should be boundaries in the breakPositions vector.
   2556     //
   2557     while (tokenMatcher.find()) {
   2558         if (tokenMatcher.start(1, status) >= 0) {
   2559             // Scanned a divide sign, indicating a break position in the test data.
   2560             if (testString.length()>0) {
   2561                 breakPositions.addElement(testString.length(), status);
   2562             }
   2563         }
   2564         else if (tokenMatcher.start(2, status) >= 0) {
   2565             // Scanned an 'x', meaning no break at this position in the test data
   2566             //   Nothing to be done here.
   2567             }
   2568         else if (tokenMatcher.start(3, status) >= 0) {
   2569             // Scanned Hex digits.  Convert them to binary, append to the character data string.
   2570             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
   2571             int length = hexNumber.length();
   2572             if (length<=8) {
   2573                 char buf[10];
   2574                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
   2575                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
   2576                 if (c<=0x10ffff) {
   2577                     testString.append(c);
   2578                 } else {
   2579                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
   2580                        fileName, lineNumber);
   2581                 }
   2582             } else {
   2583                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
   2584                        fileName, lineNumber);
   2585              }
   2586         }
   2587         else if (tokenMatcher.start(4, status) >= 0) {
   2588             // Scanned to end of a line, possibly skipping over a comment in the process.
   2589             //   If the line from the file contained test data, run the test now.
   2590             //
   2591             if (testString.length() > 0) {
   2592                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
   2593             }
   2594 
   2595             // Clear out this test case.
   2596             //    The string and breakPositions vector will be refilled as the next
   2597             //       test case is parsed.
   2598             testString.remove();
   2599             breakPositions.removeAllElements();
   2600             lineNumber++;
   2601         } else {
   2602             // Scanner catchall.  Something unrecognized appeared on the line.
   2603             char token[16];
   2604             UnicodeString uToken = tokenMatcher.group(0, status);
   2605             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
   2606             token[sizeof(token)-1] = 0;
   2607             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
   2608 
   2609             // Clean up, in preparation for continuing with the next line.
   2610             testString.remove();
   2611             breakPositions.removeAllElements();
   2612             lineNumber++;
   2613         }
   2614         TEST_ASSERT_SUCCESS(status);
   2615         if (U_FAILURE(status)) {
   2616             break;
   2617         }
   2618     }
   2619 
   2620     delete [] testFile;
   2621  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
   2622 }
   2623 
   2624 //--------------------------------------------------------------------------------------------
   2625 //
   2626 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
   2627 //                            test data files.  Do only a simple, forward-only check -
   2628 //                            this test is mostly to check that ICU and the Unicode
   2629 //                            data agree with each other.
   2630 //
   2631 //--------------------------------------------------------------------------------------------
   2632 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
   2633                          const UnicodeString &testString,   // Text data to be broken
   2634                          UVector32 *breakPositions,         // Positions where breaks should be found.
   2635                          RuleBasedBreakIterator *bi) {
   2636     int32_t pos;                 // Break Position in the test string
   2637     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
   2638     int32_t expectedPos;         // Expected break position (index into test string)
   2639 
   2640     bi->setText(testString);
   2641     pos = bi->first();
   2642     pos = bi->next();
   2643 
   2644     while (pos != BreakIterator::DONE) {
   2645         if (expectedI >= breakPositions->size()) {
   2646             errln("Test file \"%s\", line %d, unexpected break found at position %d",
   2647                 testFileName, lineNumber, pos);
   2648             break;
   2649         }
   2650         expectedPos = breakPositions->elementAti(expectedI);
   2651         if (pos < expectedPos) {
   2652             errln("Test file \"%s\", line %d, unexpected break found at position %d",
   2653                 testFileName, lineNumber, pos);
   2654             break;
   2655         }
   2656         if (pos > expectedPos) {
   2657             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
   2658                 testFileName, lineNumber, expectedPos);
   2659             break;
   2660         }
   2661         pos = bi->next();
   2662         expectedI++;
   2663     }
   2664 
   2665     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
   2666         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
   2667             testFileName, lineNumber, breakPositions->elementAti(expectedI));
   2668     }
   2669 }
   2670 
   2671 
   2672 
   2673 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   2674 //---------------------------------------------------------------------------------------
   2675 //
   2676 //   classs RBBIMonkeyKind
   2677 //
   2678 //      Monkey Test for Break Iteration
   2679 //      Abstract interface class.   Concrete derived classes independently
   2680 //      implement the break rules for different iterator types.
   2681 //
   2682 //      The Monkey Test itself uses doesn't know which type of break iterator it is
   2683 //      testing, but works purely in terms of the interface defined here.
   2684 //
   2685 //---------------------------------------------------------------------------------------
   2686 class RBBIMonkeyKind {
   2687 public:
   2688     // Return a UVector of UnicodeSets, representing the character classes used
   2689     //   for this type of iterator.
   2690     virtual  UVector  *charClasses() = 0;
   2691 
   2692     // Set the test text on which subsequent calls to next() will operate
   2693     virtual  void      setText(const UnicodeString &s) = 0;
   2694 
   2695     // Find the next break postion, starting from the prev break position, or from zero.
   2696     // Return -1 after reaching end of string.
   2697     virtual  int32_t   next(int32_t i) = 0;
   2698 
   2699     virtual ~RBBIMonkeyKind();
   2700     UErrorCode       deferredStatus;
   2701 
   2702 
   2703 protected:
   2704     RBBIMonkeyKind();
   2705 
   2706 private:
   2707 };
   2708 
   2709 RBBIMonkeyKind::RBBIMonkeyKind() {
   2710     deferredStatus = U_ZERO_ERROR;
   2711 }
   2712 
   2713 RBBIMonkeyKind::~RBBIMonkeyKind() {
   2714 }
   2715 
   2716 
   2717 //----------------------------------------------------------------------------------------
   2718 //
   2719 //   Random Numbers.  Similar to standard lib rand() and srand()
   2720 //                    Not using library to
   2721 //                      1.  Get same results on all platforms.
   2722 //                      2.  Get access to current seed, to more easily reproduce failures.
   2723 //
   2724 //---------------------------------------------------------------------------------------
   2725 static uint32_t m_seed = 1;
   2726 
   2727 static uint32_t m_rand()
   2728 {
   2729     m_seed = m_seed * 1103515245 + 12345;
   2730     return (uint32_t)(m_seed/65536) % 32768;
   2731 }
   2732 
   2733 
   2734 //------------------------------------------------------------------------------------------
   2735 //
   2736 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
   2737 //                             of RBBIMonkeyKind.
   2738 //
   2739 //------------------------------------------------------------------------------------------
   2740 class RBBICharMonkey: public RBBIMonkeyKind {
   2741 public:
   2742     RBBICharMonkey();
   2743     virtual          ~RBBICharMonkey();
   2744     virtual  UVector *charClasses();
   2745     virtual  void     setText(const UnicodeString &s);
   2746     virtual  int32_t  next(int32_t i);
   2747 private:
   2748     UVector   *fSets;
   2749 
   2750     UnicodeSet  *fCRLFSet;
   2751     UnicodeSet  *fControlSet;
   2752     UnicodeSet  *fExtendSet;
   2753     UnicodeSet  *fPrependSet;
   2754     UnicodeSet  *fSpacingSet;
   2755     UnicodeSet  *fLSet;
   2756     UnicodeSet  *fVSet;
   2757     UnicodeSet  *fTSet;
   2758     UnicodeSet  *fLVSet;
   2759     UnicodeSet  *fLVTSet;
   2760     UnicodeSet  *fHangulSet;
   2761     UnicodeSet  *fAnySet;
   2762 
   2763     const UnicodeString *fText;
   2764 };
   2765 
   2766 
   2767 RBBICharMonkey::RBBICharMonkey() {
   2768     UErrorCode  status = U_ZERO_ERROR;
   2769 
   2770     fText = NULL;
   2771 
   2772     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
   2773     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
   2774     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
   2775     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
   2776     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
   2777     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
   2778     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
   2779     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
   2780     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
   2781     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
   2782     fHangulSet  = new UnicodeSet();
   2783     fHangulSet->addAll(*fLSet);
   2784     fHangulSet->addAll(*fVSet);
   2785     fHangulSet->addAll(*fTSet);
   2786     fHangulSet->addAll(*fLVSet);
   2787     fHangulSet->addAll(*fLVTSet);
   2788     fAnySet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0000-\\U0010ffff]"), status);
   2789 
   2790     fSets       = new UVector(status);
   2791     fSets->addElement(fCRLFSet,    status);
   2792     fSets->addElement(fControlSet, status);
   2793     fSets->addElement(fExtendSet,  status);
   2794     fSets->addElement(fPrependSet, status);
   2795     fSets->addElement(fSpacingSet, status);
   2796     fSets->addElement(fHangulSet,  status);
   2797     fSets->addElement(fAnySet,     status);
   2798     if (U_FAILURE(status)) {
   2799         deferredStatus = status;
   2800     }
   2801 }
   2802 
   2803 
   2804 void RBBICharMonkey::setText(const UnicodeString &s) {
   2805     fText = &s;
   2806 }
   2807 
   2808 
   2809 
   2810 int32_t RBBICharMonkey::next(int32_t prevPos) {
   2811     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2812                               //   break position being tested.  The candidate break
   2813                               //   location is before p2.
   2814 
   2815     int     breakPos = -1;
   2816 
   2817     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2818 
   2819     if (U_FAILURE(deferredStatus)) {
   2820         return -1;
   2821     }
   2822 
   2823     // Previous break at end of string.  return DONE.
   2824     if (prevPos >= fText->length()) {
   2825         return -1;
   2826     }
   2827     p0 = p1 = p2 = p3 = prevPos;
   2828     c3 =  fText->char32At(prevPos);
   2829     c0 = c1 = c2 = 0;
   2830 
   2831     // Loop runs once per "significant" character position in the input text.
   2832     for (;;) {
   2833         // Move all of the positions forward in the input string.
   2834         p0 = p1;  c0 = c1;
   2835         p1 = p2;  c1 = c2;
   2836         p2 = p3;  c2 = c3;
   2837 
   2838         // Advancd p3 by one codepoint
   2839         p3 = fText->moveIndex32(p3, 1);
   2840         c3 = fText->char32At(p3);
   2841 
   2842         if (p1 == p2) {
   2843             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2844             continue;
   2845         }
   2846         if (p2 == fText->length()) {
   2847             // Reached end of string.  Always a break position.
   2848             break;
   2849         }
   2850 
   2851         // Rule  GB3   CR x LF
   2852         //     No Extend or Format characters may appear between the CR and LF,
   2853         //     which requires the additional check for p2 immediately following p1.
   2854         //
   2855         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
   2856             continue;
   2857         }
   2858 
   2859         // Rule (GB4).   ( Control | CR | LF ) <break>
   2860         if (fControlSet->contains(c1) ||
   2861             c1 == 0x0D ||
   2862             c1 == 0x0A)  {
   2863             break;
   2864         }
   2865 
   2866         // Rule (GB5)    <break>  ( Control | CR | LF )
   2867         //
   2868         if (fControlSet->contains(c2) ||
   2869             c2 == 0x0D ||
   2870             c2 == 0x0A)  {
   2871             break;
   2872         }
   2873 
   2874 
   2875         // Rule (GB6)  L x ( L | V | LV | LVT )
   2876         if (fLSet->contains(c1) &&
   2877                (fLSet->contains(c2)  ||
   2878                 fVSet->contains(c2)  ||
   2879                 fLVSet->contains(c2) ||
   2880                 fLVTSet->contains(c2))) {
   2881             continue;
   2882         }
   2883 
   2884         // Rule (GB7)    ( LV | V )  x  ( V | T )
   2885         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
   2886             (fVSet->contains(c2) || fTSet->contains(c2)))  {
   2887             continue;
   2888         }
   2889 
   2890         // Rule (GB8)    ( LVT | T)  x T
   2891         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
   2892             fTSet->contains(c2))  {
   2893             continue;
   2894         }
   2895 
   2896         // Rule (GB9)    Numeric x ALetter
   2897         if (fExtendSet->contains(c2))  {
   2898             continue;
   2899         }
   2900 
   2901         // Rule (GB9a)   x  SpacingMark
   2902         if (fSpacingSet->contains(c2)) {
   2903             continue;
   2904         }
   2905 
   2906         // Rule (GB9b)   Prepend x
   2907         if (fPrependSet->contains(c1)) {
   2908             continue;
   2909         }
   2910 
   2911         // Rule (GB10)  Any  <break>  Any
   2912         break;
   2913     }
   2914 
   2915     breakPos = p2;
   2916     return breakPos;
   2917 }
   2918 
   2919 
   2920 
   2921 UVector  *RBBICharMonkey::charClasses() {
   2922     return fSets;
   2923 }
   2924 
   2925 
   2926 RBBICharMonkey::~RBBICharMonkey() {
   2927     delete fSets;
   2928     delete fCRLFSet;
   2929     delete fControlSet;
   2930     delete fExtendSet;
   2931     delete fPrependSet;
   2932     delete fSpacingSet;
   2933     delete fLSet;
   2934     delete fVSet;
   2935     delete fTSet;
   2936     delete fLVSet;
   2937     delete fLVTSet;
   2938     delete fHangulSet;
   2939     delete fAnySet;
   2940 }
   2941 
   2942 //------------------------------------------------------------------------------------------
   2943 //
   2944 //   class RBBIWordMonkey      Word Break specific implementation
   2945 //                             of RBBIMonkeyKind.
   2946 //
   2947 //------------------------------------------------------------------------------------------
   2948 class RBBIWordMonkey: public RBBIMonkeyKind {
   2949 public:
   2950     RBBIWordMonkey();
   2951     virtual          ~RBBIWordMonkey();
   2952     virtual  UVector *charClasses();
   2953     virtual  void     setText(const UnicodeString &s);
   2954     virtual int32_t   next(int32_t i);
   2955 private:
   2956     UVector      *fSets;
   2957 
   2958     UnicodeSet  *fCRSet;
   2959     UnicodeSet  *fLFSet;
   2960     UnicodeSet  *fNewlineSet;
   2961     UnicodeSet  *fKatakanaSet;
   2962     UnicodeSet  *fALetterSet;
   2963     // TODO(jungshik): Do we still need this change?
   2964     // UnicodeSet  *fALetterSet; // matches ALetterPlus in word.txt
   2965     UnicodeSet  *fMidNumLetSet;
   2966     UnicodeSet  *fMidLetterSet;
   2967     UnicodeSet  *fMidNumSet;
   2968     UnicodeSet  *fNumericSet;
   2969     UnicodeSet  *fFormatSet;
   2970     UnicodeSet  *fOtherSet;
   2971     UnicodeSet  *fExtendSet;
   2972     UnicodeSet  *fExtendNumLetSet;
   2973     UnicodeSet  *fDictionaryCjkSet;
   2974 
   2975     RegexMatcher  *fMatcher;
   2976 
   2977     const UnicodeString  *fText;
   2978 };
   2979 
   2980 
   2981 RBBIWordMonkey::RBBIWordMonkey()
   2982 {
   2983     UErrorCode  status = U_ZERO_ERROR;
   2984 
   2985     fSets            = new UVector(status);
   2986 
   2987     fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
   2988     fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
   2989     fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
   2990     fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
   2991     // Exclude Hangul syllables from ALetterSet during testing.
   2992     // Leave CJK dictionary characters out from the monkey tests!
   2993 #if 0
   2994     fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
   2995                                       "[\\p{Line_Break = Complex_Context}"
   2996                                       "-\\p{Grapheme_Cluster_Break = Extend}"
   2997                                       "-\\p{Grapheme_Cluster_Break = Control}"
   2998                                       "]]",
   2999                                       status);
   3000 #endif
   3001     fALetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
   3002     fALetterSet->removeAll(*fDictionaryCjkSet);
   3003     fKatakanaSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
   3004     fMidNumLetSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
   3005     fMidLetterSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
   3006     fMidNumSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
   3007     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}[\\uff10-\\uff19]]"),      status);
   3008     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
   3009     fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
   3010     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
   3011 
   3012     fOtherSet        = new UnicodeSet();
   3013     if(U_FAILURE(status)) {
   3014       deferredStatus = status;
   3015       return;
   3016     }
   3017 
   3018     fOtherSet->complement();
   3019     fOtherSet->removeAll(*fCRSet);
   3020     fOtherSet->removeAll(*fLFSet);
   3021     fOtherSet->removeAll(*fNewlineSet);
   3022     fOtherSet->removeAll(*fKatakanaSet);
   3023     fOtherSet->removeAll(*fALetterSet);
   3024     fOtherSet->removeAll(*fMidLetterSet);
   3025     fOtherSet->removeAll(*fMidNumSet);
   3026     fOtherSet->removeAll(*fNumericSet);
   3027     fOtherSet->removeAll(*fExtendNumLetSet);
   3028     fOtherSet->removeAll(*fFormatSet);
   3029     fOtherSet->removeAll(*fExtendSet);
   3030     // Inhibit dictionary characters from being tested at all.
   3031     fOtherSet->removeAll(*fDictionaryCjkSet);
   3032     fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
   3033 
   3034     fSets->addElement(fCRSet,        status);
   3035     fSets->addElement(fLFSet,        status);
   3036     fSets->addElement(fNewlineSet,   status);
   3037     fSets->addElement(fALetterSet,   status);
   3038     //fSets->addElement(fKatakanaSet,  status); //TODO: work out how to test katakana
   3039     fSets->addElement(fMidLetterSet, status);
   3040     fSets->addElement(fMidNumLetSet, status);
   3041     fSets->addElement(fMidNumSet,    status);
   3042     fSets->addElement(fNumericSet,   status);
   3043     fSets->addElement(fFormatSet,    status);
   3044     fSets->addElement(fExtendSet,    status);
   3045     fSets->addElement(fOtherSet,     status);
   3046     fSets->addElement(fExtendNumLetSet, status);
   3047 
   3048     if (U_FAILURE(status)) {
   3049         deferredStatus = status;
   3050     }
   3051 }
   3052 
   3053 void RBBIWordMonkey::setText(const UnicodeString &s) {
   3054     fText       = &s;
   3055 }
   3056 
   3057 
   3058 int32_t RBBIWordMonkey::next(int32_t prevPos) {
   3059     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   3060                               //   break position being tested.  The candidate break
   3061                               //   location is before p2.
   3062 
   3063     int     breakPos = -1;
   3064 
   3065     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   3066 
   3067     if (U_FAILURE(deferredStatus)) {
   3068         return -1;
   3069     }
   3070 
   3071     // Prev break at end of string.  return DONE.
   3072     if (prevPos >= fText->length()) {
   3073         return -1;
   3074     }
   3075     p0 = p1 = p2 = p3 = prevPos;
   3076     c3 =  fText->char32At(prevPos);
   3077     c0 = c1 = c2 = 0;
   3078 
   3079     // Loop runs once per "significant" character position in the input text.
   3080     for (;;) {
   3081         // Move all of the positions forward in the input string.
   3082         p0 = p1;  c0 = c1;
   3083         p1 = p2;  c1 = c2;
   3084         p2 = p3;  c2 = c3;
   3085 
   3086         // Advancd p3 by    X(Extend | Format)*   Rule 4
   3087         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
   3088         do {
   3089             p3 = fText->moveIndex32(p3, 1);
   3090             c3 = fText->char32At(p3);
   3091             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
   3092                break;
   3093             };
   3094         }
   3095         while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
   3096 
   3097 
   3098         if (p1 == p2) {
   3099             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   3100             continue;
   3101         }
   3102         if (p2 == fText->length()) {
   3103             // Reached end of string.  Always a break position.
   3104             break;
   3105         }
   3106 
   3107         // Rule  (3)   CR x LF
   3108         //     No Extend or Format characters may appear between the CR and LF,
   3109         //     which requires the additional check for p2 immediately following p1.
   3110         //
   3111         if (c1==0x0D && c2==0x0A) {
   3112             continue;
   3113         }
   3114 
   3115         // Rule (3a)  Break before and after newlines (including CR and LF)
   3116         //
   3117         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
   3118             break;
   3119         };
   3120         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
   3121             break;
   3122         };
   3123 
   3124         // Rule (5).   ALetter x ALetter
   3125         if (fALetterSet->contains(c1) &&
   3126             fALetterSet->contains(c2))  {
   3127             continue;
   3128         }
   3129 
   3130         // Rule (6)  ALetter  x  (MidLetter | MidNumLet) ALetter
   3131         //
   3132         if ( fALetterSet->contains(c1)   &&
   3133              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
   3134              fALetterSet->contains(c3)) {
   3135             continue;
   3136         }
   3137 
   3138 
   3139         // Rule (7)  ALetter (MidLetter | MidNumLet)  x  ALetter
   3140         if (fALetterSet->contains(c0) &&
   3141             (fMidLetterSet->contains(c1) ||  fMidNumLetSet->contains(c1)) &&
   3142             fALetterSet->contains(c2)) {
   3143             continue;
   3144         }
   3145 
   3146         // Rule (8)    Numeric x Numeric
   3147         if (fNumericSet->contains(c1) &&
   3148             fNumericSet->contains(c2))  {
   3149             continue;
   3150         }
   3151 
   3152         // Rule (9)    ALetter x Numeric
   3153         if (fALetterSet->contains(c1) &&
   3154             fNumericSet->contains(c2))  {
   3155             continue;
   3156         }
   3157 
   3158         // Rule (10)    Numeric x ALetter
   3159         if (fNumericSet->contains(c1) &&
   3160             fALetterSet->contains(c2))  {
   3161             continue;
   3162         }
   3163 
   3164         // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric
   3165         if (fNumericSet->contains(c0) &&
   3166             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1))  &&
   3167             fNumericSet->contains(c2)) {
   3168             continue;
   3169         }
   3170 
   3171         // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric
   3172         if (fNumericSet->contains(c1) &&
   3173             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2))  &&
   3174             fNumericSet->contains(c3)) {
   3175             continue;
   3176         }
   3177 
   3178         // Rule (13)  Katakana x Katakana
   3179         if (fKatakanaSet->contains(c1) &&
   3180             fKatakanaSet->contains(c2))  {
   3181             continue;
   3182         }
   3183 
   3184         // Rule 13a
   3185         if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
   3186              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
   3187              fExtendNumLetSet->contains(c2)) {
   3188                 continue;
   3189              }
   3190 
   3191         // Rule 13b
   3192         if (fExtendNumLetSet->contains(c1) &&
   3193                 (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
   3194                 fKatakanaSet->contains(c2)))  {
   3195                 continue;
   3196              }
   3197 
   3198         // Rule 14.  Break found here.
   3199         break;
   3200     }
   3201 
   3202     breakPos = p2;
   3203     return breakPos;
   3204 }
   3205 
   3206 
   3207 UVector  *RBBIWordMonkey::charClasses() {
   3208     return fSets;
   3209 }
   3210 
   3211 
   3212 RBBIWordMonkey::~RBBIWordMonkey() {
   3213     delete fSets;
   3214     delete fCRSet;
   3215     delete fLFSet;
   3216     delete fNewlineSet;
   3217     delete fKatakanaSet;
   3218     delete fALetterSet;
   3219     delete fMidNumLetSet;
   3220     delete fMidLetterSet;
   3221     delete fMidNumSet;
   3222     delete fNumericSet;
   3223     delete fFormatSet;
   3224     delete fExtendSet;
   3225     delete fExtendNumLetSet;
   3226     delete fOtherSet;
   3227 }
   3228 
   3229 
   3230 
   3231 
   3232 //------------------------------------------------------------------------------------------
   3233 //
   3234 //   class RBBISentMonkey      Sentence Break specific implementation
   3235 //                             of RBBIMonkeyKind.
   3236 //
   3237 //------------------------------------------------------------------------------------------
   3238 class RBBISentMonkey: public RBBIMonkeyKind {
   3239 public:
   3240     RBBISentMonkey();
   3241     virtual          ~RBBISentMonkey();
   3242     virtual  UVector *charClasses();
   3243     virtual  void     setText(const UnicodeString &s);
   3244     virtual int32_t   next(int32_t i);
   3245 private:
   3246     int               moveBack(int posFrom);
   3247     int               moveForward(int posFrom);
   3248     UChar32           cAt(int pos);
   3249 
   3250     UVector      *fSets;
   3251 
   3252     UnicodeSet  *fSepSet;
   3253     UnicodeSet  *fFormatSet;
   3254     UnicodeSet  *fSpSet;
   3255     UnicodeSet  *fLowerSet;
   3256     UnicodeSet  *fUpperSet;
   3257     UnicodeSet  *fOLetterSet;
   3258     UnicodeSet  *fNumericSet;
   3259     UnicodeSet  *fATermSet;
   3260     UnicodeSet  *fSContinueSet;
   3261     UnicodeSet  *fSTermSet;
   3262     UnicodeSet  *fCloseSet;
   3263     UnicodeSet  *fOtherSet;
   3264     UnicodeSet  *fExtendSet;
   3265 
   3266     const UnicodeString  *fText;
   3267 
   3268 };
   3269 
   3270 RBBISentMonkey::RBBISentMonkey()
   3271 {
   3272     UErrorCode  status = U_ZERO_ERROR;
   3273 
   3274     fSets            = new UVector(status);
   3275 
   3276     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
   3277     //                       set and made into character classes of their own.  For the monkey impl,
   3278     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
   3279     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
   3280     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
   3281     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
   3282     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
   3283     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
   3284     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
   3285     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
   3286     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
   3287     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
   3288     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
   3289     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
   3290     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
   3291     fOtherSet        = new UnicodeSet();
   3292 
   3293     if(U_FAILURE(status)) {
   3294       deferredStatus = status;
   3295       return;
   3296     }
   3297 
   3298     fOtherSet->complement();
   3299     fOtherSet->removeAll(*fSepSet);
   3300     fOtherSet->removeAll(*fFormatSet);
   3301     fOtherSet->removeAll(*fSpSet);
   3302     fOtherSet->removeAll(*fLowerSet);
   3303     fOtherSet->removeAll(*fUpperSet);
   3304     fOtherSet->removeAll(*fOLetterSet);
   3305     fOtherSet->removeAll(*fNumericSet);
   3306     fOtherSet->removeAll(*fATermSet);
   3307     fOtherSet->removeAll(*fSContinueSet);
   3308     fOtherSet->removeAll(*fSTermSet);
   3309     fOtherSet->removeAll(*fCloseSet);
   3310     fOtherSet->removeAll(*fExtendSet);
   3311 
   3312     fSets->addElement(fSepSet,       status);
   3313     fSets->addElement(fFormatSet,    status);
   3314     fSets->addElement(fSpSet,        status);
   3315     fSets->addElement(fLowerSet,     status);
   3316     fSets->addElement(fUpperSet,     status);
   3317     fSets->addElement(fOLetterSet,   status);
   3318     fSets->addElement(fNumericSet,   status);
   3319     fSets->addElement(fATermSet,     status);
   3320     fSets->addElement(fSContinueSet, status);
   3321     fSets->addElement(fSTermSet,     status);
   3322     fSets->addElement(fCloseSet,     status);
   3323     fSets->addElement(fOtherSet,     status);
   3324     fSets->addElement(fExtendSet,    status);
   3325 
   3326     if (U_FAILURE(status)) {
   3327         deferredStatus = status;
   3328     }
   3329 }
   3330 
   3331 
   3332 
   3333 void RBBISentMonkey::setText(const UnicodeString &s) {
   3334     fText       = &s;
   3335 }
   3336 
   3337 UVector  *RBBISentMonkey::charClasses() {
   3338     return fSets;
   3339 }
   3340 
   3341 
   3342 //  moveBack()   Find the "significant" code point preceding the index i.
   3343 //               Skips over ($Extend | $Format)* .
   3344 //
   3345 int RBBISentMonkey::moveBack(int i) {
   3346     if (i <= 0) {
   3347         return -1;
   3348     }
   3349     UChar32   c;
   3350     int32_t   j = i;
   3351     do {
   3352         j = fText->moveIndex32(j, -1);
   3353         c = fText->char32At(j);
   3354     }
   3355     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
   3356     return j;
   3357 
   3358  }
   3359 
   3360 
   3361 int RBBISentMonkey::moveForward(int i) {
   3362     if (i>=fText->length()) {
   3363         return fText->length();
   3364     }
   3365     UChar32   c;
   3366     int32_t   j = i;
   3367     do {
   3368         j = fText->moveIndex32(j, 1);
   3369         c = cAt(j);
   3370     }
   3371     while (fFormatSet->contains(c) || fExtendSet->contains(c));
   3372     return j;
   3373 }
   3374 
   3375 UChar32 RBBISentMonkey::cAt(int pos) {
   3376     if (pos<0 || pos>=fText->length()) {
   3377         return -1;
   3378     } else {
   3379         return fText->char32At(pos);
   3380     }
   3381 }
   3382 
   3383 int32_t RBBISentMonkey::next(int32_t prevPos) {
   3384     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   3385                               //   break position being tested.  The candidate break
   3386                               //   location is before p2.
   3387 
   3388     int     breakPos = -1;
   3389 
   3390     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   3391     UChar32 c;
   3392 
   3393     if (U_FAILURE(deferredStatus)) {
   3394         return -1;
   3395     }
   3396 
   3397     // Prev break at end of string.  return DONE.
   3398     if (prevPos >= fText->length()) {
   3399         return -1;
   3400     }
   3401     p0 = p1 = p2 = p3 = prevPos;
   3402     c3 =  fText->char32At(prevPos);
   3403     c0 = c1 = c2 = 0;
   3404 
   3405     // Loop runs once per "significant" character position in the input text.
   3406     for (;;) {
   3407         // Move all of the positions forward in the input string.
   3408         p0 = p1;  c0 = c1;
   3409         p1 = p2;  c1 = c2;
   3410         p2 = p3;  c2 = c3;
   3411 
   3412         // Advancd p3 by    X(Extend | Format)*   Rule 4
   3413         p3 = moveForward(p3);
   3414         c3 = cAt(p3);
   3415 
   3416         // Rule (3)  CR x LF
   3417         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
   3418             continue;
   3419         }
   3420 
   3421         // Rule (4).   Sep  <break>
   3422         if (fSepSet->contains(c1)) {
   3423             p2 = p1+1;   // Separators don't combine with Extend or Format.
   3424             break;
   3425         }
   3426 
   3427         if (p2 >= fText->length()) {
   3428             // Reached end of string.  Always a break position.
   3429             break;
   3430         }
   3431 
   3432         if (p2 == prevPos) {
   3433             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   3434             continue;
   3435         }
   3436 
   3437         // Rule (6).   ATerm x Numeric
   3438         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
   3439             continue;
   3440         }
   3441 
   3442         // Rule (7).  Upper ATerm  x  Uppper
   3443         if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
   3444             continue;
   3445         }
   3446 
   3447         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
   3448         //           Note:  STerm | ATerm are added to the negated part of the expression by a
   3449         //                  note to the Unicode 5.0 documents.
   3450         int p8 = p1;
   3451         while (fSpSet->contains(cAt(p8))) {
   3452             p8 = moveBack(p8);
   3453         }
   3454         while (fCloseSet->contains(cAt(p8))) {
   3455             p8 = moveBack(p8);
   3456         }
   3457         if (fATermSet->contains(cAt(p8))) {
   3458             p8=p2;
   3459             for (;;) {
   3460                 c = cAt(p8);
   3461                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
   3462                     fLowerSet->contains(c) || fSepSet->contains(c) ||
   3463                     fATermSet->contains(c) || fSTermSet->contains(c))  {
   3464                     break;
   3465                 }
   3466                 p8 = moveForward(p8);
   3467             }
   3468             if (fLowerSet->contains(cAt(p8))) {
   3469                 continue;
   3470             }
   3471         }
   3472 
   3473         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
   3474         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
   3475             p8 = p1;
   3476             while (fSpSet->contains(cAt(p8))) {
   3477                 p8 = moveBack(p8);
   3478             }
   3479             while (fCloseSet->contains(cAt(p8))) {
   3480                 p8 = moveBack(p8);
   3481             }
   3482             c = cAt(p8);
   3483             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
   3484                 continue;
   3485             }
   3486         }
   3487 
   3488         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
   3489         int p9 = p1;
   3490         while (fCloseSet->contains(cAt(p9))) {
   3491             p9 = moveBack(p9);
   3492         }
   3493         c = cAt(p9);
   3494         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
   3495             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
   3496                 continue;
   3497             }
   3498         }
   3499 
   3500         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
   3501         int p10 = p1;
   3502         while (fSpSet->contains(cAt(p10))) {
   3503             p10 = moveBack(p10);
   3504         }
   3505         while (fCloseSet->contains(cAt(p10))) {
   3506             p10 = moveBack(p10);
   3507         }
   3508         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
   3509             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
   3510                 continue;
   3511             }
   3512         }
   3513 
   3514         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
   3515         int p11 = p1;
   3516         if (fSepSet->contains(cAt(p11))) {
   3517             p11 = moveBack(p11);
   3518         }
   3519         while (fSpSet->contains(cAt(p11))) {
   3520             p11 = moveBack(p11);
   3521         }
   3522         while (fCloseSet->contains(cAt(p11))) {
   3523             p11 = moveBack(p11);
   3524         }
   3525         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
   3526             break;
   3527         }
   3528 
   3529         //  Rule (12)  Any x Any
   3530         continue;
   3531     }
   3532     breakPos = p2;
   3533     return breakPos;
   3534 }
   3535 
   3536 RBBISentMonkey::~RBBISentMonkey() {
   3537     delete fSets;
   3538     delete fSepSet;
   3539     delete fFormatSet;
   3540     delete fSpSet;
   3541     delete fLowerSet;
   3542     delete fUpperSet;
   3543     delete fOLetterSet;
   3544     delete fNumericSet;
   3545     delete fATermSet;
   3546     delete fSContinueSet;
   3547     delete fSTermSet;
   3548     delete fCloseSet;
   3549     delete fOtherSet;
   3550     delete fExtendSet;
   3551 }
   3552 
   3553 
   3554 
   3555 //-------------------------------------------------------------------------------------------
   3556 //
   3557 //  RBBILineMonkey
   3558 //
   3559 //-------------------------------------------------------------------------------------------
   3560 
   3561 class RBBILineMonkey: public RBBIMonkeyKind {
   3562 public:
   3563     RBBILineMonkey();
   3564     virtual          ~RBBILineMonkey();
   3565     virtual  UVector *charClasses();
   3566     virtual  void     setText(const UnicodeString &s);
   3567     virtual  int32_t  next(int32_t i);
   3568     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
   3569 private:
   3570     UVector      *fSets;
   3571 
   3572     UnicodeSet  *fBK;
   3573     UnicodeSet  *fCR;
   3574     UnicodeSet  *fLF;
   3575     UnicodeSet  *fCM;
   3576     UnicodeSet  *fNL;
   3577     UnicodeSet  *fSG;
   3578     UnicodeSet  *fWJ;
   3579     UnicodeSet  *fZW;
   3580     UnicodeSet  *fGL;
   3581     UnicodeSet  *fCB;
   3582     UnicodeSet  *fSP;
   3583     UnicodeSet  *fB2;
   3584     UnicodeSet  *fBA;
   3585     UnicodeSet  *fBB;
   3586     UnicodeSet  *fHY;
   3587     UnicodeSet  *fH2;
   3588     UnicodeSet  *fH3;
   3589     UnicodeSet  *fCL;
   3590     UnicodeSet  *fEX;
   3591     UnicodeSet  *fIN;
   3592     UnicodeSet  *fJL;
   3593     UnicodeSet  *fJV;
   3594     UnicodeSet  *fJT;
   3595     UnicodeSet  *fNS;
   3596     UnicodeSet  *fOP;
   3597     UnicodeSet  *fQU;
   3598     UnicodeSet  *fIS;
   3599     UnicodeSet  *fNU;
   3600     UnicodeSet  *fPO;
   3601     UnicodeSet  *fPR;
   3602     UnicodeSet  *fSY;
   3603     UnicodeSet  *fAI;
   3604     UnicodeSet  *fAL;
   3605     UnicodeSet  *fID;
   3606     UnicodeSet  *fSA;
   3607     UnicodeSet  *fXX;
   3608 
   3609     BreakIterator  *fCharBI;
   3610 
   3611     const UnicodeString  *fText;
   3612     int32_t              *fOrigPositions;
   3613 
   3614     RegexMatcher         *fNumberMatcher;
   3615     RegexMatcher         *fLB11Matcher;
   3616 };
   3617 
   3618 
   3619 RBBILineMonkey::RBBILineMonkey()
   3620 {
   3621     UErrorCode  status = U_ZERO_ERROR;
   3622 
   3623     fSets  = new UVector(status);
   3624 
   3625     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
   3626     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
   3627     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
   3628     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
   3629     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
   3630     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
   3631     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
   3632     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
   3633     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
   3634     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
   3635     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
   3636     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
   3637     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
   3638     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
   3639     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
   3640     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
   3641     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
   3642     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
   3643     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
   3644     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
   3645     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
   3646     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
   3647     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
   3648     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
   3649     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
   3650     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
   3651     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
   3652     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
   3653     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
   3654     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
   3655     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
   3656     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
   3657     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
   3658     fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
   3659     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
   3660     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
   3661 
   3662     if (U_FAILURE(status)) {
   3663         deferredStatus = status;
   3664         fCharBI = NULL;
   3665         fNumberMatcher = NULL;
   3666         return;
   3667     }
   3668 
   3669     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
   3670     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
   3671     fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
   3672     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
   3673 
   3674     fSets->addElement(fBK, status);
   3675     fSets->addElement(fCR, status);
   3676     fSets->addElement(fLF, status);
   3677     fSets->addElement(fCM, status);
   3678     fSets->addElement(fNL, status);
   3679     fSets->addElement(fWJ, status);
   3680     fSets->addElement(fZW, status);
   3681     fSets->addElement(fGL, status);
   3682     fSets->addElement(fCB, status);
   3683     fSets->addElement(fSP, status);
   3684     fSets->addElement(fB2, status);
   3685     fSets->addElement(fBA, status);
   3686     fSets->addElement(fBB, status);
   3687     fSets->addElement(fHY, status);
   3688     fSets->addElement(fH2, status);
   3689     fSets->addElement(fH3, status);
   3690     fSets->addElement(fCL, status);
   3691     fSets->addElement(fEX, status);
   3692     fSets->addElement(fIN, status);
   3693     fSets->addElement(fJL, status);
   3694     fSets->addElement(fJT, status);
   3695     fSets->addElement(fJV, status);
   3696     fSets->addElement(fNS, status);
   3697     fSets->addElement(fOP, status);
   3698     fSets->addElement(fQU, status);
   3699     fSets->addElement(fIS, status);
   3700     fSets->addElement(fNU, status);
   3701     fSets->addElement(fPO, status);
   3702     fSets->addElement(fPR, status);
   3703     fSets->addElement(fSY, status);
   3704     fSets->addElement(fAI, status);
   3705     fSets->addElement(fAL, status);
   3706     fSets->addElement(fID, status);
   3707     fSets->addElement(fWJ, status);
   3708     fSets->addElement(fSA, status);
   3709     fSets->addElement(fSG, status);
   3710 
   3711     const char *rules =
   3712             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
   3713             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
   3714             "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
   3715             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
   3716             "(\\p{Line_Break=CL}\\p{Line_Break=CM}*)?"
   3717             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
   3718 
   3719     fNumberMatcher = new RegexMatcher(
   3720         UnicodeString(rules, -1, US_INV), 0, status);
   3721 
   3722     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
   3723 
   3724     if (U_FAILURE(status)) {
   3725         deferredStatus = status;
   3726     }
   3727 }
   3728 
   3729 
   3730 void RBBILineMonkey::setText(const UnicodeString &s) {
   3731     fText       = &s;
   3732     fCharBI->setText(s);
   3733     fNumberMatcher->reset(s);
   3734 }
   3735 
   3736 //
   3737 //  rule9Adjust
   3738 //     Line Break TR rules 9 and 10 implementation.
   3739 //     This deals with combining marks and other sequences that
   3740 //     that must be treated as if they were something other than what they actually are.
   3741 //
   3742 //     This is factored out into a separate function because it must be applied twice for
   3743 //     each potential break, once to the chars before the position being checked, then
   3744 //     again to the text following the possible break.
   3745 //
   3746 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
   3747     if (pos == -1) {
   3748         // Invalid initial position.  Happens during the warmup iteration of the
   3749         //   main loop in next().
   3750         return;
   3751     }
   3752 
   3753     int32_t  nPos = *nextPos;
   3754 
   3755     // LB 9  Keep combining sequences together.
   3756     //  advance over any CM class chars.  Note that Line Break CM is different
   3757     //  from the normal Grapheme Extend property.
   3758     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
   3759           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
   3760         for (;;) {
   3761             *nextChar = fText->char32At(nPos);
   3762             if (!fCM->contains(*nextChar)) {
   3763                 break;
   3764             }
   3765             nPos = fText->moveIndex32(nPos, 1);
   3766         }
   3767     }
   3768 
   3769 
   3770     // LB 9 Treat X CM* as if it were x.
   3771     //       No explicit action required.
   3772 
   3773     // LB 10  Treat any remaining combining mark as AL
   3774     if (fCM->contains(*posChar)) {
   3775         *posChar = 0x41;   // thisChar = 'A';
   3776     }
   3777 
   3778     // Push the updated nextPos and nextChar back to our caller.
   3779     // This only makes a difference if posChar got bigger by consuming a
   3780     // combining sequence.
   3781     *nextPos  = nPos;
   3782     *nextChar = fText->char32At(nPos);
   3783 }
   3784 
   3785 
   3786 
   3787 int32_t RBBILineMonkey::next(int32_t startPos) {
   3788     UErrorCode status = U_ZERO_ERROR;
   3789     int32_t    pos;       //  Index of the char following a potential break position
   3790     UChar32    thisChar;  //  Character at above position "pos"
   3791 
   3792     int32_t    prevPos;   //  Index of the char preceding a potential break position
   3793     UChar32    prevChar;  //  Character at above position.  Note that prevChar
   3794                           //   and thisChar may not be adjacent because combining
   3795                           //   characters between them will be ignored.
   3796 
   3797     int32_t    nextPos;   //  Index of the next character following pos.
   3798                           //     Usually skips over combining marks.
   3799     int32_t    nextCPPos; //  Index of the code point following "pos."
   3800                           //     May point to a combining mark.
   3801     int32_t    tPos;      //  temp value.
   3802     UChar32    c;
   3803 
   3804     if (U_FAILURE(deferredStatus)) {
   3805         return -1;
   3806     }
   3807 
   3808     if (startPos >= fText->length()) {
   3809         return -1;
   3810     }
   3811 
   3812 
   3813     // Initial values for loop.  Loop will run the first time without finding breaks,
   3814     //                           while the invalid values shift out and the "this" and
   3815     //                           "prev" positions are filled in with good values.
   3816     pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.
   3817     thisChar = prevChar  = 0;
   3818     nextPos  = nextCPPos = startPos;
   3819 
   3820 
   3821     // Loop runs once per position in the test text, until a break position
   3822     //  is found.
   3823     for (;;) {
   3824         prevPos   = pos;
   3825         prevChar  = thisChar;
   3826 
   3827         pos       = nextPos;
   3828         thisChar  = fText->char32At(pos);
   3829 
   3830         nextCPPos = fText->moveIndex32(pos, 1);
   3831         nextPos   = nextCPPos;
   3832 
   3833         // Rule LB2 - Break at end of text.
   3834         if (pos >= fText->length()) {
   3835             break;
   3836         }
   3837 
   3838         // Rule LB 9 - adjust for combining sequences.
   3839         //             We do this one out-of-order because the adjustment does not change anything
   3840         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
   3841         //             be applied.
   3842         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
   3843         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
   3844         c = fText->char32At(nextPos);
   3845         rule9Adjust(pos,     &thisChar, &nextPos, &c);
   3846 
   3847         // If the loop is still warming up - if we haven't shifted the initial
   3848         //   -1 positions out of prevPos yet - loop back to advance the
   3849         //    position in the input without any further looking for breaks.
   3850         if (prevPos == -1) {
   3851             continue;
   3852         }
   3853 
   3854         // LB 4  Always break after hard line breaks,
   3855         if (fBK->contains(prevChar)) {
   3856             break;
   3857         }
   3858 
   3859         // LB 5  Break after CR, LF, NL, but not inside CR LF
   3860         if (prevChar == 0x0d && thisChar == 0x0a) {
   3861             continue;
   3862         }
   3863         if (prevChar == 0x0d ||
   3864             prevChar == 0x0a ||
   3865             prevChar == 0x85)  {
   3866             break;
   3867         }
   3868 
   3869         // LB 6  Don't break before hard line breaks
   3870         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
   3871             fBK->contains(thisChar)) {
   3872                 continue;
   3873         }
   3874 
   3875 
   3876         // LB 7  Don't break before spaces or zero-width space.
   3877         if (fSP->contains(thisChar)) {
   3878             continue;
   3879         }
   3880 
   3881         if (fZW->contains(thisChar)) {
   3882             continue;
   3883         }
   3884 
   3885         // LB 8  Break after zero width space
   3886         if (fZW->contains(prevChar)) {
   3887             break;
   3888         }
   3889 
   3890         // LB 9, 10  Already done, at top of loop.
   3891         //
   3892 
   3893 
   3894         // LB 11  Do not break before or after WORD JOINER and related characters.
   3895         //    x  WJ
   3896         //    WJ  x
   3897         //
   3898         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
   3899             continue;
   3900         }
   3901 
   3902         // LB 12
   3903         //    GL  x
   3904         if (fGL->contains(prevChar)) {
   3905             continue;
   3906         }
   3907 
   3908         // LB 12a
   3909         //    [^SP BA HY] x GL
   3910         if (!(fSP->contains(prevChar) ||
   3911               fBA->contains(prevChar) ||
   3912               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
   3913             continue;
   3914         }
   3915 
   3916 
   3917 
   3918         // LB 13  Don't break before closings.
   3919         //        NU x CL  and NU x IS are not matched here so that they will
   3920         //        fall into LB 17 and the more general number regular expression.
   3921         //
   3922         if (!fNU->contains(prevChar) && fCL->contains(thisChar) ||
   3923                                         fEX->contains(thisChar) ||
   3924             !fNU->contains(prevChar) && fIS->contains(thisChar) ||
   3925             !fNU->contains(prevChar) && fSY->contains(thisChar))    {
   3926             continue;
   3927         }
   3928 
   3929         // LB 14 Don't break after OP SP*
   3930         //       Scan backwards, checking for this sequence.
   3931         //       The OP char could include combining marks, so we actually check for
   3932         //           OP CM* SP*
   3933         //       Another Twist: The Rule 67 fixes may have changed a SP CM
   3934         //       sequence into a ID char, so before scanning back through spaces,
   3935         //       verify that prevChar is indeed a space.  The prevChar variable
   3936         //       may differ from fText[prevPos]
   3937         tPos = prevPos;
   3938         if (fSP->contains(prevChar)) {
   3939             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
   3940                 tPos=fText->moveIndex32(tPos, -1);
   3941             }
   3942         }
   3943         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
   3944             tPos=fText->moveIndex32(tPos, -1);
   3945         }
   3946         if (fOP->contains(fText->char32At(tPos))) {
   3947             continue;
   3948         }
   3949 
   3950 
   3951         // LB 15    QU SP* x OP
   3952         if (fOP->contains(thisChar)) {
   3953             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
   3954             int tPos = prevPos;
   3955             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
   3956                 tPos = fText->moveIndex32(tPos, -1);
   3957             }
   3958             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
   3959                 tPos = fText->moveIndex32(tPos, -1);
   3960             }
   3961             if (fQU->contains(fText->char32At(tPos))) {
   3962                 continue;
   3963             }
   3964         }
   3965 
   3966 
   3967 
   3968         // LB 16   CL SP* x NS
   3969         //    Scan backwards for SP* CM* CL
   3970         if (fNS->contains(thisChar)) {
   3971             int tPos = prevPos;
   3972             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
   3973                 tPos = fText->moveIndex32(tPos, -1);
   3974             }
   3975             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
   3976                 tPos = fText->moveIndex32(tPos, -1);
   3977             }
   3978             if (fCL->contains(fText->char32At(tPos))) {
   3979                 continue;
   3980             }
   3981         }
   3982 
   3983 
   3984         // LB 17        B2 SP* x B2
   3985         if (fB2->contains(thisChar)) {
   3986             //  Scan backwards, checking for the B2 CM* SP* sequence.
   3987             tPos = prevPos;
   3988             if (fSP->contains(prevChar)) {
   3989                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
   3990                     tPos=fText->moveIndex32(tPos, -1);
   3991                 }
   3992             }
   3993             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
   3994                 tPos=fText->moveIndex32(tPos, -1);
   3995             }
   3996             if (fB2->contains(fText->char32At(tPos))) {
   3997                 continue;
   3998             }
   3999         }
   4000 
   4001 
   4002         // LB 18    break after space
   4003         if (fSP->contains(prevChar)) {
   4004             break;
   4005         }
   4006 
   4007         // LB 19
   4008         //    x   QU
   4009         //    QU  x
   4010         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
   4011             continue;
   4012         }
   4013 
   4014         // LB 20  Break around a CB
   4015         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
   4016             break;
   4017         }
   4018 
   4019         // LB 21
   4020         if (fBA->contains(thisChar) ||
   4021             fHY->contains(thisChar) ||
   4022             fNS->contains(thisChar) ||
   4023             fBB->contains(prevChar) )   {
   4024             continue;
   4025         }
   4026 
   4027         // LB 22
   4028         if (fAL->contains(prevChar) && fIN->contains(thisChar) ||
   4029             fID->contains(prevChar) && fIN->contains(thisChar) ||
   4030             fIN->contains(prevChar) && fIN->contains(thisChar) ||
   4031             fNU->contains(prevChar) && fIN->contains(thisChar) )   {
   4032             continue;
   4033         }
   4034 
   4035 
   4036         // LB 23    ID x PO
   4037         //          AL x NU
   4038         //          NU x AL
   4039         if (fID->contains(prevChar) && fPO->contains(thisChar) ||
   4040             fAL->contains(prevChar) && fNU->contains(thisChar) ||
   4041             fNU->contains(prevChar) && fAL->contains(thisChar) )   {
   4042             continue;
   4043         }
   4044 
   4045         // LB 24  Do not break between prefix and letters or ideographs.
   4046         //        PR x ID
   4047         //        PR x AL
   4048         //        PO x AL
   4049         if (fPR->contains(prevChar) && fID->contains(thisChar) ||
   4050             fPR->contains(prevChar) && fAL->contains(thisChar) ||
   4051             fPO->contains(prevChar) && fAL->contains(thisChar) )   {
   4052             continue;
   4053         }
   4054 
   4055 
   4056 
   4057         // LB 25    Numbers
   4058         if (fNumberMatcher->lookingAt(prevPos, status)) {
   4059             if (U_FAILURE(status)) {
   4060                 break;
   4061             }
   4062             // Matched a number.  But could have been just a single digit, which would
   4063             //    not represent a "no break here" between prevChar and thisChar
   4064             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
   4065             if (numEndIdx > pos) {
   4066                 // Number match includes at least our two chars being checked
   4067                 if (numEndIdx > nextPos) {
   4068                     // Number match includes additional chars.  Update pos and nextPos
   4069                     //   so that next loop iteration will continue at the end of the number,
   4070                     //   checking for breaks between last char in number & whatever follows.
   4071                     pos = nextPos = numEndIdx;
   4072                     do {
   4073                         pos = fText->moveIndex32(pos, -1);
   4074                         thisChar = fText->char32At(pos);
   4075                     } while (fCM->contains(thisChar));
   4076                 }
   4077                 continue;
   4078             }
   4079         }
   4080 
   4081 
   4082         // LB 26 Do not break a Korean syllable.
   4083         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
   4084                                         fJV->contains(thisChar) ||
   4085                                         fH2->contains(thisChar) ||
   4086                                         fH3->contains(thisChar))) {
   4087                                             continue;
   4088                                         }
   4089 
   4090         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
   4091             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
   4092                 continue;
   4093         }
   4094 
   4095         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
   4096             fJT->contains(thisChar)) {
   4097                 continue;
   4098         }
   4099 
   4100         // LB 27 Treat a Korean Syllable Block the same as ID.
   4101         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
   4102             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
   4103             fIN->contains(thisChar)) {
   4104                 continue;
   4105             }
   4106         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
   4107             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
   4108             fPO->contains(thisChar)) {
   4109                 continue;
   4110             }
   4111         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
   4112             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
   4113                 continue;
   4114             }
   4115 
   4116 
   4117 
   4118         // LB 28  Do not break between alphabetics ("at").
   4119         if (fAL->contains(prevChar) && fAL->contains(thisChar)) {
   4120             continue;
   4121         }
   4122 
   4123         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
   4124         if (fIS->contains(prevChar) && fAL->contains(thisChar)) {
   4125             continue;
   4126         }
   4127 
   4128         // LB 31    Break everywhere else
   4129         break;
   4130 
   4131     }
   4132 
   4133     return pos;
   4134 }
   4135 
   4136 
   4137 UVector  *RBBILineMonkey::charClasses() {
   4138     return fSets;
   4139 }
   4140 
   4141 
   4142 RBBILineMonkey::~RBBILineMonkey() {
   4143     delete fSets;
   4144 
   4145     delete fBK;
   4146     delete fCR;
   4147     delete fLF;
   4148     delete fCM;
   4149     delete fNL;
   4150     delete fWJ;
   4151     delete fZW;
   4152     delete fGL;
   4153     delete fCB;
   4154     delete fSP;
   4155     delete fB2;
   4156     delete fBA;
   4157     delete fBB;
   4158     delete fHY;
   4159     delete fH2;
   4160     delete fH3;
   4161     delete fCL;
   4162     delete fEX;
   4163     delete fIN;
   4164     delete fJL;
   4165     delete fJV;
   4166     delete fJT;
   4167     delete fNS;
   4168     delete fOP;
   4169     delete fQU;
   4170     delete fIS;
   4171     delete fNU;
   4172     delete fPO;
   4173     delete fPR;
   4174     delete fSY;
   4175     delete fAI;
   4176     delete fAL;
   4177     delete fID;
   4178     delete fSA;
   4179     delete fSG;
   4180     delete fXX;
   4181 
   4182     delete fCharBI;
   4183     delete fNumberMatcher;
   4184 }
   4185 
   4186 
   4187 //-------------------------------------------------------------------------------------------
   4188 //
   4189 //   TestMonkey
   4190 //
   4191 //     params
   4192 //       seed=nnnnn        Random number starting seed.
   4193 //                         Setting the seed allows errors to be reproduced.
   4194 //       loop=nnn          Looping count.  Controls running time.
   4195 //                         -1:  run forever.
   4196 //                          0 or greater:  run length.
   4197 //
   4198 //       type = char | word | line | sent | title
   4199 //
   4200 //-------------------------------------------------------------------------------------------
   4201 
   4202 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
   4203     int32_t val = defaultVal;
   4204     name.append(" *= *(-?\\d+)");
   4205     UErrorCode status = U_ZERO_ERROR;
   4206     RegexMatcher m(name, params, 0, status);
   4207     if (m.find()) {
   4208         // The param exists.  Convert the string to an int.
   4209         char valString[100];
   4210         int32_t paramLength = m.end(1, status) - m.start(1, status);
   4211         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
   4212             paramLength = (int32_t)(sizeof(valString)-2);
   4213         }
   4214         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
   4215         val = strtol(valString,  NULL, 10);
   4216 
   4217         // Delete this parameter from the params string.
   4218         m.reset();
   4219         params = m.replaceFirst("", status);
   4220     }
   4221     U_ASSERT(U_SUCCESS(status));
   4222     return val;
   4223 }
   4224 #endif
   4225 
   4226 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
   4227                                     BreakIterator *bi,
   4228                                     int expected[],
   4229                                     int expectedcount)
   4230 {
   4231     int count = 0;
   4232     int i = 0;
   4233     int forward[50];
   4234     bi->setText(ustr);
   4235     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
   4236         forward[count] = i;
   4237         if (count < expectedcount && expected[count] != i) {
   4238             test->errln("break forward test failed: expected %d but got %d",
   4239                         expected[count], i);
   4240             break;
   4241         }
   4242         count ++;
   4243     }
   4244     if (count != expectedcount) {
   4245         printStringBreaks(ustr, expected, expectedcount);
   4246         test->errln("break forward test failed: missed %d match",
   4247                     expectedcount - count);
   4248         return;
   4249     }
   4250     // testing boundaries
   4251     for (i = 1; i < expectedcount; i ++) {
   4252         int j = expected[i - 1];
   4253         if (!bi->isBoundary(j)) {
   4254             printStringBreaks(ustr, expected, expectedcount);
   4255             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
   4256             return;
   4257         }
   4258         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
   4259             if (bi->isBoundary(j)) {
   4260                 printStringBreaks(ustr, expected, expectedcount);
   4261                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
   4262                 return;
   4263             }
   4264         }
   4265     }
   4266 
   4267     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
   4268         count --;
   4269         if (forward[count] != i) {
   4270             printStringBreaks(ustr, expected, expectedcount);
   4271             test->errln("happy break test previous() failed: expected %d but got %d",
   4272                         forward[count], i);
   4273             break;
   4274         }
   4275     }
   4276     if (count != 0) {
   4277         printStringBreaks(ustr, expected, expectedcount);
   4278         test->errln("break test previous() failed: missed a match");
   4279         return;
   4280     }
   4281 
   4282     // testing preceding
   4283     for (i = 0; i < expectedcount - 1; i ++) {
   4284         // int j = expected[i] + 1;
   4285         int j = ustr.moveIndex32(expected[i], 1);
   4286         for (; j <= expected[i + 1]; j ++) {
   4287             if (bi->preceding(j) != expected[i]) {
   4288                 printStringBreaks(ustr, expected, expectedcount);
   4289                 test->errln("preceding(): Not expecting boundary at position %d", j);
   4290                 return;
   4291             }
   4292         }
   4293     }
   4294 }
   4295 
   4296 void RBBITest::TestWordBreaks(void)
   4297 {
   4298 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4299 
   4300     Locale        locale("en");
   4301     UErrorCode    status = U_ZERO_ERROR;
   4302     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   4303     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
   4304     // Replaced any C+J characters in a row with a random sequence of characters
   4305     // of the same length to make our C+J segmentation not get in the way.
   4306     static const char *strlist[] =
   4307     {
   4308     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
   4309     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
   4310     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
   4311     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
   4312     "\\uac00\\u3588\\u009c\\u0953\\u194b",
   4313     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   4314     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
   4315     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
   4316     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   4317     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   4318     "\\u2027\\U000e0067\\u0a47\\u00b7",
   4319     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   4320     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   4321     "\\u0589\\U000e006e\\u0a42\\U000104a5",
   4322     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
   4323     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   4324     "\\u0027\\u11af\\U000e0057\\u0602",
   4325     "\\U0001d7f2\\U000e007\\u0004\\u0589",
   4326     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
   4327     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   4328     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   4329     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   4330     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
   4331     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   4332     "\\u0233\\U000e0020\\u0a69\\u0d6a",
   4333     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   4334     "\\u18f4\\U000e0049\\u20e7\\u2027",
   4335     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   4336     "\\ua183\\u102d\\u0bec\\u003a",
   4337     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   4338     "\\u003a\\u0e57\\u0fad\\u002e",
   4339     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
   4340     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
   4341     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
   4342     "\\u003a\\u0664\\u00b7\\u1fba",
   4343     "\\u003b\\u0027\\u00b7\\u47a3",
   4344     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
   4345     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
   4346     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
   4347     };
   4348     int loop;
   4349     if (U_FAILURE(status)) {
   4350         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   4351         return;
   4352     }
   4353     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   4354         // printf("looping %d\n", loop);
   4355         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
   4356         // RBBICharMonkey monkey;
   4357         RBBIWordMonkey monkey;
   4358 
   4359         int expected[50];
   4360         int expectedcount = 0;
   4361 
   4362         monkey.setText(ustr);
   4363         int i;
   4364         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   4365             expected[expectedcount ++] = i;
   4366         }
   4367 
   4368         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   4369     }
   4370     delete bi;
   4371 #endif
   4372 }
   4373 
   4374 void RBBITest::TestWordBoundary(void)
   4375 {
   4376     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
   4377     Locale        locale("en");
   4378     UErrorCode    status = U_ZERO_ERROR;
   4379     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   4380     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
   4381     UChar         str[50];
   4382     static const char *strlist[] =
   4383     {
   4384     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   4385     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   4386     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   4387     "\\u2027\\U000e0067\\u0a47\\u00b7",
   4388     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   4389     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   4390     "\\u0589\\U000e006e\\u0a42\\U000104a5",
   4391     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
   4392     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   4393     "\\u0027\\u11af\\U000e0057\\u0602",
   4394     "\\U0001d7f2\\U000e007\\u0004\\u0589",
   4395     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
   4396     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   4397     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   4398     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   4399     "\\U000e0065\\u302c\\u09ee\\U000e0068",
   4400     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   4401     "\\u0233\\U000e0020\\u0a69\\u0d6a",
   4402     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   4403     "\\u58f4\\U000e0049\\u20e7\\u2027",
   4404     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   4405     "\\ua183\\u102d\\u0bec\\u003a",
   4406     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   4407     "\\u003a\\u0e57\\u0fad\\u002e",
   4408     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
   4409     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
   4410     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
   4411     "\\u003a\\u0664\\u00b7\\u1fba",
   4412     "\\u003b\\u0027\\u00b7\\u47a3",
   4413     };
   4414     int loop;
   4415     if (U_FAILURE(status)) {
   4416         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   4417         return;
   4418     }
   4419     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   4420         // printf("looping %d\n", loop);
   4421         u_unescape(strlist[loop], str, 20);
   4422         UnicodeString ustr(str);
   4423         int forward[50];
   4424         int count = 0;
   4425 
   4426         bi->setText(ustr);
   4427         int prev = 0;
   4428         int i;
   4429         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
   4430             forward[count ++] = i;
   4431             if (i > prev) {
   4432                 int j;
   4433                 for (j = prev + 1; j < i; j ++) {
   4434                     if (bi->isBoundary(j)) {
   4435                         printStringBreaks(ustr, forward, count);
   4436                         errln("happy boundary test failed: expected %d not a boundary",
   4437                                j);
   4438                         return;
   4439                     }
   4440                 }
   4441             }
   4442             if (!bi->isBoundary(i)) {
   4443                 printStringBreaks(ustr, forward, count);
   4444                 errln("happy boundary test failed: expected %d a boundary",
   4445                        i);
   4446                 return;
   4447             }
   4448             prev = i;
   4449         }
   4450     }
   4451     delete bi;
   4452 }
   4453 
   4454 void RBBITest::TestLineBreaks(void)
   4455 {
   4456 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4457     Locale        locale("en");
   4458     UErrorCode    status = U_ZERO_ERROR;
   4459     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
   4460     const int32_t  STRSIZE = 50;
   4461     UChar         str[STRSIZE];
   4462     static const char *strlist[] =
   4463     {
   4464      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
   4465      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
   4466              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
   4467      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
   4468              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
   4469      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
   4470      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
   4471      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
   4472      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
   4473      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
   4474      "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
   4475      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
   4476      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
   4477      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
   4478      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
   4479      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
   4480      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
   4481      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
   4482      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
   4483      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
   4484      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
   4485      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
   4486      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
   4487      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
   4488      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
   4489      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
   4490      "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
   4491      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
   4492      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
   4493      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
   4494      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
   4495      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
   4496      "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
   4497      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
   4498      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
   4499      "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
   4500      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
   4501      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
   4502      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
   4503      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
   4504      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
   4505      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
   4506          "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
   4507          "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
   4508          "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
   4509      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
   4510          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
   4511     };
   4512     int loop;
   4513     TEST_ASSERT_SUCCESS(status);
   4514     if (U_FAILURE(status)) {
   4515         return;
   4516     }
   4517     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   4518         // printf("looping %d\n", loop);
   4519         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
   4520         if (t >= STRSIZE) {
   4521             TEST_ASSERT(FALSE);
   4522             continue;
   4523         }
   4524 
   4525 
   4526         UnicodeString ustr(str);
   4527         RBBILineMonkey monkey;
   4528         if (U_FAILURE(monkey.deferredStatus)) {
   4529             continue;
   4530         }
   4531 
   4532         const int EXPECTEDSIZE = 50;
   4533         int expected[EXPECTEDSIZE];
   4534         int expectedcount = 0;
   4535 
   4536         monkey.setText(ustr);
   4537         int i;
   4538         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   4539             if (expectedcount >= EXPECTEDSIZE) {
   4540                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
   4541                 return;
   4542             }
   4543             expected[expectedcount ++] = i;
   4544         }
   4545 
   4546         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   4547     }
   4548     delete bi;
   4549 #endif
   4550 }
   4551 
   4552 void RBBITest::TestSentBreaks(void)
   4553 {
   4554 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4555     Locale        locale("en");
   4556     UErrorCode    status = U_ZERO_ERROR;
   4557     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
   4558     UChar         str[200];
   4559     static const char *strlist[] =
   4560     {
   4561      "Now\ris\nthe\r\ntime\n\rfor\r\r",
   4562      "This\n",
   4563      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
   4564      "\"Sentence ending with a quote.\" Bye.",
   4565      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
   4566      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
   4567      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
   4568      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
   4569      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
   4570      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
   4571      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
   4572              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
   4573              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
   4574              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
   4575      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
   4576              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
   4577              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
   4578              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
   4579              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
   4580              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
   4581     };
   4582     int loop;
   4583     if (U_FAILURE(status)) {
   4584         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   4585         return;
   4586     }
   4587     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   4588         u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
   4589         UnicodeString ustr(str);
   4590 
   4591         RBBISentMonkey monkey;
   4592         if (U_FAILURE(monkey.deferredStatus)) {
   4593             continue;
   4594         }
   4595 
   4596         const int EXPECTEDSIZE = 50;
   4597         int expected[EXPECTEDSIZE];
   4598         int expectedcount = 0;
   4599 
   4600         monkey.setText(ustr);
   4601         int i;
   4602         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   4603             if (expectedcount >= EXPECTEDSIZE) {
   4604                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
   4605                 return;
   4606             }
   4607             expected[expectedcount ++] = i;
   4608         }
   4609 
   4610         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   4611     }
   4612     delete bi;
   4613 #endif
   4614 }
   4615 
   4616 void RBBITest::TestMonkey(char *params) {
   4617 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4618 
   4619     UErrorCode     status    = U_ZERO_ERROR;
   4620     int32_t        loopCount = 500;
   4621     int32_t        seed      = 1;
   4622     UnicodeString  breakType = "all";
   4623     Locale         locale("en");
   4624     UBool          useUText  = FALSE;
   4625 
   4626     if (quick == FALSE) {
   4627         loopCount = 10000;
   4628     }
   4629 
   4630     if (params) {
   4631         UnicodeString p(params);
   4632         loopCount = getIntParam("loop", p, loopCount);
   4633         seed      = getIntParam("seed", p, seed);
   4634 
   4635         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
   4636         if (m.find()) {
   4637             breakType = m.group(1, status);
   4638             m.reset();
   4639             p = m.replaceFirst("", status);
   4640         }
   4641 
   4642         RegexMatcher u(" *utext", p, 0, status);
   4643         if (u.find()) {
   4644             useUText = TRUE;
   4645             u.reset();
   4646             p = u.replaceFirst("", status);
   4647         }
   4648 
   4649 
   4650         // m.reset(p);
   4651         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
   4652             // Each option is stripped out of the option string as it is processed.
   4653             // All options have been checked.  The option string should have been completely emptied..
   4654             char buf[100];
   4655             p.extract(buf, sizeof(buf), NULL, status);
   4656             buf[sizeof(buf)-1] = 0;
   4657             errln("Unrecognized or extra parameter:  %s\n", buf);
   4658             return;
   4659         }
   4660 
   4661     }
   4662 
   4663     if (breakType == "char" || breakType == "all") {
   4664         RBBICharMonkey  m;
   4665         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   4666         if (U_SUCCESS(status)) {
   4667             RunMonkey(bi, m, "char", seed, loopCount, useUText);
   4668             if (breakType == "all" && useUText==FALSE) {
   4669                 // Also run a quick test with UText when "all" is specified
   4670                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
   4671             }
   4672         }
   4673         else {
   4674             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
   4675         }
   4676         delete bi;
   4677     }
   4678 
   4679     if (breakType == "word" || breakType == "all") {
   4680         logln("Word Break Monkey Test");
   4681         RBBIWordMonkey  m;
   4682         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
   4683         if (U_SUCCESS(status)) {
   4684             RunMonkey(bi, m, "word", seed, loopCount, useUText);
   4685         }
   4686         else {
   4687             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
   4688         }
   4689         delete bi;
   4690     }
   4691 
   4692     if (breakType == "line" || breakType == "all") {
   4693         logln("Line Break Monkey Test");
   4694         RBBILineMonkey  m;
   4695         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
   4696         if (loopCount >= 10) {
   4697             loopCount = loopCount / 5;   // Line break runs slower than the others.
   4698         }
   4699         if (U_SUCCESS(status)) {
   4700             RunMonkey(bi, m, "line", seed, loopCount, useUText);
   4701         }
   4702         else {
   4703             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
   4704         }
   4705         delete bi;
   4706     }
   4707 
   4708     if (breakType == "sent" || breakType == "all"  ) {
   4709         logln("Sentence Break Monkey Test");
   4710         RBBISentMonkey  m;
   4711         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
   4712         if (loopCount >= 10) {
   4713             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
   4714         }
   4715         if (U_SUCCESS(status)) {
   4716             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
   4717         }
   4718         else {
   4719             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
   4720         }
   4721         delete bi;
   4722     }
   4723 
   4724 #endif
   4725 }
   4726 
   4727 //
   4728 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
   4729 //    Parameters:
   4730 //       bi      - the break iterator to use
   4731 //       mk      - MonkeyKind, abstraction for obtaining expected results
   4732 //       name    - Name of test (char, word, etc.) for use in error messages
   4733 //       seed    - Seed for starting random number generator (parameter from user)
   4734 //       numIterations
   4735 //
   4736 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
   4737                          int32_t numIterations, UBool useUText) {
   4738 
   4739 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4740 
   4741     const int32_t    TESTSTRINGLEN = 500;
   4742     UnicodeString    testText;
   4743     int32_t          numCharClasses;
   4744     UVector          *chClasses;
   4745     int              expected[TESTSTRINGLEN*2 + 1];
   4746     int              expectedCount = 0;
   4747     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
   4748     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
   4749     char             reverseBreaks[TESTSTRINGLEN*2+1];
   4750     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
   4751     char             followingBreaks[TESTSTRINGLEN*2+1];
   4752     char             precedingBreaks[TESTSTRINGLEN*2+1];
   4753     int              i;
   4754     int              loopCount = 0;
   4755 
   4756     m_seed = seed;
   4757 
   4758     numCharClasses = mk.charClasses()->size();
   4759     chClasses      = mk.charClasses();
   4760 
   4761     // Check for errors that occured during the construction of the MonkeyKind object.
   4762     //  Can't report them where they occured because errln() is a method coming from intlTest,
   4763     //  and is not visible outside of RBBITest :-(
   4764     if (U_FAILURE(mk.deferredStatus)) {
   4765         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
   4766         return;
   4767     }
   4768 
   4769     // Verify that the character classes all have at least one member.
   4770     for (i=0; i<numCharClasses; i++) {
   4771         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
   4772         if (s == NULL || s->size() == 0) {
   4773             errln("Character Class #%d is null or of zero size.", i);
   4774             return;
   4775         }
   4776     }
   4777 
   4778     while (loopCount < numIterations || numIterations == -1) {
   4779         if (numIterations == -1 && loopCount % 10 == 0) {
   4780             // If test is running in an infinite loop, display a periodic tic so
   4781             //   we can tell that it is making progress.
   4782             fprintf(stderr, ".");
   4783         }
   4784         // Save current random number seed, so that we can recreate the random numbers
   4785         //   for this loop iteration in event of an error.
   4786         seed = m_seed;
   4787 
   4788         // Populate a test string with data.
   4789         testText.truncate(0);
   4790         for (i=0; i<TESTSTRINGLEN; i++) {
   4791             int32_t  aClassNum = m_rand() % numCharClasses;
   4792             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
   4793             int32_t   charIdx = m_rand() % classSet->size();
   4794             UChar32   c = classSet->charAt(charIdx);
   4795             if (c < 0) {   // TODO:  deal with sets containing strings.
   4796                 errln("c < 0");
   4797                 break;
   4798             }
   4799             testText.append(c);
   4800         }
   4801 
   4802         // Calculate the expected results for this test string.
   4803         mk.setText(testText);
   4804         memset(expectedBreaks, 0, sizeof(expectedBreaks));
   4805         expectedBreaks[0] = 1;
   4806         int32_t breakPos = 0;
   4807         expectedCount = 0;
   4808         for (;;) {
   4809             breakPos = mk.next(breakPos);
   4810             if (breakPos == -1) {
   4811                 break;
   4812             }
   4813             if (breakPos > testText.length()) {
   4814                 errln("breakPos > testText.length()");
   4815             }
   4816             expectedBreaks[breakPos] = 1;
   4817             U_ASSERT(expectedCount<testText.length());
   4818             expected[expectedCount ++] = breakPos;
   4819         }
   4820 
   4821         // Find the break positions using forward iteration
   4822         memset(forwardBreaks, 0, sizeof(forwardBreaks));
   4823         if (useUText) {
   4824             UErrorCode status = U_ZERO_ERROR;
   4825             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
   4826             // testUText = utext_openUnicodeString(testUText, &testText, &status);
   4827             bi->setText(testUText, status);
   4828             TEST_ASSERT_SUCCESS(status);
   4829             utext_close(testUText);   // The break iterator does a shallow clone of the UText
   4830                                       //  This UText can be closed immediately, so long as the
   4831                                       //  testText string continues to exist.
   4832         } else {
   4833             bi->setText(testText);
   4834         }
   4835 
   4836         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
   4837             if (i < 0 || i > testText.length()) {
   4838                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
   4839                 break;
   4840             }
   4841             forwardBreaks[i] = 1;
   4842         }
   4843 
   4844         // Find the break positions using reverse iteration
   4845         memset(reverseBreaks, 0, sizeof(reverseBreaks));
   4846         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
   4847             if (i < 0 || i > testText.length()) {
   4848                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
   4849                 break;
   4850             }
   4851             reverseBreaks[i] = 1;
   4852         }
   4853 
   4854         // Find the break positions using isBoundary() tests.
   4855         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
   4856         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
   4857         for (i=0; i<=testText.length(); i++) {
   4858             isBoundaryBreaks[i] = bi->isBoundary(i);
   4859         }
   4860 
   4861 
   4862         // Find the break positions using the following() function.
   4863         // printf(".");
   4864         memset(followingBreaks, 0, sizeof(followingBreaks));
   4865         int32_t   lastBreakPos = 0;
   4866         followingBreaks[0] = 1;
   4867         for (i=0; i<testText.length(); i++) {
   4868             breakPos = bi->following(i);
   4869             if (breakPos <= i ||
   4870                 breakPos < lastBreakPos ||
   4871                 breakPos > testText.length() ||
   4872                 breakPos > lastBreakPos && lastBreakPos > i ) {
   4873                 errln("%s break monkey test: "
   4874                     "Out of range value returned by BreakIterator::following().\n"
   4875                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
   4876                          name, seed, i, breakPos, lastBreakPos);
   4877                 break;
   4878             }
   4879             followingBreaks[breakPos] = 1;
   4880             lastBreakPos = breakPos;
   4881         }
   4882 
   4883         // Find the break positions using the preceding() function.
   4884         memset(precedingBreaks, 0, sizeof(precedingBreaks));
   4885         lastBreakPos = testText.length();
   4886         precedingBreaks[testText.length()] = 1;
   4887         for (i=testText.length(); i>0; i--) {
   4888             breakPos = bi->preceding(i);
   4889             if (breakPos >= i ||
   4890                 breakPos > lastBreakPos ||
   4891                 breakPos < 0 && testText.getChar32Start(i)>0 ||
   4892                 breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i) ) {
   4893                 errln("%s break monkey test: "
   4894                     "Out of range value returned by BreakIterator::preceding().\n"
   4895                     "index=%d;  prev returned %d; lastBreak=%d" ,
   4896                     name,  i, breakPos, lastBreakPos);
   4897                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
   4898                     precedingBreaks[i] = 2;   // Forces an error.
   4899                 }
   4900             } else {
   4901                 if (breakPos >= 0) {
   4902                     precedingBreaks[breakPos] = 1;
   4903                 }
   4904                 lastBreakPos = breakPos;
   4905             }
   4906         }
   4907 
   4908         // Compare the expected and actual results.
   4909         for (i=0; i<=testText.length(); i++) {
   4910             const char *errorType = NULL;
   4911             if  (forwardBreaks[i] != expectedBreaks[i]) {
   4912                 errorType = "next()";
   4913             } else if (reverseBreaks[i] != forwardBreaks[i]) {
   4914                 errorType = "previous()";
   4915             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
   4916                 errorType = "isBoundary()";
   4917             } else if (followingBreaks[i] != expectedBreaks[i]) {
   4918                 errorType = "following()";
   4919             } else if (precedingBreaks[i] != expectedBreaks[i]) {
   4920                 errorType = "preceding()";
   4921             }
   4922 
   4923 
   4924             if (errorType != NULL) {
   4925                 // Format a range of the test text that includes the failure as
   4926                 //  a data item that can be included in the rbbi test data file.
   4927 
   4928                 // Start of the range is the last point where expected and actual results
   4929                 //   both agreed that there was a break position.
   4930                 int startContext = i;
   4931                 int32_t count = 0;
   4932                 for (;;) {
   4933                     if (startContext==0) { break; }
   4934                     startContext --;
   4935                     if (expectedBreaks[startContext] != 0) {
   4936                         if (count == 2) break;
   4937                         count ++;
   4938                     }
   4939                 }
   4940 
   4941                 // End of range is two expected breaks past the start position.
   4942                 int endContext = i + 1;
   4943                 int ci;
   4944                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
   4945                     for (;;) {
   4946                         if (endContext >= testText.length()) {break;}
   4947                         if (expectedBreaks[endContext-1] != 0) {
   4948                             if (count == 0) break;
   4949                             count --;
   4950                         }
   4951                         endContext ++;
   4952                     }
   4953                 }
   4954 
   4955                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
   4956                 UnicodeString errorText = "<data>";
   4957                 /***if (strcmp(errorType, "next()") == 0) {
   4958                     startContext = 0;
   4959                     endContext = testText.length();
   4960 
   4961                     printStringBreaks(testText, expected, expectedCount);
   4962                 }***/
   4963 
   4964                 for (ci=startContext; ci<endContext;) {
   4965                     UnicodeString hexChars("0123456789abcdef");
   4966                     UChar32  c;
   4967                     int      bn;
   4968                     c = testText.char32At(ci);
   4969                     if (ci == i) {
   4970                         // This is the location of the error.
   4971                         errorText.append("<?>");
   4972                     } else if (expectedBreaks[ci] != 0) {
   4973                         // This a non-error expected break position.
   4974                         errorText.append("\\");
   4975                     }
   4976                     if (c < 0x10000) {
   4977                         errorText.append("\\u");
   4978                         for (bn=12; bn>=0; bn-=4) {
   4979                             errorText.append(hexChars.charAt((c>>bn)&0xf));
   4980                         }
   4981                     } else {
   4982                         errorText.append("\\U");
   4983                         for (bn=28; bn>=0; bn-=4) {
   4984                             errorText.append(hexChars.charAt((c>>bn)&0xf));
   4985                         }
   4986                     }
   4987                     ci = testText.moveIndex32(ci, 1);
   4988                 }
   4989                 errorText.append("\\");
   4990                 errorText.append("</data>\n");
   4991 
   4992                 // Output the error
   4993                 char  charErrorTxt[500];
   4994                 UErrorCode status = U_ZERO_ERROR;
   4995                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
   4996                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
   4997                 errln("%s break monkey test error.  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
   4998                     name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
   4999                     errorType, seed, i, charErrorTxt);
   5000                 break;
   5001             }
   5002         }
   5003 
   5004         loopCount++;
   5005     }
   5006 #endif
   5007 }
   5008 
   5009 //
   5010 //  TestDebug    -  A place-holder test for debugging purposes.
   5011 //                  For putting in fragments of other tests that can be invoked
   5012 //                  for tracing  without a lot of unwanted extra stuff happening.
   5013 //
   5014 void RBBITest::TestDebug(void) {
   5015 #if 0
   5016     UErrorCode   status = U_ZERO_ERROR;
   5017     int pos = 0;
   5018     int ruleStatus = 0;
   5019 
   5020     RuleBasedBreakIterator* bi =
   5021        // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
   5022        // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
   5023        (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
   5024     UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
   5025     // UnicodeString s("Aaa.  Bcd");
   5026     s = s.unescape();
   5027     bi->setText(s);
   5028     UBool r = bi->isBoundary(8);
   5029     printf("%s", r?"true":"false");
   5030     return;
   5031     pos = bi->last();
   5032     do {
   5033         // ruleStatus = bi->getRuleStatus();
   5034         printf("%d\t%d\n", pos, ruleStatus);
   5035         pos = bi->previous();
   5036     } while (pos != BreakIterator::DONE);
   5037 #endif
   5038 }
   5039 
   5040 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   5041