Home | History | Annotate | Download | only in intltest
      1 /********************************************************************
      2  * COPYRIGHT:
      3  * Copyright (c) 1999-2010, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  ********************************************************************/
      6 /************************************************************************
      7 *   Date        Name        Description
      8 *   12/15/99    Madhu        Creation.
      9 *   01/12/2000  Madhu        Updated for changed API and added new tests
     10 ************************************************************************/
     11 
     12 #include "unicode/utypeinfo.h"  // for 'typeid' to work
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_BREAK_ITERATION
     17 
     18 #include "unicode/utypes.h"
     19 #include "unicode/brkiter.h"
     20 #include "unicode/rbbi.h"
     21 #include "unicode/uchar.h"
     22 #include "unicode/utf16.h"
     23 #include "unicode/ucnv.h"
     24 #include "unicode/schriter.h"
     25 #include "unicode/uniset.h"
     26 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
     27 #include "unicode/ustring.h"
     28 #include "unicode/utext.h"
     29 #include "intltest.h"
     30 #include "rbbitst.h"
     31 #include <string.h>
     32 #include "uvector.h"
     33 #include "uvectr32.h"
     34 #include "triedict.h"
     35 #include <string.h>
     36 #include <stdio.h>
     37 #include <stdlib.h>
     38 #include "unicode/numfmt.h"
     39 #include "unicode/uscript.h"
     40 
     41 #define TEST_ASSERT(x) {if (!(x)) { \
     42     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
     43 
     44 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
     45     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
     46 
     47 
     48 //---------------------------------------------
     49 // runIndexedTest
     50 //---------------------------------------------
     51 
     52 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
     53 {
     54     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
     55 
     56     switch (index) {
     57 #if !UCONFIG_NO_FILE_IO
     58         case 0: name = "TestBug4153072";
     59             if(exec) TestBug4153072();                         break;
     60 #else
     61         case 0: name = "skip";
     62             break;
     63 #endif
     64 
     65         case 1: name = "TestJapaneseLineBreak";
     66             if(exec) TestJapaneseLineBreak();                  break;
     67         case 2: name = "TestStatusReturn";
     68             if(exec) TestStatusReturn();                       break;
     69 
     70 #if !UCONFIG_NO_FILE_IO
     71         case 3: name = "TestUnicodeFiles";
     72             if(exec) TestUnicodeFiles();                       break;
     73         case 4: name = "TestEmptyString";
     74             if(exec) TestEmptyString();                        break;
     75 #else
     76         case 3: case 4: name = "skip";
     77             break;
     78 #endif
     79 
     80         case 5: name = "TestGetAvailableLocales";
     81             if(exec) TestGetAvailableLocales();                break;
     82 
     83         case 6: name = "TestGetDisplayName";
     84             if(exec) TestGetDisplayName();                     break;
     85 
     86 #if !UCONFIG_NO_FILE_IO
     87         case 7: name = "TestEndBehaviour";
     88             if(exec) TestEndBehaviour();                       break;
     89         case 8: name = "TestMixedThaiLineBreak";
     90              if(exec) TestMixedThaiLineBreak();                break;
     91         case 9: name = "TestThaiLineBreak";
     92              if(exec) TestThaiLineBreak();                     break;
     93         case 10: name = "TestMaiyamok";
     94              if(exec) TestMaiyamok();                          break;
     95         case 11: name = "TestWordBreaks";
     96              if(exec) TestWordBreaks();                        break;
     97         case 12: name = "TestWordBoundary";
     98              if(exec) TestWordBoundary();                      break;
     99         case 13: name = "TestLineBreaks";
    100              if(exec) TestLineBreaks();                        break;
    101         case 14: name = "TestSentBreaks";
    102              if(exec) TestSentBreaks();                        break;
    103         case 15: name = "TestExtended";
    104              if(exec) TestExtended();                          break;
    105 #else
    106         case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
    107              break;
    108 #endif
    109 
    110         case 16:
    111              if(exec) {
    112  #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
    113                name = "TestMonkey";
    114                TestMonkey(params);
    115  #else
    116                name = "skip";
    117  #endif
    118              }
    119                                                                break;
    120 
    121 #if !UCONFIG_NO_FILE_IO
    122         case 17: name = "TestBug3818";
    123             if(exec) TestBug3818();                            break;
    124         case 18: name = "TestJapaneseWordBreak";
    125             if(exec) TestJapaneseWordBreak();                  break;
    126 #else
    127         case 17: case 18: name = "skip";
    128             break;
    129 #endif
    130 
    131         case 19: name = "TestDebug";
    132             if(exec) TestDebug();                              break;
    133         case 20: name = "TestTrieDict";
    134             if(exec) TestTrieDict();                           break;
    135 
    136 #if !UCONFIG_NO_FILE_IO
    137         case 21: name = "TestBug5775";
    138             if (exec) TestBug5775();                           break;
    139         case 22: name = "TestThaiBreaks";
    140             if (exec) TestThaiBreaks();                        break;
    141         case 23: name = "TestTailoredBreaks";
    142             if (exec) TestTailoredBreaks();                    break;
    143         case 24: name = "TestTrieDictWithValue";
    144             if(exec) TestTrieDictWithValue();                  break;
    145 #else
    146         case 21: case 22: case 23: case 24: name = "skip";
    147             break;
    148 #endif
    149         case 25: name = "TestDictRules";
    150             if (exec) TestDictRules();                         break;
    151         case 25: name = "TestBug5532";
    152             if (exec) TestBug5532();                           break;
    153         default: name = ""; break; //needed to end loop
    154     }
    155 }
    156 
    157 
    158 //---------------------------------------------------------------------------
    159 //
    160 //   class BITestData   Holds a set of Break iterator test data and results
    161 //                      Includes
    162 //                         - the string data to be broken
    163 //                         - a vector of the expected break positions.
    164 //                         - a vector of source line numbers for the data,
    165 //                               (to help see where errors occured.)
    166 //                         - The expected break tag values.
    167 //                         - Vectors of actual break positions and tag values.
    168 //                         - Functions for comparing actual with expected and
    169 //                            reporting errors.
    170 //
    171 //----------------------------------------------------------------------------
    172 class BITestData {
    173 public:
    174     UnicodeString    fDataToBreak;
    175     UVector          fExpectedBreakPositions;
    176     UVector          fExpectedTags;
    177     UVector          fLineNum;
    178     UVector          fActualBreakPositions;   // Test Results.
    179     UVector          fActualTags;
    180 
    181     BITestData(UErrorCode &status);
    182     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
    183     void             checkResults(const char *heading, RBBITest *test);
    184     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
    185     void             clearResults();
    186 };
    187 
    188 //
    189 // Constructor.
    190 //
    191 BITestData::BITestData(UErrorCode &status)
    192 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
    193   fActualTags(status)
    194 {
    195 }
    196 
    197 //
    198 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
    199 //                 The macro form collects the line number, which is helpful
    200 //                 when tracking down failures.
    201 //
    202 //                 A null data item is inserted at the start of each test's data
    203 //                  to put the starting zero into the data list.  The position saved for
    204 //                  each non-null item is its ending position.
    205 //
    206 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
    207 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
    208     if (U_FAILURE(status)) {return;}
    209     if (data != NULL) {
    210         fDataToBreak.append(CharsToUnicodeString(data));
    211     }
    212     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
    213     fExpectedTags.addElement(tag, status);
    214     fLineNum.addElement(lineNum, status);
    215 }
    216 
    217 
    218 //
    219 //  checkResults.   Compare the actual and expected break positions, report any differences.
    220 //
    221 void BITestData::checkResults(const char *heading, RBBITest *test) {
    222     int32_t   expectedIndex = 0;
    223     int32_t   actualIndex = 0;
    224 
    225     for (;;) {
    226         // If we've run through both the expected and actual results vectors, we're done.
    227         //   break out of the loop.
    228         if (expectedIndex >= fExpectedBreakPositions.size() &&
    229             actualIndex   >= fActualBreakPositions.size()) {
    230             break;
    231         }
    232 
    233 
    234         if (expectedIndex >= fExpectedBreakPositions.size()) {
    235             err(heading, test, expectedIndex-1, actualIndex);
    236             actualIndex++;
    237             continue;
    238         }
    239 
    240         if (actualIndex >= fActualBreakPositions.size()) {
    241             err(heading, test, expectedIndex, actualIndex-1);
    242             expectedIndex++;
    243             continue;
    244         }
    245 
    246         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
    247             err(heading, test, expectedIndex, actualIndex);
    248             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
    249             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
    250                 actualIndex++;
    251             } else {
    252                 expectedIndex++;
    253             }
    254             continue;
    255         }
    256 
    257         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
    258             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
    259                 heading, fLineNum.elementAt(expectedIndex),
    260                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
    261         }
    262 
    263         actualIndex++;
    264         expectedIndex++;
    265     }
    266 }
    267 
    268 //
    269 //  err   -  An error was found.  Report it, along with information about where the
    270 //                                incorrectly broken test data appeared in the source file.
    271 //
    272 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
    273 {
    274     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
    275     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
    276     int32_t   o        = 0;
    277     int32_t   line     = fLineNum.elementAti(expectedIdx);
    278     if (expectedIdx > 0) {
    279         // The line numbers are off by one because a premature break occurs somewhere
    280         //    within the previous item, rather than at the start of the current (expected) item.
    281         //    We want to report the offset of the unexpected break from the start of
    282         //      this previous item.
    283         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
    284     }
    285     if (actual < expected) {
    286         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
    287     } else {
    288         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
    289     }
    290 }
    291 
    292 
    293 void BITestData::clearResults() {
    294     fActualBreakPositions.removeAllElements();
    295     fActualTags.removeAllElements();
    296 }
    297 
    298 
    299 //-----------------------------------------------------------------------------------
    300 //
    301 //    Cannned Test Characters
    302 //
    303 //-----------------------------------------------------------------------------------
    304 
    305 static const UChar cannedTestArray[] = {
    306     0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
    307     0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
    308     0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
    309     0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
    310     0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
    311     0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
    312     0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
    313     0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
    314 };
    315 
    316 static UnicodeString* cannedTestChars = 0;
    317 
    318 #define  halfNA     "\\u0928\\u094d\\u200d"
    319 #define  halfSA     "\\u0938\\u094d\\u200d"
    320 #define  halfCHA    "\\u091a\\u094d\\u200d"
    321 #define  halfKA     "\\u0915\\u094d\\u200d"
    322 #define  deadTA     "\\u0924\\u094d"
    323 
    324 //--------------------------------------------------------------------------------------
    325 //
    326 //    RBBITest    constructor and destructor
    327 //
    328 //--------------------------------------------------------------------------------------
    329 
    330 RBBITest::RBBITest() {
    331     UnicodeString temp(cannedTestArray);
    332     cannedTestChars = new UnicodeString();
    333     *cannedTestChars += (UChar)0x0000;
    334     *cannedTestChars += temp;
    335 }
    336 
    337 
    338 RBBITest::~RBBITest() {
    339     delete cannedTestChars;
    340 }
    341 
    342 
    343 static const int T_NUMBER = 100;
    344 static const int T_LETTER = 200;
    345 static const int T_H_OR_K = 300;
    346 static const int T_IDEO   = 400;
    347 
    348 
    349 
    350 
    351 
    352 
    353 //--------------------------------------------------------------------
    354 //Testing the BreakIterator for devanagari script
    355 //--------------------------------------------------------------------
    356 
    357 #define deadRA   "\\u0930\\u094d"         /*deadform RA = devanagari RA + virama*/
    358 #define deadPHA  "\\u092b\\u094d"         /*deadform PHA = devanagari PHA + virama*/
    359 #define deadTTHA "\\u0920\\u094d"
    360 #define deadPA   "\\u092a\\u094d"
    361 #define deadSA   "\\u0938\\u094d"
    362 #define visarga  "\\u0903"                /*devanagari visarga looks like a english colon*/
    363 
    364 
    365 
    366 
    367 
    368 
    369 //-----------------------------------------------------------------------------------
    370 //
    371 //   Test for status {tag} return value from break rules.
    372 //        TODO:  a more thorough test.
    373 //
    374 //-----------------------------------------------------------------------------------
    375 void RBBITest::TestStatusReturn() {
    376      UnicodeString rulesString1("$Letters = [:L:];\n"
    377                                   "$Numbers = [:N:];\n"
    378                                   "$Letters+{1};\n"
    379                                   "$Numbers+{2};\n"
    380                                   "Help\\ {4}/me\\!;\n"
    381                                   "[^$Letters $Numbers];\n"
    382                                   "!.*;\n", -1, US_INV);
    383      UnicodeString testString1  = "abc123..abc Help me Help me!";
    384                                 // 01234567890123456789012345678
    385      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
    386      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
    387 
    388      UErrorCode status=U_ZERO_ERROR;
    389      UParseError    parseError;
    390 
    391      RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
    392      if(U_FAILURE(status)) {
    393          dataerrln("FAIL : in construction - %s", u_errorName(status));
    394      } else {
    395          int32_t  pos;
    396          int32_t  i = 0;
    397          bi->setText(testString1);
    398          for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
    399              if (pos != bounds1[i]) {
    400                  errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
    401                  break;
    402              }
    403 
    404              int tag = bi->getRuleStatus();
    405              if (tag != brkStatus[i]) {
    406                  errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
    407                  break;
    408              }
    409              i++;
    410          }
    411      }
    412      delete bi;
    413 }
    414 
    415 
    416 static void printStringBreaks(UnicodeString ustr, int expected[],
    417                               int expectedcount)
    418 {
    419     UErrorCode status = U_ZERO_ERROR;
    420     char name[100];
    421     printf("code    alpha extend alphanum type word sent line name\n");
    422     int j;
    423     for (j = 0; j < ustr.length(); j ++) {
    424         if (expectedcount > 0) {
    425             int k;
    426             for (k = 0; k < expectedcount; k ++) {
    427                 if (j == expected[k]) {
    428                     printf("------------------------------------------------ %d\n",
    429                            j);
    430                 }
    431             }
    432         }
    433         UChar32 c = ustr.char32At(j);
    434         if (c > 0xffff) {
    435             j ++;
    436         }
    437         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
    438         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
    439                            u_isUAlphabetic(c),
    440                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
    441                            u_isalnum(c),
    442                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
    443                                                   u_charType(c),
    444                                                   U_SHORT_PROPERTY_NAME),
    445                            u_getPropertyValueName(UCHAR_WORD_BREAK,
    446                                                   u_getIntPropertyValue(c,
    447                                                           UCHAR_WORD_BREAK),
    448                                                   U_SHORT_PROPERTY_NAME),
    449                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
    450                                    u_getIntPropertyValue(c,
    451                                            UCHAR_SENTENCE_BREAK),
    452                                    U_SHORT_PROPERTY_NAME),
    453                            u_getPropertyValueName(UCHAR_LINE_BREAK,
    454                                    u_getIntPropertyValue(c,
    455                                            UCHAR_LINE_BREAK),
    456                                    U_SHORT_PROPERTY_NAME),
    457                            name);
    458     }
    459 }
    460 
    461 void RBBITest::TestThaiLineBreak() {
    462     UErrorCode status = U_ZERO_ERROR;
    463     BITestData thaiLineSelection(status);
    464 
    465     // \u0e2f-- the Thai paiyannoi character-- isn't a letter.  It's a symbol that
    466     // represents elided letters at the end of a long word.  It should be bound to
    467     // the end of the word and not treated as an independent punctuation mark.
    468 
    469 
    470     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
    471     ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status);
    472     ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status);
    473     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status);
    474     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status);
    475 //        ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);
    476 //        ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
    477     ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status);
    478     // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
    479     ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status);
    480     ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status);
    481     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status);
    482     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status);
    483     ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status);
    484     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status);
    485 
    486     // the one time where the paiyannoi occurs somewhere other than at the end
    487     // of a word is in the Thai abbrevation for "etc.", which both begins and
    488     // ends with a paiyannoi
    489     ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status);
    490     ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
    491     ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status);
    492 
    493     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
    494         Locale("th"), status);
    495     if (U_FAILURE(status))
    496     {
    497         errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestThaiLineBreak. - %s", u_errorName(status));
    498         return;
    499     }
    500 
    501     generalIteratorTest(*e, thaiLineSelection);
    502     delete e;
    503 }
    504 
    505 
    506 
    507 void RBBITest::TestMixedThaiLineBreak()
    508 {
    509     UErrorCode   status = U_ZERO_ERROR;
    510     BITestData   thaiLineSelection(status);
    511 
    512     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
    513 
    514 
    515     // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters
    516     // start
    517 
    518     ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
    519     ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status);
    520     ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status);
    521     ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status);
    522     ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
    523     ADD_DATACHUNK(thaiLineSelection, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status);
    524     ADD_DATACHUNK(thaiLineSelection, "\\u0E04\\u0E23\\u0E1A", 0, status);
    525     ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E2D\\u0E1A ", 0, status);
    526     ADD_DATACHUNK(thaiLineSelection, "\"\\u0E52\\u0E52\\u0E50 ", 0, status);
    527     ADD_DATACHUNK(thaiLineSelection, "\\u0E1b\\u0E35\" ", 0, status);
    528     ADD_DATACHUNK(thaiLineSelection, "\\u0E02\\u0E2d\\u0E07", 0, status);
    529     ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status);
    530     ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status);
    531     ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status);
    532     ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status);
    533     ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status);
    534 
    535     // @suwit - end of changes
    536 
    537 
    538     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status);
    539     if (U_FAILURE(status))
    540     {
    541         errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak. - %s", u_errorName(status));
    542         return;
    543     }
    544 
    545 
    546     generalIteratorTest(*e, thaiLineSelection);
    547     delete e;
    548 }
    549 
    550 
    551 void RBBITest::TestMaiyamok()
    552 {
    553     UErrorCode status = U_ZERO_ERROR;
    554     BITestData   thaiLineSelection(status);
    555     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
    556     // the Thai maiyamok character is a shorthand symbol that means "repeat the previous
    557     // word".  Instead of appearing as a word unto itself, however, it's kept together
    558     // with the word before it
    559     ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);
    560     ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);
    561     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);
    562     ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status);
    563     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status);
    564     ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);
    565     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status);
    566     ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status);
    567     ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
    568 
    569     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
    570         Locale("th"), status);
    571 
    572     if (U_FAILURE(status))
    573     {
    574         errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMaiyamok. - %s", u_errorName(status));
    575         return;
    576     }
    577     generalIteratorTest(*e, thaiLineSelection);
    578     delete e;
    579 }
    580 
    581 
    582 
    583 void RBBITest::TestBug3818() {
    584     UErrorCode  status = U_ZERO_ERROR;
    585 
    586     // Four Thai words...
    587     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
    588                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
    589     UnicodeString  thaiStr(thaiWordData);
    590 
    591     RuleBasedBreakIterator* bi =
    592         (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
    593     if (U_FAILURE(status) || bi == NULL) {
    594         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
    595         return;
    596     }
    597     bi->setText(thaiStr);
    598 
    599     int32_t  startOfSecondWord = bi->following(1);
    600     if (startOfSecondWord != 4) {
    601         errln("Fail at file %s, line %d expected start of word at 4, got %d",
    602             __FILE__, __LINE__, startOfSecondWord);
    603     }
    604     startOfSecondWord = bi->following(0);
    605     if (startOfSecondWord != 4) {
    606         errln("Fail at file %s, line %d expected start of word at 4, got %d",
    607             __FILE__, __LINE__, startOfSecondWord);
    608     }
    609     delete bi;
    610 }
    611 
    612 
    613 void RBBITest::TestJapaneseWordBreak() {
    614 // TODO: Rewrite this test for a dictionary-based word breaking.
    615 #if 0
    616     UErrorCode status = U_ZERO_ERROR;
    617     BITestData   japaneseWordSelection(status);
    618 
    619     ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status);           // Break at start of data
    620     ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2
    621     ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5
    622     ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7
    623     ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10
    624     ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11
    625     ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12
    626 
    627     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
    628         Locale("ja"), status);
    629     if (U_FAILURE(status))
    630     {
    631         errcheckln(status, "Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
    632         return;
    633     }
    634 
    635     generalIteratorTest(*e, japaneseWordSelection);
    636     delete e;
    637 #endif
    638 }
    639 
    640 void RBBITest::TestTrieDict() {
    641     UErrorCode      status  = U_ZERO_ERROR;
    642 
    643     //
    644     //  Open and read the test data file.
    645     //
    646     const char *testDataDirectory = IntlTest::getSourceTestData(status);
    647     char testFileName[1000];
    648     if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) {
    649         errln("Can't open test data.  Path too long.");
    650         return;
    651     }
    652     strcpy(testFileName, testDataDirectory);
    653     strcat(testFileName, "riwords.txt");
    654 
    655     // Items needing deleting at the end
    656     MutableTrieDictionary *mutableDict = NULL;
    657     CompactTrieDictionary *compactDict = NULL;
    658     UnicodeSet            *breaks      = NULL;
    659     UChar                 *testFile    = NULL;
    660     StringEnumeration     *enumer1     = NULL;
    661     StringEnumeration     *enumer2     = NULL;
    662     MutableTrieDictionary *mutable2    = NULL;
    663     StringEnumeration     *cloneEnum   = NULL;
    664     CompactTrieDictionary *compact2    = NULL;
    665 
    666 
    667     const UnicodeString *originalWord = NULL;
    668     const UnicodeString *cloneWord    = NULL;
    669     UChar *current;
    670     UChar *word;
    671     UChar uc;
    672     int32_t wordLen;
    673     int32_t wordCount;
    674     int32_t testCount;
    675 
    676     int    len;
    677     testFile = ReadAndConvertFile(testFileName, len, NULL, status);
    678     if (U_FAILURE(status)) {
    679         goto cleanup; /* something went wrong, error already output */
    680     }
    681 
    682     mutableDict = new MutableTrieDictionary(0x0E1C, status);
    683     if (U_FAILURE(status)) {
    684         errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
    685         goto cleanup;
    686     }
    687 
    688     breaks = new UnicodeSet;
    689     breaks->add(0x000A);     // Line Feed
    690     breaks->add(0x000D);     // Carriage Return
    691     breaks->add(0x2028);     // Line Separator
    692     breaks->add(0x2029);     // Paragraph Separator
    693 
    694     // Now add each non-comment line of the file as a word.
    695     current = testFile;
    696     word = current;
    697     uc = *current++;
    698     wordLen = 0;
    699     wordCount = 0;
    700 
    701     while (uc) {
    702         if (uc == 0x0023) {     // #comment line, skip
    703             while (uc && !breaks->contains(uc)) {
    704                 uc = *current++;
    705             }
    706         }
    707         else while (uc && !breaks->contains(uc)) {
    708             ++wordLen;
    709             uc = *current++;
    710         }
    711         if (wordLen > 0) {
    712             mutableDict->addWord(word, wordLen, status);
    713             if (U_FAILURE(status)) {
    714                 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
    715                 goto cleanup;
    716             }
    717             wordCount += 1;
    718         }
    719 
    720         // Find beginning of next line
    721         while (uc && breaks->contains(uc)) {
    722             uc = *current++;
    723         }
    724         word = current-1;
    725         wordLen = 0;
    726     }
    727 
    728     if (wordCount < 50) {
    729         errln("Word count (%d) unreasonably small\n", wordCount);
    730         goto cleanup;
    731     }
    732 
    733     enumer1 = mutableDict->openWords(status);
    734     if (U_FAILURE(status)) {
    735         errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
    736         goto cleanup;
    737     }
    738 
    739     testCount = 0;
    740     if (wordCount != (testCount = enumer1->count(status))) {
    741         errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
    742             testCount, wordCount, u_errorName(status));
    743         goto cleanup;
    744     }
    745 
    746     // Now compact it
    747     compactDict = new CompactTrieDictionary(*mutableDict, status);
    748     if (U_FAILURE(status)) {
    749         errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
    750         goto cleanup;
    751     }
    752 
    753     enumer2 = compactDict->openWords(status);
    754     if (U_FAILURE(status)) {
    755         errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
    756         goto cleanup;
    757     }
    758 
    759     if (wordCount != (testCount = enumer2->count(status))) {
    760         errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
    761             testCount, wordCount, u_errorName(status));
    762         goto cleanup;
    763     }
    764 
    765     if (typeid(*enumer1) == typeid(*enumer2)) {
    766         errln("CompactTrieEnumeration and MutableTrieEnumeration typeids are the same");
    767     }
    768     delete enumer1;
    769     enumer1 = NULL;
    770     delete enumer2;
    771     enumer2 = NULL;
    772 
    773     // Now un-compact it
    774     mutable2 = compactDict->cloneMutable(status);
    775     if (U_FAILURE(status)) {
    776         errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
    777         goto cleanup;
    778     }
    779 
    780     cloneEnum = mutable2->openWords(status);
    781     if (U_FAILURE(status)) {
    782         errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
    783         goto cleanup;
    784     }
    785 
    786     if (wordCount != (testCount = cloneEnum->count(status))) {
    787         errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
    788             testCount, wordCount, u_errorName(status));
    789         goto cleanup;
    790     }
    791 
    792     // Compact original dictionary to clone. Note that we can only compare the same kind of
    793     // dictionary as the order of the enumerators is not guaranteed to be the same between
    794     // different kinds
    795     enumer1 = mutableDict->openWords(status);
    796     if (U_FAILURE(status)) {
    797         errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
    798         goto cleanup;
    799      }
    800 
    801     originalWord = enumer1->snext(status);
    802     cloneWord = cloneEnum->snext(status);
    803     while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
    804         if (*originalWord != *cloneWord) {
    805             errln("Original and cloned MutableTrieDictionary word mismatch\n");
    806             goto cleanup;
    807         }
    808         originalWord = enumer1->snext(status);
    809         cloneWord = cloneEnum->snext(status);
    810     }
    811 
    812     if (U_FAILURE(status)) {
    813         errln("Enumeration failed: %s\n", u_errorName(status));
    814         goto cleanup;
    815     }
    816 
    817     if (originalWord != cloneWord) {
    818         errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
    819         goto cleanup;
    820     }
    821 
    822     // Test the data copying constructor for CompactTrieDict, and the data access APIs.
    823     compact2 = new CompactTrieDictionary(compactDict->data(), status);
    824     if (U_FAILURE(status)) {
    825         errln("CompactTrieDictionary(const void *,...) failed\n");
    826         goto cleanup;
    827     }
    828 
    829     if (compact2->dataSize() == 0) {
    830         errln("CompactTrieDictionary->dataSize() == 0\n");
    831         goto cleanup;
    832     }
    833 
    834     // Now count the words via the second dictionary
    835     delete enumer1;
    836     enumer1 = compact2->openWords(status);
    837     if (U_FAILURE(status)) {
    838         errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
    839         goto cleanup;
    840     }
    841 
    842     if (wordCount != (testCount = enumer1->count(status))) {
    843         errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
    844             testCount, wordCount, u_errorName(status));
    845         goto cleanup;
    846     }
    847 
    848 cleanup:
    849     delete compactDict;
    850     delete mutableDict;
    851     delete breaks;
    852     delete[] testFile;
    853     delete enumer1;
    854     delete mutable2;
    855     delete cloneEnum;
    856     delete compact2;
    857 }
    858 
    859 /*TODO: delete later*/
    860 inline void writeEnumerationToFile(StringEnumeration *enumer, char *filename){
    861     UErrorCode      status  = U_ZERO_ERROR;
    862     FILE *outfile = fopen(filename,"w");
    863     UConverter *cvt = ucnv_open("UTF-8", &status);
    864     if (U_FAILURE(status))
    865         return;
    866     if(outfile != NULL){
    867         status = U_ZERO_ERROR;
    868         const UnicodeString *word = enumer->snext(status);
    869         while (word != NULL && U_SUCCESS(status)) {
    870             char u8word[500];
    871             status = U_ZERO_ERROR;
    872             ucnv_fromUChars(cvt, u8word, 500, word->getBuffer(), word->length(),
    873                     &status);
    874             fprintf(outfile,"%s\n", u8word);
    875             status = U_ZERO_ERROR;
    876             word = enumer->snext(status);
    877         }
    878         fclose(outfile);
    879     }
    880     ucnv_close(cvt);
    881 }
    882 
    883 // A very simple helper class to streamline the buffer handling in
    884 // TestTrieDictWithValue
    885 template<class T, size_t N>
    886 class AutoBuffer {
    887  public:
    888   AutoBuffer(size_t size) : buffer(stackBuffer) {
    889     if (size > N)
    890       buffer = new T[size];
    891   }
    892   ~AutoBuffer() {
    893     if (buffer != stackBuffer)
    894       delete [] buffer;
    895   }
    896   T* elems() {
    897     return buffer;
    898   }
    899   const T& operator[] (size_t i) const {
    900     return buffer[i];
    901   }
    902   T& operator[] (size_t i) {
    903     return buffer[i];
    904   }
    905  private:
    906   T stackBuffer[N];
    907   T* buffer;
    908   AutoBuffer();
    909 };
    910 
    911 //----------------------------------------------------------------------------
    912 //
    913 // TestTrieDictWithValue    Test trie dictionaries with logprob values and
    914 // more than 2^16 nodes after compaction.
    915 //
    916 //----------------------------------------------------------------------------
    917 void RBBITest::TestTrieDictWithValue() {
    918     UErrorCode      status  = U_ZERO_ERROR;
    919 
    920     //
    921     //  Open and read the test data file.
    922     //
    923     const char *testDataDirectory = IntlTest::getSourceTestData(status);
    924     const char *filename = "cjdict-truncated.txt";
    925     char testFileName[1000];
    926     if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen(filename) + 10 >= sizeof(testFileName)) {
    927         errln("Can't open test data.  Path too long.");
    928         return;
    929     }
    930     strcpy(testFileName, testDataDirectory);
    931     strcat(testFileName, filename);
    932 
    933     // Items needing deleting at the end
    934     MutableTrieDictionary *mutableDict = NULL;
    935     CompactTrieDictionary *compactDict = NULL;
    936     UnicodeSet            *breaks      = NULL;
    937     UChar                 *testFile    = NULL;
    938     StringEnumeration     *enumer1     = NULL;
    939     StringEnumeration     *enumer2     = NULL;
    940     MutableTrieDictionary *mutable2    = NULL;
    941     StringEnumeration     *cloneEnum   = NULL;
    942     CompactTrieDictionary *compact2    = NULL;
    943     NumberFormat          *nf           = NULL;
    944     UText *originalText = NULL, *cloneText = NULL;
    945 
    946     const UnicodeString *originalWord = NULL;
    947     const UnicodeString *cloneWord    = NULL;
    948     UChar *current;
    949     UChar *word;
    950     UChar uc;
    951     int32_t wordLen;
    952     int32_t wordCount;
    953     int32_t testCount;
    954     int32_t valueLen;
    955     int counter = 0;
    956 
    957     int    len;
    958     testFile = ReadAndConvertFile(testFileName, len, NULL, status);
    959     if (U_FAILURE(status)) {
    960         goto cleanup; /* something went wrong, error already output */
    961     }
    962 
    963     mutableDict = new MutableTrieDictionary(0x0E1C, status, TRUE);
    964     if (U_FAILURE(status)) {
    965         errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
    966         goto cleanup;
    967     }
    968 
    969     breaks = new UnicodeSet;
    970     breaks->add(0x000A);     // Line Feed
    971     breaks->add(0x000D);     // Carriage Return
    972     breaks->add(0x2028);     // Line Separator
    973     breaks->add(0x2029);     // Paragraph Separator
    974     breaks->add(0x0009);     // Tab character
    975 
    976     // Now add each non-comment line of the file as a word.
    977     current = testFile;
    978     word = current;
    979     uc = *current++;
    980     wordLen = 0;
    981     wordCount = 0;
    982     nf = NumberFormat::createInstance(status);
    983 
    984     while (uc) {
    985         UnicodeString ucharValue;
    986         valueLen = 0;
    987 
    988         if (uc == 0x0023) {     // #comment line, skip
    989             while (uc && !breaks->contains(uc)) {
    990                 uc = *current++;
    991             }
    992         }
    993         else{
    994             while (uc && !breaks->contains(uc)) {
    995                 ++wordLen;
    996                 uc = *current++;
    997             }
    998             if(uc == 0x0009){ //separator is a tab char, read in num after tab
    999                 uc = *current++;
   1000                 while (uc && !breaks->contains(uc)) {
   1001                     ucharValue.append(uc);
   1002                     uc = *current++;
   1003                 }
   1004             }
   1005         }
   1006         if (wordLen > 0) {
   1007             Formattable value((int32_t)0);
   1008             nf->parse(ucharValue.getTerminatedBuffer(), value, status);
   1009 
   1010             if(U_FAILURE(status)){
   1011                 errln("parsing of value failed when reading in dictionary\n");
   1012                 goto cleanup;
   1013             }
   1014             mutableDict->addWord(word, wordLen, status, value.getLong());
   1015             if (U_FAILURE(status)) {
   1016                 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
   1017                 goto cleanup;
   1018             }
   1019             wordCount += 1;
   1020         }
   1021 
   1022         // Find beginning of next line
   1023         while (uc && breaks->contains(uc)) {
   1024             uc = *current++;
   1025         }
   1026         word = current-1;
   1027         wordLen = 0;
   1028     }
   1029 
   1030     if (wordCount < 50) {
   1031         errln("Word count (%d) unreasonably small\n", wordCount);
   1032         goto cleanup;
   1033     }
   1034 
   1035     enumer1 = mutableDict->openWords(status);
   1036     if (U_FAILURE(status)) {
   1037         errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
   1038         goto cleanup;
   1039     }
   1040 
   1041     testCount = 0;
   1042     if (wordCount != (testCount = enumer1->count(status))) {
   1043         errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
   1044                 testCount, wordCount, u_errorName(status));
   1045         goto cleanup;
   1046     }
   1047 
   1048     // Now compact it
   1049     compactDict = new CompactTrieDictionary(*mutableDict, status);
   1050     if (U_FAILURE(status)) {
   1051         errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
   1052         goto cleanup;
   1053     }
   1054 
   1055     enumer2 = compactDict->openWords(status);
   1056     if (U_FAILURE(status)) {
   1057         errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
   1058         goto cleanup;
   1059     }
   1060 
   1061 
   1062     //delete later
   1063 //    writeEnumerationToFile(enumer1, "/home/jchye/mutable.txt");
   1064 //    writeEnumerationToFile(enumer2, "/home/jchye/compact.txt");
   1065 
   1066     enumer1->reset(status);
   1067     enumer2->reset(status);
   1068 
   1069     originalWord = enumer1->snext(status);
   1070     cloneWord = enumer2->snext(status);
   1071     while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
   1072         if (*originalWord != *cloneWord) {
   1073             errln("MutableTrieDictionary and CompactTrieDictionary word mismatch at %d, lengths are %d and %d\n",
   1074                     counter, originalWord->length(), cloneWord->length());
   1075             goto cleanup;
   1076         }
   1077 
   1078         // check if attached values of the same word in both dictionaries tally
   1079 #if 0
   1080         int32_t lengths1[originalWord->length()], lengths2[cloneWord->length()];
   1081         uint16_t values1[originalWord->length()], values2[cloneWord->length()];
   1082 #endif
   1083         AutoBuffer<int32_t, 20> lengths1(originalWord->length());
   1084         AutoBuffer<int32_t, 20> lengths2(cloneWord->length());
   1085         AutoBuffer<uint16_t, 20> values1(originalWord->length());
   1086         AutoBuffer<uint16_t, 20> values2(cloneWord->length());
   1087 
   1088         originalText = utext_openConstUnicodeString(originalText, originalWord, &status);
   1089         cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status);
   1090 
   1091         int count1, count2;
   1092         mutableDict->matches(originalText, originalWord->length(), lengths1.elems(), count1, originalWord->length(), values1.elems());
   1093         compactDict->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems());
   1094 
   1095         if(values1[count1-1] != values2[count2-1]){
   1096             errln("Values of word %d in MutableTrieDictionary and CompactTrieDictionary do not match, with values %d and %d\n",
   1097                   counter, values1[count1-1], values2[count2-1]);
   1098             goto cleanup;
   1099         }
   1100 
   1101         counter++;
   1102         originalWord = enumer1->snext(status);
   1103         cloneWord = enumer2->snext(status);
   1104     }
   1105     if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) {
   1106         errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same");
   1107     }
   1108 
   1109     delete enumer1;
   1110     enumer1 = NULL;
   1111     delete enumer2;
   1112     enumer2 = NULL;
   1113 
   1114     // Now un-compact it
   1115     mutable2 = compactDict->cloneMutable(status);
   1116     if (U_FAILURE(status)) {
   1117         errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
   1118         goto cleanup;
   1119     }
   1120 
   1121     cloneEnum = mutable2->openWords(status);
   1122     if (U_FAILURE(status)) {
   1123         errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
   1124         goto cleanup;
   1125     }
   1126 
   1127     if (wordCount != (testCount = cloneEnum->count(status))) {
   1128         errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
   1129                 testCount, wordCount, u_errorName(status));
   1130         goto cleanup;
   1131     }
   1132 
   1133     // Compact original dictionary to clone. Note that we can only compare the same kind of
   1134     // dictionary as the order of the enumerators is not guaranteed to be the same between
   1135     // different kinds
   1136     enumer1 = mutableDict->openWords(status);
   1137     if (U_FAILURE(status)) {
   1138         errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
   1139         goto cleanup;
   1140     }
   1141 
   1142     counter = 0;
   1143     originalWord = enumer1->snext(status);
   1144     cloneWord = cloneEnum->snext(status);
   1145     while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
   1146         if (*originalWord != *cloneWord) {
   1147             errln("Original and cloned MutableTrieDictionary word mismatch\n");
   1148             goto cleanup;
   1149         }
   1150 
   1151         // check if attached values of the same word in both dictionaries tally
   1152         AutoBuffer<int32_t, 20> lengths1(originalWord->length());
   1153         AutoBuffer<int32_t, 20> lengths2(cloneWord->length());
   1154         AutoBuffer<uint16_t, 20> values1(originalWord->length());
   1155         AutoBuffer<uint16_t, 20> values2(cloneWord->length());
   1156         originalText = utext_openConstUnicodeString(originalText, originalWord, &status);
   1157         cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status);
   1158 
   1159         int count1, count2;
   1160         mutableDict->matches(originalText, originalWord->length(), lengths1.elems(), count1, originalWord->length(), values1.elems());
   1161         mutable2->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems());
   1162 
   1163         if(values1[count1-1] != values2[count2-1]){
   1164             errln("Values of word %d in original and cloned MutableTrieDictionary do not match, with values %d and %d\n",
   1165                   counter, values1[count1-1], values2[count2-1]);
   1166             goto cleanup;
   1167         }
   1168 
   1169         counter++;
   1170 
   1171         originalWord = enumer1->snext(status);
   1172         cloneWord = cloneEnum->snext(status);
   1173     }
   1174 
   1175     if (U_FAILURE(status)) {
   1176         errln("Enumeration failed: %s\n", u_errorName(status));
   1177         goto cleanup;
   1178     }
   1179 
   1180     if (originalWord != cloneWord) {
   1181         errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
   1182         goto cleanup;
   1183     }
   1184 
   1185     // Test the data copying constructor for CompactTrieDict, and the data access APIs.
   1186     compact2 = new CompactTrieDictionary(compactDict->data(), status);
   1187     if (U_FAILURE(status)) {
   1188         errln("CompactTrieDictionary(const void *,...) failed\n");
   1189         goto cleanup;
   1190     }
   1191 
   1192     if (compact2->dataSize() == 0) {
   1193         errln("CompactTrieDictionary->dataSize() == 0\n");
   1194         goto cleanup;
   1195     }
   1196 
   1197     // Now count the words via the second dictionary
   1198     delete enumer1;
   1199     enumer1 = compact2->openWords(status);
   1200     if (U_FAILURE(status)) {
   1201         errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
   1202         goto cleanup;
   1203     }
   1204 
   1205     if (wordCount != (testCount = enumer1->count(status))) {
   1206         errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
   1207                 testCount, wordCount, u_errorName(status));
   1208         goto cleanup;
   1209     }
   1210 
   1211     cleanup:
   1212     delete compactDict;
   1213     delete mutableDict;
   1214     delete breaks;
   1215     delete[] testFile;
   1216     delete enumer1;
   1217     delete mutable2;
   1218     delete cloneEnum;
   1219     delete compact2;
   1220     utext_close(originalText);
   1221     utext_close(cloneText);
   1222 
   1223 
   1224 }
   1225 
   1226 //----------------------------------------------------------------------------
   1227 //
   1228 // generalIteratorTest      Given a break iterator and a set of test data,
   1229 //                          Run the tests and report the results.
   1230 //
   1231 //----------------------------------------------------------------------------
   1232 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
   1233 {
   1234 
   1235     bi.setText(td.fDataToBreak);
   1236 
   1237     testFirstAndNext(bi, td);
   1238 
   1239     testLastAndPrevious(bi, td);
   1240 
   1241     testFollowing(bi, td);
   1242     testPreceding(bi, td);
   1243     testIsBoundary(bi, td);
   1244     doMultipleSelectionTest(bi, td);
   1245 }
   1246 
   1247 
   1248 //
   1249 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
   1250 //                       kind of loop.
   1251 //
   1252 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
   1253 {
   1254     UErrorCode  status = U_ZERO_ERROR;
   1255     int32_t     p;
   1256     int32_t     lastP = -1;
   1257     int32_t     tag;
   1258 
   1259     logln("Test first and next");
   1260     bi.setText(td.fDataToBreak);
   1261     td.clearResults();
   1262 
   1263     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
   1264         td.fActualBreakPositions.addElement(p, status);  // Save result.
   1265         tag = bi.getRuleStatus();
   1266         td.fActualTags.addElement(tag, status);
   1267         if (p <= lastP) {
   1268             // If the iterator is not making forward progress, stop.
   1269             //  No need to raise an error here, it'll be detected in the normal check of results.
   1270             break;
   1271         }
   1272         lastP = p;
   1273     }
   1274     td.checkResults("testFirstAndNext", this);
   1275 }
   1276 
   1277 
   1278 //
   1279 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
   1280 //
   1281 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
   1282 {
   1283     UErrorCode  status = U_ZERO_ERROR;
   1284     int32_t     p;
   1285     int32_t     lastP  = 0x7ffffffe;
   1286     int32_t     tag;
   1287 
   1288     logln("Test last and previous");
   1289     bi.setText(td.fDataToBreak);
   1290     td.clearResults();
   1291 
   1292     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
   1293         // Save break position.  Insert it at start of vector of results, shoving
   1294         //    already-saved results further towards the end.
   1295         td.fActualBreakPositions.insertElementAt(p, 0, status);
   1296         // bi.previous();   // TODO:  Why does this fix things up????
   1297         // bi.next();
   1298         tag = bi.getRuleStatus();
   1299         td.fActualTags.insertElementAt(tag, 0, status);
   1300         if (p >= lastP) {
   1301             // If the iterator is not making progress, stop.
   1302             //  No need to raise an error here, it'll be detected in the normal check of results.
   1303             break;
   1304         }
   1305         lastP = p;
   1306     }
   1307     td.checkResults("testLastAndPrevious", this);
   1308 }
   1309 
   1310 
   1311 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
   1312 {
   1313     UErrorCode  status = U_ZERO_ERROR;
   1314     int32_t     p;
   1315     int32_t     tag;
   1316     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
   1317                                  //   cannot be -1; that is returned for DONE.
   1318     int         i;
   1319 
   1320     logln("testFollowing():");
   1321     bi.setText(td.fDataToBreak);
   1322     td.clearResults();
   1323 
   1324     // Save the starting point, since we won't get that out of following.
   1325     p = bi.first();
   1326     td.fActualBreakPositions.addElement(p, status);  // Save result.
   1327     tag = bi.getRuleStatus();
   1328     td.fActualTags.addElement(tag, status);
   1329 
   1330     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
   1331         p = bi.following(i);
   1332         if (p != lastP) {
   1333             if (p == RuleBasedBreakIterator::DONE) {
   1334                 break;
   1335             }
   1336             // We've reached a new break position.  Save it.
   1337             td.fActualBreakPositions.addElement(p, status);  // Save result.
   1338             tag = bi.getRuleStatus();
   1339             td.fActualTags.addElement(tag, status);
   1340             lastP = p;
   1341         }
   1342     }
   1343     // The loop normally exits by means of the break in the middle.
   1344     // Make sure that the index was at the correct position for the break iterator to have
   1345     //   returned DONE.
   1346     if (i != td.fDataToBreak.length()) {
   1347         errln("testFollowing():  iterator returned DONE prematurely.");
   1348     }
   1349 
   1350     // Full check of all results.
   1351     td.checkResults("testFollowing", this);
   1352 }
   1353 
   1354 
   1355 
   1356 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
   1357     UErrorCode  status = U_ZERO_ERROR;
   1358     int32_t     p;
   1359     int32_t     tag;
   1360     int32_t     lastP  = 0x7ffffffe;
   1361     int         i;
   1362 
   1363     logln("testPreceding():");
   1364     bi.setText(td.fDataToBreak);
   1365     td.clearResults();
   1366 
   1367     p = bi.last();
   1368     td.fActualBreakPositions.addElement(p, status);
   1369     tag = bi.getRuleStatus();
   1370     td.fActualTags.addElement(tag, status);
   1371 
   1372     for (i = td.fDataToBreak.length(); i>=-1; i--) {
   1373         p = bi.preceding(i);
   1374         if (p != lastP) {
   1375             if (p == RuleBasedBreakIterator::DONE) {
   1376                 break;
   1377             }
   1378             // We've reached a new break position.  Save it.
   1379             td.fActualBreakPositions.insertElementAt(p, 0, status);
   1380             lastP = p;
   1381             tag = bi.getRuleStatus();
   1382             td.fActualTags.insertElementAt(tag, 0, status);
   1383         }
   1384     }
   1385     // The loop normally exits by means of the break in the middle.
   1386     // Make sure that the index was at the correct position for the break iterator to have
   1387     //   returned DONE.
   1388     if (i != 0) {
   1389         errln("testPreceding():  iterator returned DONE prematurely.");
   1390     }
   1391 
   1392     // Full check of all results.
   1393     td.checkResults("testPreceding", this);
   1394 }
   1395 
   1396 
   1397 
   1398 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
   1399     UErrorCode  status = U_ZERO_ERROR;
   1400     int         i;
   1401     int32_t     tag;
   1402 
   1403     logln("testIsBoundary():");
   1404     bi.setText(td.fDataToBreak);
   1405     td.clearResults();
   1406 
   1407     for (i = 0; i <= td.fDataToBreak.length(); i++) {
   1408         if (bi.isBoundary(i)) {
   1409             td.fActualBreakPositions.addElement(i, status);  // Save result.
   1410             tag = bi.getRuleStatus();
   1411             td.fActualTags.addElement(tag, status);
   1412         }
   1413     }
   1414     td.checkResults("testIsBoundary: ", this);
   1415 }
   1416 
   1417 
   1418 
   1419 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
   1420 {
   1421     iterator.setText(td.fDataToBreak);
   1422 
   1423     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
   1424     int32_t offset = iterator.first();
   1425     int32_t testOffset;
   1426     int32_t count = 0;
   1427 
   1428     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
   1429 
   1430     if (*testIterator != iterator)
   1431         errln("clone() or operator!= failed: two clones compared unequal");
   1432 
   1433     do {
   1434         testOffset = testIterator->first();
   1435         testOffset = testIterator->next(count);
   1436         if (offset != testOffset)
   1437             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
   1438 
   1439         if (offset != RuleBasedBreakIterator::DONE) {
   1440             count++;
   1441             offset = iterator.next();
   1442 
   1443             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
   1444                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
   1445                 if (count > 10000 || offset == -1) {
   1446                     errln("operator== failed too many times. Stopping test.");
   1447                     if (offset == -1) {
   1448                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
   1449                     }
   1450                     return;
   1451                 }
   1452             }
   1453         }
   1454     } while (offset != RuleBasedBreakIterator::DONE);
   1455 
   1456     // now do it backwards...
   1457     offset = iterator.last();
   1458     count = 0;
   1459 
   1460     do {
   1461         testOffset = testIterator->last();
   1462         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
   1463         if (offset != testOffset)
   1464             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
   1465 
   1466         if (offset != RuleBasedBreakIterator::DONE) {
   1467             count--;
   1468             offset = iterator.previous();
   1469         }
   1470     } while (offset != RuleBasedBreakIterator::DONE);
   1471 
   1472     delete testIterator;
   1473 }
   1474 
   1475 
   1476 //---------------------------------------------
   1477 //
   1478 //     other tests
   1479 //
   1480 //---------------------------------------------
   1481 void RBBITest::TestEmptyString()
   1482 {
   1483     UnicodeString text = "";
   1484     UErrorCode status = U_ZERO_ERROR;
   1485 
   1486     BITestData x(status);
   1487     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
   1488     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
   1489     if (U_FAILURE(status))
   1490     {
   1491         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
   1492         return;
   1493     }
   1494     generalIteratorTest(*bi, x);
   1495     delete bi;
   1496 }
   1497 
   1498 void RBBITest::TestGetAvailableLocales()
   1499 {
   1500     int32_t locCount = 0;
   1501     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
   1502 
   1503     if (locCount == 0)
   1504         dataerrln("getAvailableLocales() returned an empty list!");
   1505     // Just make sure that it's returning good memory.
   1506     int32_t i;
   1507     for (i = 0; i < locCount; ++i) {
   1508         logln(locList[i].getName());
   1509     }
   1510 }
   1511 
   1512 //Testing the BreakIterator::getDisplayName() function
   1513 void RBBITest::TestGetDisplayName()
   1514 {
   1515     UnicodeString   result;
   1516 
   1517     BreakIterator::getDisplayName(Locale::getUS(), result);
   1518     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
   1519         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
   1520                 + result);
   1521 
   1522     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
   1523     if (result != "French (France)")
   1524         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
   1525                 + result);
   1526 }
   1527 /**
   1528  * Test End Behaviour
   1529  * @bug 4068137
   1530  */
   1531 void RBBITest::TestEndBehaviour()
   1532 {
   1533     UErrorCode status = U_ZERO_ERROR;
   1534     UnicodeString testString("boo.");
   1535     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
   1536     if (U_FAILURE(status))
   1537     {
   1538         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
   1539         return;
   1540     }
   1541     wb->setText(testString);
   1542 
   1543     if (wb->first() != 0)
   1544         errln("Didn't get break at beginning of string.");
   1545     if (wb->next() != 3)
   1546         errln("Didn't get break before period in \"boo.\"");
   1547     if (wb->current() != 4 && wb->next() != 4)
   1548         errln("Didn't get break at end of string.");
   1549     delete wb;
   1550 }
   1551 /*
   1552  * @bug 4153072
   1553  */
   1554 void RBBITest::TestBug4153072() {
   1555     UErrorCode status = U_ZERO_ERROR;
   1556     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
   1557     if (U_FAILURE(status))
   1558     {
   1559         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
   1560         return;
   1561     }
   1562     UnicodeString str("...Hello, World!...");
   1563     int32_t begin = 3;
   1564     int32_t end = str.length() - 3;
   1565     UBool onBoundary;
   1566 
   1567     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
   1568     iter->adoptText(textIterator);
   1569     int index;
   1570     // Note: with the switch to UText, there is no way to restrict the
   1571     //       iteration range to begin at an index other than zero.
   1572     //       String character iterators created with a non-zero bound are
   1573     //         treated by RBBI as being empty.
   1574     for (index = -1; index < begin + 1; ++index) {
   1575         onBoundary = iter->isBoundary(index);
   1576         if (index == 0?  !onBoundary : onBoundary) {
   1577             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
   1578                             " and begin index = " + begin);
   1579         }
   1580     }
   1581     delete iter;
   1582 }
   1583 
   1584 
   1585 //
   1586 // Test for problem reported by Ashok Matoria on 9 July 2007
   1587 //    One.<kSoftHyphen><kSpace>Two.
   1588 //
   1589 //    Sentence break at start (0) and then on calling next() it breaks at
   1590 //   'T' of "Two". Now, at this point if I do next() and
   1591 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
   1592 //
   1593 void RBBITest::TestBug5775() {
   1594     UErrorCode status = U_ZERO_ERROR;
   1595     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
   1596     TEST_ASSERT_SUCCESS(status);
   1597     if (U_FAILURE(status)) {
   1598         return;
   1599     }
   1600 // Check for status first for better handling of no data errors.
   1601     TEST_ASSERT(bi != NULL);
   1602     if (bi == NULL) {
   1603         return;
   1604     }
   1605 
   1606     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
   1607     //               01234      56789
   1608     s = s.unescape();
   1609     bi->setText(s);
   1610     int pos = bi->next();
   1611     TEST_ASSERT(pos == 6);
   1612     pos = bi->next();
   1613     TEST_ASSERT(pos == 10);
   1614     pos = bi->previous();
   1615     TEST_ASSERT(pos == 6);
   1616     delete bi;
   1617 }
   1618 
   1619 
   1620 
   1621 /**
   1622  * Test Japanese Line Break
   1623  * @bug 4095322
   1624  */
   1625 void RBBITest::TestJapaneseLineBreak()
   1626 {
   1627 #if 0
   1628     // Test needs updating some more...   Dump it for now.
   1629 
   1630 
   1631     // Change for Unicode TR 14:  Punctuation characters with categories Pi and Pf do not count
   1632     //        as opening and closing punctuation for line breaking.
   1633     //        Also, \u30fc and \u30fe are not counted as hyphens.   Remove these chars
   1634     //        from these tests.    6-13-2002
   1635     //
   1636     UErrorCode status = U_ZERO_ERROR;
   1637     UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");
   1638     UnicodeString precedingChars = CharsToUnicodeString(
   1639         //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
   1640         "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
   1641     UnicodeString followingChars = CharsToUnicodeString(
   1642         // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
   1643         ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
   1644         // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
   1645         ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
   1646         "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
   1647     BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status);
   1648 
   1649     int32_t i;
   1650     if (U_FAILURE(status))
   1651     {
   1652         errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
   1653         return;
   1654     }
   1655 
   1656     for (i = 0; i < precedingChars.length(); i++) {
   1657         testString.setCharAt(1, precedingChars[i]);
   1658         iter->setText(testString);
   1659         int32_t j = iter->first();
   1660         if (j != 0)
   1661             errln("ja line break failure: failed to start at 0");
   1662         j = iter->next();
   1663         if (j != 1)
   1664             errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])
   1665                         + "' (" + ((int)(precedingChars[i])) + ")");
   1666         j = iter->next();
   1667         if (j != 3)
   1668             errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])
   1669                         + "' (" + ((int)(precedingChars[i])) + ")");
   1670     }
   1671 
   1672     for (i = 0; i < followingChars.length(); i++) {
   1673         testString.setCharAt(1, followingChars[i]);
   1674         iter->setText(testString);
   1675         int j = iter->first();
   1676         if (j != 0)
   1677             errln("ja line break failure: failed to start at 0");
   1678         j = iter->next();
   1679         if (j != 2)
   1680             errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])
   1681                         + "' (" + ((int)(followingChars[i])) + ")");
   1682         j = iter->next();
   1683         if (j != 3)
   1684             errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])
   1685                         + "' (" + ((int)(followingChars[i])) + ")");
   1686     }
   1687     delete iter;
   1688 #endif
   1689 }
   1690 
   1691 
   1692 //------------------------------------------------------------------------------
   1693 //
   1694 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
   1695 //
   1696 //------------------------------------------------------------------------------
   1697 
   1698 struct TestParams {
   1699     BreakIterator   *bi;
   1700     UnicodeString    dataToBreak;
   1701     UVector32       *expectedBreaks;
   1702     UVector32       *srcLine;
   1703     UVector32       *srcCol;
   1704 };
   1705 
   1706 void RBBITest::executeTest(TestParams *t) {
   1707     int32_t    bp;
   1708     int32_t    prevBP;
   1709     int32_t    i;
   1710 
   1711     if (t->bi == NULL) {
   1712         return;
   1713     }
   1714 
   1715     t->bi->setText(t->dataToBreak);
   1716     //
   1717     //  Run the iterator forward
   1718     //
   1719     prevBP = -1;
   1720     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
   1721         if (prevBP ==  bp) {
   1722             // Fail for lack of forward progress.
   1723             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
   1724                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
   1725             break;
   1726         }
   1727 
   1728         // Check that there were we didn't miss an expected break between the last one
   1729         //  and this one.
   1730         for (i=prevBP+1; i<bp; i++) {
   1731             if (t->expectedBreaks->elementAti(i) != 0) {
   1732                 int expected[] = {0, i};
   1733                 printStringBreaks(t->dataToBreak, expected, 2);
   1734                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1735                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
   1736             }
   1737         }
   1738 
   1739         // Check that the break we did find was expected
   1740         if (t->expectedBreaks->elementAti(bp) == 0) {
   1741             int expected[] = {0, bp};
   1742             printStringBreaks(t->dataToBreak, expected, 2);
   1743             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
   1744                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
   1745         } else {
   1746             // The break was expected.
   1747             //   Check that the {nnn} tag value is correct.
   1748             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
   1749             if (expectedTagVal == -1) {
   1750                 expectedTagVal = 0;
   1751             }
   1752             int32_t line = t->srcLine->elementAti(bp);
   1753             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
   1754             if (rs != expectedTagVal) {
   1755                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
   1756                       "          Actual, Expected status = %4d, %4d",
   1757                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
   1758             }
   1759         }
   1760 
   1761 
   1762         prevBP = bp;
   1763     }
   1764 
   1765     // Verify that there were no missed expected breaks after the last one found
   1766     for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
   1767         if (t->expectedBreaks->elementAti(i) != 0) {
   1768             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1769                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
   1770         }
   1771     }
   1772 
   1773     //
   1774     //  Run the iterator backwards, verify that the same breaks are found.
   1775     //
   1776     prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
   1777     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
   1778         if (prevBP ==  bp) {
   1779             // Fail for lack of progress.
   1780             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
   1781                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
   1782             break;
   1783         }
   1784 
   1785         // Check that there were we didn't miss an expected break between the last one
   1786         //  and this one.  (UVector returns zeros for index out of bounds.)
   1787         for (i=prevBP-1; i>bp; i--) {
   1788             if (t->expectedBreaks->elementAti(i) != 0) {
   1789                 errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1790                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
   1791             }
   1792         }
   1793 
   1794         // Check that the break we did find was expected
   1795         if (t->expectedBreaks->elementAti(bp) == 0) {
   1796             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
   1797                    bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
   1798         } else {
   1799             // The break was expected.
   1800             //   Check that the {nnn} tag value is correct.
   1801             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
   1802             if (expectedTagVal == -1) {
   1803                 expectedTagVal = 0;
   1804             }
   1805             int line = t->srcLine->elementAti(bp);
   1806             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
   1807             if (rs != expectedTagVal) {
   1808                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
   1809                       "          Actual, Expected status = %4d, %4d",
   1810                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
   1811             }
   1812         }
   1813 
   1814         prevBP = bp;
   1815     }
   1816 
   1817     // Verify that there were no missed breaks prior to the last one found
   1818     for (i=prevBP-1; i>=0; i--) {
   1819         if (t->expectedBreaks->elementAti(i) != 0) {
   1820             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1821                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
   1822         }
   1823     }
   1824 }
   1825 
   1826 
   1827 void RBBITest::TestExtended() {
   1828 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   1829     UErrorCode      status  = U_ZERO_ERROR;
   1830     Locale          locale("");
   1831 
   1832     UnicodeString       rules;
   1833     TestParams          tp;
   1834     tp.bi             = NULL;
   1835     tp.expectedBreaks = new UVector32(status);
   1836     tp.srcLine        = new UVector32(status);
   1837     tp.srcCol         = new UVector32(status);
   1838 
   1839     RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
   1840     if (U_FAILURE(status)) {
   1841         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
   1842     }
   1843 
   1844 
   1845     //
   1846     //  Open and read the test data file.
   1847     //
   1848     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   1849     char testFileName[1000];
   1850     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
   1851         errln("Can't open test data.  Path too long.");
   1852         return;
   1853     }
   1854     strcpy(testFileName, testDataDirectory);
   1855     strcat(testFileName, "rbbitst.txt");
   1856 
   1857     int    len;
   1858     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
   1859     if (U_FAILURE(status)) {
   1860         return; /* something went wrong, error already output */
   1861     }
   1862 
   1863 
   1864 
   1865 
   1866     //
   1867     //  Put the test data into a UnicodeString
   1868     //
   1869     UnicodeString testString(FALSE, testFile, len);
   1870 
   1871     enum EParseState{
   1872         PARSE_COMMENT,
   1873         PARSE_TAG,
   1874         PARSE_DATA,
   1875         PARSE_NUM
   1876     }
   1877     parseState = PARSE_TAG;
   1878 
   1879     EParseState savedState = PARSE_TAG;
   1880 
   1881     static const UChar CH_LF        = 0x0a;
   1882     static const UChar CH_CR        = 0x0d;
   1883     static const UChar CH_HASH      = 0x23;
   1884     /*static const UChar CH_PERIOD    = 0x2e;*/
   1885     static const UChar CH_LT        = 0x3c;
   1886     static const UChar CH_GT        = 0x3e;
   1887     static const UChar CH_BACKSLASH = 0x5c;
   1888     static const UChar CH_BULLET    = 0x2022;
   1889 
   1890     int32_t    lineNum  = 1;
   1891     int32_t    colStart = 0;
   1892     int32_t    column   = 0;
   1893     int32_t    charIdx  = 0;
   1894 
   1895     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
   1896 
   1897     for (charIdx = 0; charIdx < len; ) {
   1898         status = U_ZERO_ERROR;
   1899         UChar  c = testString.charAt(charIdx);
   1900         charIdx++;
   1901         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
   1902             // treat CRLF as a unit
   1903             c = CH_LF;
   1904             charIdx++;
   1905         }
   1906         if (c == CH_LF || c == CH_CR) {
   1907             lineNum++;
   1908             colStart = charIdx;
   1909         }
   1910         column = charIdx - colStart + 1;
   1911 
   1912         switch (parseState) {
   1913         case PARSE_COMMENT:
   1914             if (c == 0x0a || c == 0x0d) {
   1915                 parseState = savedState;
   1916             }
   1917             break;
   1918 
   1919         case PARSE_TAG:
   1920             {
   1921             if (c == CH_HASH) {
   1922                 parseState = PARSE_COMMENT;
   1923                 savedState = PARSE_TAG;
   1924                 break;
   1925             }
   1926             if (u_isUWhiteSpace(c)) {
   1927                 break;
   1928             }
   1929             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
   1930                 delete tp.bi;
   1931                 tp.bi = BreakIterator::createWordInstance(locale,  status);
   1932                 charIdx += 5;
   1933                 break;
   1934             }
   1935             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
   1936                 delete tp.bi;
   1937                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
   1938                 charIdx += 5;
   1939                 break;
   1940             }
   1941             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
   1942                 delete tp.bi;
   1943                 tp.bi = BreakIterator::createLineInstance(locale,  status);
   1944                 charIdx += 5;
   1945                 break;
   1946             }
   1947             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
   1948                 delete tp.bi;
   1949                 tp.bi = NULL;
   1950                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
   1951                 charIdx += 5;
   1952                 break;
   1953             }
   1954             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
   1955                 delete tp.bi;
   1956                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
   1957                 charIdx += 6;
   1958                 break;
   1959             }
   1960 
   1961             // <locale  loc_name>
   1962             localeMatcher.reset(testString);
   1963             if (localeMatcher.lookingAt(charIdx-1, status)) {
   1964                 UnicodeString localeName = localeMatcher.group(1, status);
   1965                 char localeName8[100];
   1966                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
   1967                 locale = Locale::createFromName(localeName8);
   1968                 charIdx += localeMatcher.group(0, status).length();
   1969                 TEST_ASSERT_SUCCESS(status);
   1970                 break;
   1971             }
   1972             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
   1973                 parseState = PARSE_DATA;
   1974                 charIdx += 5;
   1975                 tp.dataToBreak = "";
   1976                 tp.expectedBreaks->removeAllElements();
   1977                 tp.srcCol ->removeAllElements();
   1978                 tp.srcLine->removeAllElements();
   1979                 break;
   1980             }
   1981 
   1982             errln("line %d: Tag expected in test file.", lineNum);
   1983             parseState = PARSE_COMMENT;
   1984             savedState = PARSE_DATA;
   1985             goto end_test; // Stop the test.
   1986             }
   1987             break;
   1988 
   1989         case PARSE_DATA:
   1990             if (c == CH_BULLET) {
   1991                 int32_t  breakIdx = tp.dataToBreak.length();
   1992                 tp.expectedBreaks->setSize(breakIdx+1);
   1993                 tp.expectedBreaks->setElementAt(-1, breakIdx);
   1994                 tp.srcLine->setSize(breakIdx+1);
   1995                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1996                 tp.srcCol ->setSize(breakIdx+1);
   1997                 tp.srcCol ->setElementAt(column, breakIdx);
   1998                 break;
   1999             }
   2000 
   2001             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
   2002                 // Add final entry to mappings from break location to source file position.
   2003                 //  Need one extra because last break position returned is after the
   2004                 //    last char in the data, not at the last char.
   2005                 tp.srcLine->addElement(lineNum, status);
   2006                 tp.srcCol ->addElement(column, status);
   2007 
   2008                 parseState = PARSE_TAG;
   2009                 charIdx += 6;
   2010 
   2011                 // RUN THE TEST!
   2012                 executeTest(&tp);
   2013                 break;
   2014             }
   2015 
   2016             if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
   2017                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
   2018                 // Get the code point from the name and insert it into the test data.
   2019                 //   (Damn, no API takes names in Unicode  !!!
   2020                 //    we've got to take it back to char *)
   2021                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
   2022                 int32_t nameLength = nameEndIdx - (charIdx+2);
   2023                 char charNameBuf[200];
   2024                 UChar32 theChar = -1;
   2025                 if (nameEndIdx != -1) {
   2026                     UErrorCode status = U_ZERO_ERROR;
   2027                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
   2028                     charNameBuf[sizeof(charNameBuf)-1] = 0;
   2029                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
   2030                     if (U_FAILURE(status)) {
   2031                         theChar = -1;
   2032                     }
   2033                 }
   2034                 if (theChar == -1) {
   2035                     errln("Error in named character in test file at line %d, col %d",
   2036                         lineNum, column);
   2037                 } else {
   2038                     // Named code point was recognized.  Insert it
   2039                     //   into the test data.
   2040                     tp.dataToBreak.append(theChar);
   2041                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
   2042                         tp.srcLine->addElement(lineNum, status);
   2043                         tp.srcCol ->addElement(column, status);
   2044                     }
   2045                 }
   2046                 if (nameEndIdx > charIdx) {
   2047                     charIdx = nameEndIdx+1;
   2048 
   2049                 }
   2050                 break;
   2051             }
   2052 
   2053 
   2054 
   2055 
   2056             if (testString.compare(charIdx-1, 2, "<>") == 0) {
   2057                 charIdx++;
   2058                 int32_t  breakIdx = tp.dataToBreak.length();
   2059                 tp.expectedBreaks->setSize(breakIdx+1);
   2060                 tp.expectedBreaks->setElementAt(-1, breakIdx);
   2061                 tp.srcLine->setSize(breakIdx+1);
   2062                 tp.srcLine->setElementAt(lineNum, breakIdx);
   2063                 tp.srcCol ->setSize(breakIdx+1);
   2064                 tp.srcCol ->setElementAt(column, breakIdx);
   2065                 break;
   2066             }
   2067 
   2068             if (c == CH_LT) {
   2069                 tagValue   = 0;
   2070                 parseState = PARSE_NUM;
   2071                 break;
   2072             }
   2073 
   2074             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
   2075                 parseState = PARSE_COMMENT;
   2076                 savedState = PARSE_DATA;
   2077                 break;
   2078             }
   2079 
   2080             if (c == CH_BACKSLASH) {
   2081                 // Check for \ at end of line, a line continuation.
   2082                 //     Advance over (discard) the newline
   2083                 UChar32 cp = testString.char32At(charIdx);
   2084                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
   2085                     // We have a CR LF
   2086                     //  Need an extra increment of the input ptr to move over both of them
   2087                     charIdx++;
   2088                 }
   2089                 if (cp == CH_LF || cp == CH_CR) {
   2090                     lineNum++;
   2091                     colStart = charIdx;
   2092                     charIdx++;
   2093                     break;
   2094                 }
   2095 
   2096                 // Let unescape handle the back slash.
   2097                 cp = testString.unescapeAt(charIdx);
   2098                 if (cp != -1) {
   2099                     // Escape sequence was recognized.  Insert the char
   2100                     //   into the test data.
   2101                     tp.dataToBreak.append(cp);
   2102                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
   2103                         tp.srcLine->addElement(lineNum, status);
   2104                         tp.srcCol ->addElement(column, status);
   2105                     }
   2106                     break;
   2107                 }
   2108 
   2109 
   2110                 // Not a recognized backslash escape sequence.
   2111                 // Take the next char as a literal.
   2112                 //  TODO:  Should this be an error?
   2113                 c = testString.charAt(charIdx);
   2114                 charIdx = testString.moveIndex32(charIdx, 1);
   2115             }
   2116 
   2117             // Normal, non-escaped data char.
   2118             tp.dataToBreak.append(c);
   2119 
   2120             // Save the mapping from offset in the data to line/column numbers in
   2121             //   the original input file.  Will be used for better error messages only.
   2122             //   If there's an expected break before this char, the slot in the mapping
   2123             //     vector will already be set for this char; don't overwrite it.
   2124             if (tp.dataToBreak.length() > tp.srcLine->size()) {
   2125                 tp.srcLine->addElement(lineNum, status);
   2126                 tp.srcCol ->addElement(column, status);
   2127             }
   2128             break;
   2129 
   2130 
   2131         case PARSE_NUM:
   2132             // We are parsing an expected numeric tag value, like <1234>,
   2133             //   within a chunk of data.
   2134             if (u_isUWhiteSpace(c)) {
   2135                 break;
   2136             }
   2137 
   2138             if (c == CH_GT) {
   2139                 // Finished the number.  Add the info to the expected break data,
   2140                 //   and switch parse state back to doing plain data.
   2141                 parseState = PARSE_DATA;
   2142                 if (tagValue == 0) {
   2143                     tagValue = -1;
   2144                 }
   2145                 int32_t  breakIdx = tp.dataToBreak.length();
   2146                 tp.expectedBreaks->setSize(breakIdx+1);
   2147                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
   2148                 tp.srcLine->setSize(breakIdx+1);
   2149                 tp.srcLine->setElementAt(lineNum, breakIdx);
   2150                 tp.srcCol ->setSize(breakIdx+1);
   2151                 tp.srcCol ->setElementAt(column, breakIdx);
   2152                 break;
   2153             }
   2154 
   2155             if (u_isdigit(c)) {
   2156                 tagValue = tagValue*10 + u_charDigitValue(c);
   2157                 break;
   2158             }
   2159 
   2160             errln("Syntax Error in test file at line %d, col %d",
   2161                 lineNum, column);
   2162             parseState = PARSE_COMMENT;
   2163             goto end_test; // Stop the test
   2164             break;
   2165         }
   2166 
   2167 
   2168         if (U_FAILURE(status)) {
   2169             errln("ICU Error %s while parsing test file at line %d.",
   2170                 u_errorName(status), lineNum);
   2171             status = U_ZERO_ERROR;
   2172             goto end_test; // Stop the test
   2173         }
   2174 
   2175     }
   2176 
   2177 end_test:
   2178     delete tp.bi;
   2179     delete tp.expectedBreaks;
   2180     delete tp.srcLine;
   2181     delete tp.srcCol;
   2182     delete [] testFile;
   2183 #endif
   2184 }
   2185 
   2186 void RBBITest::TestThaiBreaks() {
   2187     UErrorCode status=U_ZERO_ERROR;
   2188     BreakIterator* b;
   2189     Locale locale = Locale("th");
   2190     int32_t p, index;
   2191     UChar c[]= {
   2192             0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B,
   2193             0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19,
   2194             0x0E16, 0x0E49, 0x0E33, 0x0000
   2195     };
   2196     int32_t expectedWordResult[] = {
   2197             2, 3, 6, 10, 11, 15, 17, 20, 22
   2198     };
   2199     int32_t expectedLineResult[] = {
   2200             3, 6, 11, 15, 17, 20, 22
   2201     };
   2202 
   2203     int32_t size = u_strlen(c);
   2204     UnicodeString text=UnicodeString(c);
   2205 
   2206     b = BreakIterator::createWordInstance(locale, status);
   2207     if (U_FAILURE(status)) {
   2208         errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status));
   2209         return;
   2210     }
   2211     b->setText(text);
   2212     p = index = 0;
   2213     while ((p=b->next())!=BreakIterator::DONE && p < size) {
   2214         if (p != expectedWordResult[index++]) {
   2215             errln("Incorrect break given by thai word break iterator. Expected: %d  Got: %d", expectedWordResult[index-1], p);
   2216         }
   2217     }
   2218     delete b;
   2219 
   2220     b = BreakIterator::createLineInstance(locale, status);
   2221     if (U_FAILURE(status)) {
   2222         printf("Unable to create thai line break iterator.\n");
   2223         return;
   2224     }
   2225     b->setText(text);
   2226     p = index = 0;
   2227     while ((p=b->next())!=BreakIterator::DONE && p < size) {
   2228         if (p != expectedLineResult[index++]) {
   2229             errln("Incorrect break given by thai line break iterator. Expected: %d  Got: %d", expectedLineResult[index-1], p);
   2230         }
   2231     }
   2232 
   2233     delete b;
   2234 }
   2235 
   2236 // UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"
   2237 // Words don't include colon or period (cldrbug #1969).
   2238 static const char    posxWordText[]     = "Can't have breaks in xx:yy or struct.field for CS-types.";
   2239 static const int32_t posxWordTOffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 };
   2240 static const int32_t posxWordROffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21,         26, 27, 29, 30,         42, 43, 46, 47, 49, 50, 55, 56 };
   2241 
   2242 // UBreakIteratorType UBRK_WORD, Locale "ja"
   2243 // Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
   2244 static const char    jaWordText[]     = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"
   2245                                         "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";
   2246 #if 0
   2247 static const int32_t jaWordTOffsets[] = {    2, 3,          7, 8, 14,         17, 18,     20, 21, 24,         27, 28 };
   2248 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 };
   2249 #endif
   2250 // There's no separate Japanese word break iterator. Root is the same as Japanese.
   2251 // Our dictionary-based iterator has to be tweaked to better handle U+3005,
   2252 // U+3007, U+300B and some other cases.
   2253 static const int32_t jaWordTOffsets[] = { 1, 2, 3, 4, 5,    7, 8, 12, 13, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 };
   2254 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5,    7, 8, 12, 13, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 };
   2255 
   2256 // UBreakIteratorType UBRK_SENTENCE, Locale "el"
   2257 // Add break after Greek question mark (cldrbug #2069).
   2258 static const char    elSentText[]     = "\\u0391\\u03B2, \\u03B3\\u03B4; \\u0395 \\u03B6\\u03B7\\u037E \\u0398 \\u03B9\\u03BA. "
   2259                                         "\\u039B\\u03BC \\u03BD\\u03BE! \\u039F\\u03C0, \\u03A1\\u03C2? \\u03A3";
   2260 static const int32_t elSentTOffsets[] = { 8, 14, 20, 27, 35, 36 };
   2261 static const int32_t elSentROffsets[] = {        20, 27, 35, 36 };
   2262 
   2263 // UBreakIteratorType UBRK_CHARACTER, Locale "th"
   2264 // Clusters should not include spacing Thai/Lao vowels (prefix or postfix), except for [SARA] AM (cldrbug #2161).
   2265 static const char    thCharText[]     = "\\u0E01\\u0E23\\u0E30\\u0E17\\u0E48\\u0E2D\\u0E21\\u0E23\\u0E08\\u0E19\\u0E32 "
   2266                                         "(\\u0E2A\\u0E38\\u0E0A\\u0E32\\u0E15\\u0E34-\\u0E08\\u0E38\\u0E11\\u0E32\\u0E21\\u0E32\\u0E28) "
   2267                                         "\\u0E40\\u0E14\\u0E47\\u0E01\\u0E21\\u0E35\\u0E1B\\u0E31\\u0E0D\\u0E2B\\u0E32 ";
   2268 static const int32_t thCharTOffsets[] = { 1, 2, 3, 5, 6, 7, 8, 9, 10, 11,
   2269                                           12, 13, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28,
   2270                                           29, 30, 32, 33, 35, 37, 38, 39, 40, 41 };
   2271 static const int32_t thCharROffsets[] = { 1,    3, 5, 6, 7, 8, 9,     11,
   2272                                           12, 13, 15,     17, 19, 20, 22,     24,     26, 27, 28,
   2273                                           29,     32, 33, 35, 37, 38,     40, 41 };
   2274 
   2275 typedef struct {
   2276     UBreakIteratorType  type;
   2277     const char *        locale;
   2278     const char *        escapedText;
   2279     const int32_t *     tailoredOffsets;
   2280     int32_t             tailoredOffsetsCount;
   2281     const int32_t *     rootOffsets;
   2282     int32_t             rootOffsetsCount;
   2283 } TailoredBreakItem;
   2284 
   2285 #define ARRAY_PTR_LEN(array) (array),(sizeof(array)/sizeof(array[0]))
   2286 
   2287 static const TailoredBreakItem tbItems[] = {
   2288     { UBRK_WORD,      "en_US_POSIX", posxWordText, ARRAY_PTR_LEN(posxWordTOffsets), ARRAY_PTR_LEN(posxWordROffsets) },
   2289     { UBRK_WORD,      "ja",          jaWordText,   ARRAY_PTR_LEN(jaWordTOffsets),   ARRAY_PTR_LEN(jaWordROffsets)   },
   2290     { UBRK_SENTENCE,  "el",          elSentText,   ARRAY_PTR_LEN(elSentTOffsets),   ARRAY_PTR_LEN(elSentROffsets)   },
   2291     { UBRK_CHARACTER, "th",          thCharText,   ARRAY_PTR_LEN(thCharTOffsets),   ARRAY_PTR_LEN(thCharROffsets)   },
   2292     { UBRK_CHARACTER, NULL,          NULL,         NULL,0,                          NULL,0                          } // terminator
   2293 };
   2294 
   2295 static void formatOffsets(char* buffer, int32_t buflen, int32_t count, const int32_t* offsets) {
   2296     while (count-- > 0) {
   2297         int writeCount;
   2298         sprintf(buffer, /* buflen, */ " %d%n", *offsets++, &writeCount); /* wants to be snprintf */
   2299         buffer += writeCount;
   2300         buflen -= writeCount;
   2301     }
   2302 }
   2303 
   2304 enum { kMaxOffsetCount = 128 };
   2305 
   2306 void RBBITest::TBTest(BreakIterator* brkitr, int type, const char *locale, const char* escapedText, const int32_t *expectOffsets, int32_t expectOffsetsCount) {
   2307     brkitr->setText( CharsToUnicodeString(escapedText) );
   2308     int32_t foundOffsets[kMaxOffsetCount];
   2309     int32_t offset, foundOffsetsCount = 0;
   2310     // do forwards iteration test
   2311     while ( foundOffsetsCount < kMaxOffsetCount && (offset = brkitr->next()) != BreakIterator::DONE ) {
   2312         foundOffsets[foundOffsetsCount++] = offset;
   2313     }
   2314     if ( foundOffsetsCount != expectOffsetsCount || memcmp(expectOffsets, foundOffsets, foundOffsetsCount*sizeof(foundOffsets[0])) != 0 ) {
   2315         // log error for forwards test
   2316         char formatExpect[512], formatFound[512];
   2317         formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets);
   2318         formatOffsets(formatFound, sizeof(formatFound), foundOffsetsCount, foundOffsets);
   2319         errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found %d offsets fwd:%s\n",
   2320                 type, locale, escapedText, expectOffsetsCount, formatExpect, foundOffsetsCount, formatFound);
   2321     } else {
   2322         // do backwards iteration test
   2323         --foundOffsetsCount; // back off one from the end offset
   2324         while ( foundOffsetsCount > 0 ) {
   2325             offset = brkitr->previous();
   2326             if ( offset != foundOffsets[--foundOffsetsCount] ) {
   2327                 // log error for backwards test
   2328                 char formatExpect[512];
   2329                 formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets);
   2330                 errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found rev offset %d where expect %d\n",
   2331                         type, locale, escapedText, expectOffsetsCount, formatExpect, offset, foundOffsets[foundOffsetsCount]);
   2332                 break;
   2333             }
   2334         }
   2335     }
   2336 }
   2337 
   2338 void RBBITest::TestTailoredBreaks() {
   2339     const TailoredBreakItem * tbItemPtr;
   2340     Locale rootLocale = Locale("root");
   2341     for (tbItemPtr = tbItems; tbItemPtr->escapedText != NULL; ++tbItemPtr) {
   2342         Locale testLocale = Locale(tbItemPtr->locale);
   2343         BreakIterator * tailoredBrkiter = NULL;
   2344         BreakIterator * rootBrkiter = NULL;
   2345         UErrorCode status = U_ZERO_ERROR;
   2346         switch (tbItemPtr->type) {
   2347             case UBRK_CHARACTER:
   2348                 tailoredBrkiter = BreakIterator::createCharacterInstance(testLocale, status);
   2349                 rootBrkiter = BreakIterator::createCharacterInstance(rootLocale, status);
   2350                 break;
   2351             case UBRK_WORD:
   2352                 tailoredBrkiter = BreakIterator::createWordInstance(testLocale, status);
   2353                 rootBrkiter = BreakIterator::createWordInstance(rootLocale, status);
   2354                 break;
   2355             case UBRK_LINE:
   2356                 tailoredBrkiter = BreakIterator::createLineInstance(testLocale, status);
   2357                 rootBrkiter = BreakIterator::createLineInstance(rootLocale, status);
   2358                 break;
   2359             case UBRK_SENTENCE:
   2360                 tailoredBrkiter = BreakIterator::createSentenceInstance(testLocale, status);
   2361                 rootBrkiter = BreakIterator::createSentenceInstance(rootLocale, status);
   2362                 break;
   2363             default:
   2364                 status = U_UNSUPPORTED_ERROR;
   2365                 break;
   2366         }
   2367         if (U_FAILURE(status)) {
   2368             errcheckln(status, "BreakIterator create failed for type %d, locales root or %s - Error: %s", (int)(tbItemPtr->type), tbItemPtr->locale, u_errorName(status));
   2369             continue;
   2370         }
   2371         TBTest(tailoredBrkiter, (int)(tbItemPtr->type), tbItemPtr->locale, tbItemPtr->escapedText, tbItemPtr->tailoredOffsets, tbItemPtr->tailoredOffsetsCount);
   2372         TBTest(rootBrkiter,     (int)(tbItemPtr->type), "root",            tbItemPtr->escapedText, tbItemPtr->rootOffsets,     tbItemPtr->rootOffsetsCount);
   2373 
   2374         delete rootBrkiter;
   2375         delete tailoredBrkiter;
   2376     }
   2377 }
   2378 
   2379 
   2380 //-------------------------------------------------------------------------------
   2381 //
   2382 //  TestDictRules   create a break iterator from source rules that includes a
   2383 //                  dictionary range.   Regression for bug #7130.  Source rules
   2384 //                  do not declare a break iterator type (word, line, sentence, etc.
   2385 //                  but the dictionary code, without a type, would loop.
   2386 //
   2387 //-------------------------------------------------------------------------------
   2388 void RBBITest::TestDictRules() {
   2389     const char *rules =  "$dictionary = [a-z]; \n"
   2390                          "!!forward; \n"
   2391                          "$dictionary $dictionary; \n"
   2392                          "!!reverse; \n"
   2393                          "$dictionary $dictionary; \n";
   2394     const char *text = "aa";
   2395     UErrorCode status = U_ZERO_ERROR;
   2396     UParseError parseError;
   2397 
   2398     RuleBasedBreakIterator bi(rules, parseError, status);
   2399     if (U_SUCCESS(status)) {
   2400         UnicodeString utext = text;
   2401         bi.setText(utext);
   2402         int32_t position;
   2403         int32_t loops;
   2404         for (loops = 0; loops<10; loops++) {
   2405             position = bi.next();
   2406             if (position == RuleBasedBreakIterator::DONE) {
   2407                 break;
   2408             }
   2409         }
   2410         TEST_ASSERT(loops == 1);
   2411     } else {
   2412         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
   2413     }
   2414 }
   2415 
   2416 
   2417 
   2418 //-------------------------------------------------------------------------------
   2419 //
   2420 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
   2421 //    return the datain one big UChar * buffer, which the caller must delete.
   2422 //
   2423 //    parameters:
   2424 //          fileName:   the name of the file, with no directory part.  The test data directory
   2425 //                      is assumed.
   2426 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
   2427 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
   2428 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
   2429 //                      Pass NULL for the system default encoding.
   2430 //          status
   2431 //    returns:
   2432 //                      The file data, converted to UChar.
   2433 //                      The caller must delete this when done with
   2434 //                           delete [] theBuffer;
   2435 //
   2436 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
   2437 //           Move this function to some common place.
   2438 //
   2439 //--------------------------------------------------------------------------------
   2440 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
   2441     UChar       *retPtr  = NULL;
   2442     char        *fileBuf = NULL;
   2443     UConverter* conv     = NULL;
   2444     FILE        *f       = NULL;
   2445 
   2446     ulen = 0;
   2447     if (U_FAILURE(status)) {
   2448         return retPtr;
   2449     }
   2450 
   2451     //
   2452     //  Open the file.
   2453     //
   2454     f = fopen(fileName, "rb");
   2455     if (f == 0) {
   2456         dataerrln("Error opening test data file %s\n", fileName);
   2457         status = U_FILE_ACCESS_ERROR;
   2458         return NULL;
   2459     }
   2460     //
   2461     //  Read it in
   2462     //
   2463     int   fileSize;
   2464     int   amt_read;
   2465 
   2466     fseek( f, 0, SEEK_END);
   2467     fileSize = ftell(f);
   2468     fileBuf = new char[fileSize];
   2469     fseek(f, 0, SEEK_SET);
   2470     amt_read = fread(fileBuf, 1, fileSize, f);
   2471     if (amt_read != fileSize || fileSize <= 0) {
   2472         errln("Error reading test data file.");
   2473         goto cleanUpAndReturn;
   2474     }
   2475 
   2476     //
   2477     // Look for a Unicode Signature (BOM) on the data just read
   2478     //
   2479     int32_t        signatureLength;
   2480     const char *   fileBufC;
   2481     const char*    bomEncoding;
   2482 
   2483     fileBufC = fileBuf;
   2484     bomEncoding = ucnv_detectUnicodeSignature(
   2485         fileBuf, fileSize, &signatureLength, &status);
   2486     if(bomEncoding!=NULL ){
   2487         fileBufC  += signatureLength;
   2488         fileSize  -= signatureLength;
   2489         encoding = bomEncoding;
   2490     }
   2491 
   2492     //
   2493     // Open a converter to take the rule file to UTF-16
   2494     //
   2495     conv = ucnv_open(encoding, &status);
   2496     if (U_FAILURE(status)) {
   2497         goto cleanUpAndReturn;
   2498     }
   2499 
   2500     //
   2501     // Convert the rules to UChar.
   2502     //  Preflight first to determine required buffer size.
   2503     //
   2504     ulen = ucnv_toUChars(conv,
   2505         NULL,           //  dest,
   2506         0,              //  destCapacity,
   2507         fileBufC,
   2508         fileSize,
   2509         &status);
   2510     if (status == U_BUFFER_OVERFLOW_ERROR) {
   2511         // Buffer Overflow is expected from the preflight operation.
   2512         status = U_ZERO_ERROR;
   2513 
   2514         retPtr = new UChar[ulen+1];
   2515         ucnv_toUChars(conv,
   2516             retPtr,       //  dest,
   2517             ulen+1,
   2518             fileBufC,
   2519             fileSize,
   2520             &status);
   2521     }
   2522 
   2523 cleanUpAndReturn:
   2524     fclose(f);
   2525     delete []fileBuf;
   2526     ucnv_close(conv);
   2527     if (U_FAILURE(status)) {
   2528         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   2529         delete retPtr;
   2530         retPtr = 0;
   2531         ulen   = 0;
   2532     };
   2533     return retPtr;
   2534 }
   2535 
   2536 
   2537 
   2538 //--------------------------------------------------------------------------------------------
   2539 //
   2540 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
   2541 //
   2542 //-------------------------------------------------------------------------------------------
   2543 void RBBITest::TestUnicodeFiles() {
   2544     RuleBasedBreakIterator  *bi;
   2545     UErrorCode               status = U_ZERO_ERROR;
   2546 
   2547     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
   2548     TEST_ASSERT_SUCCESS(status);
   2549     if (U_SUCCESS(status)) {
   2550         runUnicodeTestData("GraphemeBreakTest.txt", bi);
   2551     }
   2552     delete bi;
   2553 
   2554     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
   2555     TEST_ASSERT_SUCCESS(status);
   2556     if (U_SUCCESS(status)) {
   2557         runUnicodeTestData("WordBreakTest.txt", bi);
   2558     }
   2559     delete bi;
   2560 
   2561     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
   2562     TEST_ASSERT_SUCCESS(status);
   2563     if (U_SUCCESS(status)) {
   2564         runUnicodeTestData("SentenceBreakTest.txt", bi);
   2565     }
   2566     delete bi;
   2567 
   2568     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
   2569     TEST_ASSERT_SUCCESS(status);
   2570     if (U_SUCCESS(status)) {
   2571         runUnicodeTestData("LineBreakTest.txt", bi);
   2572     }
   2573     delete bi;
   2574 }
   2575 
   2576 
   2577 //--------------------------------------------------------------------------------------------
   2578 //
   2579 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
   2580 //
   2581 //-------------------------------------------------------------------------------------------
   2582 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
   2583 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   2584 // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb.
   2585   UVersionInfo icu4601 = { 4, 6, 0, 1 };
   2586 UBool isICUVersionPast46 = isICUVersionAtLeast(icu4601);
   2587 UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt");
   2588     UErrorCode  status = U_ZERO_ERROR;
   2589 
   2590     //
   2591     //  Open and read the test data file, put it into a UnicodeString.
   2592     //
   2593     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   2594     char testFileName[1000];
   2595     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
   2596         dataerrln("Can't open test data.  Path too long.");
   2597         return;
   2598     }
   2599     strcpy(testFileName, testDataDirectory);
   2600     strcat(testFileName, fileName);
   2601 
   2602     logln("Opening data file %s\n", fileName);
   2603 
   2604     int    len;
   2605     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
   2606     if (status != U_FILE_ACCESS_ERROR) {
   2607         TEST_ASSERT_SUCCESS(status);
   2608         TEST_ASSERT(testFile != NULL);
   2609     }
   2610     if (U_FAILURE(status) || testFile == NULL) {
   2611         return; /* something went wrong, error already output */
   2612     }
   2613     UnicodeString testFileAsString(TRUE, testFile, len);
   2614 
   2615     //
   2616     //  Parse the test data file using a regular expression.
   2617     //  Each kind of token is recognized in its own capture group; what type of item was scanned
   2618     //     is identified by which group had a match.
   2619     //
   2620     //    Caputure Group #                  1          2            3            4           5
   2621     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
   2622     //
   2623     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
   2624     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
   2625     UnicodeString   testString;
   2626     UVector32       breakPositions(status);
   2627     int             lineNumber = 1;
   2628     TEST_ASSERT_SUCCESS(status);
   2629     if (U_FAILURE(status)) {
   2630         return;
   2631     }
   2632 
   2633     //
   2634     //  Scan through each test case, building up the string to be broken in testString,
   2635     //   and the positions that should be boundaries in the breakPositions vector.
   2636     //
   2637     int spin = 0;
   2638     while (tokenMatcher.find()) {
   2639       	if(tokenMatcher.hitEnd()) {
   2640           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
   2641              This occurred when the text file was corrupt (wasn't marked as UTF-8)
   2642              and caused an infinite loop here on EBCDIC systems!
   2643           */
   2644           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
   2645           //	   return;
   2646       	}
   2647         if (tokenMatcher.start(1, status) >= 0) {
   2648             // Scanned a divide sign, indicating a break position in the test data.
   2649             if (testString.length()>0) {
   2650                 breakPositions.addElement(testString.length(), status);
   2651             }
   2652         }
   2653         else if (tokenMatcher.start(2, status) >= 0) {
   2654             // Scanned an 'x', meaning no break at this position in the test data
   2655             //   Nothing to be done here.
   2656             }
   2657         else if (tokenMatcher.start(3, status) >= 0) {
   2658             // Scanned Hex digits.  Convert them to binary, append to the character data string.
   2659             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
   2660             int length = hexNumber.length();
   2661             if (length<=8) {
   2662                 char buf[10];
   2663                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
   2664                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
   2665                 if (c<=0x10ffff) {
   2666                     testString.append(c);
   2667                 } else {
   2668                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
   2669                        fileName, lineNumber);
   2670                 }
   2671             } else {
   2672                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
   2673                        fileName, lineNumber);
   2674              }
   2675         }
   2676         else if (tokenMatcher.start(4, status) >= 0) {
   2677             // Scanned to end of a line, possibly skipping over a comment in the process.
   2678             //   If the line from the file contained test data, run the test now.
   2679             //
   2680             if (testString.length() > 0) {
   2681 // TODO(andy): Remove this time bomb code.
   2682 if (!isLineBreak || isICUVersionPast46 || !(4658 <= lineNumber && lineNumber <= 4758)) {
   2683                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
   2684 }
   2685             }
   2686 
   2687             // Clear out this test case.
   2688             //    The string and breakPositions vector will be refilled as the next
   2689             //       test case is parsed.
   2690             testString.remove();
   2691             breakPositions.removeAllElements();
   2692             lineNumber++;
   2693         } else {
   2694             // Scanner catchall.  Something unrecognized appeared on the line.
   2695             char token[16];
   2696             UnicodeString uToken = tokenMatcher.group(0, status);
   2697             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
   2698             token[sizeof(token)-1] = 0;
   2699             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
   2700 
   2701             // Clean up, in preparation for continuing with the next line.
   2702             testString.remove();
   2703             breakPositions.removeAllElements();
   2704             lineNumber++;
   2705         }
   2706         TEST_ASSERT_SUCCESS(status);
   2707         if (U_FAILURE(status)) {
   2708             break;
   2709         }
   2710     }
   2711 
   2712     delete [] testFile;
   2713  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
   2714 }
   2715 
   2716 //--------------------------------------------------------------------------------------------
   2717 //
   2718 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
   2719 //                            test data files.  Do only a simple, forward-only check -
   2720 //                            this test is mostly to check that ICU and the Unicode
   2721 //                            data agree with each other.
   2722 //
   2723 //--------------------------------------------------------------------------------------------
   2724 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
   2725                          const UnicodeString &testString,   // Text data to be broken
   2726                          UVector32 *breakPositions,         // Positions where breaks should be found.
   2727                          RuleBasedBreakIterator *bi) {
   2728     int32_t pos;                 // Break Position in the test string
   2729     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
   2730     int32_t expectedPos;         // Expected break position (index into test string)
   2731 
   2732     bi->setText(testString);
   2733     pos = bi->first();
   2734     pos = bi->next();
   2735 
   2736     while (pos != BreakIterator::DONE) {
   2737         if (expectedI >= breakPositions->size()) {
   2738             errln("Test file \"%s\", line %d, unexpected break found at position %d",
   2739                 testFileName, lineNumber, pos);
   2740             break;
   2741         }
   2742         expectedPos = breakPositions->elementAti(expectedI);
   2743         if (pos < expectedPos) {
   2744             errln("Test file \"%s\", line %d, unexpected break found at position %d",
   2745                 testFileName, lineNumber, pos);
   2746             break;
   2747         }
   2748         if (pos > expectedPos) {
   2749             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
   2750                 testFileName, lineNumber, expectedPos);
   2751             break;
   2752         }
   2753         pos = bi->next();
   2754         expectedI++;
   2755     }
   2756 
   2757     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
   2758         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
   2759             testFileName, lineNumber, breakPositions->elementAti(expectedI));
   2760     }
   2761 }
   2762 
   2763 
   2764 
   2765 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   2766 //---------------------------------------------------------------------------------------
   2767 //
   2768 //   classs RBBIMonkeyKind
   2769 //
   2770 //      Monkey Test for Break Iteration
   2771 //      Abstract interface class.   Concrete derived classes independently
   2772 //      implement the break rules for different iterator types.
   2773 //
   2774 //      The Monkey Test itself uses doesn't know which type of break iterator it is
   2775 //      testing, but works purely in terms of the interface defined here.
   2776 //
   2777 //---------------------------------------------------------------------------------------
   2778 class RBBIMonkeyKind {
   2779 public:
   2780     // Return a UVector of UnicodeSets, representing the character classes used
   2781     //   for this type of iterator.
   2782     virtual  UVector  *charClasses() = 0;
   2783 
   2784     // Set the test text on which subsequent calls to next() will operate
   2785     virtual  void      setText(const UnicodeString &s) = 0;
   2786 
   2787     // Find the next break postion, starting from the prev break position, or from zero.
   2788     // Return -1 after reaching end of string.
   2789     virtual  int32_t   next(int32_t i) = 0;
   2790 
   2791     virtual ~RBBIMonkeyKind();
   2792     UErrorCode       deferredStatus;
   2793 
   2794 
   2795 protected:
   2796     RBBIMonkeyKind();
   2797 
   2798 private:
   2799 };
   2800 
   2801 RBBIMonkeyKind::RBBIMonkeyKind() {
   2802     deferredStatus = U_ZERO_ERROR;
   2803 }
   2804 
   2805 RBBIMonkeyKind::~RBBIMonkeyKind() {
   2806 }
   2807 
   2808 
   2809 //----------------------------------------------------------------------------------------
   2810 //
   2811 //   Random Numbers.  Similar to standard lib rand() and srand()
   2812 //                    Not using library to
   2813 //                      1.  Get same results on all platforms.
   2814 //                      2.  Get access to current seed, to more easily reproduce failures.
   2815 //
   2816 //---------------------------------------------------------------------------------------
   2817 static uint32_t m_seed = 1;
   2818 
   2819 static uint32_t m_rand()
   2820 {
   2821     m_seed = m_seed * 1103515245 + 12345;
   2822     return (uint32_t)(m_seed/65536) % 32768;
   2823 }
   2824 
   2825 
   2826 //------------------------------------------------------------------------------------------
   2827 //
   2828 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
   2829 //                             of RBBIMonkeyKind.
   2830 //
   2831 //------------------------------------------------------------------------------------------
   2832 class RBBICharMonkey: public RBBIMonkeyKind {
   2833 public:
   2834     RBBICharMonkey();
   2835     virtual          ~RBBICharMonkey();
   2836     virtual  UVector *charClasses();
   2837     virtual  void     setText(const UnicodeString &s);
   2838     virtual  int32_t  next(int32_t i);
   2839 private:
   2840     UVector   *fSets;
   2841 
   2842     UnicodeSet  *fCRLFSet;
   2843     UnicodeSet  *fControlSet;
   2844     UnicodeSet  *fExtendSet;
   2845     UnicodeSet  *fPrependSet;
   2846     UnicodeSet  *fSpacingSet;
   2847     UnicodeSet  *fLSet;
   2848     UnicodeSet  *fVSet;
   2849     UnicodeSet  *fTSet;
   2850     UnicodeSet  *fLVSet;
   2851     UnicodeSet  *fLVTSet;
   2852     UnicodeSet  *fHangulSet;
   2853     UnicodeSet  *fAnySet;
   2854 
   2855     const UnicodeString *fText;
   2856 };
   2857 
   2858 
   2859 RBBICharMonkey::RBBICharMonkey() {
   2860     UErrorCode  status = U_ZERO_ERROR;
   2861 
   2862     fText = NULL;
   2863 
   2864     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
   2865     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
   2866     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
   2867     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
   2868     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
   2869     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
   2870     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
   2871     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
   2872     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
   2873     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
   2874     fHangulSet  = new UnicodeSet();
   2875     fHangulSet->addAll(*fLSet);
   2876     fHangulSet->addAll(*fVSet);
   2877     fHangulSet->addAll(*fTSet);
   2878     fHangulSet->addAll(*fLVSet);
   2879     fHangulSet->addAll(*fLVTSet);
   2880     fAnySet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0000-\\U0010ffff]"), status);
   2881 
   2882     fSets       = new UVector(status);
   2883     fSets->addElement(fCRLFSet,    status);
   2884     fSets->addElement(fControlSet, status);
   2885     fSets->addElement(fExtendSet,  status);
   2886     fSets->addElement(fPrependSet, status);
   2887     fSets->addElement(fSpacingSet, status);
   2888     fSets->addElement(fHangulSet,  status);
   2889     fSets->addElement(fAnySet,     status);
   2890     if (U_FAILURE(status)) {
   2891         deferredStatus = status;
   2892     }
   2893 }
   2894 
   2895 
   2896 void RBBICharMonkey::setText(const UnicodeString &s) {
   2897     fText = &s;
   2898 }
   2899 
   2900 
   2901 
   2902 int32_t RBBICharMonkey::next(int32_t prevPos) {
   2903     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2904                               //   break position being tested.  The candidate break
   2905                               //   location is before p2.
   2906 
   2907     int     breakPos = -1;
   2908 
   2909     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2910 
   2911     if (U_FAILURE(deferredStatus)) {
   2912         return -1;
   2913     }
   2914 
   2915     // Previous break at end of string.  return DONE.
   2916     if (prevPos >= fText->length()) {
   2917         return -1;
   2918     }
   2919     p0 = p1 = p2 = p3 = prevPos;
   2920     c3 =  fText->char32At(prevPos);
   2921     c0 = c1 = c2 = 0;
   2922 
   2923     // Loop runs once per "significant" character position in the input text.
   2924     for (;;) {
   2925         // Move all of the positions forward in the input string.
   2926         p0 = p1;  c0 = c1;
   2927         p1 = p2;  c1 = c2;
   2928         p2 = p3;  c2 = c3;
   2929 
   2930         // Advancd p3 by one codepoint
   2931         p3 = fText->moveIndex32(p3, 1);
   2932         c3 = fText->char32At(p3);
   2933 
   2934         if (p1 == p2) {
   2935             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2936             continue;
   2937         }
   2938         if (p2 == fText->length()) {
   2939             // Reached end of string.  Always a break position.
   2940             break;
   2941         }
   2942 
   2943         // Rule  GB3   CR x LF
   2944         //     No Extend or Format characters may appear between the CR and LF,
   2945         //     which requires the additional check for p2 immediately following p1.
   2946         //
   2947         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
   2948             continue;
   2949         }
   2950 
   2951         // Rule (GB4).   ( Control | CR | LF ) <break>
   2952         if (fControlSet->contains(c1) ||
   2953             c1 == 0x0D ||
   2954             c1 == 0x0A)  {
   2955             break;
   2956         }
   2957 
   2958         // Rule (GB5)    <break>  ( Control | CR | LF )
   2959         //
   2960         if (fControlSet->contains(c2) ||
   2961             c2 == 0x0D ||
   2962             c2 == 0x0A)  {
   2963             break;
   2964         }
   2965 
   2966 
   2967         // Rule (GB6)  L x ( L | V | LV | LVT )
   2968         if (fLSet->contains(c1) &&
   2969                (fLSet->contains(c2)  ||
   2970                 fVSet->contains(c2)  ||
   2971                 fLVSet->contains(c2) ||
   2972                 fLVTSet->contains(c2))) {
   2973             continue;
   2974         }
   2975 
   2976         // Rule (GB7)    ( LV | V )  x  ( V | T )
   2977         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
   2978             (fVSet->contains(c2) || fTSet->contains(c2)))  {
   2979             continue;
   2980         }
   2981 
   2982         // Rule (GB8)    ( LVT | T)  x T
   2983         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
   2984             fTSet->contains(c2))  {
   2985             continue;
   2986         }
   2987 
   2988         // Rule (GB9)    Numeric x ALetter
   2989         if (fExtendSet->contains(c2))  {
   2990             continue;
   2991         }
   2992 
   2993         // Rule (GB9a)   x  SpacingMark
   2994         if (fSpacingSet->contains(c2)) {
   2995             continue;
   2996         }
   2997 
   2998         // Rule (GB9b)   Prepend x
   2999         if (fPrependSet->contains(c1)) {
   3000             continue;
   3001         }
   3002 
   3003         // Rule (GB10)  Any  <break>  Any
   3004         break;
   3005     }
   3006 
   3007     breakPos = p2;
   3008     return breakPos;
   3009 }
   3010 
   3011 
   3012 
   3013 UVector  *RBBICharMonkey::charClasses() {
   3014     return fSets;
   3015 }
   3016 
   3017 
   3018 RBBICharMonkey::~RBBICharMonkey() {
   3019     delete fSets;
   3020     delete fCRLFSet;
   3021     delete fControlSet;
   3022     delete fExtendSet;
   3023     delete fPrependSet;
   3024     delete fSpacingSet;
   3025     delete fLSet;
   3026     delete fVSet;
   3027     delete fTSet;
   3028     delete fLVSet;
   3029     delete fLVTSet;
   3030     delete fHangulSet;
   3031     delete fAnySet;
   3032 }
   3033 
   3034 //------------------------------------------------------------------------------------------
   3035 //
   3036 //   class RBBIWordMonkey      Word Break specific implementation
   3037 //                             of RBBIMonkeyKind.
   3038 //
   3039 //------------------------------------------------------------------------------------------
   3040 class RBBIWordMonkey: public RBBIMonkeyKind {
   3041 public:
   3042     RBBIWordMonkey();
   3043     virtual          ~RBBIWordMonkey();
   3044     virtual  UVector *charClasses();
   3045     virtual  void     setText(const UnicodeString &s);
   3046     virtual int32_t   next(int32_t i);
   3047 private:
   3048     UVector      *fSets;
   3049 
   3050     UnicodeSet  *fCRSet;
   3051     UnicodeSet  *fLFSet;
   3052     UnicodeSet  *fNewlineSet;
   3053     UnicodeSet  *fKatakanaSet;
   3054     UnicodeSet  *fALetterSet;
   3055     // TODO(jungshik): Do we still need this change?
   3056     // UnicodeSet  *fALetterSet; // matches ALetterPlus in word.txt
   3057     UnicodeSet  *fMidNumLetSet;
   3058     UnicodeSet  *fMidLetterSet;
   3059     UnicodeSet  *fMidNumSet;
   3060     UnicodeSet  *fNumericSet;
   3061     UnicodeSet  *fFormatSet;
   3062     UnicodeSet  *fOtherSet;
   3063     UnicodeSet  *fExtendSet;
   3064     UnicodeSet  *fExtendNumLetSet;
   3065     UnicodeSet  *fDictionaryCjkSet;
   3066 
   3067     RegexMatcher  *fMatcher;
   3068 
   3069     const UnicodeString  *fText;
   3070 };
   3071 
   3072 
   3073 RBBIWordMonkey::RBBIWordMonkey()
   3074 {
   3075     UErrorCode  status = U_ZERO_ERROR;
   3076 
   3077     fSets            = new UVector(status);
   3078 
   3079     fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
   3080     fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
   3081     fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
   3082     fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
   3083     // Exclude Hangul syllables from ALetterSet during testing.
   3084     // Leave CJK dictionary characters out from the monkey tests!
   3085 #if 0
   3086     fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
   3087                                       "[\\p{Line_Break = Complex_Context}"
   3088                                       "-\\p{Grapheme_Cluster_Break = Extend}"
   3089                                       "-\\p{Grapheme_Cluster_Break = Control}"
   3090                                       "]]",
   3091                                       status);
   3092 #endif
   3093     fALetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
   3094     fALetterSet->removeAll(*fDictionaryCjkSet);
   3095     fKatakanaSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
   3096     fMidNumLetSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
   3097     fMidLetterSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
   3098     fMidNumSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
   3099     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}[\\uff10-\\uff19]]"),      status);
   3100     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
   3101     fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
   3102     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
   3103 
   3104     fOtherSet        = new UnicodeSet();
   3105     if(U_FAILURE(status)) {
   3106       deferredStatus = status;
   3107       return;
   3108     }
   3109 
   3110     fOtherSet->complement();
   3111     fOtherSet->removeAll(*fCRSet);
   3112     fOtherSet->removeAll(*fLFSet);
   3113     fOtherSet->removeAll(*fNewlineSet);
   3114     fOtherSet->removeAll(*fKatakanaSet);
   3115     fOtherSet->removeAll(*fALetterSet);
   3116     fOtherSet->removeAll(*fMidLetterSet);
   3117     fOtherSet->removeAll(*fMidNumSet);
   3118     fOtherSet->removeAll(*fNumericSet);
   3119     fOtherSet->removeAll(*fExtendNumLetSet);
   3120     fOtherSet->removeAll(*fFormatSet);
   3121     fOtherSet->removeAll(*fExtendSet);
   3122     // Inhibit dictionary characters from being tested at all.
   3123     fOtherSet->removeAll(*fDictionaryCjkSet);
   3124     fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
   3125 
   3126     fSets->addElement(fCRSet,        status);
   3127     fSets->addElement(fLFSet,        status);
   3128     fSets->addElement(fNewlineSet,   status);
   3129     fSets->addElement(fALetterSet,   status);
   3130     //fSets->addElement(fKatakanaSet,  status); //TODO: work out how to test katakana
   3131     fSets->addElement(fMidLetterSet, status);
   3132     fSets->addElement(fMidNumLetSet, status);
   3133     fSets->addElement(fMidNumSet,    status);
   3134     fSets->addElement(fNumericSet,   status);
   3135     fSets->addElement(fFormatSet,    status);
   3136     fSets->addElement(fExtendSet,    status);
   3137     fSets->addElement(fOtherSet,     status);
   3138     fSets->addElement(fExtendNumLetSet, status);
   3139 
   3140     if (U_FAILURE(status)) {
   3141         deferredStatus = status;
   3142     }
   3143 }
   3144 
   3145 void RBBIWordMonkey::setText(const UnicodeString &s) {
   3146     fText       = &s;
   3147 }
   3148 
   3149 
   3150 int32_t RBBIWordMonkey::next(int32_t prevPos) {
   3151     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   3152                               //   break position being tested.  The candidate break
   3153                               //   location is before p2.
   3154 
   3155     int     breakPos = -1;
   3156 
   3157     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   3158 
   3159     if (U_FAILURE(deferredStatus)) {
   3160         return -1;
   3161     }
   3162 
   3163     // Prev break at end of string.  return DONE.
   3164     if (prevPos >= fText->length()) {
   3165         return -1;
   3166     }
   3167     p0 = p1 = p2 = p3 = prevPos;
   3168     c3 =  fText->char32At(prevPos);
   3169     c0 = c1 = c2 = 0;
   3170 
   3171     // Loop runs once per "significant" character position in the input text.
   3172     for (;;) {
   3173         // Move all of the positions forward in the input string.
   3174         p0 = p1;  c0 = c1;
   3175         p1 = p2;  c1 = c2;
   3176         p2 = p3;  c2 = c3;
   3177 
   3178         // Advancd p3 by    X(Extend | Format)*   Rule 4
   3179         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
   3180         do {
   3181             p3 = fText->moveIndex32(p3, 1);
   3182             c3 = fText->char32At(p3);
   3183             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
   3184                break;
   3185             };
   3186         }
   3187         while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
   3188 
   3189 
   3190         if (p1 == p2) {
   3191             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   3192             continue;
   3193         }
   3194         if (p2 == fText->length()) {
   3195             // Reached end of string.  Always a break position.
   3196             break;
   3197         }
   3198 
   3199         // Rule  (3)   CR x LF
   3200         //     No Extend or Format characters may appear between the CR and LF,
   3201         //     which requires the additional check for p2 immediately following p1.
   3202         //
   3203         if (c1==0x0D && c2==0x0A) {
   3204             continue;
   3205         }
   3206 
   3207         // Rule (3a)  Break before and after newlines (including CR and LF)
   3208         //
   3209         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
   3210             break;
   3211         };
   3212         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
   3213             break;
   3214         };
   3215 
   3216         // Rule (5).   ALetter x ALetter
   3217         if (fALetterSet->contains(c1) &&
   3218             fALetterSet->contains(c2))  {
   3219             continue;
   3220         }
   3221 
   3222         // Rule (6)  ALetter  x  (MidLetter | MidNumLet) ALetter
   3223         //
   3224         if ( fALetterSet->contains(c1)   &&
   3225              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
   3226              fALetterSet->contains(c3)) {
   3227             continue;
   3228         }
   3229 
   3230 
   3231         // Rule (7)  ALetter (MidLetter | MidNumLet)  x  ALetter
   3232         if (fALetterSet->contains(c0) &&
   3233             (fMidLetterSet->contains(c1) ||  fMidNumLetSet->contains(c1)) &&
   3234             fALetterSet->contains(c2)) {
   3235             continue;
   3236         }
   3237 
   3238         // Rule (8)    Numeric x Numeric
   3239         if (fNumericSet->contains(c1) &&
   3240             fNumericSet->contains(c2))  {
   3241             continue;
   3242         }
   3243 
   3244         // Rule (9)    ALetter x Numeric
   3245         if (fALetterSet->contains(c1) &&
   3246             fNumericSet->contains(c2))  {
   3247             continue;
   3248         }
   3249 
   3250         // Rule (10)    Numeric x ALetter
   3251         if (fNumericSet->contains(c1) &&
   3252             fALetterSet->contains(c2))  {
   3253             continue;
   3254         }
   3255 
   3256         // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric
   3257         if (fNumericSet->contains(c0) &&
   3258             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1))  &&
   3259             fNumericSet->contains(c2)) {
   3260             continue;
   3261         }
   3262 
   3263         // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric
   3264         if (fNumericSet->contains(c1) &&
   3265             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2))  &&
   3266             fNumericSet->contains(c3)) {
   3267             continue;
   3268         }
   3269 
   3270         // Rule (13)  Katakana x Katakana
   3271         if (fKatakanaSet->contains(c1) &&
   3272             fKatakanaSet->contains(c2))  {
   3273             continue;
   3274         }
   3275 
   3276         // Rule 13a
   3277         if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
   3278              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
   3279              fExtendNumLetSet->contains(c2)) {
   3280                 continue;
   3281              }
   3282 
   3283         // Rule 13b
   3284         if (fExtendNumLetSet->contains(c1) &&
   3285                 (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
   3286                 fKatakanaSet->contains(c2)))  {
   3287                 continue;
   3288              }
   3289 
   3290         // Rule 14.  Break found here.
   3291         break;
   3292     }
   3293 
   3294     breakPos = p2;
   3295     return breakPos;
   3296 }
   3297 
   3298 
   3299 UVector  *RBBIWordMonkey::charClasses() {
   3300     return fSets;
   3301 }
   3302 
   3303 
   3304 RBBIWordMonkey::~RBBIWordMonkey() {
   3305     delete fSets;
   3306     delete fCRSet;
   3307     delete fLFSet;
   3308     delete fNewlineSet;
   3309     delete fKatakanaSet;
   3310     delete fALetterSet;
   3311     delete fMidNumLetSet;
   3312     delete fMidLetterSet;
   3313     delete fMidNumSet;
   3314     delete fNumericSet;
   3315     delete fFormatSet;
   3316     delete fExtendSet;
   3317     delete fExtendNumLetSet;
   3318     delete fOtherSet;
   3319 }
   3320 
   3321 
   3322 
   3323 
   3324 //------------------------------------------------------------------------------------------
   3325 //
   3326 //   class RBBISentMonkey      Sentence Break specific implementation
   3327 //                             of RBBIMonkeyKind.
   3328 //
   3329 //------------------------------------------------------------------------------------------
   3330 class RBBISentMonkey: public RBBIMonkeyKind {
   3331 public:
   3332     RBBISentMonkey();
   3333     virtual          ~RBBISentMonkey();
   3334     virtual  UVector *charClasses();
   3335     virtual  void     setText(const UnicodeString &s);
   3336     virtual int32_t   next(int32_t i);
   3337 private:
   3338     int               moveBack(int posFrom);
   3339     int               moveForward(int posFrom);
   3340     UChar32           cAt(int pos);
   3341 
   3342     UVector      *fSets;
   3343 
   3344     UnicodeSet  *fSepSet;
   3345     UnicodeSet  *fFormatSet;
   3346     UnicodeSet  *fSpSet;
   3347     UnicodeSet  *fLowerSet;
   3348     UnicodeSet  *fUpperSet;
   3349     UnicodeSet  *fOLetterSet;
   3350     UnicodeSet  *fNumericSet;
   3351     UnicodeSet  *fATermSet;
   3352     UnicodeSet  *fSContinueSet;
   3353     UnicodeSet  *fSTermSet;
   3354     UnicodeSet  *fCloseSet;
   3355     UnicodeSet  *fOtherSet;
   3356     UnicodeSet  *fExtendSet;
   3357 
   3358     const UnicodeString  *fText;
   3359 
   3360 };
   3361 
   3362 RBBISentMonkey::RBBISentMonkey()
   3363 {
   3364     UErrorCode  status = U_ZERO_ERROR;
   3365 
   3366     fSets            = new UVector(status);
   3367 
   3368     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
   3369     //                       set and made into character classes of their own.  For the monkey impl,
   3370     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
   3371     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
   3372     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
   3373     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
   3374     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
   3375     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
   3376     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
   3377     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
   3378     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
   3379     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
   3380     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
   3381     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
   3382     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
   3383     fOtherSet        = new UnicodeSet();
   3384 
   3385     if(U_FAILURE(status)) {
   3386       deferredStatus = status;
   3387       return;
   3388     }
   3389 
   3390     fOtherSet->complement();
   3391     fOtherSet->removeAll(*fSepSet);
   3392     fOtherSet->removeAll(*fFormatSet);
   3393     fOtherSet->removeAll(*fSpSet);
   3394     fOtherSet->removeAll(*fLowerSet);
   3395     fOtherSet->removeAll(*fUpperSet);
   3396     fOtherSet->removeAll(*fOLetterSet);
   3397     fOtherSet->removeAll(*fNumericSet);
   3398     fOtherSet->removeAll(*fATermSet);
   3399     fOtherSet->removeAll(*fSContinueSet);
   3400     fOtherSet->removeAll(*fSTermSet);
   3401     fOtherSet->removeAll(*fCloseSet);
   3402     fOtherSet->removeAll(*fExtendSet);
   3403 
   3404     fSets->addElement(fSepSet,       status);
   3405     fSets->addElement(fFormatSet,    status);
   3406     fSets->addElement(fSpSet,        status);
   3407     fSets->addElement(fLowerSet,     status);
   3408     fSets->addElement(fUpperSet,     status);
   3409     fSets->addElement(fOLetterSet,   status);
   3410     fSets->addElement(fNumericSet,   status);
   3411     fSets->addElement(fATermSet,     status);
   3412     fSets->addElement(fSContinueSet, status);
   3413     fSets->addElement(fSTermSet,     status);
   3414     fSets->addElement(fCloseSet,     status);
   3415     fSets->addElement(fOtherSet,     status);
   3416     fSets->addElement(fExtendSet,    status);
   3417 
   3418     if (U_FAILURE(status)) {
   3419         deferredStatus = status;
   3420     }
   3421 }
   3422 
   3423 
   3424 
   3425 void RBBISentMonkey::setText(const UnicodeString &s) {
   3426     fText       = &s;
   3427 }
   3428 
   3429 UVector  *RBBISentMonkey::charClasses() {
   3430     return fSets;
   3431 }
   3432 
   3433 
   3434 //  moveBack()   Find the "significant" code point preceding the index i.
   3435 //               Skips over ($Extend | $Format)* .
   3436 //
   3437 int RBBISentMonkey::moveBack(int i) {
   3438     if (i <= 0) {
   3439         return -1;
   3440     }
   3441     UChar32   c;
   3442     int32_t   j = i;
   3443     do {
   3444         j = fText->moveIndex32(j, -1);
   3445         c = fText->char32At(j);
   3446     }
   3447     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
   3448     return j;
   3449 
   3450  }
   3451 
   3452 
   3453 int RBBISentMonkey::moveForward(int i) {
   3454     if (i>=fText->length()) {
   3455         return fText->length();
   3456     }
   3457     UChar32   c;
   3458     int32_t   j = i;
   3459     do {
   3460         j = fText->moveIndex32(j, 1);
   3461         c = cAt(j);
   3462     }
   3463     while (fFormatSet->contains(c) || fExtendSet->contains(c));
   3464     return j;
   3465 }
   3466 
   3467 UChar32 RBBISentMonkey::cAt(int pos) {
   3468     if (pos<0 || pos>=fText->length()) {
   3469         return -1;
   3470     } else {
   3471         return fText->char32At(pos);
   3472     }
   3473 }
   3474 
   3475 int32_t RBBISentMonkey::next(int32_t prevPos) {
   3476     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   3477                               //   break position being tested.  The candidate break
   3478                               //   location is before p2.
   3479 
   3480     int     breakPos = -1;
   3481 
   3482     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   3483     UChar32 c;
   3484 
   3485     if (U_FAILURE(deferredStatus)) {
   3486         return -1;
   3487     }
   3488 
   3489     // Prev break at end of string.  return DONE.
   3490     if (prevPos >= fText->length()) {
   3491         return -1;
   3492     }
   3493     p0 = p1 = p2 = p3 = prevPos;
   3494     c3 =  fText->char32At(prevPos);
   3495     c0 = c1 = c2 = 0;
   3496 
   3497     // Loop runs once per "significant" character position in the input text.
   3498     for (;;) {
   3499         // Move all of the positions forward in the input string.
   3500         p0 = p1;  c0 = c1;
   3501         p1 = p2;  c1 = c2;
   3502         p2 = p3;  c2 = c3;
   3503 
   3504         // Advancd p3 by    X(Extend | Format)*   Rule 4
   3505         p3 = moveForward(p3);
   3506         c3 = cAt(p3);
   3507 
   3508         // Rule (3)  CR x LF
   3509         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
   3510             continue;
   3511         }
   3512 
   3513         // Rule (4).   Sep  <break>
   3514         if (fSepSet->contains(c1)) {
   3515             p2 = p1+1;   // Separators don't combine with Extend or Format.
   3516             break;
   3517         }
   3518 
   3519         if (p2 >= fText->length()) {
   3520             // Reached end of string.  Always a break position.
   3521             break;
   3522         }
   3523 
   3524         if (p2 == prevPos) {
   3525             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   3526             continue;
   3527         }
   3528 
   3529         // Rule (6).   ATerm x Numeric
   3530         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
   3531             continue;
   3532         }
   3533 
   3534         // Rule (7).  Upper ATerm  x  Uppper
   3535         if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
   3536             continue;
   3537         }
   3538 
   3539         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
   3540         //           Note:  STerm | ATerm are added to the negated part of the expression by a
   3541         //                  note to the Unicode 5.0 documents.
   3542         int p8 = p1;
   3543         while (fSpSet->contains(cAt(p8))) {
   3544             p8 = moveBack(p8);
   3545         }
   3546         while (fCloseSet->contains(cAt(p8))) {
   3547             p8 = moveBack(p8);
   3548         }
   3549         if (fATermSet->contains(cAt(p8))) {
   3550             p8=p2;
   3551             for (;;) {
   3552                 c = cAt(p8);
   3553                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
   3554                     fLowerSet->contains(c) || fSepSet->contains(c) ||
   3555                     fATermSet->contains(c) || fSTermSet->contains(c))  {
   3556                     break;
   3557                 }
   3558                 p8 = moveForward(p8);
   3559             }
   3560             if (fLowerSet->contains(cAt(p8))) {
   3561                 continue;
   3562             }
   3563         }
   3564 
   3565         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
   3566         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
   3567             p8 = p1;
   3568             while (fSpSet->contains(cAt(p8))) {
   3569                 p8 = moveBack(p8);
   3570             }
   3571             while (fCloseSet->contains(cAt(p8))) {
   3572                 p8 = moveBack(p8);
   3573             }
   3574             c = cAt(p8);
   3575             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
   3576                 continue;
   3577             }
   3578         }
   3579 
   3580         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
   3581         int p9 = p1;
   3582         while (fCloseSet->contains(cAt(p9))) {
   3583             p9 = moveBack(p9);
   3584         }
   3585         c = cAt(p9);
   3586         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
   3587             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
   3588                 continue;
   3589             }
   3590         }
   3591 
   3592         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
   3593         int p10 = p1;
   3594         while (fSpSet->contains(cAt(p10))) {
   3595             p10 = moveBack(p10);
   3596         }
   3597         while (fCloseSet->contains(cAt(p10))) {
   3598             p10 = moveBack(p10);
   3599         }
   3600         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
   3601             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
   3602                 continue;
   3603             }
   3604         }
   3605 
   3606         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
   3607         int p11 = p1;
   3608         if (fSepSet->contains(cAt(p11))) {
   3609             p11 = moveBack(p11);
   3610         }
   3611         while (fSpSet->contains(cAt(p11))) {
   3612             p11 = moveBack(p11);
   3613         }
   3614         while (fCloseSet->contains(cAt(p11))) {
   3615             p11 = moveBack(p11);
   3616         }
   3617         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
   3618             break;
   3619         }
   3620 
   3621         //  Rule (12)  Any x Any
   3622         continue;
   3623     }
   3624     breakPos = p2;
   3625     return breakPos;
   3626 }
   3627 
   3628 RBBISentMonkey::~RBBISentMonkey() {
   3629     delete fSets;
   3630     delete fSepSet;
   3631     delete fFormatSet;
   3632     delete fSpSet;
   3633     delete fLowerSet;
   3634     delete fUpperSet;
   3635     delete fOLetterSet;
   3636     delete fNumericSet;
   3637     delete fATermSet;
   3638     delete fSContinueSet;
   3639     delete fSTermSet;
   3640     delete fCloseSet;
   3641     delete fOtherSet;
   3642     delete fExtendSet;
   3643 }
   3644 
   3645 
   3646 
   3647 //-------------------------------------------------------------------------------------------
   3648 //
   3649 //  RBBILineMonkey
   3650 //
   3651 //-------------------------------------------------------------------------------------------
   3652 
   3653 class RBBILineMonkey: public RBBIMonkeyKind {
   3654 public:
   3655     RBBILineMonkey();
   3656     virtual          ~RBBILineMonkey();
   3657     virtual  UVector *charClasses();
   3658     virtual  void     setText(const UnicodeString &s);
   3659     virtual  int32_t  next(int32_t i);
   3660     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
   3661 private:
   3662     UVector      *fSets;
   3663 
   3664     UnicodeSet  *fBK;
   3665     UnicodeSet  *fCR;
   3666     UnicodeSet  *fLF;
   3667     UnicodeSet  *fCM;
   3668     UnicodeSet  *fNL;
   3669     UnicodeSet  *fSG;
   3670     UnicodeSet  *fWJ;
   3671     UnicodeSet  *fZW;
   3672     UnicodeSet  *fGL;
   3673     UnicodeSet  *fCB;
   3674     UnicodeSet  *fSP;
   3675     UnicodeSet  *fB2;
   3676     UnicodeSet  *fBA;
   3677     UnicodeSet  *fBB;
   3678     UnicodeSet  *fHY;
   3679     UnicodeSet  *fH2;
   3680     UnicodeSet  *fH3;
   3681     UnicodeSet  *fCL;
   3682     UnicodeSet  *fCP;
   3683     UnicodeSet  *fEX;
   3684     UnicodeSet  *fIN;
   3685     UnicodeSet  *fJL;
   3686     UnicodeSet  *fJV;
   3687     UnicodeSet  *fJT;
   3688     UnicodeSet  *fNS;
   3689     UnicodeSet  *fOP;
   3690     UnicodeSet  *fQU;
   3691     UnicodeSet  *fIS;
   3692     UnicodeSet  *fNU;
   3693     UnicodeSet  *fPO;
   3694     UnicodeSet  *fPR;
   3695     UnicodeSet  *fSY;
   3696     UnicodeSet  *fAI;
   3697     UnicodeSet  *fAL;
   3698     UnicodeSet  *fID;
   3699     UnicodeSet  *fSA;
   3700     UnicodeSet  *fXX;
   3701 
   3702     BreakIterator  *fCharBI;
   3703 
   3704     const UnicodeString  *fText;
   3705     int32_t              *fOrigPositions;
   3706 
   3707     RegexMatcher         *fNumberMatcher;
   3708     RegexMatcher         *fLB11Matcher;
   3709 };
   3710 
   3711 
   3712 RBBILineMonkey::RBBILineMonkey()
   3713 {
   3714     UErrorCode  status = U_ZERO_ERROR;
   3715 
   3716     fSets  = new UVector(status);
   3717 
   3718     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
   3719     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
   3720     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
   3721     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
   3722     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
   3723     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
   3724     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
   3725     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
   3726     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
   3727     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
   3728     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
   3729     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
   3730     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
   3731     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
   3732     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
   3733     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
   3734     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
   3735     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
   3736     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
   3737     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
   3738     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
   3739     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
   3740     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
   3741     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
   3742     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
   3743     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
   3744     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
   3745     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
   3746     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
   3747     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
   3748     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
   3749     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
   3750     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
   3751     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
   3752     fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
   3753     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
   3754     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
   3755 
   3756     if (U_FAILURE(status)) {
   3757         deferredStatus = status;
   3758         fCharBI = NULL;
   3759         fNumberMatcher = NULL;
   3760         return;
   3761     }
   3762 
   3763     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
   3764     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
   3765     fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
   3766     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
   3767 
   3768     fSets->addElement(fBK, status);
   3769     fSets->addElement(fCR, status);
   3770     fSets->addElement(fLF, status);
   3771     fSets->addElement(fCM, status);
   3772     fSets->addElement(fNL, status);
   3773     fSets->addElement(fWJ, status);
   3774     fSets->addElement(fZW, status);
   3775     fSets->addElement(fGL, status);
   3776     fSets->addElement(fCB, status);
   3777     fSets->addElement(fSP, status);
   3778     fSets->addElement(fB2, status);
   3779     fSets->addElement(fBA, status);
   3780     fSets->addElement(fBB, status);
   3781     fSets->addElement(fHY, status);
   3782     fSets->addElement(fH2, status);
   3783     fSets->addElement(fH3, status);
   3784     fSets->addElement(fCL, status);
   3785     fSets->addElement(fCP, status);
   3786     fSets->addElement(fEX, status);
   3787     fSets->addElement(fIN, status);
   3788     fSets->addElement(fJL, status);
   3789     fSets->addElement(fJT, status);
   3790     fSets->addElement(fJV, status);
   3791     fSets->addElement(fNS, status);
   3792     fSets->addElement(fOP, status);
   3793     fSets->addElement(fQU, status);
   3794     fSets->addElement(fIS, status);
   3795     fSets->addElement(fNU, status);
   3796     fSets->addElement(fPO, status);
   3797     fSets->addElement(fPR, status);
   3798     fSets->addElement(fSY, status);
   3799     fSets->addElement(fAI, status);
   3800     fSets->addElement(fAL, status);
   3801     fSets->addElement(fID, status);
   3802     fSets->addElement(fWJ, status);
   3803     fSets->addElement(fSA, status);
   3804     fSets->addElement(fSG, status);
   3805 
   3806     const char *rules =
   3807             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
   3808             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
   3809             "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
   3810             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
   3811             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
   3812             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
   3813 
   3814     fNumberMatcher = new RegexMatcher(
   3815         UnicodeString(rules, -1, US_INV), 0, status);
   3816 
   3817     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
   3818 
   3819     if (U_FAILURE(status)) {
   3820         deferredStatus = status;
   3821     }
   3822 }
   3823 
   3824 
   3825 void RBBILineMonkey::setText(const UnicodeString &s) {
   3826     fText       = &s;
   3827     fCharBI->setText(s);
   3828     fNumberMatcher->reset(s);
   3829 }
   3830 
   3831 //
   3832 //  rule9Adjust
   3833 //     Line Break TR rules 9 and 10 implementation.
   3834 //     This deals with combining marks and other sequences that
   3835 //     that must be treated as if they were something other than what they actually are.
   3836 //
   3837 //     This is factored out into a separate function because it must be applied twice for
   3838 //     each potential break, once to the chars before the position being checked, then
   3839 //     again to the text following the possible break.
   3840 //
   3841 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
   3842     if (pos == -1) {
   3843         // Invalid initial position.  Happens during the warmup iteration of the
   3844         //   main loop in next().
   3845         return;
   3846     }
   3847 
   3848     int32_t  nPos = *nextPos;
   3849 
   3850     // LB 9  Keep combining sequences together.
   3851     //  advance over any CM class chars.  Note that Line Break CM is different
   3852     //  from the normal Grapheme Extend property.
   3853     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
   3854           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
   3855         for (;;) {
   3856             *nextChar = fText->char32At(nPos);
   3857             if (!fCM->contains(*nextChar)) {
   3858                 break;
   3859             }
   3860             nPos = fText->moveIndex32(nPos, 1);
   3861         }
   3862     }
   3863 
   3864 
   3865     // LB 9 Treat X CM* as if it were x.
   3866     //       No explicit action required.
   3867 
   3868     // LB 10  Treat any remaining combining mark as AL
   3869     if (fCM->contains(*posChar)) {
   3870         *posChar = 0x41;   // thisChar = 'A';
   3871     }
   3872 
   3873     // Push the updated nextPos and nextChar back to our caller.
   3874     // This only makes a difference if posChar got bigger by consuming a
   3875     // combining sequence.
   3876     *nextPos  = nPos;
   3877     *nextChar = fText->char32At(nPos);
   3878 }
   3879 
   3880 
   3881 
   3882 int32_t RBBILineMonkey::next(int32_t startPos) {
   3883     UErrorCode status = U_ZERO_ERROR;
   3884     int32_t    pos;       //  Index of the char following a potential break position
   3885     UChar32    thisChar;  //  Character at above position "pos"
   3886 
   3887     int32_t    prevPos;   //  Index of the char preceding a potential break position
   3888     UChar32    prevChar;  //  Character at above position.  Note that prevChar
   3889                           //   and thisChar may not be adjacent because combining
   3890                           //   characters between them will be ignored.
   3891 
   3892     int32_t    nextPos;   //  Index of the next character following pos.
   3893                           //     Usually skips over combining marks.
   3894     int32_t    nextCPPos; //  Index of the code point following "pos."
   3895                           //     May point to a combining mark.
   3896     int32_t    tPos;      //  temp value.
   3897     UChar32    c;
   3898 
   3899     if (U_FAILURE(deferredStatus)) {
   3900         return -1;
   3901     }
   3902 
   3903     if (startPos >= fText->length()) {
   3904         return -1;
   3905     }
   3906 
   3907 
   3908     // Initial values for loop.  Loop will run the first time without finding breaks,
   3909     //                           while the invalid values shift out and the "this" and
   3910     //                           "prev" positions are filled in with good values.
   3911     pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.
   3912     thisChar = prevChar  = 0;
   3913     nextPos  = nextCPPos = startPos;
   3914 
   3915 
   3916     // Loop runs once per position in the test text, until a break position
   3917     //  is found.
   3918     for (;;) {
   3919         prevPos   = pos;
   3920         prevChar  = thisChar;
   3921 
   3922         pos       = nextPos;
   3923         thisChar  = fText->char32At(pos);
   3924 
   3925         nextCPPos = fText->moveIndex32(pos, 1);
   3926         nextPos   = nextCPPos;
   3927 
   3928         // Rule LB2 - Break at end of text.
   3929         if (pos >= fText->length()) {
   3930             break;
   3931         }
   3932 
   3933         // Rule LB 9 - adjust for combining sequences.
   3934         //             We do this one out-of-order because the adjustment does not change anything
   3935         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
   3936         //             be applied.
   3937         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
   3938         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
   3939         c = fText->char32At(nextPos);
   3940         rule9Adjust(pos,     &thisChar, &nextPos, &c);
   3941 
   3942         // If the loop is still warming up - if we haven't shifted the initial
   3943         //   -1 positions out of prevPos yet - loop back to advance the
   3944         //    position in the input without any further looking for breaks.
   3945         if (prevPos == -1) {
   3946             continue;
   3947         }
   3948 
   3949         // LB 4  Always break after hard line breaks,
   3950         if (fBK->contains(prevChar)) {
   3951             break;
   3952         }
   3953 
   3954         // LB 5  Break after CR, LF, NL, but not inside CR LF
   3955         if (prevChar == 0x0d && thisChar == 0x0a) {
   3956             continue;
   3957         }
   3958         if (prevChar == 0x0d ||
   3959             prevChar == 0x0a ||
   3960             prevChar == 0x85)  {
   3961             break;
   3962         }
   3963 
   3964         // LB 6  Don't break before hard line breaks
   3965         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
   3966             fBK->contains(thisChar)) {
   3967                 continue;
   3968         }
   3969 
   3970 
   3971         // LB 7  Don't break before spaces or zero-width space.
   3972         if (fSP->contains(thisChar)) {
   3973             continue;
   3974         }
   3975 
   3976         if (fZW->contains(thisChar)) {
   3977             continue;
   3978         }
   3979 
   3980         // LB 8  Break after zero width space
   3981         if (fZW->contains(prevChar)) {
   3982             break;
   3983         }
   3984 
   3985         // LB 9, 10  Already done, at top of loop.
   3986         //
   3987 
   3988 
   3989         // LB 11  Do not break before or after WORD JOINER and related characters.
   3990         //    x  WJ
   3991         //    WJ  x
   3992         //
   3993         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
   3994             continue;
   3995         }
   3996 
   3997         // LB 12
   3998         //    GL  x
   3999         if (fGL->contains(prevChar)) {
   4000             continue;
   4001         }
   4002 
   4003         // LB 12a
   4004         //    [^SP BA HY] x GL
   4005         if (!(fSP->contains(prevChar) ||
   4006               fBA->contains(prevChar) ||
   4007               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
   4008             continue;
   4009         }
   4010 
   4011 
   4012 
   4013         // LB 13  Don't break before closings.
   4014         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
   4015         //        fall into LB 17 and the more general number regular expression.
   4016         //
   4017         if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
   4018             (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
   4019                                          fEX->contains(thisChar)  ||
   4020             (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
   4021             (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
   4022             continue;
   4023         }
   4024 
   4025         // LB 14 Don't break after OP SP*
   4026         //       Scan backwards, checking for this sequence.
   4027         //       The OP char could include combining marks, so we actually check for
   4028         //           OP CM* SP*
   4029         //       Another Twist: The Rule 67 fixes may have changed a SP CM
   4030         //       sequence into a ID char, so before scanning back through spaces,
   4031         //       verify that prevChar is indeed a space.  The prevChar variable
   4032         //       may differ from fText[prevPos]
   4033         tPos = prevPos;
   4034         if (fSP->contains(prevChar)) {
   4035             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
   4036                 tPos=fText->moveIndex32(tPos, -1);
   4037             }
   4038         }
   4039         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
   4040             tPos=fText->moveIndex32(tPos, -1);
   4041         }
   4042         if (fOP->contains(fText->char32At(tPos))) {
   4043             continue;
   4044         }
   4045 
   4046 
   4047         // LB 15    QU SP* x OP
   4048         if (fOP->contains(thisChar)) {
   4049             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
   4050             int tPos = prevPos;
   4051             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
   4052                 tPos = fText->moveIndex32(tPos, -1);
   4053             }
   4054             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
   4055                 tPos = fText->moveIndex32(tPos, -1);
   4056             }
   4057             if (fQU->contains(fText->char32At(tPos))) {
   4058                 continue;
   4059             }
   4060         }
   4061 
   4062 
   4063 
   4064         // LB 16   (CL | CP) SP* x NS
   4065         //    Scan backwards for SP* CM* (CL | CP)
   4066         if (fNS->contains(thisChar)) {
   4067             int tPos = prevPos;
   4068             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
   4069                 tPos = fText->moveIndex32(tPos, -1);
   4070             }
   4071             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
   4072                 tPos = fText->moveIndex32(tPos, -1);
   4073             }
   4074             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
   4075                 continue;
   4076             }
   4077         }
   4078 
   4079 
   4080         // LB 17        B2 SP* x B2
   4081         if (fB2->contains(thisChar)) {
   4082             //  Scan backwards, checking for the B2 CM* SP* sequence.
   4083             tPos = prevPos;
   4084             if (fSP->contains(prevChar)) {
   4085                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
   4086                     tPos=fText->moveIndex32(tPos, -1);
   4087                 }
   4088             }
   4089             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
   4090                 tPos=fText->moveIndex32(tPos, -1);
   4091             }
   4092             if (fB2->contains(fText->char32At(tPos))) {
   4093                 continue;
   4094             }
   4095         }
   4096 
   4097 
   4098         // LB 18    break after space
   4099         if (fSP->contains(prevChar)) {
   4100             break;
   4101         }
   4102 
   4103         // LB 19
   4104         //    x   QU
   4105         //    QU  x
   4106         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
   4107             continue;
   4108         }
   4109 
   4110         // LB 20  Break around a CB
   4111         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
   4112             break;
   4113         }
   4114 
   4115         // LB 21
   4116         if (fBA->contains(thisChar) ||
   4117             fHY->contains(thisChar) ||
   4118             fNS->contains(thisChar) ||
   4119             fBB->contains(prevChar) )   {
   4120             continue;
   4121         }
   4122 
   4123         // LB 22
   4124         if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
   4125             (fID->contains(prevChar) && fIN->contains(thisChar)) ||
   4126             (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
   4127             (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
   4128             continue;
   4129         }
   4130 
   4131 
   4132         // LB 23    ID x PO
   4133         //          AL x NU
   4134         //          NU x AL
   4135         if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
   4136             (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
   4137             (fNU->contains(prevChar) && fAL->contains(thisChar)) )   {
   4138             continue;
   4139         }
   4140 
   4141         // LB 24  Do not break between prefix and letters or ideographs.
   4142         //        PR x ID
   4143         //        PR x AL
   4144         //        PO x AL
   4145         if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
   4146             (fPR->contains(prevChar) && fAL->contains(thisChar)) ||
   4147             (fPO->contains(prevChar) && fAL->contains(thisChar)) )   {
   4148             continue;
   4149         }
   4150 
   4151 
   4152 
   4153         // LB 25    Numbers
   4154         if (fNumberMatcher->lookingAt(prevPos, status)) {
   4155             if (U_FAILURE(status)) {
   4156                 break;
   4157             }
   4158             // Matched a number.  But could have been just a single digit, which would
   4159             //    not represent a "no break here" between prevChar and thisChar
   4160             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
   4161             if (numEndIdx > pos) {
   4162                 // Number match includes at least our two chars being checked
   4163                 if (numEndIdx > nextPos) {
   4164                     // Number match includes additional chars.  Update pos and nextPos
   4165                     //   so that next loop iteration will continue at the end of the number,
   4166                     //   checking for breaks between last char in number & whatever follows.
   4167                     pos = nextPos = numEndIdx;
   4168                     do {
   4169                         pos = fText->moveIndex32(pos, -1);
   4170                         thisChar = fText->char32At(pos);
   4171                     } while (fCM->contains(thisChar));
   4172                 }
   4173                 continue;
   4174             }
   4175         }
   4176 
   4177 
   4178         // LB 26 Do not break a Korean syllable.
   4179         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
   4180                                         fJV->contains(thisChar) ||
   4181                                         fH2->contains(thisChar) ||
   4182                                         fH3->contains(thisChar))) {
   4183                                             continue;
   4184                                         }
   4185 
   4186         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
   4187             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
   4188                 continue;
   4189         }
   4190 
   4191         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
   4192             fJT->contains(thisChar)) {
   4193                 continue;
   4194         }
   4195 
   4196         // LB 27 Treat a Korean Syllable Block the same as ID.
   4197         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
   4198             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
   4199             fIN->contains(thisChar)) {
   4200                 continue;
   4201             }
   4202         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
   4203             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
   4204             fPO->contains(thisChar)) {
   4205                 continue;
   4206             }
   4207         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
   4208             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
   4209                 continue;
   4210             }
   4211 
   4212 
   4213 
   4214         // LB 28  Do not break between alphabetics ("at").
   4215         if (fAL->contains(prevChar) && fAL->contains(thisChar)) {
   4216             continue;
   4217         }
   4218 
   4219         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
   4220         if (fIS->contains(prevChar) && fAL->contains(thisChar)) {
   4221             continue;
   4222         }
   4223 
   4224         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
   4225         //          (AL | NU) x OP
   4226         //          CP x (AL | NU)
   4227         if ((fAL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
   4228             continue;
   4229         }
   4230         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fNU->contains(thisChar))) {
   4231             continue;
   4232         }
   4233 
   4234         // LB 31    Break everywhere else
   4235         break;
   4236 
   4237     }
   4238 
   4239     return pos;
   4240 }
   4241 
   4242 
   4243 UVector  *RBBILineMonkey::charClasses() {
   4244     return fSets;
   4245 }
   4246 
   4247 
   4248 RBBILineMonkey::~RBBILineMonkey() {
   4249     delete fSets;
   4250 
   4251     delete fBK;
   4252     delete fCR;
   4253     delete fLF;
   4254     delete fCM;
   4255     delete fNL;
   4256     delete fWJ;
   4257     delete fZW;
   4258     delete fGL;
   4259     delete fCB;
   4260     delete fSP;
   4261     delete fB2;
   4262     delete fBA;
   4263     delete fBB;
   4264     delete fHY;
   4265     delete fH2;
   4266     delete fH3;
   4267     delete fCL;
   4268     delete fCP;
   4269     delete fEX;
   4270     delete fIN;
   4271     delete fJL;
   4272     delete fJV;
   4273     delete fJT;
   4274     delete fNS;
   4275     delete fOP;
   4276     delete fQU;
   4277     delete fIS;
   4278     delete fNU;
   4279     delete fPO;
   4280     delete fPR;
   4281     delete fSY;
   4282     delete fAI;
   4283     delete fAL;
   4284     delete fID;
   4285     delete fSA;
   4286     delete fSG;
   4287     delete fXX;
   4288 
   4289     delete fCharBI;
   4290     delete fNumberMatcher;
   4291 }
   4292 
   4293 
   4294 //-------------------------------------------------------------------------------------------
   4295 //
   4296 //   TestMonkey
   4297 //
   4298 //     params
   4299 //       seed=nnnnn        Random number starting seed.
   4300 //                         Setting the seed allows errors to be reproduced.
   4301 //       loop=nnn          Looping count.  Controls running time.
   4302 //                         -1:  run forever.
   4303 //                          0 or greater:  run length.
   4304 //
   4305 //       type = char | word | line | sent | title
   4306 //
   4307 //-------------------------------------------------------------------------------------------
   4308 
   4309 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
   4310     int32_t val = defaultVal;
   4311     name.append(" *= *(-?\\d+)");
   4312     UErrorCode status = U_ZERO_ERROR;
   4313     RegexMatcher m(name, params, 0, status);
   4314     if (m.find()) {
   4315         // The param exists.  Convert the string to an int.
   4316         char valString[100];
   4317         int32_t paramLength = m.end(1, status) - m.start(1, status);
   4318         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
   4319             paramLength = (int32_t)(sizeof(valString)-2);
   4320         }
   4321         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
   4322         val = strtol(valString,  NULL, 10);
   4323 
   4324         // Delete this parameter from the params string.
   4325         m.reset();
   4326         params = m.replaceFirst("", status);
   4327     }
   4328     U_ASSERT(U_SUCCESS(status));
   4329     return val;
   4330 }
   4331 #endif
   4332 
   4333 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
   4334                                     BreakIterator *bi,
   4335                                     int expected[],
   4336                                     int expectedcount)
   4337 {
   4338     int count = 0;
   4339     int i = 0;
   4340     int forward[50];
   4341     bi->setText(ustr);
   4342     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
   4343         forward[count] = i;
   4344         if (count < expectedcount && expected[count] != i) {
   4345             test->errln("break forward test failed: expected %d but got %d",
   4346                         expected[count], i);
   4347             break;
   4348         }
   4349         count ++;
   4350     }
   4351     if (count != expectedcount) {
   4352         printStringBreaks(ustr, expected, expectedcount);
   4353         test->errln("break forward test failed: missed %d match",
   4354                     expectedcount - count);
   4355         return;
   4356     }
   4357     // testing boundaries
   4358     for (i = 1; i < expectedcount; i ++) {
   4359         int j = expected[i - 1];
   4360         if (!bi->isBoundary(j)) {
   4361             printStringBreaks(ustr, expected, expectedcount);
   4362             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
   4363             return;
   4364         }
   4365         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
   4366             if (bi->isBoundary(j)) {
   4367                 printStringBreaks(ustr, expected, expectedcount);
   4368                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
   4369                 return;
   4370             }
   4371         }
   4372     }
   4373 
   4374     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
   4375         count --;
   4376         if (forward[count] != i) {
   4377             printStringBreaks(ustr, expected, expectedcount);
   4378             test->errln("happy break test previous() failed: expected %d but got %d",
   4379                         forward[count], i);
   4380             break;
   4381         }
   4382     }
   4383     if (count != 0) {
   4384         printStringBreaks(ustr, expected, expectedcount);
   4385         test->errln("break test previous() failed: missed a match");
   4386         return;
   4387     }
   4388 
   4389     // testing preceding
   4390     for (i = 0; i < expectedcount - 1; i ++) {
   4391         // int j = expected[i] + 1;
   4392         int j = ustr.moveIndex32(expected[i], 1);
   4393         for (; j <= expected[i + 1]; j ++) {
   4394             if (bi->preceding(j) != expected[i]) {
   4395                 printStringBreaks(ustr, expected, expectedcount);
   4396                 test->errln("preceding(): Not expecting boundary at position %d", j);
   4397                 return;
   4398             }
   4399         }
   4400     }
   4401 }
   4402 
   4403 void RBBITest::TestWordBreaks(void)
   4404 {
   4405 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4406 
   4407     Locale        locale("en");
   4408     UErrorCode    status = U_ZERO_ERROR;
   4409     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   4410     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
   4411     // Replaced any C+J characters in a row with a random sequence of characters
   4412     // of the same length to make our C+J segmentation not get in the way.
   4413     static const char *strlist[] =
   4414     {
   4415     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
   4416     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
   4417     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
   4418     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
   4419     "\\uac00\\u3588\\u009c\\u0953\\u194b",
   4420     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   4421     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
   4422     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
   4423     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   4424     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   4425     "\\u2027\\U000e0067\\u0a47\\u00b7",
   4426     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   4427     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   4428     "\\u0589\\U000e006e\\u0a42\\U000104a5",
   4429     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
   4430     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   4431     "\\u0027\\u11af\\U000e0057\\u0602",
   4432     "\\U0001d7f2\\U000e007\\u0004\\u0589",
   4433     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
   4434     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   4435     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   4436     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   4437     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
   4438     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   4439     "\\u0233\\U000e0020\\u0a69\\u0d6a",
   4440     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   4441     "\\u18f4\\U000e0049\\u20e7\\u2027",
   4442     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   4443     "\\ua183\\u102d\\u0bec\\u003a",
   4444     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   4445     "\\u003a\\u0e57\\u0fad\\u002e",
   4446     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
   4447     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
   4448     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
   4449     "\\u003a\\u0664\\u00b7\\u1fba",
   4450     "\\u003b\\u0027\\u00b7\\u47a3",
   4451     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
   4452     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
   4453     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
   4454     };
   4455     int loop;
   4456     if (U_FAILURE(status)) {
   4457         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   4458         return;
   4459     }
   4460     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   4461         // printf("looping %d\n", loop);
   4462         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
   4463         // RBBICharMonkey monkey;
   4464         RBBIWordMonkey monkey;
   4465 
   4466         int expected[50];
   4467         int expectedcount = 0;
   4468 
   4469         monkey.setText(ustr);
   4470         int i;
   4471         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   4472             expected[expectedcount ++] = i;
   4473         }
   4474 
   4475         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   4476     }
   4477     delete bi;
   4478 #endif
   4479 }
   4480 
   4481 void RBBITest::TestWordBoundary(void)
   4482 {
   4483     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
   4484     Locale        locale("en");
   4485     UErrorCode    status = U_ZERO_ERROR;
   4486     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   4487     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
   4488     UChar         str[50];
   4489     static const char *strlist[] =
   4490     {
   4491     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   4492     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   4493     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   4494     "\\u2027\\U000e0067\\u0a47\\u00b7",
   4495     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   4496     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   4497     "\\u0589\\U000e006e\\u0a42\\U000104a5",
   4498     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
   4499     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   4500     "\\u0027\\u11af\\U000e0057\\u0602",
   4501     "\\U0001d7f2\\U000e007\\u0004\\u0589",
   4502     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
   4503     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   4504     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   4505     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   4506     "\\U000e0065\\u302c\\u09ee\\U000e0068",
   4507     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   4508     "\\u0233\\U000e0020\\u0a69\\u0d6a",
   4509     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   4510     "\\u58f4\\U000e0049\\u20e7\\u2027",
   4511     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   4512     "\\ua183\\u102d\\u0bec\\u003a",
   4513     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   4514     "\\u003a\\u0e57\\u0fad\\u002e",
   4515     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
   4516     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
   4517     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
   4518     "\\u003a\\u0664\\u00b7\\u1fba",
   4519     "\\u003b\\u0027\\u00b7\\u47a3",
   4520     };
   4521     int loop;
   4522     if (U_FAILURE(status)) {
   4523         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   4524         return;
   4525     }
   4526     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   4527         // printf("looping %d\n", loop);
   4528         u_unescape(strlist[loop], str, 20);
   4529         UnicodeString ustr(str);
   4530         int forward[50];
   4531         int count = 0;
   4532 
   4533         bi->setText(ustr);
   4534         int prev = 0;
   4535         int i;
   4536         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
   4537             forward[count ++] = i;
   4538             if (i > prev) {
   4539                 int j;
   4540                 for (j = prev + 1; j < i; j ++) {
   4541                     if (bi->isBoundary(j)) {
   4542                         printStringBreaks(ustr, forward, count);
   4543                         errln("happy boundary test failed: expected %d not a boundary",
   4544                                j);
   4545                         return;
   4546                     }
   4547                 }
   4548             }
   4549             if (!bi->isBoundary(i)) {
   4550                 printStringBreaks(ustr, forward, count);
   4551                 errln("happy boundary test failed: expected %d a boundary",
   4552                        i);
   4553                 return;
   4554             }
   4555             prev = i;
   4556         }
   4557     }
   4558     delete bi;
   4559 }
   4560 
   4561 void RBBITest::TestLineBreaks(void)
   4562 {
   4563 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4564     Locale        locale("en");
   4565     UErrorCode    status = U_ZERO_ERROR;
   4566     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
   4567     const int32_t  STRSIZE = 50;
   4568     UChar         str[STRSIZE];
   4569     static const char *strlist[] =
   4570     {
   4571      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
   4572      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
   4573              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
   4574      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
   4575              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
   4576      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
   4577      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
   4578      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
   4579      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
   4580      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
   4581      "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
   4582      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
   4583      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
   4584      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
   4585      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
   4586      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
   4587      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
   4588      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
   4589      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
   4590      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
   4591      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
   4592      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
   4593      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
   4594      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
   4595      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
   4596      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
   4597      "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
   4598      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
   4599      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
   4600      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
   4601      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
   4602      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
   4603      "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
   4604      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
   4605      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
   4606      "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
   4607      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
   4608      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
   4609      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
   4610      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
   4611      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
   4612      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
   4613          "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
   4614          "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
   4615          "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
   4616      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
   4617          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
   4618     };
   4619     int loop;
   4620     TEST_ASSERT_SUCCESS(status);
   4621     if (U_FAILURE(status)) {
   4622         return;
   4623     }
   4624     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   4625         // printf("looping %d\n", loop);
   4626         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
   4627         if (t >= STRSIZE) {
   4628             TEST_ASSERT(FALSE);
   4629             continue;
   4630         }
   4631 
   4632 
   4633         UnicodeString ustr(str);
   4634         RBBILineMonkey monkey;
   4635         if (U_FAILURE(monkey.deferredStatus)) {
   4636             continue;
   4637         }
   4638 
   4639         const int EXPECTEDSIZE = 50;
   4640         int expected[EXPECTEDSIZE];
   4641         int expectedcount = 0;
   4642 
   4643         monkey.setText(ustr);
   4644         int i;
   4645         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   4646             if (expectedcount >= EXPECTEDSIZE) {
   4647                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
   4648                 return;
   4649             }
   4650             expected[expectedcount ++] = i;
   4651         }
   4652 
   4653         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   4654     }
   4655     delete bi;
   4656 #endif
   4657 }
   4658 
   4659 void RBBITest::TestSentBreaks(void)
   4660 {
   4661 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4662     Locale        locale("en");
   4663     UErrorCode    status = U_ZERO_ERROR;
   4664     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
   4665     UChar         str[200];
   4666     static const char *strlist[] =
   4667     {
   4668      "Now\ris\nthe\r\ntime\n\rfor\r\r",
   4669      "This\n",
   4670      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
   4671      "\"Sentence ending with a quote.\" Bye.",
   4672      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
   4673      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
   4674      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
   4675      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
   4676      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
   4677      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
   4678      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
   4679              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
   4680              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
   4681              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
   4682      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
   4683              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
   4684              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
   4685              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
   4686              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
   4687              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
   4688     };
   4689     int loop;
   4690     if (U_FAILURE(status)) {
   4691         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   4692         return;
   4693     }
   4694     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   4695         u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
   4696         UnicodeString ustr(str);
   4697 
   4698         RBBISentMonkey monkey;
   4699         if (U_FAILURE(monkey.deferredStatus)) {
   4700             continue;
   4701         }
   4702 
   4703         const int EXPECTEDSIZE = 50;
   4704         int expected[EXPECTEDSIZE];
   4705         int expectedcount = 0;
   4706 
   4707         monkey.setText(ustr);
   4708         int i;
   4709         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   4710             if (expectedcount >= EXPECTEDSIZE) {
   4711                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
   4712                 return;
   4713             }
   4714             expected[expectedcount ++] = i;
   4715         }
   4716 
   4717         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   4718     }
   4719     delete bi;
   4720 #endif
   4721 }
   4722 
   4723 void RBBITest::TestMonkey(char *params) {
   4724 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4725 
   4726     UErrorCode     status    = U_ZERO_ERROR;
   4727     int32_t        loopCount = 500;
   4728     int32_t        seed      = 1;
   4729     UnicodeString  breakType = "all";
   4730     Locale         locale("en");
   4731     UBool          useUText  = FALSE;
   4732 
   4733     if (quick == FALSE) {
   4734         loopCount = 10000;
   4735     }
   4736 
   4737     if (params) {
   4738         UnicodeString p(params);
   4739         loopCount = getIntParam("loop", p, loopCount);
   4740         seed      = getIntParam("seed", p, seed);
   4741 
   4742         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
   4743         if (m.find()) {
   4744             breakType = m.group(1, status);
   4745             m.reset();
   4746             p = m.replaceFirst("", status);
   4747         }
   4748 
   4749         RegexMatcher u(" *utext", p, 0, status);
   4750         if (u.find()) {
   4751             useUText = TRUE;
   4752             u.reset();
   4753             p = u.replaceFirst("", status);
   4754         }
   4755 
   4756 
   4757         // m.reset(p);
   4758         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
   4759             // Each option is stripped out of the option string as it is processed.
   4760             // All options have been checked.  The option string should have been completely emptied..
   4761             char buf[100];
   4762             p.extract(buf, sizeof(buf), NULL, status);
   4763             buf[sizeof(buf)-1] = 0;
   4764             errln("Unrecognized or extra parameter:  %s\n", buf);
   4765             return;
   4766         }
   4767 
   4768     }
   4769 
   4770     if (breakType == "char" || breakType == "all") {
   4771         RBBICharMonkey  m;
   4772         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   4773         if (U_SUCCESS(status)) {
   4774             RunMonkey(bi, m, "char", seed, loopCount, useUText);
   4775             if (breakType == "all" && useUText==FALSE) {
   4776                 // Also run a quick test with UText when "all" is specified
   4777                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
   4778             }
   4779         }
   4780         else {
   4781             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
   4782         }
   4783         delete bi;
   4784     }
   4785 
   4786     if (breakType == "word" || breakType == "all") {
   4787         logln("Word Break Monkey Test");
   4788         RBBIWordMonkey  m;
   4789         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
   4790         if (U_SUCCESS(status)) {
   4791             RunMonkey(bi, m, "word", seed, loopCount, useUText);
   4792         }
   4793         else {
   4794             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
   4795         }
   4796         delete bi;
   4797     }
   4798 
   4799     if (breakType == "line" || breakType == "all") {
   4800         logln("Line Break Monkey Test");
   4801         RBBILineMonkey  m;
   4802         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
   4803         if (loopCount >= 10) {
   4804             loopCount = loopCount / 5;   // Line break runs slower than the others.
   4805         }
   4806         if (U_SUCCESS(status)) {
   4807             RunMonkey(bi, m, "line", seed, loopCount, useUText);
   4808         }
   4809         else {
   4810             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
   4811         }
   4812         delete bi;
   4813     }
   4814 
   4815     if (breakType == "sent" || breakType == "all"  ) {
   4816         logln("Sentence Break Monkey Test");
   4817         RBBISentMonkey  m;
   4818         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
   4819         if (loopCount >= 10) {
   4820             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
   4821         }
   4822         if (U_SUCCESS(status)) {
   4823             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
   4824         }
   4825         else {
   4826             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
   4827         }
   4828         delete bi;
   4829     }
   4830 
   4831 #endif
   4832 }
   4833 
   4834 //
   4835 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
   4836 //    Parameters:
   4837 //       bi      - the break iterator to use
   4838 //       mk      - MonkeyKind, abstraction for obtaining expected results
   4839 //       name    - Name of test (char, word, etc.) for use in error messages
   4840 //       seed    - Seed for starting random number generator (parameter from user)
   4841 //       numIterations
   4842 //
   4843 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
   4844                          int32_t numIterations, UBool useUText) {
   4845 
   4846 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4847 
   4848     const int32_t    TESTSTRINGLEN = 500;
   4849     UnicodeString    testText;
   4850     int32_t          numCharClasses;
   4851     UVector          *chClasses;
   4852     int              expected[TESTSTRINGLEN*2 + 1];
   4853     int              expectedCount = 0;
   4854     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
   4855     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
   4856     char             reverseBreaks[TESTSTRINGLEN*2+1];
   4857     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
   4858     char             followingBreaks[TESTSTRINGLEN*2+1];
   4859     char             precedingBreaks[TESTSTRINGLEN*2+1];
   4860     int              i;
   4861     int              loopCount = 0;
   4862 
   4863     m_seed = seed;
   4864 
   4865     numCharClasses = mk.charClasses()->size();
   4866     chClasses      = mk.charClasses();
   4867 
   4868     // Check for errors that occured during the construction of the MonkeyKind object.
   4869     //  Can't report them where they occured because errln() is a method coming from intlTest,
   4870     //  and is not visible outside of RBBITest :-(
   4871     if (U_FAILURE(mk.deferredStatus)) {
   4872         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
   4873         return;
   4874     }
   4875 
   4876     // Verify that the character classes all have at least one member.
   4877     for (i=0; i<numCharClasses; i++) {
   4878         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
   4879         if (s == NULL || s->size() == 0) {
   4880             errln("Character Class #%d is null or of zero size.", i);
   4881             return;
   4882         }
   4883     }
   4884 
   4885     while (loopCount < numIterations || numIterations == -1) {
   4886         if (numIterations == -1 && loopCount % 10 == 0) {
   4887             // If test is running in an infinite loop, display a periodic tic so
   4888             //   we can tell that it is making progress.
   4889             fprintf(stderr, ".");
   4890         }
   4891         // Save current random number seed, so that we can recreate the random numbers
   4892         //   for this loop iteration in event of an error.
   4893         seed = m_seed;
   4894 
   4895         // Populate a test string with data.
   4896         testText.truncate(0);
   4897         for (i=0; i<TESTSTRINGLEN; i++) {
   4898             int32_t  aClassNum = m_rand() % numCharClasses;
   4899             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
   4900             int32_t   charIdx = m_rand() % classSet->size();
   4901             UChar32   c = classSet->charAt(charIdx);
   4902             if (c < 0) {   // TODO:  deal with sets containing strings.
   4903                 errln("c < 0");
   4904                 break;
   4905             }
   4906             testText.append(c);
   4907         }
   4908 
   4909         // Calculate the expected results for this test string.
   4910         mk.setText(testText);
   4911         memset(expectedBreaks, 0, sizeof(expectedBreaks));
   4912         expectedBreaks[0] = 1;
   4913         int32_t breakPos = 0;
   4914         expectedCount = 0;
   4915         for (;;) {
   4916             breakPos = mk.next(breakPos);
   4917             if (breakPos == -1) {
   4918                 break;
   4919             }
   4920             if (breakPos > testText.length()) {
   4921                 errln("breakPos > testText.length()");
   4922             }
   4923             expectedBreaks[breakPos] = 1;
   4924             U_ASSERT(expectedCount<testText.length());
   4925             expected[expectedCount ++] = breakPos;
   4926         }
   4927 
   4928         // Find the break positions using forward iteration
   4929         memset(forwardBreaks, 0, sizeof(forwardBreaks));
   4930         if (useUText) {
   4931             UErrorCode status = U_ZERO_ERROR;
   4932             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
   4933             // testUText = utext_openUnicodeString(testUText, &testText, &status);
   4934             bi->setText(testUText, status);
   4935             TEST_ASSERT_SUCCESS(status);
   4936             utext_close(testUText);   // The break iterator does a shallow clone of the UText
   4937                                       //  This UText can be closed immediately, so long as the
   4938                                       //  testText string continues to exist.
   4939         } else {
   4940             bi->setText(testText);
   4941         }
   4942 
   4943         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
   4944             if (i < 0 || i > testText.length()) {
   4945                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
   4946                 break;
   4947             }
   4948             forwardBreaks[i] = 1;
   4949         }
   4950 
   4951         // Find the break positions using reverse iteration
   4952         memset(reverseBreaks, 0, sizeof(reverseBreaks));
   4953         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
   4954             if (i < 0 || i > testText.length()) {
   4955                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
   4956                 break;
   4957             }
   4958             reverseBreaks[i] = 1;
   4959         }
   4960 
   4961         // Find the break positions using isBoundary() tests.
   4962         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
   4963         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
   4964         for (i=0; i<=testText.length(); i++) {
   4965             isBoundaryBreaks[i] = bi->isBoundary(i);
   4966         }
   4967 
   4968 
   4969         // Find the break positions using the following() function.
   4970         // printf(".");
   4971         memset(followingBreaks, 0, sizeof(followingBreaks));
   4972         int32_t   lastBreakPos = 0;
   4973         followingBreaks[0] = 1;
   4974         for (i=0; i<testText.length(); i++) {
   4975             breakPos = bi->following(i);
   4976             if (breakPos <= i ||
   4977                 breakPos < lastBreakPos ||
   4978                 breakPos > testText.length() ||
   4979                 (breakPos > lastBreakPos && lastBreakPos > i)) {
   4980                 errln("%s break monkey test: "
   4981                     "Out of range value returned by BreakIterator::following().\n"
   4982                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
   4983                          name, seed, i, breakPos, lastBreakPos);
   4984                 break;
   4985             }
   4986             followingBreaks[breakPos] = 1;
   4987             lastBreakPos = breakPos;
   4988         }
   4989 
   4990         // Find the break positions using the preceding() function.
   4991         memset(precedingBreaks, 0, sizeof(precedingBreaks));
   4992         lastBreakPos = testText.length();
   4993         precedingBreaks[testText.length()] = 1;
   4994         for (i=testText.length(); i>0; i--) {
   4995             breakPos = bi->preceding(i);
   4996             if (breakPos >= i ||
   4997                 breakPos > lastBreakPos ||
   4998                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
   4999                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
   5000                 errln("%s break monkey test: "
   5001                     "Out of range value returned by BreakIterator::preceding().\n"
   5002                     "index=%d;  prev returned %d; lastBreak=%d" ,
   5003                     name,  i, breakPos, lastBreakPos);
   5004                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
   5005                     precedingBreaks[i] = 2;   // Forces an error.
   5006                 }
   5007             } else {
   5008                 if (breakPos >= 0) {
   5009                     precedingBreaks[breakPos] = 1;
   5010                 }
   5011                 lastBreakPos = breakPos;
   5012             }
   5013         }
   5014 
   5015         // Compare the expected and actual results.
   5016         for (i=0; i<=testText.length(); i++) {
   5017             const char *errorType = NULL;
   5018             if  (forwardBreaks[i] != expectedBreaks[i]) {
   5019                 errorType = "next()";
   5020             } else if (reverseBreaks[i] != forwardBreaks[i]) {
   5021                 errorType = "previous()";
   5022             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
   5023                 errorType = "isBoundary()";
   5024             } else if (followingBreaks[i] != expectedBreaks[i]) {
   5025                 errorType = "following()";
   5026             } else if (precedingBreaks[i] != expectedBreaks[i]) {
   5027                 errorType = "preceding()";
   5028             }
   5029 
   5030 
   5031             if (errorType != NULL) {
   5032                 // Format a range of the test text that includes the failure as
   5033                 //  a data item that can be included in the rbbi test data file.
   5034 
   5035                 // Start of the range is the last point where expected and actual results
   5036                 //   both agreed that there was a break position.
   5037                 int startContext = i;
   5038                 int32_t count = 0;
   5039                 for (;;) {
   5040                     if (startContext==0) { break; }
   5041                     startContext --;
   5042                     if (expectedBreaks[startContext] != 0) {
   5043                         if (count == 2) break;
   5044                         count ++;
   5045                     }
   5046                 }
   5047 
   5048                 // End of range is two expected breaks past the start position.
   5049                 int endContext = i + 1;
   5050                 int ci;
   5051                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
   5052                     for (;;) {
   5053                         if (endContext >= testText.length()) {break;}
   5054                         if (expectedBreaks[endContext-1] != 0) {
   5055                             if (count == 0) break;
   5056                             count --;
   5057                         }
   5058                         endContext ++;
   5059                     }
   5060                 }
   5061 
   5062                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
   5063                 UnicodeString errorText = "<data>";
   5064                 /***if (strcmp(errorType, "next()") == 0) {
   5065                     startContext = 0;
   5066                     endContext = testText.length();
   5067 
   5068                     printStringBreaks(testText, expected, expectedCount);
   5069                 }***/
   5070 
   5071                 for (ci=startContext; ci<endContext;) {
   5072                     UnicodeString hexChars("0123456789abcdef");
   5073                     UChar32  c;
   5074                     int      bn;
   5075                     c = testText.char32At(ci);
   5076                     if (ci == i) {
   5077                         // This is the location of the error.
   5078                         errorText.append("<?>");
   5079                     } else if (expectedBreaks[ci] != 0) {
   5080                         // This a non-error expected break position.
   5081                         errorText.append("\\");
   5082                     }
   5083                     if (c < 0x10000) {
   5084                         errorText.append("\\u");
   5085                         for (bn=12; bn>=0; bn-=4) {
   5086                             errorText.append(hexChars.charAt((c>>bn)&0xf));
   5087                         }
   5088                     } else {
   5089                         errorText.append("\\U");
   5090                         for (bn=28; bn>=0; bn-=4) {
   5091                             errorText.append(hexChars.charAt((c>>bn)&0xf));
   5092                         }
   5093                     }
   5094                     ci = testText.moveIndex32(ci, 1);
   5095                 }
   5096                 errorText.append("\\");
   5097                 errorText.append("</data>\n");
   5098 
   5099                 // Output the error
   5100                 char  charErrorTxt[500];
   5101                 UErrorCode status = U_ZERO_ERROR;
   5102                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
   5103                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
   5104                 errln("%s break monkey test error.  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
   5105                     name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
   5106                     errorType, seed, i, charErrorTxt);
   5107                 break;
   5108             }
   5109         }
   5110 
   5111         loopCount++;
   5112     }
   5113 #endif
   5114 }
   5115 
   5116 
   5117 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
   5118 //             This test checks the initial patch,
   5119 //             which is to just keep it from crashing.  Correct word boundaries
   5120 //             await a proper fix to the dictionary code.
   5121 //
   5122 void RBBITest::TestBug5532(void)  {
   5123    // Text includes a mixture of Thai and Latin.
   5124    const unsigned char utf8Data[] = {
   5125            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
   5126            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
   5127            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
   5128            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
   5129            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
   5130            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
   5131            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
   5132            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
   5133            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
   5134            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
   5135            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
   5136 
   5137     UErrorCode status = U_ZERO_ERROR;
   5138     UText utext=UTEXT_INITIALIZER;
   5139     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
   5140     TEST_ASSERT_SUCCESS(status);
   5141 
   5142     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
   5143     TEST_ASSERT_SUCCESS(status);
   5144     if (U_SUCCESS(status)) {
   5145         bi->setText(&utext, status);
   5146         TEST_ASSERT_SUCCESS(status);
   5147 
   5148         int32_t breakCount = 0;
   5149         int32_t previousBreak = -1;
   5150         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
   5151             // For now, just make sure that the break iterator doesn't hang.
   5152             TEST_ASSERT(previousBreak < bi->current());
   5153             previousBreak = bi->current();
   5154         }
   5155         TEST_ASSERT(breakCount > 0);
   5156     }
   5157     delete bi;
   5158     utext_close(&utext);
   5159 }
   5160 
   5161 
   5162 //
   5163 //  TestDebug    -  A place-holder test for debugging purposes.
   5164 //                  For putting in fragments of other tests that can be invoked
   5165 //                  for tracing  without a lot of unwanted extra stuff happening.
   5166 //
   5167 void RBBITest::TestDebug(void) {
   5168 #if 0
   5169     UErrorCode   status = U_ZERO_ERROR;
   5170     int pos = 0;
   5171     int ruleStatus = 0;
   5172 
   5173     RuleBasedBreakIterator* bi =
   5174        // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
   5175        // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
   5176        (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
   5177     UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
   5178     // UnicodeString s("Aaa.  Bcd");
   5179     s = s.unescape();
   5180     bi->setText(s);
   5181     UBool r = bi->isBoundary(8);
   5182     printf("%s", r?"true":"false");
   5183     return;
   5184     pos = bi->last();
   5185     do {
   5186         // ruleStatus = bi->getRuleStatus();
   5187         printf("%d\t%d\n", pos, ruleStatus);
   5188         pos = bi->previous();
   5189     } while (pos != BreakIterator::DONE);
   5190 #endif
   5191 }
   5192 
   5193 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   5194