Home | History | Annotate | Download | only in intltest
      1 /********************************************************************
      2  * COPYRIGHT:
      3  * Copyright (c) 1999-2010, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  ********************************************************************/
      6 /************************************************************************
      7 *   Date        Name        Description
      8 *   12/15/99    Madhu        Creation.
      9 *   01/12/2000  Madhu        Updated for changed API and added new tests
     10 ************************************************************************/
     11 
     12 #include <typeinfo>  // for 'typeid' to work
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_BREAK_ITERATION
     17 
     18 #include "unicode/utypes.h"
     19 #include "unicode/brkiter.h"
     20 #include "unicode/rbbi.h"
     21 #include "unicode/uchar.h"
     22 #include "unicode/utf16.h"
     23 #include "unicode/ucnv.h"
     24 #include "unicode/schriter.h"
     25 #include "unicode/uniset.h"
     26 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
     27 #include "unicode/ustring.h"
     28 #include "unicode/utext.h"
     29 #include "intltest.h"
     30 #include "rbbitst.h"
     31 #include <string.h>
     32 #include "uvector.h"
     33 #include "uvectr32.h"
     34 #include "triedict.h"
     35 #include <string.h>
     36 #include <stdio.h>
     37 #include <stdlib.h>
     38 
     39 #define TEST_ASSERT(x) {if (!(x)) { \
     40     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
     41 
     42 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
     43     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
     44 
     45 
     46 //---------------------------------------------
     47 // runIndexedTest
     48 //---------------------------------------------
     49 
     50 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
     51 {
     52     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
     53 
     54     switch (index) {
     55 #if !UCONFIG_NO_FILE_IO
     56         case 0: name = "TestBug4153072";
     57             if(exec) TestBug4153072();                         break;
     58 #else
     59         case 0: name = "skip";
     60             break;
     61 #endif
     62 
     63         case 1: name = "TestJapaneseLineBreak";
     64             if(exec) TestJapaneseLineBreak();                  break;
     65         case 2: name = "TestStatusReturn";
     66             if(exec) TestStatusReturn();                       break;
     67 
     68 #if !UCONFIG_NO_FILE_IO
     69         case 3: name = "TestUnicodeFiles";
     70             if(exec) TestUnicodeFiles();                       break;
     71         case 4: name = "TestEmptyString";
     72             if(exec) TestEmptyString();                        break;
     73 #else
     74         case 3: case 4: name = "skip";
     75             break;
     76 #endif
     77 
     78         case 5: name = "TestGetAvailableLocales";
     79             if(exec) TestGetAvailableLocales();                break;
     80 
     81         case 6: name = "TestGetDisplayName";
     82             if(exec) TestGetDisplayName();                     break;
     83 
     84 #if !UCONFIG_NO_FILE_IO
     85         case 7: name = "TestEndBehaviour";
     86             if(exec) TestEndBehaviour();                       break;
     87         case 8: name = "TestMixedThaiLineBreak";
     88              if(exec) TestMixedThaiLineBreak();                break;
     89         case 9: name = "TestThaiLineBreak";
     90              if(exec) TestThaiLineBreak();                     break;
     91         case 10: name = "TestMaiyamok";
     92              if(exec) TestMaiyamok();                          break;
     93         case 11: name = "TestWordBreaks";
     94              if(exec) TestWordBreaks();                        break;
     95         case 12: name = "TestWordBoundary";
     96              if(exec) TestWordBoundary();                      break;
     97         case 13: name = "TestLineBreaks";
     98              if(exec) TestLineBreaks();                        break;
     99         case 14: name = "TestSentBreaks";
    100              if(exec) TestSentBreaks();                        break;
    101         case 15: name = "TestExtended";
    102              if(exec) TestExtended();                          break;
    103 #else
    104         case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
    105              break;
    106 #endif
    107 
    108         case 16:
    109              if(exec) {
    110  #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
    111                name = "TestMonkey";
    112                TestMonkey(params);
    113  #else
    114                name = "skip";
    115  #endif
    116              }
    117                                                                break;
    118 
    119 #if !UCONFIG_NO_FILE_IO
    120         case 17: name = "TestBug3818";
    121             if(exec) TestBug3818();                            break;
    122         case 18: name = "TestJapaneseWordBreak";
    123             if(exec) TestJapaneseWordBreak();                  break;
    124 #else
    125         case 17: case 18: name = "skip";
    126             break;
    127 #endif
    128 
    129         case 19: name = "TestDebug";
    130             if(exec) TestDebug();                              break;
    131         case 20: name = "TestTrieDict";
    132             if(exec) TestTrieDict();                           break;
    133 
    134 #if !UCONFIG_NO_FILE_IO
    135         case 21: name = "TestBug5775";
    136             if (exec) TestBug5775();                           break;
    137         case 22: name = "TestThaiBreaks";
    138             if (exec) TestThaiBreaks();                        break;
    139         case 23: name = "TestTailoredBreaks";
    140             if (exec) TestTailoredBreaks();                    break;
    141 #else
    142         case 21: case 22: case 23: name = "skip";
    143             break;
    144 #endif
    145         case 24: name = "TestDictRules";
    146             if (exec) TestDictRules();                         break;
    147         case 25: name = "TestBug5532";
    148             if (exec) TestBug5532();                           break;
    149         default: name = ""; break; //needed to end loop
    150     }
    151 }
    152 
    153 
    154 //---------------------------------------------------------------------------
    155 //
    156 //   class BITestData   Holds a set of Break iterator test data and results
    157 //                      Includes
    158 //                         - the string data to be broken
    159 //                         - a vector of the expected break positions.
    160 //                         - a vector of source line numbers for the data,
    161 //                               (to help see where errors occured.)
    162 //                         - The expected break tag values.
    163 //                         - Vectors of actual break positions and tag values.
    164 //                         - Functions for comparing actual with expected and
    165 //                            reporting errors.
    166 //
    167 //----------------------------------------------------------------------------
    168 class BITestData {
    169 public:
    170     UnicodeString    fDataToBreak;
    171     UVector          fExpectedBreakPositions;
    172     UVector          fExpectedTags;
    173     UVector          fLineNum;
    174     UVector          fActualBreakPositions;   // Test Results.
    175     UVector          fActualTags;
    176 
    177     BITestData(UErrorCode &status);
    178     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
    179     void             checkResults(const char *heading, RBBITest *test);
    180     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
    181     void             clearResults();
    182 };
    183 
    184 //
    185 // Constructor.
    186 //
    187 BITestData::BITestData(UErrorCode &status)
    188 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
    189   fActualTags(status)
    190 {
    191 }
    192 
    193 //
    194 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
    195 //                 The macro form collects the line number, which is helpful
    196 //                 when tracking down failures.
    197 //
    198 //                 A null data item is inserted at the start of each test's data
    199 //                  to put the starting zero into the data list.  The position saved for
    200 //                  each non-null item is its ending position.
    201 //
    202 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
    203 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
    204     if (U_FAILURE(status)) {return;}
    205     if (data != NULL) {
    206         fDataToBreak.append(CharsToUnicodeString(data));
    207     }
    208     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
    209     fExpectedTags.addElement(tag, status);
    210     fLineNum.addElement(lineNum, status);
    211 }
    212 
    213 
    214 //
    215 //  checkResults.   Compare the actual and expected break positions, report any differences.
    216 //
    217 void BITestData::checkResults(const char *heading, RBBITest *test) {
    218     int32_t   expectedIndex = 0;
    219     int32_t   actualIndex = 0;
    220 
    221     for (;;) {
    222         // If we've run through both the expected and actual results vectors, we're done.
    223         //   break out of the loop.
    224         if (expectedIndex >= fExpectedBreakPositions.size() &&
    225             actualIndex   >= fActualBreakPositions.size()) {
    226             break;
    227         }
    228 
    229 
    230         if (expectedIndex >= fExpectedBreakPositions.size()) {
    231             err(heading, test, expectedIndex-1, actualIndex);
    232             actualIndex++;
    233             continue;
    234         }
    235 
    236         if (actualIndex >= fActualBreakPositions.size()) {
    237             err(heading, test, expectedIndex, actualIndex-1);
    238             expectedIndex++;
    239             continue;
    240         }
    241 
    242         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
    243             err(heading, test, expectedIndex, actualIndex);
    244             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
    245             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
    246                 actualIndex++;
    247             } else {
    248                 expectedIndex++;
    249             }
    250             continue;
    251         }
    252 
    253         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
    254             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
    255                 heading, fLineNum.elementAt(expectedIndex),
    256                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
    257         }
    258 
    259         actualIndex++;
    260         expectedIndex++;
    261     }
    262 }
    263 
    264 //
    265 //  err   -  An error was found.  Report it, along with information about where the
    266 //                                incorrectly broken test data appeared in the source file.
    267 //
    268 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
    269 {
    270     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
    271     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
    272     int32_t   o        = 0;
    273     int32_t   line     = fLineNum.elementAti(expectedIdx);
    274     if (expectedIdx > 0) {
    275         // The line numbers are off by one because a premature break occurs somewhere
    276         //    within the previous item, rather than at the start of the current (expected) item.
    277         //    We want to report the offset of the unexpected break from the start of
    278         //      this previous item.
    279         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
    280     }
    281     if (actual < expected) {
    282         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
    283     } else {
    284         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
    285     }
    286 }
    287 
    288 
    289 void BITestData::clearResults() {
    290     fActualBreakPositions.removeAllElements();
    291     fActualTags.removeAllElements();
    292 }
    293 
    294 
    295 //-----------------------------------------------------------------------------------
    296 //
    297 //    Cannned Test Characters
    298 //
    299 //-----------------------------------------------------------------------------------
    300 
    301 static const UChar cannedTestArray[] = {
    302     0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
    303     0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
    304     0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
    305     0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
    306     0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
    307     0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
    308     0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
    309     0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
    310 };
    311 
    312 static UnicodeString* cannedTestChars = 0;
    313 
    314 #define  halfNA     "\\u0928\\u094d\\u200d"
    315 #define  halfSA     "\\u0938\\u094d\\u200d"
    316 #define  halfCHA    "\\u091a\\u094d\\u200d"
    317 #define  halfKA     "\\u0915\\u094d\\u200d"
    318 #define  deadTA     "\\u0924\\u094d"
    319 
    320 //--------------------------------------------------------------------------------------
    321 //
    322 //    RBBITest    constructor and destructor
    323 //
    324 //--------------------------------------------------------------------------------------
    325 
    326 RBBITest::RBBITest() {
    327     UnicodeString temp(cannedTestArray);
    328     cannedTestChars = new UnicodeString();
    329     *cannedTestChars += (UChar)0x0000;
    330     *cannedTestChars += temp;
    331 }
    332 
    333 
    334 RBBITest::~RBBITest() {
    335     delete cannedTestChars;
    336 }
    337 
    338 
    339 static const int T_NUMBER = 100;
    340 static const int T_LETTER = 200;
    341 static const int T_H_OR_K = 300;
    342 static const int T_IDEO   = 400;
    343 
    344 
    345 
    346 
    347 
    348 
    349 //--------------------------------------------------------------------
    350 //Testing the BreakIterator for devanagari script
    351 //--------------------------------------------------------------------
    352 
    353 #define deadRA   "\\u0930\\u094d"         /*deadform RA = devanagari RA + virama*/
    354 #define deadPHA  "\\u092b\\u094d"         /*deadform PHA = devanagari PHA + virama*/
    355 #define deadTTHA "\\u0920\\u094d"
    356 #define deadPA   "\\u092a\\u094d"
    357 #define deadSA   "\\u0938\\u094d"
    358 #define visarga  "\\u0903"                /*devanagari visarga looks like a english colon*/
    359 
    360 
    361 
    362 
    363 
    364 
    365 //-----------------------------------------------------------------------------------
    366 //
    367 //   Test for status {tag} return value from break rules.
    368 //        TODO:  a more thorough test.
    369 //
    370 //-----------------------------------------------------------------------------------
    371 void RBBITest::TestStatusReturn() {
    372      UnicodeString rulesString1("$Letters = [:L:];\n"
    373                                   "$Numbers = [:N:];\n"
    374                                   "$Letters+{1};\n"
    375                                   "$Numbers+{2};\n"
    376                                   "Help\\ {4}/me\\!;\n"
    377                                   "[^$Letters $Numbers];\n"
    378                                   "!.*;\n", -1, US_INV);
    379      UnicodeString testString1  = "abc123..abc Help me Help me!";
    380                                 // 01234567890123456789012345678
    381      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
    382      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
    383 
    384      UErrorCode status=U_ZERO_ERROR;
    385      UParseError    parseError;
    386 
    387      RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
    388      if(U_FAILURE(status)) {
    389          dataerrln("FAIL : in construction - %s", u_errorName(status));
    390      } else {
    391          int32_t  pos;
    392          int32_t  i = 0;
    393          bi->setText(testString1);
    394          for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
    395              if (pos != bounds1[i]) {
    396                  errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
    397                  break;
    398              }
    399 
    400              int tag = bi->getRuleStatus();
    401              if (tag != brkStatus[i]) {
    402                  errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
    403                  break;
    404              }
    405              i++;
    406          }
    407      }
    408      delete bi;
    409 }
    410 
    411 
    412 static void printStringBreaks(UnicodeString ustr, int expected[],
    413                               int expectedcount)
    414 {
    415     UErrorCode status = U_ZERO_ERROR;
    416     char name[100];
    417     printf("code    alpha extend alphanum type word sent line name\n");
    418     int j;
    419     for (j = 0; j < ustr.length(); j ++) {
    420         if (expectedcount > 0) {
    421             int k;
    422             for (k = 0; k < expectedcount; k ++) {
    423                 if (j == expected[k]) {
    424                     printf("------------------------------------------------ %d\n",
    425                            j);
    426                 }
    427             }
    428         }
    429         UChar32 c = ustr.char32At(j);
    430         if (c > 0xffff) {
    431             j ++;
    432         }
    433         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
    434         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
    435                            u_isUAlphabetic(c),
    436                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
    437                            u_isalnum(c),
    438                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
    439                                                   u_charType(c),
    440                                                   U_SHORT_PROPERTY_NAME),
    441                            u_getPropertyValueName(UCHAR_WORD_BREAK,
    442                                                   u_getIntPropertyValue(c,
    443                                                           UCHAR_WORD_BREAK),
    444                                                   U_SHORT_PROPERTY_NAME),
    445                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
    446                                    u_getIntPropertyValue(c,
    447                                            UCHAR_SENTENCE_BREAK),
    448                                    U_SHORT_PROPERTY_NAME),
    449                            u_getPropertyValueName(UCHAR_LINE_BREAK,
    450                                    u_getIntPropertyValue(c,
    451                                            UCHAR_LINE_BREAK),
    452                                    U_SHORT_PROPERTY_NAME),
    453                            name);
    454     }
    455 }
    456 
    457 void RBBITest::TestThaiLineBreak() {
    458     UErrorCode status = U_ZERO_ERROR;
    459     BITestData thaiLineSelection(status);
    460 
    461     // \u0e2f-- the Thai paiyannoi character-- isn't a letter.  It's a symbol that
    462     // represents elided letters at the end of a long word.  It should be bound to
    463     // the end of the word and not treated as an independent punctuation mark.
    464 
    465 
    466     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
    467     ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status);
    468     ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status);
    469     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status);
    470     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status);
    471 //        ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);
    472 //        ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
    473     ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status);
    474     // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
    475     ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status);
    476     ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status);
    477     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status);
    478     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status);
    479     ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status);
    480     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status);
    481 
    482     // the one time where the paiyannoi occurs somewhere other than at the end
    483     // of a word is in the Thai abbrevation for "etc.", which both begins and
    484     // ends with a paiyannoi
    485     ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status);
    486     ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
    487     ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status);
    488 
    489     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
    490         Locale("th"), status);
    491     if (U_FAILURE(status))
    492     {
    493         errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestThaiLineBreak. - %s", u_errorName(status));
    494         return;
    495     }
    496 
    497     generalIteratorTest(*e, thaiLineSelection);
    498     delete e;
    499 }
    500 
    501 
    502 
    503 void RBBITest::TestMixedThaiLineBreak()
    504 {
    505     UErrorCode   status = U_ZERO_ERROR;
    506     BITestData   thaiLineSelection(status);
    507 
    508     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
    509 
    510 
    511     // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters
    512     // start
    513 
    514     ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
    515     ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status);
    516     ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status);
    517     ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status);
    518     ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
    519     ADD_DATACHUNK(thaiLineSelection, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status);
    520     ADD_DATACHUNK(thaiLineSelection, "\\u0E04\\u0E23\\u0E1A", 0, status);
    521     ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E2D\\u0E1A ", 0, status);
    522     ADD_DATACHUNK(thaiLineSelection, "\"\\u0E52\\u0E52\\u0E50 ", 0, status);
    523     ADD_DATACHUNK(thaiLineSelection, "\\u0E1b\\u0E35\" ", 0, status);
    524     ADD_DATACHUNK(thaiLineSelection, "\\u0E02\\u0E2d\\u0E07", 0, status);
    525     ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status);
    526     ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status);
    527     ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status);
    528     ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status);
    529     ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status);
    530 
    531     // @suwit - end of changes
    532 
    533 
    534     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status);
    535     if (U_FAILURE(status))
    536     {
    537         errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak. - %s", u_errorName(status));
    538         return;
    539     }
    540 
    541 
    542     generalIteratorTest(*e, thaiLineSelection);
    543     delete e;
    544 }
    545 
    546 
    547 void RBBITest::TestMaiyamok()
    548 {
    549     UErrorCode status = U_ZERO_ERROR;
    550     BITestData   thaiLineSelection(status);
    551     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
    552     // the Thai maiyamok character is a shorthand symbol that means "repeat the previous
    553     // word".  Instead of appearing as a word unto itself, however, it's kept together
    554     // with the word before it
    555     ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);
    556     ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);
    557     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);
    558     ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status);
    559     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status);
    560     ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);
    561     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status);
    562     ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status);
    563     ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
    564 
    565     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
    566         Locale("th"), status);
    567 
    568     if (U_FAILURE(status))
    569     {
    570         errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMaiyamok. - %s", u_errorName(status));
    571         return;
    572     }
    573     generalIteratorTest(*e, thaiLineSelection);
    574     delete e;
    575 }
    576 
    577 
    578 
    579 void RBBITest::TestBug3818() {
    580     UErrorCode  status = U_ZERO_ERROR;
    581 
    582     // Four Thai words...
    583     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
    584                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
    585     UnicodeString  thaiStr(thaiWordData);
    586 
    587     RuleBasedBreakIterator* bi =
    588         (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
    589     if (U_FAILURE(status) || bi == NULL) {
    590         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
    591         return;
    592     }
    593     bi->setText(thaiStr);
    594 
    595     int32_t  startOfSecondWord = bi->following(1);
    596     if (startOfSecondWord != 4) {
    597         errln("Fail at file %s, line %d expected start of word at 4, got %d",
    598             __FILE__, __LINE__, startOfSecondWord);
    599     }
    600     startOfSecondWord = bi->following(0);
    601     if (startOfSecondWord != 4) {
    602         errln("Fail at file %s, line %d expected start of word at 4, got %d",
    603             __FILE__, __LINE__, startOfSecondWord);
    604     }
    605     delete bi;
    606 }
    607 
    608 
    609 void RBBITest::TestJapaneseWordBreak() {
    610     UErrorCode status = U_ZERO_ERROR;
    611     BITestData   japaneseWordSelection(status);
    612 
    613     ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status);           // Break at start of data
    614     ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2
    615     ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5
    616     ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7
    617     ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10
    618     ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11
    619     ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12
    620 
    621     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
    622         Locale("ja"), status);
    623     if (U_FAILURE(status))
    624     {
    625         errcheckln(status, "Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
    626         return;
    627     }
    628 
    629     generalIteratorTest(*e, japaneseWordSelection);
    630     delete e;
    631 }
    632 
    633 void RBBITest::TestTrieDict() {
    634     UErrorCode      status  = U_ZERO_ERROR;
    635 
    636     //
    637     //  Open and read the test data file.
    638     //
    639     const char *testDataDirectory = IntlTest::getSourceTestData(status);
    640     char testFileName[1000];
    641     if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) {
    642         errln("Can't open test data.  Path too long.");
    643         return;
    644     }
    645     strcpy(testFileName, testDataDirectory);
    646     strcat(testFileName, "riwords.txt");
    647 
    648     // Items needing deleting at the end
    649     MutableTrieDictionary *mutableDict = NULL;
    650     CompactTrieDictionary *compactDict = NULL;
    651     UnicodeSet            *breaks      = NULL;
    652     UChar                 *testFile    = NULL;
    653     StringEnumeration     *enumer1     = NULL;
    654     StringEnumeration     *enumer2     = NULL;
    655     MutableTrieDictionary *mutable2    = NULL;
    656     StringEnumeration     *cloneEnum   = NULL;
    657     CompactTrieDictionary *compact2    = NULL;
    658 
    659 
    660     const UnicodeString *originalWord = NULL;
    661     const UnicodeString *cloneWord    = NULL;
    662     UChar *current;
    663     UChar *word;
    664     UChar uc;
    665     int32_t wordLen;
    666     int32_t wordCount;
    667     int32_t testCount;
    668 
    669     int    len;
    670     testFile = ReadAndConvertFile(testFileName, len, NULL, status);
    671     if (U_FAILURE(status)) {
    672         goto cleanup; /* something went wrong, error already output */
    673     }
    674 
    675     mutableDict = new MutableTrieDictionary(0x0E1C, status);
    676     if (U_FAILURE(status)) {
    677         errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
    678         goto cleanup;
    679     }
    680 
    681     breaks = new UnicodeSet;
    682     breaks->add(0x000A);     // Line Feed
    683     breaks->add(0x000D);     // Carriage Return
    684     breaks->add(0x2028);     // Line Separator
    685     breaks->add(0x2029);     // Paragraph Separator
    686 
    687     // Now add each non-comment line of the file as a word.
    688     current = testFile;
    689     word = current;
    690     uc = *current++;
    691     wordLen = 0;
    692     wordCount = 0;
    693 
    694     while (uc) {
    695         if (uc == 0x0023) {     // #comment line, skip
    696             while (uc && !breaks->contains(uc)) {
    697                 uc = *current++;
    698             }
    699         }
    700         else while (uc && !breaks->contains(uc)) {
    701             ++wordLen;
    702             uc = *current++;
    703         }
    704         if (wordLen > 0) {
    705             mutableDict->addWord(word, wordLen, status);
    706             if (U_FAILURE(status)) {
    707                 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
    708                 goto cleanup;
    709             }
    710             wordCount += 1;
    711         }
    712 
    713         // Find beginning of next line
    714         while (uc && breaks->contains(uc)) {
    715             uc = *current++;
    716         }
    717         word = current-1;
    718         wordLen = 0;
    719     }
    720 
    721     if (wordCount < 50) {
    722         errln("Word count (%d) unreasonably small\n", wordCount);
    723         goto cleanup;
    724     }
    725 
    726     enumer1 = mutableDict->openWords(status);
    727     if (U_FAILURE(status)) {
    728         errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
    729         goto cleanup;
    730     }
    731 
    732     testCount = 0;
    733     if (wordCount != (testCount = enumer1->count(status))) {
    734         errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
    735             testCount, wordCount, u_errorName(status));
    736         goto cleanup;
    737     }
    738 
    739     // Now compact it
    740     compactDict = new CompactTrieDictionary(*mutableDict, status);
    741     if (U_FAILURE(status)) {
    742         errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
    743         goto cleanup;
    744     }
    745 
    746     enumer2 = compactDict->openWords(status);
    747     if (U_FAILURE(status)) {
    748         errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
    749         goto cleanup;
    750     }
    751 
    752     if (wordCount != (testCount = enumer2->count(status))) {
    753         errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
    754             testCount, wordCount, u_errorName(status));
    755         goto cleanup;
    756     }
    757 
    758     if (typeid(*enumer1) == typeid(*enumer2)) {
    759         errln("CompactTrieEnumeration and MutableTrieEnumeration typeids are the same");
    760     }
    761     delete enumer1;
    762     enumer1 = NULL;
    763     delete enumer2;
    764     enumer2 = NULL;
    765 
    766     // Now un-compact it
    767     mutable2 = compactDict->cloneMutable(status);
    768     if (U_FAILURE(status)) {
    769         errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
    770         goto cleanup;
    771     }
    772 
    773     cloneEnum = mutable2->openWords(status);
    774     if (U_FAILURE(status)) {
    775         errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
    776         goto cleanup;
    777     }
    778 
    779     if (wordCount != (testCount = cloneEnum->count(status))) {
    780         errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
    781             testCount, wordCount, u_errorName(status));
    782         goto cleanup;
    783     }
    784 
    785     // Compact original dictionary to clone. Note that we can only compare the same kind of
    786     // dictionary as the order of the enumerators is not guaranteed to be the same between
    787     // different kinds
    788     enumer1 = mutableDict->openWords(status);
    789     if (U_FAILURE(status)) {
    790         errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
    791         goto cleanup;
    792      }
    793 
    794     originalWord = enumer1->snext(status);
    795     cloneWord = cloneEnum->snext(status);
    796     while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
    797         if (*originalWord != *cloneWord) {
    798             errln("Original and cloned MutableTrieDictionary word mismatch\n");
    799             goto cleanup;
    800         }
    801         originalWord = enumer1->snext(status);
    802         cloneWord = cloneEnum->snext(status);
    803     }
    804 
    805     if (U_FAILURE(status)) {
    806         errln("Enumeration failed: %s\n", u_errorName(status));
    807         goto cleanup;
    808     }
    809 
    810     if (originalWord != cloneWord) {
    811         errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
    812         goto cleanup;
    813     }
    814 
    815     // Test the data copying constructor for CompactTrieDict, and the data access APIs.
    816     compact2 = new CompactTrieDictionary(compactDict->data(), status);
    817     if (U_FAILURE(status)) {
    818         errln("CompactTrieDictionary(const void *,...) failed\n");
    819         goto cleanup;
    820     }
    821 
    822     if (compact2->dataSize() == 0) {
    823         errln("CompactTrieDictionary->dataSize() == 0\n");
    824         goto cleanup;
    825     }
    826 
    827     // Now count the words via the second dictionary
    828     delete enumer1;
    829     enumer1 = compact2->openWords(status);
    830     if (U_FAILURE(status)) {
    831         errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
    832         goto cleanup;
    833     }
    834 
    835     if (wordCount != (testCount = enumer1->count(status))) {
    836         errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
    837             testCount, wordCount, u_errorName(status));
    838         goto cleanup;
    839     }
    840 
    841 cleanup:
    842     delete compactDict;
    843     delete mutableDict;
    844     delete breaks;
    845     delete[] testFile;
    846     delete enumer1;
    847     delete mutable2;
    848     delete cloneEnum;
    849     delete compact2;
    850 }
    851 
    852 
    853 //----------------------------------------------------------------------------
    854 //
    855 // generalIteratorTest      Given a break iterator and a set of test data,
    856 //                          Run the tests and report the results.
    857 //
    858 //----------------------------------------------------------------------------
    859 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
    860 {
    861 
    862     bi.setText(td.fDataToBreak);
    863 
    864     testFirstAndNext(bi, td);
    865 
    866     testLastAndPrevious(bi, td);
    867 
    868     testFollowing(bi, td);
    869     testPreceding(bi, td);
    870     testIsBoundary(bi, td);
    871     doMultipleSelectionTest(bi, td);
    872 }
    873 
    874 
    875 //
    876 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
    877 //                       kind of loop.
    878 //
    879 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
    880 {
    881     UErrorCode  status = U_ZERO_ERROR;
    882     int32_t     p;
    883     int32_t     lastP = -1;
    884     int32_t     tag;
    885 
    886     logln("Test first and next");
    887     bi.setText(td.fDataToBreak);
    888     td.clearResults();
    889 
    890     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
    891         td.fActualBreakPositions.addElement(p, status);  // Save result.
    892         tag = bi.getRuleStatus();
    893         td.fActualTags.addElement(tag, status);
    894         if (p <= lastP) {
    895             // If the iterator is not making forward progress, stop.
    896             //  No need to raise an error here, it'll be detected in the normal check of results.
    897             break;
    898         }
    899         lastP = p;
    900     }
    901     td.checkResults("testFirstAndNext", this);
    902 }
    903 
    904 
    905 //
    906 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
    907 //
    908 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
    909 {
    910     UErrorCode  status = U_ZERO_ERROR;
    911     int32_t     p;
    912     int32_t     lastP  = 0x7ffffffe;
    913     int32_t     tag;
    914 
    915     logln("Test last and previous");
    916     bi.setText(td.fDataToBreak);
    917     td.clearResults();
    918 
    919     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
    920         // Save break position.  Insert it at start of vector of results, shoving
    921         //    already-saved results further towards the end.
    922         td.fActualBreakPositions.insertElementAt(p, 0, status);
    923         // bi.previous();   // TODO:  Why does this fix things up????
    924         // bi.next();
    925         tag = bi.getRuleStatus();
    926         td.fActualTags.insertElementAt(tag, 0, status);
    927         if (p >= lastP) {
    928             // If the iterator is not making progress, stop.
    929             //  No need to raise an error here, it'll be detected in the normal check of results.
    930             break;
    931         }
    932         lastP = p;
    933     }
    934     td.checkResults("testLastAndPrevious", this);
    935 }
    936 
    937 
    938 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
    939 {
    940     UErrorCode  status = U_ZERO_ERROR;
    941     int32_t     p;
    942     int32_t     tag;
    943     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
    944                                  //   cannot be -1; that is returned for DONE.
    945     int         i;
    946 
    947     logln("testFollowing():");
    948     bi.setText(td.fDataToBreak);
    949     td.clearResults();
    950 
    951     // Save the starting point, since we won't get that out of following.
    952     p = bi.first();
    953     td.fActualBreakPositions.addElement(p, status);  // Save result.
    954     tag = bi.getRuleStatus();
    955     td.fActualTags.addElement(tag, status);
    956 
    957     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
    958         p = bi.following(i);
    959         if (p != lastP) {
    960             if (p == RuleBasedBreakIterator::DONE) {
    961                 break;
    962             }
    963             // We've reached a new break position.  Save it.
    964             td.fActualBreakPositions.addElement(p, status);  // Save result.
    965             tag = bi.getRuleStatus();
    966             td.fActualTags.addElement(tag, status);
    967             lastP = p;
    968         }
    969     }
    970     // The loop normally exits by means of the break in the middle.
    971     // Make sure that the index was at the correct position for the break iterator to have
    972     //   returned DONE.
    973     if (i != td.fDataToBreak.length()) {
    974         errln("testFollowing():  iterator returned DONE prematurely.");
    975     }
    976 
    977     // Full check of all results.
    978     td.checkResults("testFollowing", this);
    979 }
    980 
    981 
    982 
    983 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
    984     UErrorCode  status = U_ZERO_ERROR;
    985     int32_t     p;
    986     int32_t     tag;
    987     int32_t     lastP  = 0x7ffffffe;
    988     int         i;
    989 
    990     logln("testPreceding():");
    991     bi.setText(td.fDataToBreak);
    992     td.clearResults();
    993 
    994     p = bi.last();
    995     td.fActualBreakPositions.addElement(p, status);
    996     tag = bi.getRuleStatus();
    997     td.fActualTags.addElement(tag, status);
    998 
    999     for (i = td.fDataToBreak.length(); i>=-1; i--) {
   1000         p = bi.preceding(i);
   1001         if (p != lastP) {
   1002             if (p == RuleBasedBreakIterator::DONE) {
   1003                 break;
   1004             }
   1005             // We've reached a new break position.  Save it.
   1006             td.fActualBreakPositions.insertElementAt(p, 0, status);
   1007             lastP = p;
   1008             tag = bi.getRuleStatus();
   1009             td.fActualTags.insertElementAt(tag, 0, status);
   1010         }
   1011     }
   1012     // The loop normally exits by means of the break in the middle.
   1013     // Make sure that the index was at the correct position for the break iterator to have
   1014     //   returned DONE.
   1015     if (i != 0) {
   1016         errln("testPreceding():  iterator returned DONE prematurely.");
   1017     }
   1018 
   1019     // Full check of all results.
   1020     td.checkResults("testPreceding", this);
   1021 }
   1022 
   1023 
   1024 
   1025 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
   1026     UErrorCode  status = U_ZERO_ERROR;
   1027     int         i;
   1028     int32_t     tag;
   1029 
   1030     logln("testIsBoundary():");
   1031     bi.setText(td.fDataToBreak);
   1032     td.clearResults();
   1033 
   1034     for (i = 0; i <= td.fDataToBreak.length(); i++) {
   1035         if (bi.isBoundary(i)) {
   1036             td.fActualBreakPositions.addElement(i, status);  // Save result.
   1037             tag = bi.getRuleStatus();
   1038             td.fActualTags.addElement(tag, status);
   1039         }
   1040     }
   1041     td.checkResults("testIsBoundary: ", this);
   1042 }
   1043 
   1044 
   1045 
   1046 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
   1047 {
   1048     iterator.setText(td.fDataToBreak);
   1049 
   1050     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
   1051     int32_t offset = iterator.first();
   1052     int32_t testOffset;
   1053     int32_t count = 0;
   1054 
   1055     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
   1056 
   1057     if (*testIterator != iterator)
   1058         errln("clone() or operator!= failed: two clones compared unequal");
   1059 
   1060     do {
   1061         testOffset = testIterator->first();
   1062         testOffset = testIterator->next(count);
   1063         if (offset != testOffset)
   1064             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
   1065 
   1066         if (offset != RuleBasedBreakIterator::DONE) {
   1067             count++;
   1068             offset = iterator.next();
   1069 
   1070             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
   1071                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
   1072                 if (count > 10000 || offset == -1) {
   1073                     errln("operator== failed too many times. Stopping test.");
   1074                     if (offset == -1) {
   1075                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
   1076                     }
   1077                     return;
   1078                 }
   1079             }
   1080         }
   1081     } while (offset != RuleBasedBreakIterator::DONE);
   1082 
   1083     // now do it backwards...
   1084     offset = iterator.last();
   1085     count = 0;
   1086 
   1087     do {
   1088         testOffset = testIterator->last();
   1089         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
   1090         if (offset != testOffset)
   1091             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
   1092 
   1093         if (offset != RuleBasedBreakIterator::DONE) {
   1094             count--;
   1095             offset = iterator.previous();
   1096         }
   1097     } while (offset != RuleBasedBreakIterator::DONE);
   1098 
   1099     delete testIterator;
   1100 }
   1101 
   1102 
   1103 //---------------------------------------------
   1104 //
   1105 //     other tests
   1106 //
   1107 //---------------------------------------------
   1108 void RBBITest::TestEmptyString()
   1109 {
   1110     UnicodeString text = "";
   1111     UErrorCode status = U_ZERO_ERROR;
   1112 
   1113     BITestData x(status);
   1114     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
   1115     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
   1116     if (U_FAILURE(status))
   1117     {
   1118         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
   1119         return;
   1120     }
   1121     generalIteratorTest(*bi, x);
   1122     delete bi;
   1123 }
   1124 
   1125 void RBBITest::TestGetAvailableLocales()
   1126 {
   1127     int32_t locCount = 0;
   1128     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
   1129 
   1130     if (locCount == 0)
   1131         dataerrln("getAvailableLocales() returned an empty list!");
   1132     // Just make sure that it's returning good memory.
   1133     int32_t i;
   1134     for (i = 0; i < locCount; ++i) {
   1135         logln(locList[i].getName());
   1136     }
   1137 }
   1138 
   1139 //Testing the BreakIterator::getDisplayName() function
   1140 void RBBITest::TestGetDisplayName()
   1141 {
   1142     UnicodeString   result;
   1143 
   1144     BreakIterator::getDisplayName(Locale::getUS(), result);
   1145     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
   1146         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
   1147                 + result);
   1148 
   1149     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
   1150     if (result != "French (France)")
   1151         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
   1152                 + result);
   1153 }
   1154 /**
   1155  * Test End Behaviour
   1156  * @bug 4068137
   1157  */
   1158 void RBBITest::TestEndBehaviour()
   1159 {
   1160     UErrorCode status = U_ZERO_ERROR;
   1161     UnicodeString testString("boo.");
   1162     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
   1163     if (U_FAILURE(status))
   1164     {
   1165         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
   1166         return;
   1167     }
   1168     wb->setText(testString);
   1169 
   1170     if (wb->first() != 0)
   1171         errln("Didn't get break at beginning of string.");
   1172     if (wb->next() != 3)
   1173         errln("Didn't get break before period in \"boo.\"");
   1174     if (wb->current() != 4 && wb->next() != 4)
   1175         errln("Didn't get break at end of string.");
   1176     delete wb;
   1177 }
   1178 /*
   1179  * @bug 4153072
   1180  */
   1181 void RBBITest::TestBug4153072() {
   1182     UErrorCode status = U_ZERO_ERROR;
   1183     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
   1184     if (U_FAILURE(status))
   1185     {
   1186         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
   1187         return;
   1188     }
   1189     UnicodeString str("...Hello, World!...");
   1190     int32_t begin = 3;
   1191     int32_t end = str.length() - 3;
   1192     UBool onBoundary;
   1193 
   1194     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
   1195     iter->adoptText(textIterator);
   1196     int index;
   1197     // Note: with the switch to UText, there is no way to restrict the
   1198     //       iteration range to begin at an index other than zero.
   1199     //       String character iterators created with a non-zero bound are
   1200     //         treated by RBBI as being empty.
   1201     for (index = -1; index < begin + 1; ++index) {
   1202         onBoundary = iter->isBoundary(index);
   1203         if (index == 0?  !onBoundary : onBoundary) {
   1204             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
   1205                             " and begin index = " + begin);
   1206         }
   1207     }
   1208     delete iter;
   1209 }
   1210 
   1211 
   1212 //
   1213 // Test for problem reported by Ashok Matoria on 9 July 2007
   1214 //    One.<kSoftHyphen><kSpace>Two.
   1215 //
   1216 //    Sentence break at start (0) and then on calling next() it breaks at
   1217 //   'T' of "Two". Now, at this point if I do next() and
   1218 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
   1219 //
   1220 void RBBITest::TestBug5775() {
   1221     UErrorCode status = U_ZERO_ERROR;
   1222     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
   1223     TEST_ASSERT_SUCCESS(status);
   1224     if (U_FAILURE(status)) {
   1225         return;
   1226     }
   1227 // Check for status first for better handling of no data errors.
   1228     TEST_ASSERT(bi != NULL);
   1229     if (bi == NULL) {
   1230         return;
   1231     }
   1232 
   1233     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
   1234     //               01234      56789
   1235     s = s.unescape();
   1236     bi->setText(s);
   1237     int pos = bi->next();
   1238     TEST_ASSERT(pos == 6);
   1239     pos = bi->next();
   1240     TEST_ASSERT(pos == 10);
   1241     pos = bi->previous();
   1242     TEST_ASSERT(pos == 6);
   1243     delete bi;
   1244 }
   1245 
   1246 
   1247 
   1248 /**
   1249  * Test Japanese Line Break
   1250  * @bug 4095322
   1251  */
   1252 void RBBITest::TestJapaneseLineBreak()
   1253 {
   1254 #if 0
   1255     // Test needs updating some more...   Dump it for now.
   1256 
   1257 
   1258     // Change for Unicode TR 14:  Punctuation characters with categories Pi and Pf do not count
   1259     //        as opening and closing punctuation for line breaking.
   1260     //        Also, \u30fc and \u30fe are not counted as hyphens.   Remove these chars
   1261     //        from these tests.    6-13-2002
   1262     //
   1263     UErrorCode status = U_ZERO_ERROR;
   1264     UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");
   1265     UnicodeString precedingChars = CharsToUnicodeString(
   1266         //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
   1267         "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
   1268     UnicodeString followingChars = CharsToUnicodeString(
   1269         // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
   1270         ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
   1271         // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
   1272         ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
   1273         "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
   1274     BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status);
   1275 
   1276     int32_t i;
   1277     if (U_FAILURE(status))
   1278     {
   1279         errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
   1280         return;
   1281     }
   1282 
   1283     for (i = 0; i < precedingChars.length(); i++) {
   1284         testString.setCharAt(1, precedingChars[i]);
   1285         iter->setText(testString);
   1286         int32_t j = iter->first();
   1287         if (j != 0)
   1288             errln("ja line break failure: failed to start at 0");
   1289         j = iter->next();
   1290         if (j != 1)
   1291             errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])
   1292                         + "' (" + ((int)(precedingChars[i])) + ")");
   1293         j = iter->next();
   1294         if (j != 3)
   1295             errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])
   1296                         + "' (" + ((int)(precedingChars[i])) + ")");
   1297     }
   1298 
   1299     for (i = 0; i < followingChars.length(); i++) {
   1300         testString.setCharAt(1, followingChars[i]);
   1301         iter->setText(testString);
   1302         int j = iter->first();
   1303         if (j != 0)
   1304             errln("ja line break failure: failed to start at 0");
   1305         j = iter->next();
   1306         if (j != 2)
   1307             errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])
   1308                         + "' (" + ((int)(followingChars[i])) + ")");
   1309         j = iter->next();
   1310         if (j != 3)
   1311             errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])
   1312                         + "' (" + ((int)(followingChars[i])) + ")");
   1313     }
   1314     delete iter;
   1315 #endif
   1316 }
   1317 
   1318 
   1319 //------------------------------------------------------------------------------
   1320 //
   1321 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
   1322 //
   1323 //------------------------------------------------------------------------------
   1324 
   1325 struct TestParams {
   1326     BreakIterator   *bi;
   1327     UnicodeString    dataToBreak;
   1328     UVector32       *expectedBreaks;
   1329     UVector32       *srcLine;
   1330     UVector32       *srcCol;
   1331 };
   1332 
   1333 void RBBITest::executeTest(TestParams *t) {
   1334     int32_t    bp;
   1335     int32_t    prevBP;
   1336     int32_t    i;
   1337 
   1338     if (t->bi == NULL) {
   1339         return;
   1340     }
   1341 
   1342     t->bi->setText(t->dataToBreak);
   1343     //
   1344     //  Run the iterator forward
   1345     //
   1346     prevBP = -1;
   1347     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
   1348         if (prevBP ==  bp) {
   1349             // Fail for lack of forward progress.
   1350             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
   1351                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
   1352             break;
   1353         }
   1354 
   1355         // Check that there were we didn't miss an expected break between the last one
   1356         //  and this one.
   1357         for (i=prevBP+1; i<bp; i++) {
   1358             if (t->expectedBreaks->elementAti(i) != 0) {
   1359                 int expected[] = {0, i};
   1360                 printStringBreaks(t->dataToBreak, expected, 2);
   1361                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1362                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
   1363             }
   1364         }
   1365 
   1366         // Check that the break we did find was expected
   1367         if (t->expectedBreaks->elementAti(bp) == 0) {
   1368             int expected[] = {0, bp};
   1369             printStringBreaks(t->dataToBreak, expected, 2);
   1370             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
   1371                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
   1372         } else {
   1373             // The break was expected.
   1374             //   Check that the {nnn} tag value is correct.
   1375             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
   1376             if (expectedTagVal == -1) {
   1377                 expectedTagVal = 0;
   1378             }
   1379             int32_t line = t->srcLine->elementAti(bp);
   1380             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
   1381             if (rs != expectedTagVal) {
   1382                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
   1383                       "          Actual, Expected status = %4d, %4d",
   1384                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
   1385             }
   1386         }
   1387 
   1388 
   1389         prevBP = bp;
   1390     }
   1391 
   1392     // Verify that there were no missed expected breaks after the last one found
   1393     for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
   1394         if (t->expectedBreaks->elementAti(i) != 0) {
   1395             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1396                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
   1397         }
   1398     }
   1399 
   1400     //
   1401     //  Run the iterator backwards, verify that the same breaks are found.
   1402     //
   1403     prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
   1404     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
   1405         if (prevBP ==  bp) {
   1406             // Fail for lack of progress.
   1407             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
   1408                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
   1409             break;
   1410         }
   1411 
   1412         // Check that there were we didn't miss an expected break between the last one
   1413         //  and this one.  (UVector returns zeros for index out of bounds.)
   1414         for (i=prevBP-1; i>bp; i--) {
   1415             if (t->expectedBreaks->elementAti(i) != 0) {
   1416                 errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1417                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
   1418             }
   1419         }
   1420 
   1421         // Check that the break we did find was expected
   1422         if (t->expectedBreaks->elementAti(bp) == 0) {
   1423             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
   1424                    bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
   1425         } else {
   1426             // The break was expected.
   1427             //   Check that the {nnn} tag value is correct.
   1428             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
   1429             if (expectedTagVal == -1) {
   1430                 expectedTagVal = 0;
   1431             }
   1432             int line = t->srcLine->elementAti(bp);
   1433             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
   1434             if (rs != expectedTagVal) {
   1435                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
   1436                       "          Actual, Expected status = %4d, %4d",
   1437                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
   1438             }
   1439         }
   1440 
   1441         prevBP = bp;
   1442     }
   1443 
   1444     // Verify that there were no missed breaks prior to the last one found
   1445     for (i=prevBP-1; i>=0; i--) {
   1446         if (t->expectedBreaks->elementAti(i) != 0) {
   1447             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1448                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
   1449         }
   1450     }
   1451 }
   1452 
   1453 
   1454 void RBBITest::TestExtended() {
   1455 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   1456     UErrorCode      status  = U_ZERO_ERROR;
   1457     Locale          locale("");
   1458 
   1459     UnicodeString       rules;
   1460     TestParams          tp;
   1461     tp.bi             = NULL;
   1462     tp.expectedBreaks = new UVector32(status);
   1463     tp.srcLine        = new UVector32(status);
   1464     tp.srcCol         = new UVector32(status);
   1465 
   1466     RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
   1467     if (U_FAILURE(status)) {
   1468         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
   1469     }
   1470 
   1471 
   1472     //
   1473     //  Open and read the test data file.
   1474     //
   1475     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   1476     char testFileName[1000];
   1477     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
   1478         errln("Can't open test data.  Path too long.");
   1479         return;
   1480     }
   1481     strcpy(testFileName, testDataDirectory);
   1482     strcat(testFileName, "rbbitst.txt");
   1483 
   1484     int    len;
   1485     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
   1486     if (U_FAILURE(status)) {
   1487         return; /* something went wrong, error already output */
   1488     }
   1489 
   1490 
   1491 
   1492 
   1493     //
   1494     //  Put the test data into a UnicodeString
   1495     //
   1496     UnicodeString testString(FALSE, testFile, len);
   1497 
   1498     enum EParseState{
   1499         PARSE_COMMENT,
   1500         PARSE_TAG,
   1501         PARSE_DATA,
   1502         PARSE_NUM
   1503     }
   1504     parseState = PARSE_TAG;
   1505 
   1506     EParseState savedState = PARSE_TAG;
   1507 
   1508     static const UChar CH_LF        = 0x0a;
   1509     static const UChar CH_CR        = 0x0d;
   1510     static const UChar CH_HASH      = 0x23;
   1511     /*static const UChar CH_PERIOD    = 0x2e;*/
   1512     static const UChar CH_LT        = 0x3c;
   1513     static const UChar CH_GT        = 0x3e;
   1514     static const UChar CH_BACKSLASH = 0x5c;
   1515     static const UChar CH_BULLET    = 0x2022;
   1516 
   1517     int32_t    lineNum  = 1;
   1518     int32_t    colStart = 0;
   1519     int32_t    column   = 0;
   1520     int32_t    charIdx  = 0;
   1521 
   1522     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
   1523 
   1524     for (charIdx = 0; charIdx < len; ) {
   1525         status = U_ZERO_ERROR;
   1526         UChar  c = testString.charAt(charIdx);
   1527         charIdx++;
   1528         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
   1529             // treat CRLF as a unit
   1530             c = CH_LF;
   1531             charIdx++;
   1532         }
   1533         if (c == CH_LF || c == CH_CR) {
   1534             lineNum++;
   1535             colStart = charIdx;
   1536         }
   1537         column = charIdx - colStart + 1;
   1538 
   1539         switch (parseState) {
   1540         case PARSE_COMMENT:
   1541             if (c == 0x0a || c == 0x0d) {
   1542                 parseState = savedState;
   1543             }
   1544             break;
   1545 
   1546         case PARSE_TAG:
   1547             {
   1548             if (c == CH_HASH) {
   1549                 parseState = PARSE_COMMENT;
   1550                 savedState = PARSE_TAG;
   1551                 break;
   1552             }
   1553             if (u_isUWhiteSpace(c)) {
   1554                 break;
   1555             }
   1556             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
   1557                 delete tp.bi;
   1558                 tp.bi = BreakIterator::createWordInstance(locale,  status);
   1559                 charIdx += 5;
   1560                 break;
   1561             }
   1562             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
   1563                 delete tp.bi;
   1564                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
   1565                 charIdx += 5;
   1566                 break;
   1567             }
   1568             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
   1569                 delete tp.bi;
   1570                 tp.bi = BreakIterator::createLineInstance(locale,  status);
   1571                 charIdx += 5;
   1572                 break;
   1573             }
   1574             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
   1575                 delete tp.bi;
   1576                 tp.bi = NULL;
   1577                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
   1578                 charIdx += 5;
   1579                 break;
   1580             }
   1581             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
   1582                 delete tp.bi;
   1583                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
   1584                 charIdx += 6;
   1585                 break;
   1586             }
   1587 
   1588             // <locale  loc_name>
   1589             localeMatcher.reset(testString);
   1590             if (localeMatcher.lookingAt(charIdx-1, status)) {
   1591                 UnicodeString localeName = localeMatcher.group(1, status);
   1592                 char localeName8[100];
   1593                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
   1594                 locale = Locale::createFromName(localeName8);
   1595                 charIdx += localeMatcher.group(0, status).length();
   1596                 TEST_ASSERT_SUCCESS(status);
   1597                 break;
   1598             }
   1599             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
   1600                 parseState = PARSE_DATA;
   1601                 charIdx += 5;
   1602                 tp.dataToBreak = "";
   1603                 tp.expectedBreaks->removeAllElements();
   1604                 tp.srcCol ->removeAllElements();
   1605                 tp.srcLine->removeAllElements();
   1606                 break;
   1607             }
   1608 
   1609             errln("line %d: Tag expected in test file.", lineNum);
   1610             parseState = PARSE_COMMENT;
   1611             savedState = PARSE_DATA;
   1612             goto end_test; // Stop the test.
   1613             }
   1614             break;
   1615 
   1616         case PARSE_DATA:
   1617             if (c == CH_BULLET) {
   1618                 int32_t  breakIdx = tp.dataToBreak.length();
   1619                 tp.expectedBreaks->setSize(breakIdx+1);
   1620                 tp.expectedBreaks->setElementAt(-1, breakIdx);
   1621                 tp.srcLine->setSize(breakIdx+1);
   1622                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1623                 tp.srcCol ->setSize(breakIdx+1);
   1624                 tp.srcCol ->setElementAt(column, breakIdx);
   1625                 break;
   1626             }
   1627 
   1628             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
   1629                 // Add final entry to mappings from break location to source file position.
   1630                 //  Need one extra because last break position returned is after the
   1631                 //    last char in the data, not at the last char.
   1632                 tp.srcLine->addElement(lineNum, status);
   1633                 tp.srcCol ->addElement(column, status);
   1634 
   1635                 parseState = PARSE_TAG;
   1636                 charIdx += 6;
   1637 
   1638                 // RUN THE TEST!
   1639                 executeTest(&tp);
   1640                 break;
   1641             }
   1642 
   1643             if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
   1644                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
   1645                 // Get the code point from the name and insert it into the test data.
   1646                 //   (Damn, no API takes names in Unicode  !!!
   1647                 //    we've got to take it back to char *)
   1648                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
   1649                 int32_t nameLength = nameEndIdx - (charIdx+2);
   1650                 char charNameBuf[200];
   1651                 UChar32 theChar = -1;
   1652                 if (nameEndIdx != -1) {
   1653                     UErrorCode status = U_ZERO_ERROR;
   1654                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
   1655                     charNameBuf[sizeof(charNameBuf)-1] = 0;
   1656                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
   1657                     if (U_FAILURE(status)) {
   1658                         theChar = -1;
   1659                     }
   1660                 }
   1661                 if (theChar == -1) {
   1662                     errln("Error in named character in test file at line %d, col %d",
   1663                         lineNum, column);
   1664                 } else {
   1665                     // Named code point was recognized.  Insert it
   1666                     //   into the test data.
   1667                     tp.dataToBreak.append(theChar);
   1668                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
   1669                         tp.srcLine->addElement(lineNum, status);
   1670                         tp.srcCol ->addElement(column, status);
   1671                     }
   1672                 }
   1673                 if (nameEndIdx > charIdx) {
   1674                     charIdx = nameEndIdx+1;
   1675 
   1676                 }
   1677                 break;
   1678             }
   1679 
   1680 
   1681 
   1682 
   1683             if (testString.compare(charIdx-1, 2, "<>") == 0) {
   1684                 charIdx++;
   1685                 int32_t  breakIdx = tp.dataToBreak.length();
   1686                 tp.expectedBreaks->setSize(breakIdx+1);
   1687                 tp.expectedBreaks->setElementAt(-1, breakIdx);
   1688                 tp.srcLine->setSize(breakIdx+1);
   1689                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1690                 tp.srcCol ->setSize(breakIdx+1);
   1691                 tp.srcCol ->setElementAt(column, breakIdx);
   1692                 break;
   1693             }
   1694 
   1695             if (c == CH_LT) {
   1696                 tagValue   = 0;
   1697                 parseState = PARSE_NUM;
   1698                 break;
   1699             }
   1700 
   1701             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
   1702                 parseState = PARSE_COMMENT;
   1703                 savedState = PARSE_DATA;
   1704                 break;
   1705             }
   1706 
   1707             if (c == CH_BACKSLASH) {
   1708                 // Check for \ at end of line, a line continuation.
   1709                 //     Advance over (discard) the newline
   1710                 UChar32 cp = testString.char32At(charIdx);
   1711                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
   1712                     // We have a CR LF
   1713                     //  Need an extra increment of the input ptr to move over both of them
   1714                     charIdx++;
   1715                 }
   1716                 if (cp == CH_LF || cp == CH_CR) {
   1717                     lineNum++;
   1718                     colStart = charIdx;
   1719                     charIdx++;
   1720                     break;
   1721                 }
   1722 
   1723                 // Let unescape handle the back slash.
   1724                 cp = testString.unescapeAt(charIdx);
   1725                 if (cp != -1) {
   1726                     // Escape sequence was recognized.  Insert the char
   1727                     //   into the test data.
   1728                     tp.dataToBreak.append(cp);
   1729                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
   1730                         tp.srcLine->addElement(lineNum, status);
   1731                         tp.srcCol ->addElement(column, status);
   1732                     }
   1733                     break;
   1734                 }
   1735 
   1736 
   1737                 // Not a recognized backslash escape sequence.
   1738                 // Take the next char as a literal.
   1739                 //  TODO:  Should this be an error?
   1740                 c = testString.charAt(charIdx);
   1741                 charIdx = testString.moveIndex32(charIdx, 1);
   1742             }
   1743 
   1744             // Normal, non-escaped data char.
   1745             tp.dataToBreak.append(c);
   1746 
   1747             // Save the mapping from offset in the data to line/column numbers in
   1748             //   the original input file.  Will be used for better error messages only.
   1749             //   If there's an expected break before this char, the slot in the mapping
   1750             //     vector will already be set for this char; don't overwrite it.
   1751             if (tp.dataToBreak.length() > tp.srcLine->size()) {
   1752                 tp.srcLine->addElement(lineNum, status);
   1753                 tp.srcCol ->addElement(column, status);
   1754             }
   1755             break;
   1756 
   1757 
   1758         case PARSE_NUM:
   1759             // We are parsing an expected numeric tag value, like <1234>,
   1760             //   within a chunk of data.
   1761             if (u_isUWhiteSpace(c)) {
   1762                 break;
   1763             }
   1764 
   1765             if (c == CH_GT) {
   1766                 // Finished the number.  Add the info to the expected break data,
   1767                 //   and switch parse state back to doing plain data.
   1768                 parseState = PARSE_DATA;
   1769                 if (tagValue == 0) {
   1770                     tagValue = -1;
   1771                 }
   1772                 int32_t  breakIdx = tp.dataToBreak.length();
   1773                 tp.expectedBreaks->setSize(breakIdx+1);
   1774                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
   1775                 tp.srcLine->setSize(breakIdx+1);
   1776                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1777                 tp.srcCol ->setSize(breakIdx+1);
   1778                 tp.srcCol ->setElementAt(column, breakIdx);
   1779                 break;
   1780             }
   1781 
   1782             if (u_isdigit(c)) {
   1783                 tagValue = tagValue*10 + u_charDigitValue(c);
   1784                 break;
   1785             }
   1786 
   1787             errln("Syntax Error in test file at line %d, col %d",
   1788                 lineNum, column);
   1789             parseState = PARSE_COMMENT;
   1790             goto end_test; // Stop the test
   1791             break;
   1792         }
   1793 
   1794 
   1795         if (U_FAILURE(status)) {
   1796             errln("ICU Error %s while parsing test file at line %d.",
   1797                 u_errorName(status), lineNum);
   1798             status = U_ZERO_ERROR;
   1799             goto end_test; // Stop the test
   1800         }
   1801 
   1802     }
   1803 
   1804 end_test:
   1805     delete tp.bi;
   1806     delete tp.expectedBreaks;
   1807     delete tp.srcLine;
   1808     delete tp.srcCol;
   1809     delete [] testFile;
   1810 #endif
   1811 }
   1812 
   1813 void RBBITest::TestThaiBreaks() {
   1814     UErrorCode status=U_ZERO_ERROR;
   1815     BreakIterator* b;
   1816     Locale locale = Locale("th");
   1817     int32_t p, index;
   1818     UChar c[]= {
   1819             0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B,
   1820             0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19,
   1821             0x0E16, 0x0E49, 0x0E33, 0x0000
   1822     };
   1823     int32_t expectedWordResult[] = {
   1824             2, 3, 6, 10, 11, 15, 17, 20, 22
   1825     };
   1826     int32_t expectedLineResult[] = {
   1827             3, 6, 11, 15, 17, 20, 22
   1828     };
   1829 
   1830     int32_t size = u_strlen(c);
   1831     UnicodeString text=UnicodeString(c);
   1832 
   1833     b = BreakIterator::createWordInstance(locale, status);
   1834     if (U_FAILURE(status)) {
   1835         errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status));
   1836         return;
   1837     }
   1838     b->setText(text);
   1839     p = index = 0;
   1840     while ((p=b->next())!=BreakIterator::DONE && p < size) {
   1841         if (p != expectedWordResult[index++]) {
   1842             errln("Incorrect break given by thai word break iterator. Expected: %d  Got: %d", expectedWordResult[index-1], p);
   1843         }
   1844     }
   1845     delete b;
   1846 
   1847     b = BreakIterator::createLineInstance(locale, status);
   1848     if (U_FAILURE(status)) {
   1849         printf("Unable to create thai line break iterator.\n");
   1850         return;
   1851     }
   1852     b->setText(text);
   1853     p = index = 0;
   1854     while ((p=b->next())!=BreakIterator::DONE && p < size) {
   1855         if (p != expectedLineResult[index++]) {
   1856             errln("Incorrect break given by thai line break iterator. Expected: %d  Got: %d", expectedLineResult[index-1], p);
   1857         }
   1858     }
   1859 
   1860     delete b;
   1861 }
   1862 
   1863 // UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"
   1864 // Words don't include colon or period (cldrbug #1969).
   1865 static const char    posxWordText[]     = "Can't have breaks in xx:yy or struct.field for CS-types.";
   1866 static const int32_t posxWordTOffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 };
   1867 static const int32_t posxWordROffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21,         26, 27, 29, 30,         42, 43, 46, 47, 49, 50, 55, 56 };
   1868 
   1869 // UBreakIteratorType UBRK_WORD, Locale "ja"
   1870 // Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
   1871 static const char    jaWordText[]     = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"
   1872                                         "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";
   1873 static const int32_t jaWordTOffsets[] = {    2, 3,          7, 8, 14,         17, 18,     20, 21, 24,         27, 28 };
   1874 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 };
   1875 
   1876 // UBreakIteratorType UBRK_SENTENCE, Locale "el"
   1877 // Add break after Greek question mark (cldrbug #2069).
   1878 static const char    elSentText[]     = "\\u0391\\u03B2, \\u03B3\\u03B4; \\u0395 \\u03B6\\u03B7\\u037E \\u0398 \\u03B9\\u03BA. "
   1879                                         "\\u039B\\u03BC \\u03BD\\u03BE! \\u039F\\u03C0, \\u03A1\\u03C2? \\u03A3";
   1880 static const int32_t elSentTOffsets[] = { 8, 14, 20, 27, 35, 36 };
   1881 static const int32_t elSentROffsets[] = {        20, 27, 35, 36 };
   1882 
   1883 // UBreakIteratorType UBRK_CHARACTER, Locale "th"
   1884 // Clusters should not include spacing Thai/Lao vowels (prefix or postfix), except for [SARA] AM (cldrbug #2161).
   1885 static const char    thCharText[]     = "\\u0E01\\u0E23\\u0E30\\u0E17\\u0E48\\u0E2D\\u0E21\\u0E23\\u0E08\\u0E19\\u0E32 "
   1886                                         "(\\u0E2A\\u0E38\\u0E0A\\u0E32\\u0E15\\u0E34-\\u0E08\\u0E38\\u0E11\\u0E32\\u0E21\\u0E32\\u0E28) "
   1887                                         "\\u0E40\\u0E14\\u0E47\\u0E01\\u0E21\\u0E35\\u0E1B\\u0E31\\u0E0D\\u0E2B\\u0E32 ";
   1888 static const int32_t thCharTOffsets[] = { 1, 2, 3, 5, 6, 7, 8, 9, 10, 11,
   1889                                           12, 13, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28,
   1890                                           29, 30, 32, 33, 35, 37, 38, 39, 40, 41 };
   1891 static const int32_t thCharROffsets[] = { 1,    3, 5, 6, 7, 8, 9,     11,
   1892                                           12, 13, 15,     17, 19, 20, 22,     24,     26, 27, 28,
   1893                                           29,     32, 33, 35, 37, 38,     40, 41 };
   1894 
   1895 typedef struct {
   1896     UBreakIteratorType  type;
   1897     const char *        locale;
   1898     const char *        escapedText;
   1899     const int32_t *     tailoredOffsets;
   1900     int32_t             tailoredOffsetsCount;
   1901     const int32_t *     rootOffsets;
   1902     int32_t             rootOffsetsCount;
   1903 } TailoredBreakItem;
   1904 
   1905 #define ARRAY_PTR_LEN(array) (array),(sizeof(array)/sizeof(array[0]))
   1906 
   1907 static const TailoredBreakItem tbItems[] = {
   1908     { UBRK_WORD,      "en_US_POSIX", posxWordText, ARRAY_PTR_LEN(posxWordTOffsets), ARRAY_PTR_LEN(posxWordROffsets) },
   1909     { UBRK_WORD,      "ja",          jaWordText,   ARRAY_PTR_LEN(jaWordTOffsets),   ARRAY_PTR_LEN(jaWordROffsets)   },
   1910     { UBRK_SENTENCE,  "el",          elSentText,   ARRAY_PTR_LEN(elSentTOffsets),   ARRAY_PTR_LEN(elSentROffsets)   },
   1911     { UBRK_CHARACTER, "th",          thCharText,   ARRAY_PTR_LEN(thCharTOffsets),   ARRAY_PTR_LEN(thCharROffsets)   },
   1912     { UBRK_CHARACTER, NULL,          NULL,         NULL,0,                          NULL,0                          } // terminator
   1913 };
   1914 
   1915 static void formatOffsets(char* buffer, int32_t buflen, int32_t count, const int32_t* offsets) {
   1916     while (count-- > 0) {
   1917         int writeCount;
   1918         sprintf(buffer, /* buflen, */ " %d%n", *offsets++, &writeCount); /* wants to be snprintf */
   1919         buffer += writeCount;
   1920         buflen -= writeCount;
   1921     }
   1922 }
   1923 
   1924 enum { kMaxOffsetCount = 128 };
   1925 
   1926 void RBBITest::TBTest(BreakIterator* brkitr, int type, const char *locale, const char* escapedText, const int32_t *expectOffsets, int32_t expectOffsetsCount) {
   1927     brkitr->setText( CharsToUnicodeString(escapedText) );
   1928     int32_t foundOffsets[kMaxOffsetCount];
   1929     int32_t offset, foundOffsetsCount = 0;
   1930     // do forwards iteration test
   1931     while ( foundOffsetsCount < kMaxOffsetCount && (offset = brkitr->next()) != BreakIterator::DONE ) {
   1932         foundOffsets[foundOffsetsCount++] = offset;
   1933     }
   1934     if ( foundOffsetsCount != expectOffsetsCount || memcmp(expectOffsets, foundOffsets, foundOffsetsCount*sizeof(foundOffsets[0])) != 0 ) {
   1935         // log error for forwards test
   1936         char formatExpect[512], formatFound[512];
   1937         formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets);
   1938         formatOffsets(formatFound, sizeof(formatFound), foundOffsetsCount, foundOffsets);
   1939         errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found %d offsets fwd:%s\n",
   1940                 type, locale, escapedText, expectOffsetsCount, formatExpect, foundOffsetsCount, formatFound);
   1941     } else {
   1942         // do backwards iteration test
   1943         --foundOffsetsCount; // back off one from the end offset
   1944         while ( foundOffsetsCount > 0 ) {
   1945             offset = brkitr->previous();
   1946             if ( offset != foundOffsets[--foundOffsetsCount] ) {
   1947                 // log error for backwards test
   1948                 char formatExpect[512];
   1949                 formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets);
   1950                 errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found rev offset %d where expect %d\n",
   1951                         type, locale, escapedText, expectOffsetsCount, formatExpect, offset, foundOffsets[foundOffsetsCount]);
   1952                 break;
   1953             }
   1954         }
   1955     }
   1956 }
   1957 
   1958 void RBBITest::TestTailoredBreaks() {
   1959     const TailoredBreakItem * tbItemPtr;
   1960     Locale rootLocale = Locale("root");
   1961     for (tbItemPtr = tbItems; tbItemPtr->escapedText != NULL; ++tbItemPtr) {
   1962         Locale testLocale = Locale(tbItemPtr->locale);
   1963         BreakIterator * tailoredBrkiter = NULL;
   1964         BreakIterator * rootBrkiter = NULL;
   1965         UErrorCode status = U_ZERO_ERROR;
   1966         switch (tbItemPtr->type) {
   1967             case UBRK_CHARACTER:
   1968                 tailoredBrkiter = BreakIterator::createCharacterInstance(testLocale, status);
   1969                 rootBrkiter = BreakIterator::createCharacterInstance(rootLocale, status);
   1970                 break;
   1971             case UBRK_WORD:
   1972                 tailoredBrkiter = BreakIterator::createWordInstance(testLocale, status);
   1973                 rootBrkiter = BreakIterator::createWordInstance(rootLocale, status);
   1974                 break;
   1975             case UBRK_LINE:
   1976                 tailoredBrkiter = BreakIterator::createLineInstance(testLocale, status);
   1977                 rootBrkiter = BreakIterator::createLineInstance(rootLocale, status);
   1978                 break;
   1979             case UBRK_SENTENCE:
   1980                 tailoredBrkiter = BreakIterator::createSentenceInstance(testLocale, status);
   1981                 rootBrkiter = BreakIterator::createSentenceInstance(rootLocale, status);
   1982                 break;
   1983             default:
   1984                 status = U_UNSUPPORTED_ERROR;
   1985                 break;
   1986         }
   1987         if (U_FAILURE(status)) {
   1988             errcheckln(status, "BreakIterator create failed for type %d, locales root or %s - Error: %s", (int)(tbItemPtr->type), tbItemPtr->locale, u_errorName(status));
   1989             continue;
   1990         }
   1991         TBTest(tailoredBrkiter, (int)(tbItemPtr->type), tbItemPtr->locale, tbItemPtr->escapedText, tbItemPtr->tailoredOffsets, tbItemPtr->tailoredOffsetsCount);
   1992         TBTest(rootBrkiter,     (int)(tbItemPtr->type), "root",            tbItemPtr->escapedText, tbItemPtr->rootOffsets,     tbItemPtr->rootOffsetsCount);
   1993 
   1994         delete rootBrkiter;
   1995         delete tailoredBrkiter;
   1996     }
   1997 }
   1998 
   1999 
   2000 //-------------------------------------------------------------------------------
   2001 //
   2002 //  TestDictRules   create a break iterator from source rules that includes a
   2003 //                  dictionary range.   Regression for bug #7130.  Source rules
   2004 //                  do not declare a break iterator type (word, line, sentence, etc.
   2005 //                  but the dictionary code, without a type, would loop.
   2006 //
   2007 //-------------------------------------------------------------------------------
   2008 void RBBITest::TestDictRules() {
   2009     const char *rules =  "$dictionary = [a-z]; \n"
   2010                          "!!forward; \n"
   2011                          "$dictionary $dictionary; \n"
   2012                          "!!reverse; \n"
   2013                          "$dictionary $dictionary; \n";
   2014     const char *text = "aa";
   2015     UErrorCode status = U_ZERO_ERROR;
   2016     UParseError parseError;
   2017 
   2018     RuleBasedBreakIterator bi(rules, parseError, status);
   2019     if (U_SUCCESS(status)) {
   2020         UnicodeString utext = text;
   2021         bi.setText(utext);
   2022         int32_t position;
   2023         int32_t loops;
   2024         for (loops = 0; loops<10; loops++) {
   2025             position = bi.next();
   2026             if (position == RuleBasedBreakIterator::DONE) {
   2027                 break;
   2028             }
   2029         }
   2030         TEST_ASSERT(loops == 1);
   2031     } else {
   2032         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
   2033     }
   2034 }
   2035 
   2036 
   2037 
   2038 //-------------------------------------------------------------------------------
   2039 //
   2040 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
   2041 //    return the datain one big UChar * buffer, which the caller must delete.
   2042 //
   2043 //    parameters:
   2044 //          fileName:   the name of the file, with no directory part.  The test data directory
   2045 //                      is assumed.
   2046 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
   2047 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
   2048 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
   2049 //                      Pass NULL for the system default encoding.
   2050 //          status
   2051 //    returns:
   2052 //                      The file data, converted to UChar.
   2053 //                      The caller must delete this when done with
   2054 //                           delete [] theBuffer;
   2055 //
   2056 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
   2057 //           Move this function to some common place.
   2058 //
   2059 //--------------------------------------------------------------------------------
   2060 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
   2061     UChar       *retPtr  = NULL;
   2062     char        *fileBuf = NULL;
   2063     UConverter* conv     = NULL;
   2064     FILE        *f       = NULL;
   2065 
   2066     ulen = 0;
   2067     if (U_FAILURE(status)) {
   2068         return retPtr;
   2069     }
   2070 
   2071     //
   2072     //  Open the file.
   2073     //
   2074     f = fopen(fileName, "rb");
   2075     if (f == 0) {
   2076         dataerrln("Error opening test data file %s\n", fileName);
   2077         status = U_FILE_ACCESS_ERROR;
   2078         return NULL;
   2079     }
   2080     //
   2081     //  Read it in
   2082     //
   2083     int   fileSize;
   2084     int   amt_read;
   2085 
   2086     fseek( f, 0, SEEK_END);
   2087     fileSize = ftell(f);
   2088     fileBuf = new char[fileSize];
   2089     fseek(f, 0, SEEK_SET);
   2090     amt_read = fread(fileBuf, 1, fileSize, f);
   2091     if (amt_read != fileSize || fileSize <= 0) {
   2092         errln("Error reading test data file.");
   2093         goto cleanUpAndReturn;
   2094     }
   2095 
   2096     //
   2097     // Look for a Unicode Signature (BOM) on the data just read
   2098     //
   2099     int32_t        signatureLength;
   2100     const char *   fileBufC;
   2101     const char*    bomEncoding;
   2102 
   2103     fileBufC = fileBuf;
   2104     bomEncoding = ucnv_detectUnicodeSignature(
   2105         fileBuf, fileSize, &signatureLength, &status);
   2106     if(bomEncoding!=NULL ){
   2107         fileBufC  += signatureLength;
   2108         fileSize  -= signatureLength;
   2109         encoding = bomEncoding;
   2110     }
   2111 
   2112     //
   2113     // Open a converter to take the rule file to UTF-16
   2114     //
   2115     conv = ucnv_open(encoding, &status);
   2116     if (U_FAILURE(status)) {
   2117         goto cleanUpAndReturn;
   2118     }
   2119 
   2120     //
   2121     // Convert the rules to UChar.
   2122     //  Preflight first to determine required buffer size.
   2123     //
   2124     ulen = ucnv_toUChars(conv,
   2125         NULL,           //  dest,
   2126         0,              //  destCapacity,
   2127         fileBufC,
   2128         fileSize,
   2129         &status);
   2130     if (status == U_BUFFER_OVERFLOW_ERROR) {
   2131         // Buffer Overflow is expected from the preflight operation.
   2132         status = U_ZERO_ERROR;
   2133 
   2134         retPtr = new UChar[ulen+1];
   2135         ucnv_toUChars(conv,
   2136             retPtr,       //  dest,
   2137             ulen+1,
   2138             fileBufC,
   2139             fileSize,
   2140             &status);
   2141     }
   2142 
   2143 cleanUpAndReturn:
   2144     fclose(f);
   2145     delete []fileBuf;
   2146     ucnv_close(conv);
   2147     if (U_FAILURE(status)) {
   2148         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   2149         delete retPtr;
   2150         retPtr = 0;
   2151         ulen   = 0;
   2152     };
   2153     return retPtr;
   2154 }
   2155 
   2156 
   2157 
   2158 //--------------------------------------------------------------------------------------------
   2159 //
   2160 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
   2161 //
   2162 //-------------------------------------------------------------------------------------------
   2163 void RBBITest::TestUnicodeFiles() {
   2164     RuleBasedBreakIterator  *bi;
   2165     UErrorCode               status = U_ZERO_ERROR;
   2166 
   2167     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
   2168     TEST_ASSERT_SUCCESS(status);
   2169     if (U_SUCCESS(status)) {
   2170         runUnicodeTestData("GraphemeBreakTest.txt", bi);
   2171     }
   2172     delete bi;
   2173 
   2174     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
   2175     TEST_ASSERT_SUCCESS(status);
   2176     if (U_SUCCESS(status)) {
   2177         runUnicodeTestData("WordBreakTest.txt", bi);
   2178     }
   2179     delete bi;
   2180 
   2181     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
   2182     TEST_ASSERT_SUCCESS(status);
   2183     if (U_SUCCESS(status)) {
   2184         runUnicodeTestData("SentenceBreakTest.txt", bi);
   2185     }
   2186     delete bi;
   2187 
   2188     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
   2189     TEST_ASSERT_SUCCESS(status);
   2190     if (U_SUCCESS(status)) {
   2191         runUnicodeTestData("LineBreakTest.txt", bi);
   2192     }
   2193     delete bi;
   2194 }
   2195 
   2196 
   2197 //--------------------------------------------------------------------------------------------
   2198 //
   2199 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
   2200 //
   2201 //-------------------------------------------------------------------------------------------
   2202 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
   2203 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   2204 // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb.
   2205   UVersionInfo icu4601 = { 4, 6, 0, 1 };
   2206 UBool isICUVersionPast46 = isICUVersionAtLeast(icu4601);
   2207 UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt");
   2208     UErrorCode  status = U_ZERO_ERROR;
   2209 
   2210     //
   2211     //  Open and read the test data file, put it into a UnicodeString.
   2212     //
   2213     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   2214     char testFileName[1000];
   2215     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
   2216         dataerrln("Can't open test data.  Path too long.");
   2217         return;
   2218     }
   2219     strcpy(testFileName, testDataDirectory);
   2220     strcat(testFileName, fileName);
   2221 
   2222     logln("Opening data file %s\n", fileName);
   2223 
   2224     int    len;
   2225     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
   2226     if (status != U_FILE_ACCESS_ERROR) {
   2227         TEST_ASSERT_SUCCESS(status);
   2228         TEST_ASSERT(testFile != NULL);
   2229     }
   2230     if (U_FAILURE(status) || testFile == NULL) {
   2231         return; /* something went wrong, error already output */
   2232     }
   2233     UnicodeString testFileAsString(TRUE, testFile, len);
   2234 
   2235     //
   2236     //  Parse the test data file using a regular expression.
   2237     //  Each kind of token is recognized in its own capture group; what type of item was scanned
   2238     //     is identified by which group had a match.
   2239     //
   2240     //    Caputure Group #                  1          2            3            4           5
   2241     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
   2242     //
   2243     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
   2244     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
   2245     UnicodeString   testString;
   2246     UVector32       breakPositions(status);
   2247     int             lineNumber = 1;
   2248     TEST_ASSERT_SUCCESS(status);
   2249     if (U_FAILURE(status)) {
   2250         return;
   2251     }
   2252 
   2253     //
   2254     //  Scan through each test case, building up the string to be broken in testString,
   2255     //   and the positions that should be boundaries in the breakPositions vector.
   2256     //
   2257     int spin = 0;
   2258     while (tokenMatcher.find()) {
   2259       	if(tokenMatcher.hitEnd()) {
   2260           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
   2261              This occurred when the text file was corrupt (wasn't marked as UTF-8)
   2262              and caused an infinite loop here on EBCDIC systems!
   2263           */
   2264           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
   2265           //	   return;
   2266       	}
   2267         if (tokenMatcher.start(1, status) >= 0) {
   2268             // Scanned a divide sign, indicating a break position in the test data.
   2269             if (testString.length()>0) {
   2270                 breakPositions.addElement(testString.length(), status);
   2271             }
   2272         }
   2273         else if (tokenMatcher.start(2, status) >= 0) {
   2274             // Scanned an 'x', meaning no break at this position in the test data
   2275             //   Nothing to be done here.
   2276             }
   2277         else if (tokenMatcher.start(3, status) >= 0) {
   2278             // Scanned Hex digits.  Convert them to binary, append to the character data string.
   2279             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
   2280             int length = hexNumber.length();
   2281             if (length<=8) {
   2282                 char buf[10];
   2283                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
   2284                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
   2285                 if (c<=0x10ffff) {
   2286                     testString.append(c);
   2287                 } else {
   2288                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
   2289                        fileName, lineNumber);
   2290                 }
   2291             } else {
   2292                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
   2293                        fileName, lineNumber);
   2294              }
   2295         }
   2296         else if (tokenMatcher.start(4, status) >= 0) {
   2297             // Scanned to end of a line, possibly skipping over a comment in the process.
   2298             //   If the line from the file contained test data, run the test now.
   2299             //
   2300             if (testString.length() > 0) {
   2301 // TODO(andy): Remove this time bomb code.
   2302 if (!isLineBreak || isICUVersionPast46 || !(4658 <= lineNumber && lineNumber <= 4758)) {
   2303                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
   2304 }
   2305             }
   2306 
   2307             // Clear out this test case.
   2308             //    The string and breakPositions vector will be refilled as the next
   2309             //       test case is parsed.
   2310             testString.remove();
   2311             breakPositions.removeAllElements();
   2312             lineNumber++;
   2313         } else {
   2314             // Scanner catchall.  Something unrecognized appeared on the line.
   2315             char token[16];
   2316             UnicodeString uToken = tokenMatcher.group(0, status);
   2317             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
   2318             token[sizeof(token)-1] = 0;
   2319             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
   2320 
   2321             // Clean up, in preparation for continuing with the next line.
   2322             testString.remove();
   2323             breakPositions.removeAllElements();
   2324             lineNumber++;
   2325         }
   2326         TEST_ASSERT_SUCCESS(status);
   2327         if (U_FAILURE(status)) {
   2328             break;
   2329         }
   2330     }
   2331 
   2332     delete [] testFile;
   2333  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
   2334 }
   2335 
   2336 //--------------------------------------------------------------------------------------------
   2337 //
   2338 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
   2339 //                            test data files.  Do only a simple, forward-only check -
   2340 //                            this test is mostly to check that ICU and the Unicode
   2341 //                            data agree with each other.
   2342 //
   2343 //--------------------------------------------------------------------------------------------
   2344 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
   2345                          const UnicodeString &testString,   // Text data to be broken
   2346                          UVector32 *breakPositions,         // Positions where breaks should be found.
   2347                          RuleBasedBreakIterator *bi) {
   2348     int32_t pos;                 // Break Position in the test string
   2349     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
   2350     int32_t expectedPos;         // Expected break position (index into test string)
   2351 
   2352     bi->setText(testString);
   2353     pos = bi->first();
   2354     pos = bi->next();
   2355 
   2356     while (pos != BreakIterator::DONE) {
   2357         if (expectedI >= breakPositions->size()) {
   2358             errln("Test file \"%s\", line %d, unexpected break found at position %d",
   2359                 testFileName, lineNumber, pos);
   2360             break;
   2361         }
   2362         expectedPos = breakPositions->elementAti(expectedI);
   2363         if (pos < expectedPos) {
   2364             errln("Test file \"%s\", line %d, unexpected break found at position %d",
   2365                 testFileName, lineNumber, pos);
   2366             break;
   2367         }
   2368         if (pos > expectedPos) {
   2369             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
   2370                 testFileName, lineNumber, expectedPos);
   2371             break;
   2372         }
   2373         pos = bi->next();
   2374         expectedI++;
   2375     }
   2376 
   2377     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
   2378         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
   2379             testFileName, lineNumber, breakPositions->elementAti(expectedI));
   2380     }
   2381 }
   2382 
   2383 
   2384 
   2385 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   2386 //---------------------------------------------------------------------------------------
   2387 //
   2388 //   classs RBBIMonkeyKind
   2389 //
   2390 //      Monkey Test for Break Iteration
   2391 //      Abstract interface class.   Concrete derived classes independently
   2392 //      implement the break rules for different iterator types.
   2393 //
   2394 //      The Monkey Test itself uses doesn't know which type of break iterator it is
   2395 //      testing, but works purely in terms of the interface defined here.
   2396 //
   2397 //---------------------------------------------------------------------------------------
   2398 class RBBIMonkeyKind {
   2399 public:
   2400     // Return a UVector of UnicodeSets, representing the character classes used
   2401     //   for this type of iterator.
   2402     virtual  UVector  *charClasses() = 0;
   2403 
   2404     // Set the test text on which subsequent calls to next() will operate
   2405     virtual  void      setText(const UnicodeString &s) = 0;
   2406 
   2407     // Find the next break postion, starting from the prev break position, or from zero.
   2408     // Return -1 after reaching end of string.
   2409     virtual  int32_t   next(int32_t i) = 0;
   2410 
   2411     virtual ~RBBIMonkeyKind();
   2412     UErrorCode       deferredStatus;
   2413 
   2414 
   2415 protected:
   2416     RBBIMonkeyKind();
   2417 
   2418 private:
   2419 };
   2420 
   2421 RBBIMonkeyKind::RBBIMonkeyKind() {
   2422     deferredStatus = U_ZERO_ERROR;
   2423 }
   2424 
   2425 RBBIMonkeyKind::~RBBIMonkeyKind() {
   2426 }
   2427 
   2428 
   2429 //----------------------------------------------------------------------------------------
   2430 //
   2431 //   Random Numbers.  Similar to standard lib rand() and srand()
   2432 //                    Not using library to
   2433 //                      1.  Get same results on all platforms.
   2434 //                      2.  Get access to current seed, to more easily reproduce failures.
   2435 //
   2436 //---------------------------------------------------------------------------------------
   2437 static uint32_t m_seed = 1;
   2438 
   2439 static uint32_t m_rand()
   2440 {
   2441     m_seed = m_seed * 1103515245 + 12345;
   2442     return (uint32_t)(m_seed/65536) % 32768;
   2443 }
   2444 
   2445 
   2446 //------------------------------------------------------------------------------------------
   2447 //
   2448 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
   2449 //                             of RBBIMonkeyKind.
   2450 //
   2451 //------------------------------------------------------------------------------------------
   2452 class RBBICharMonkey: public RBBIMonkeyKind {
   2453 public:
   2454     RBBICharMonkey();
   2455     virtual          ~RBBICharMonkey();
   2456     virtual  UVector *charClasses();
   2457     virtual  void     setText(const UnicodeString &s);
   2458     virtual  int32_t  next(int32_t i);
   2459 private:
   2460     UVector   *fSets;
   2461 
   2462     UnicodeSet  *fCRLFSet;
   2463     UnicodeSet  *fControlSet;
   2464     UnicodeSet  *fExtendSet;
   2465     UnicodeSet  *fPrependSet;
   2466     UnicodeSet  *fSpacingSet;
   2467     UnicodeSet  *fLSet;
   2468     UnicodeSet  *fVSet;
   2469     UnicodeSet  *fTSet;
   2470     UnicodeSet  *fLVSet;
   2471     UnicodeSet  *fLVTSet;
   2472     UnicodeSet  *fHangulSet;
   2473     UnicodeSet  *fAnySet;
   2474 
   2475     const UnicodeString *fText;
   2476 };
   2477 
   2478 
   2479 RBBICharMonkey::RBBICharMonkey() {
   2480     UErrorCode  status = U_ZERO_ERROR;
   2481 
   2482     fText = NULL;
   2483 
   2484     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
   2485     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
   2486     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
   2487     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
   2488     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
   2489     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
   2490     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
   2491     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
   2492     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
   2493     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
   2494     fHangulSet  = new UnicodeSet();
   2495     fHangulSet->addAll(*fLSet);
   2496     fHangulSet->addAll(*fVSet);
   2497     fHangulSet->addAll(*fTSet);
   2498     fHangulSet->addAll(*fLVSet);
   2499     fHangulSet->addAll(*fLVTSet);
   2500     fAnySet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0000-\\U0010ffff]"), status);
   2501 
   2502     fSets       = new UVector(status);
   2503     fSets->addElement(fCRLFSet,    status);
   2504     fSets->addElement(fControlSet, status);
   2505     fSets->addElement(fExtendSet,  status);
   2506     fSets->addElement(fPrependSet, status);
   2507     fSets->addElement(fSpacingSet, status);
   2508     fSets->addElement(fHangulSet,  status);
   2509     fSets->addElement(fAnySet,     status);
   2510     if (U_FAILURE(status)) {
   2511         deferredStatus = status;
   2512     }
   2513 }
   2514 
   2515 
   2516 void RBBICharMonkey::setText(const UnicodeString &s) {
   2517     fText = &s;
   2518 }
   2519 
   2520 
   2521 
   2522 int32_t RBBICharMonkey::next(int32_t prevPos) {
   2523     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2524                               //   break position being tested.  The candidate break
   2525                               //   location is before p2.
   2526 
   2527     int     breakPos = -1;
   2528 
   2529     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2530 
   2531     if (U_FAILURE(deferredStatus)) {
   2532         return -1;
   2533     }
   2534 
   2535     // Previous break at end of string.  return DONE.
   2536     if (prevPos >= fText->length()) {
   2537         return -1;
   2538     }
   2539     p0 = p1 = p2 = p3 = prevPos;
   2540     c3 =  fText->char32At(prevPos);
   2541     c0 = c1 = c2 = 0;
   2542 
   2543     // Loop runs once per "significant" character position in the input text.
   2544     for (;;) {
   2545         // Move all of the positions forward in the input string.
   2546         p0 = p1;  c0 = c1;
   2547         p1 = p2;  c1 = c2;
   2548         p2 = p3;  c2 = c3;
   2549 
   2550         // Advancd p3 by one codepoint
   2551         p3 = fText->moveIndex32(p3, 1);
   2552         c3 = fText->char32At(p3);
   2553 
   2554         if (p1 == p2) {
   2555             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2556             continue;
   2557         }
   2558         if (p2 == fText->length()) {
   2559             // Reached end of string.  Always a break position.
   2560             break;
   2561         }
   2562 
   2563         // Rule  GB3   CR x LF
   2564         //     No Extend or Format characters may appear between the CR and LF,
   2565         //     which requires the additional check for p2 immediately following p1.
   2566         //
   2567         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
   2568             continue;
   2569         }
   2570 
   2571         // Rule (GB4).   ( Control | CR | LF ) <break>
   2572         if (fControlSet->contains(c1) ||
   2573             c1 == 0x0D ||
   2574             c1 == 0x0A)  {
   2575             break;
   2576         }
   2577 
   2578         // Rule (GB5)    <break>  ( Control | CR | LF )
   2579         //
   2580         if (fControlSet->contains(c2) ||
   2581             c2 == 0x0D ||
   2582             c2 == 0x0A)  {
   2583             break;
   2584         }
   2585 
   2586 
   2587         // Rule (GB6)  L x ( L | V | LV | LVT )
   2588         if (fLSet->contains(c1) &&
   2589                (fLSet->contains(c2)  ||
   2590                 fVSet->contains(c2)  ||
   2591                 fLVSet->contains(c2) ||
   2592                 fLVTSet->contains(c2))) {
   2593             continue;
   2594         }
   2595 
   2596         // Rule (GB7)    ( LV | V )  x  ( V | T )
   2597         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
   2598             (fVSet->contains(c2) || fTSet->contains(c2)))  {
   2599             continue;
   2600         }
   2601 
   2602         // Rule (GB8)    ( LVT | T)  x T
   2603         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
   2604             fTSet->contains(c2))  {
   2605             continue;
   2606         }
   2607 
   2608         // Rule (GB9)    Numeric x ALetter
   2609         if (fExtendSet->contains(c2))  {
   2610             continue;
   2611         }
   2612 
   2613         // Rule (GB9a)   x  SpacingMark
   2614         if (fSpacingSet->contains(c2)) {
   2615             continue;
   2616         }
   2617 
   2618         // Rule (GB9b)   Prepend x
   2619         if (fPrependSet->contains(c1)) {
   2620             continue;
   2621         }
   2622 
   2623         // Rule (GB10)  Any  <break>  Any
   2624         break;
   2625     }
   2626 
   2627     breakPos = p2;
   2628     return breakPos;
   2629 }
   2630 
   2631 
   2632 
   2633 UVector  *RBBICharMonkey::charClasses() {
   2634     return fSets;
   2635 }
   2636 
   2637 
   2638 RBBICharMonkey::~RBBICharMonkey() {
   2639     delete fSets;
   2640     delete fCRLFSet;
   2641     delete fControlSet;
   2642     delete fExtendSet;
   2643     delete fPrependSet;
   2644     delete fSpacingSet;
   2645     delete fLSet;
   2646     delete fVSet;
   2647     delete fTSet;
   2648     delete fLVSet;
   2649     delete fLVTSet;
   2650     delete fHangulSet;
   2651     delete fAnySet;
   2652 }
   2653 
   2654 //------------------------------------------------------------------------------------------
   2655 //
   2656 //   class RBBIWordMonkey      Word Break specific implementation
   2657 //                             of RBBIMonkeyKind.
   2658 //
   2659 //------------------------------------------------------------------------------------------
   2660 class RBBIWordMonkey: public RBBIMonkeyKind {
   2661 public:
   2662     RBBIWordMonkey();
   2663     virtual          ~RBBIWordMonkey();
   2664     virtual  UVector *charClasses();
   2665     virtual  void     setText(const UnicodeString &s);
   2666     virtual int32_t   next(int32_t i);
   2667 private:
   2668     UVector      *fSets;
   2669 
   2670     UnicodeSet  *fCRSet;
   2671     UnicodeSet  *fLFSet;
   2672     UnicodeSet  *fNewlineSet;
   2673     UnicodeSet  *fKatakanaSet;
   2674     UnicodeSet  *fALetterSet;
   2675     UnicodeSet  *fMidNumLetSet;
   2676     UnicodeSet  *fMidLetterSet;
   2677     UnicodeSet  *fMidNumSet;
   2678     UnicodeSet  *fNumericSet;
   2679     UnicodeSet  *fFormatSet;
   2680     UnicodeSet  *fOtherSet;
   2681     UnicodeSet  *fExtendSet;
   2682     UnicodeSet  *fExtendNumLetSet;
   2683 
   2684     RegexMatcher  *fMatcher;
   2685 
   2686     const UnicodeString  *fText;
   2687 };
   2688 
   2689 
   2690 RBBIWordMonkey::RBBIWordMonkey()
   2691 {
   2692     UErrorCode  status = U_ZERO_ERROR;
   2693 
   2694     fSets            = new UVector(status);
   2695 
   2696     fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
   2697     fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
   2698     fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
   2699     fALetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"),      status);
   2700     fKatakanaSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
   2701     fMidNumLetSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
   2702     fMidLetterSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
   2703     fMidNumSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
   2704     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
   2705     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
   2706     fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
   2707     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
   2708 
   2709     fOtherSet        = new UnicodeSet();
   2710     if(U_FAILURE(status)) {
   2711       deferredStatus = status;
   2712       return;
   2713     }
   2714 
   2715     fOtherSet->complement();
   2716     fOtherSet->removeAll(*fCRSet);
   2717     fOtherSet->removeAll(*fLFSet);
   2718     fOtherSet->removeAll(*fNewlineSet);
   2719     fOtherSet->removeAll(*fKatakanaSet);
   2720     fOtherSet->removeAll(*fALetterSet);
   2721     fOtherSet->removeAll(*fMidLetterSet);
   2722     fOtherSet->removeAll(*fMidNumSet);
   2723     fOtherSet->removeAll(*fNumericSet);
   2724     fOtherSet->removeAll(*fExtendNumLetSet);
   2725     fOtherSet->removeAll(*fFormatSet);
   2726     fOtherSet->removeAll(*fExtendSet);
   2727     // Inhibit dictionary characters from being tested at all.
   2728     fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
   2729 
   2730     fSets->addElement(fCRSet,        status);
   2731     fSets->addElement(fLFSet,        status);
   2732     fSets->addElement(fNewlineSet,   status);
   2733     fSets->addElement(fALetterSet,   status);
   2734     fSets->addElement(fKatakanaSet,  status);
   2735     fSets->addElement(fMidLetterSet, status);
   2736     fSets->addElement(fMidNumLetSet, status);
   2737     fSets->addElement(fMidNumSet,    status);
   2738     fSets->addElement(fNumericSet,   status);
   2739     fSets->addElement(fFormatSet,    status);
   2740     fSets->addElement(fExtendSet,    status);
   2741     fSets->addElement(fOtherSet,     status);
   2742     fSets->addElement(fExtendNumLetSet, status);
   2743 
   2744     if (U_FAILURE(status)) {
   2745         deferredStatus = status;
   2746     }
   2747 }
   2748 
   2749 void RBBIWordMonkey::setText(const UnicodeString &s) {
   2750     fText       = &s;
   2751 }
   2752 
   2753 
   2754 int32_t RBBIWordMonkey::next(int32_t prevPos) {
   2755     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2756                               //   break position being tested.  The candidate break
   2757                               //   location is before p2.
   2758 
   2759     int     breakPos = -1;
   2760 
   2761     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2762 
   2763     if (U_FAILURE(deferredStatus)) {
   2764         return -1;
   2765     }
   2766 
   2767     // Prev break at end of string.  return DONE.
   2768     if (prevPos >= fText->length()) {
   2769         return -1;
   2770     }
   2771     p0 = p1 = p2 = p3 = prevPos;
   2772     c3 =  fText->char32At(prevPos);
   2773     c0 = c1 = c2 = 0;
   2774 
   2775     // Loop runs once per "significant" character position in the input text.
   2776     for (;;) {
   2777         // Move all of the positions forward in the input string.
   2778         p0 = p1;  c0 = c1;
   2779         p1 = p2;  c1 = c2;
   2780         p2 = p3;  c2 = c3;
   2781 
   2782         // Advancd p3 by    X(Extend | Format)*   Rule 4
   2783         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
   2784         do {
   2785             p3 = fText->moveIndex32(p3, 1);
   2786             c3 = fText->char32At(p3);
   2787             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
   2788                break;
   2789             };
   2790         }
   2791         while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
   2792 
   2793 
   2794         if (p1 == p2) {
   2795             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2796             continue;
   2797         }
   2798         if (p2 == fText->length()) {
   2799             // Reached end of string.  Always a break position.
   2800             break;
   2801         }
   2802 
   2803         // Rule  (3)   CR x LF
   2804         //     No Extend or Format characters may appear between the CR and LF,
   2805         //     which requires the additional check for p2 immediately following p1.
   2806         //
   2807         if (c1==0x0D && c2==0x0A) {
   2808             continue;
   2809         }
   2810 
   2811         // Rule (3a)  Break before and after newlines (including CR and LF)
   2812         //
   2813         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
   2814             break;
   2815         };
   2816         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
   2817             break;
   2818         };
   2819 
   2820         // Rule (5).   ALetter x ALetter
   2821         if (fALetterSet->contains(c1) &&
   2822             fALetterSet->contains(c2))  {
   2823             continue;
   2824         }
   2825 
   2826         // Rule (6)  ALetter  x  (MidLetter | MidNumLet) ALetter
   2827         //
   2828         if ( fALetterSet->contains(c1)   &&
   2829              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
   2830              fALetterSet->contains(c3)) {
   2831             continue;
   2832         }
   2833 
   2834 
   2835         // Rule (7)  ALetter (MidLetter | MidNumLet)  x  ALetter
   2836         if (fALetterSet->contains(c0) &&
   2837             (fMidLetterSet->contains(c1) ||  fMidNumLetSet->contains(c1)) &&
   2838             fALetterSet->contains(c2)) {
   2839             continue;
   2840         }
   2841 
   2842         // Rule (8)    Numeric x Numeric
   2843         if (fNumericSet->contains(c1) &&
   2844             fNumericSet->contains(c2))  {
   2845             continue;
   2846         }
   2847 
   2848         // Rule (9)    ALetter x Numeric
   2849         if (fALetterSet->contains(c1) &&
   2850             fNumericSet->contains(c2))  {
   2851             continue;
   2852         }
   2853 
   2854         // Rule (10)    Numeric x ALetter
   2855         if (fNumericSet->contains(c1) &&
   2856             fALetterSet->contains(c2))  {
   2857             continue;
   2858         }
   2859 
   2860         // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric
   2861         if (fNumericSet->contains(c0) &&
   2862             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1))  &&
   2863             fNumericSet->contains(c2)) {
   2864             continue;
   2865         }
   2866 
   2867         // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric
   2868         if (fNumericSet->contains(c1) &&
   2869             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2))  &&
   2870             fNumericSet->contains(c3)) {
   2871             continue;
   2872         }
   2873 
   2874         // Rule (13)  Katakana x Katakana
   2875         if (fKatakanaSet->contains(c1) &&
   2876             fKatakanaSet->contains(c2))  {
   2877             continue;
   2878         }
   2879 
   2880         // Rule 13a
   2881         if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
   2882              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
   2883              fExtendNumLetSet->contains(c2)) {
   2884                 continue;
   2885              }
   2886 
   2887         // Rule 13b
   2888         if (fExtendNumLetSet->contains(c1) &&
   2889                 (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
   2890                 fKatakanaSet->contains(c2)))  {
   2891                 continue;
   2892              }
   2893 
   2894         // Rule 14.  Break found here.
   2895         break;
   2896     }
   2897 
   2898     breakPos = p2;
   2899     return breakPos;
   2900 }
   2901 
   2902 
   2903 UVector  *RBBIWordMonkey::charClasses() {
   2904     return fSets;
   2905 }
   2906 
   2907 
   2908 RBBIWordMonkey::~RBBIWordMonkey() {
   2909     delete fSets;
   2910     delete fCRSet;
   2911     delete fLFSet;
   2912     delete fNewlineSet;
   2913     delete fKatakanaSet;
   2914     delete fALetterSet;
   2915     delete fMidNumLetSet;
   2916     delete fMidLetterSet;
   2917     delete fMidNumSet;
   2918     delete fNumericSet;
   2919     delete fFormatSet;
   2920     delete fExtendSet;
   2921     delete fExtendNumLetSet;
   2922     delete fOtherSet;
   2923 }
   2924 
   2925 
   2926 
   2927 
   2928 //------------------------------------------------------------------------------------------
   2929 //
   2930 //   class RBBISentMonkey      Sentence Break specific implementation
   2931 //                             of RBBIMonkeyKind.
   2932 //
   2933 //------------------------------------------------------------------------------------------
   2934 class RBBISentMonkey: public RBBIMonkeyKind {
   2935 public:
   2936     RBBISentMonkey();
   2937     virtual          ~RBBISentMonkey();
   2938     virtual  UVector *charClasses();
   2939     virtual  void     setText(const UnicodeString &s);
   2940     virtual int32_t   next(int32_t i);
   2941 private:
   2942     int               moveBack(int posFrom);
   2943     int               moveForward(int posFrom);
   2944     UChar32           cAt(int pos);
   2945 
   2946     UVector      *fSets;
   2947 
   2948     UnicodeSet  *fSepSet;
   2949     UnicodeSet  *fFormatSet;
   2950     UnicodeSet  *fSpSet;
   2951     UnicodeSet  *fLowerSet;
   2952     UnicodeSet  *fUpperSet;
   2953     UnicodeSet  *fOLetterSet;
   2954     UnicodeSet  *fNumericSet;
   2955     UnicodeSet  *fATermSet;
   2956     UnicodeSet  *fSContinueSet;
   2957     UnicodeSet  *fSTermSet;
   2958     UnicodeSet  *fCloseSet;
   2959     UnicodeSet  *fOtherSet;
   2960     UnicodeSet  *fExtendSet;
   2961 
   2962     const UnicodeString  *fText;
   2963 
   2964 };
   2965 
   2966 RBBISentMonkey::RBBISentMonkey()
   2967 {
   2968     UErrorCode  status = U_ZERO_ERROR;
   2969 
   2970     fSets            = new UVector(status);
   2971 
   2972     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
   2973     //                       set and made into character classes of their own.  For the monkey impl,
   2974     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
   2975     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
   2976     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
   2977     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
   2978     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
   2979     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
   2980     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
   2981     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
   2982     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
   2983     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
   2984     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
   2985     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
   2986     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
   2987     fOtherSet        = new UnicodeSet();
   2988 
   2989     if(U_FAILURE(status)) {
   2990       deferredStatus = status;
   2991       return;
   2992     }
   2993 
   2994     fOtherSet->complement();
   2995     fOtherSet->removeAll(*fSepSet);
   2996     fOtherSet->removeAll(*fFormatSet);
   2997     fOtherSet->removeAll(*fSpSet);
   2998     fOtherSet->removeAll(*fLowerSet);
   2999     fOtherSet->removeAll(*fUpperSet);
   3000     fOtherSet->removeAll(*fOLetterSet);
   3001     fOtherSet->removeAll(*fNumericSet);
   3002     fOtherSet->removeAll(*fATermSet);
   3003     fOtherSet->removeAll(*fSContinueSet);
   3004     fOtherSet->removeAll(*fSTermSet);
   3005     fOtherSet->removeAll(*fCloseSet);
   3006     fOtherSet->removeAll(*fExtendSet);
   3007 
   3008     fSets->addElement(fSepSet,       status);
   3009     fSets->addElement(fFormatSet,    status);
   3010     fSets->addElement(fSpSet,        status);
   3011     fSets->addElement(fLowerSet,     status);
   3012     fSets->addElement(fUpperSet,     status);
   3013     fSets->addElement(fOLetterSet,   status);
   3014     fSets->addElement(fNumericSet,   status);
   3015     fSets->addElement(fATermSet,     status);
   3016     fSets->addElement(fSContinueSet, status);
   3017     fSets->addElement(fSTermSet,     status);
   3018     fSets->addElement(fCloseSet,     status);
   3019     fSets->addElement(fOtherSet,     status);
   3020     fSets->addElement(fExtendSet,    status);
   3021 
   3022     if (U_FAILURE(status)) {
   3023         deferredStatus = status;
   3024     }
   3025 }
   3026 
   3027 
   3028 
   3029 void RBBISentMonkey::setText(const UnicodeString &s) {
   3030     fText       = &s;
   3031 }
   3032 
   3033 UVector  *RBBISentMonkey::charClasses() {
   3034     return fSets;
   3035 }
   3036 
   3037 
   3038 //  moveBack()   Find the "significant" code point preceding the index i.
   3039 //               Skips over ($Extend | $Format)* .
   3040 //
   3041 int RBBISentMonkey::moveBack(int i) {
   3042     if (i <= 0) {
   3043         return -1;
   3044     }
   3045     UChar32   c;
   3046     int32_t   j = i;
   3047     do {
   3048         j = fText->moveIndex32(j, -1);
   3049         c = fText->char32At(j);
   3050     }
   3051     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
   3052     return j;
   3053 
   3054  }
   3055 
   3056 
   3057 int RBBISentMonkey::moveForward(int i) {
   3058     if (i>=fText->length()) {
   3059         return fText->length();
   3060     }
   3061     UChar32   c;
   3062     int32_t   j = i;
   3063     do {
   3064         j = fText->moveIndex32(j, 1);
   3065         c = cAt(j);
   3066     }
   3067     while (fFormatSet->contains(c) || fExtendSet->contains(c));
   3068     return j;
   3069 }
   3070 
   3071 UChar32 RBBISentMonkey::cAt(int pos) {
   3072     if (pos<0 || pos>=fText->length()) {
   3073         return -1;
   3074     } else {
   3075         return fText->char32At(pos);
   3076     }
   3077 }
   3078 
   3079 int32_t RBBISentMonkey::next(int32_t prevPos) {
   3080     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   3081                               //   break position being tested.  The candidate break
   3082                               //   location is before p2.
   3083 
   3084     int     breakPos = -1;
   3085 
   3086     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   3087     UChar32 c;
   3088 
   3089     if (U_FAILURE(deferredStatus)) {
   3090         return -1;
   3091     }
   3092 
   3093     // Prev break at end of string.  return DONE.
   3094     if (prevPos >= fText->length()) {
   3095         return -1;
   3096     }
   3097     p0 = p1 = p2 = p3 = prevPos;
   3098     c3 =  fText->char32At(prevPos);
   3099     c0 = c1 = c2 = 0;
   3100 
   3101     // Loop runs once per "significant" character position in the input text.
   3102     for (;;) {
   3103         // Move all of the positions forward in the input string.
   3104         p0 = p1;  c0 = c1;
   3105         p1 = p2;  c1 = c2;
   3106         p2 = p3;  c2 = c3;
   3107 
   3108         // Advancd p3 by    X(Extend | Format)*   Rule 4
   3109         p3 = moveForward(p3);
   3110         c3 = cAt(p3);
   3111 
   3112         // Rule (3)  CR x LF
   3113         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
   3114             continue;
   3115         }
   3116 
   3117         // Rule (4).   Sep  <break>
   3118         if (fSepSet->contains(c1)) {
   3119             p2 = p1+1;   // Separators don't combine with Extend or Format.
   3120             break;
   3121         }
   3122 
   3123         if (p2 >= fText->length()) {
   3124             // Reached end of string.  Always a break position.
   3125             break;
   3126         }
   3127 
   3128         if (p2 == prevPos) {
   3129             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   3130             continue;
   3131         }
   3132 
   3133         // Rule (6).   ATerm x Numeric
   3134         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
   3135             continue;
   3136         }
   3137 
   3138         // Rule (7).  Upper ATerm  x  Uppper
   3139         if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
   3140             continue;
   3141         }
   3142 
   3143         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
   3144         //           Note:  STerm | ATerm are added to the negated part of the expression by a
   3145         //                  note to the Unicode 5.0 documents.
   3146         int p8 = p1;
   3147         while (fSpSet->contains(cAt(p8))) {
   3148             p8 = moveBack(p8);
   3149         }
   3150         while (fCloseSet->contains(cAt(p8))) {
   3151             p8 = moveBack(p8);
   3152         }
   3153         if (fATermSet->contains(cAt(p8))) {
   3154             p8=p2;
   3155             for (;;) {
   3156                 c = cAt(p8);
   3157                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
   3158                     fLowerSet->contains(c) || fSepSet->contains(c) ||
   3159                     fATermSet->contains(c) || fSTermSet->contains(c))  {
   3160                     break;
   3161                 }
   3162                 p8 = moveForward(p8);
   3163             }
   3164             if (fLowerSet->contains(cAt(p8))) {
   3165                 continue;
   3166             }
   3167         }
   3168 
   3169         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
   3170         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
   3171             p8 = p1;
   3172             while (fSpSet->contains(cAt(p8))) {
   3173                 p8 = moveBack(p8);
   3174             }
   3175             while (fCloseSet->contains(cAt(p8))) {
   3176                 p8 = moveBack(p8);
   3177             }
   3178             c = cAt(p8);
   3179             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
   3180                 continue;
   3181             }
   3182         }
   3183 
   3184         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
   3185         int p9 = p1;
   3186         while (fCloseSet->contains(cAt(p9))) {
   3187             p9 = moveBack(p9);
   3188         }
   3189         c = cAt(p9);
   3190         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
   3191             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
   3192                 continue;
   3193             }
   3194         }
   3195 
   3196         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
   3197         int p10 = p1;
   3198         while (fSpSet->contains(cAt(p10))) {
   3199             p10 = moveBack(p10);
   3200         }
   3201         while (fCloseSet->contains(cAt(p10))) {
   3202             p10 = moveBack(p10);
   3203         }
   3204         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
   3205             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
   3206                 continue;
   3207             }
   3208         }
   3209 
   3210         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
   3211         int p11 = p1;
   3212         if (fSepSet->contains(cAt(p11))) {
   3213             p11 = moveBack(p11);
   3214         }
   3215         while (fSpSet->contains(cAt(p11))) {
   3216             p11 = moveBack(p11);
   3217         }
   3218         while (fCloseSet->contains(cAt(p11))) {
   3219             p11 = moveBack(p11);
   3220         }
   3221         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
   3222             break;
   3223         }
   3224 
   3225         //  Rule (12)  Any x Any
   3226         continue;
   3227     }
   3228     breakPos = p2;
   3229     return breakPos;
   3230 }
   3231 
   3232 RBBISentMonkey::~RBBISentMonkey() {
   3233     delete fSets;
   3234     delete fSepSet;
   3235     delete fFormatSet;
   3236     delete fSpSet;
   3237     delete fLowerSet;
   3238     delete fUpperSet;
   3239     delete fOLetterSet;
   3240     delete fNumericSet;
   3241     delete fATermSet;
   3242     delete fSContinueSet;
   3243     delete fSTermSet;
   3244     delete fCloseSet;
   3245     delete fOtherSet;
   3246     delete fExtendSet;
   3247 }
   3248 
   3249 
   3250 
   3251 //-------------------------------------------------------------------------------------------
   3252 //
   3253 //  RBBILineMonkey
   3254 //
   3255 //-------------------------------------------------------------------------------------------
   3256 
   3257 class RBBILineMonkey: public RBBIMonkeyKind {
   3258 public:
   3259     RBBILineMonkey();
   3260     virtual          ~RBBILineMonkey();
   3261     virtual  UVector *charClasses();
   3262     virtual  void     setText(const UnicodeString &s);
   3263     virtual  int32_t  next(int32_t i);
   3264     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
   3265 private:
   3266     UVector      *fSets;
   3267 
   3268     UnicodeSet  *fBK;
   3269     UnicodeSet  *fCR;
   3270     UnicodeSet  *fLF;
   3271     UnicodeSet  *fCM;
   3272     UnicodeSet  *fNL;
   3273     UnicodeSet  *fSG;
   3274     UnicodeSet  *fWJ;
   3275     UnicodeSet  *fZW;
   3276     UnicodeSet  *fGL;
   3277     UnicodeSet  *fCB;
   3278     UnicodeSet  *fSP;
   3279     UnicodeSet  *fB2;
   3280     UnicodeSet  *fBA;
   3281     UnicodeSet  *fBB;
   3282     UnicodeSet  *fHY;
   3283     UnicodeSet  *fH2;
   3284     UnicodeSet  *fH3;
   3285     UnicodeSet  *fCL;
   3286     UnicodeSet  *fCP;
   3287     UnicodeSet  *fEX;
   3288     UnicodeSet  *fIN;
   3289     UnicodeSet  *fJL;
   3290     UnicodeSet  *fJV;
   3291     UnicodeSet  *fJT;
   3292     UnicodeSet  *fNS;
   3293     UnicodeSet  *fOP;
   3294     UnicodeSet  *fQU;
   3295     UnicodeSet  *fIS;
   3296     UnicodeSet  *fNU;
   3297     UnicodeSet  *fPO;
   3298     UnicodeSet  *fPR;
   3299     UnicodeSet  *fSY;
   3300     UnicodeSet  *fAI;
   3301     UnicodeSet  *fAL;
   3302     UnicodeSet  *fID;
   3303     UnicodeSet  *fSA;
   3304     UnicodeSet  *fXX;
   3305 
   3306     BreakIterator  *fCharBI;
   3307 
   3308     const UnicodeString  *fText;
   3309     int32_t              *fOrigPositions;
   3310 
   3311     RegexMatcher         *fNumberMatcher;
   3312     RegexMatcher         *fLB11Matcher;
   3313 };
   3314 
   3315 
   3316 RBBILineMonkey::RBBILineMonkey()
   3317 {
   3318     UErrorCode  status = U_ZERO_ERROR;
   3319 
   3320     fSets  = new UVector(status);
   3321 
   3322     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
   3323     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
   3324     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
   3325     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
   3326     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
   3327     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
   3328     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
   3329     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
   3330     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
   3331     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
   3332     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
   3333     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
   3334     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
   3335     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
   3336     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
   3337     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
   3338     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
   3339     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
   3340     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
   3341     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
   3342     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
   3343     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
   3344     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
   3345     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
   3346     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
   3347     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
   3348     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
   3349     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
   3350     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
   3351     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
   3352     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
   3353     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
   3354     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
   3355     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
   3356     fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
   3357     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
   3358     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
   3359 
   3360     if (U_FAILURE(status)) {
   3361         deferredStatus = status;
   3362         fCharBI = NULL;
   3363         fNumberMatcher = NULL;
   3364         return;
   3365     }
   3366 
   3367     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
   3368     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
   3369     fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
   3370     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
   3371 
   3372     fSets->addElement(fBK, status);
   3373     fSets->addElement(fCR, status);
   3374     fSets->addElement(fLF, status);
   3375     fSets->addElement(fCM, status);
   3376     fSets->addElement(fNL, status);
   3377     fSets->addElement(fWJ, status);
   3378     fSets->addElement(fZW, status);
   3379     fSets->addElement(fGL, status);
   3380     fSets->addElement(fCB, status);
   3381     fSets->addElement(fSP, status);
   3382     fSets->addElement(fB2, status);
   3383     fSets->addElement(fBA, status);
   3384     fSets->addElement(fBB, status);
   3385     fSets->addElement(fHY, status);
   3386     fSets->addElement(fH2, status);
   3387     fSets->addElement(fH3, status);
   3388     fSets->addElement(fCL, status);
   3389     fSets->addElement(fCP, status);
   3390     fSets->addElement(fEX, status);
   3391     fSets->addElement(fIN, status);
   3392     fSets->addElement(fJL, status);
   3393     fSets->addElement(fJT, status);
   3394     fSets->addElement(fJV, status);
   3395     fSets->addElement(fNS, status);
   3396     fSets->addElement(fOP, status);
   3397     fSets->addElement(fQU, status);
   3398     fSets->addElement(fIS, status);
   3399     fSets->addElement(fNU, status);
   3400     fSets->addElement(fPO, status);
   3401     fSets->addElement(fPR, status);
   3402     fSets->addElement(fSY, status);
   3403     fSets->addElement(fAI, status);
   3404     fSets->addElement(fAL, status);
   3405     fSets->addElement(fID, status);
   3406     fSets->addElement(fWJ, status);
   3407     fSets->addElement(fSA, status);
   3408     fSets->addElement(fSG, status);
   3409 
   3410     const char *rules =
   3411             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
   3412             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
   3413             "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
   3414             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
   3415             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
   3416             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
   3417 
   3418     fNumberMatcher = new RegexMatcher(
   3419         UnicodeString(rules, -1, US_INV), 0, status);
   3420 
   3421     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
   3422 
   3423     if (U_FAILURE(status)) {
   3424         deferredStatus = status;
   3425     }
   3426 }
   3427 
   3428 
   3429 void RBBILineMonkey::setText(const UnicodeString &s) {
   3430     fText       = &s;
   3431     fCharBI->setText(s);
   3432     fNumberMatcher->reset(s);
   3433 }
   3434 
   3435 //
   3436 //  rule9Adjust
   3437 //     Line Break TR rules 9 and 10 implementation.
   3438 //     This deals with combining marks and other sequences that
   3439 //     that must be treated as if they were something other than what they actually are.
   3440 //
   3441 //     This is factored out into a separate function because it must be applied twice for
   3442 //     each potential break, once to the chars before the position being checked, then
   3443 //     again to the text following the possible break.
   3444 //
   3445 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
   3446     if (pos == -1) {
   3447         // Invalid initial position.  Happens during the warmup iteration of the
   3448         //   main loop in next().
   3449         return;
   3450     }
   3451 
   3452     int32_t  nPos = *nextPos;
   3453 
   3454     // LB 9  Keep combining sequences together.
   3455     //  advance over any CM class chars.  Note that Line Break CM is different
   3456     //  from the normal Grapheme Extend property.
   3457     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
   3458           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
   3459         for (;;) {
   3460             *nextChar = fText->char32At(nPos);
   3461             if (!fCM->contains(*nextChar)) {
   3462                 break;
   3463             }
   3464             nPos = fText->moveIndex32(nPos, 1);
   3465         }
   3466     }
   3467 
   3468 
   3469     // LB 9 Treat X CM* as if it were x.
   3470     //       No explicit action required.
   3471 
   3472     // LB 10  Treat any remaining combining mark as AL
   3473     if (fCM->contains(*posChar)) {
   3474         *posChar = 0x41;   // thisChar = 'A';
   3475     }
   3476 
   3477     // Push the updated nextPos and nextChar back to our caller.
   3478     // This only makes a difference if posChar got bigger by consuming a
   3479     // combining sequence.
   3480     *nextPos  = nPos;
   3481     *nextChar = fText->char32At(nPos);
   3482 }
   3483 
   3484 
   3485 
   3486 int32_t RBBILineMonkey::next(int32_t startPos) {
   3487     UErrorCode status = U_ZERO_ERROR;
   3488     int32_t    pos;       //  Index of the char following a potential break position
   3489     UChar32    thisChar;  //  Character at above position "pos"
   3490 
   3491     int32_t    prevPos;   //  Index of the char preceding a potential break position
   3492     UChar32    prevChar;  //  Character at above position.  Note that prevChar
   3493                           //   and thisChar may not be adjacent because combining
   3494                           //   characters between them will be ignored.
   3495 
   3496     int32_t    nextPos;   //  Index of the next character following pos.
   3497                           //     Usually skips over combining marks.
   3498     int32_t    nextCPPos; //  Index of the code point following "pos."
   3499                           //     May point to a combining mark.
   3500     int32_t    tPos;      //  temp value.
   3501     UChar32    c;
   3502 
   3503     if (U_FAILURE(deferredStatus)) {
   3504         return -1;
   3505     }
   3506 
   3507     if (startPos >= fText->length()) {
   3508         return -1;
   3509     }
   3510 
   3511 
   3512     // Initial values for loop.  Loop will run the first time without finding breaks,
   3513     //                           while the invalid values shift out and the "this" and
   3514     //                           "prev" positions are filled in with good values.
   3515     pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.
   3516     thisChar = prevChar  = 0;
   3517     nextPos  = nextCPPos = startPos;
   3518 
   3519 
   3520     // Loop runs once per position in the test text, until a break position
   3521     //  is found.
   3522     for (;;) {
   3523         prevPos   = pos;
   3524         prevChar  = thisChar;
   3525 
   3526         pos       = nextPos;
   3527         thisChar  = fText->char32At(pos);
   3528 
   3529         nextCPPos = fText->moveIndex32(pos, 1);
   3530         nextPos   = nextCPPos;
   3531 
   3532         // Rule LB2 - Break at end of text.
   3533         if (pos >= fText->length()) {
   3534             break;
   3535         }
   3536 
   3537         // Rule LB 9 - adjust for combining sequences.
   3538         //             We do this one out-of-order because the adjustment does not change anything
   3539         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
   3540         //             be applied.
   3541         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
   3542         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
   3543         c = fText->char32At(nextPos);
   3544         rule9Adjust(pos,     &thisChar, &nextPos, &c);
   3545 
   3546         // If the loop is still warming up - if we haven't shifted the initial
   3547         //   -1 positions out of prevPos yet - loop back to advance the
   3548         //    position in the input without any further looking for breaks.
   3549         if (prevPos == -1) {
   3550             continue;
   3551         }
   3552 
   3553         // LB 4  Always break after hard line breaks,
   3554         if (fBK->contains(prevChar)) {
   3555             break;
   3556         }
   3557 
   3558         // LB 5  Break after CR, LF, NL, but not inside CR LF
   3559         if (prevChar == 0x0d && thisChar == 0x0a) {
   3560             continue;
   3561         }
   3562         if (prevChar == 0x0d ||
   3563             prevChar == 0x0a ||
   3564             prevChar == 0x85)  {
   3565             break;
   3566         }
   3567 
   3568         // LB 6  Don't break before hard line breaks
   3569         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
   3570             fBK->contains(thisChar)) {
   3571                 continue;
   3572         }
   3573 
   3574 
   3575         // LB 7  Don't break before spaces or zero-width space.
   3576         if (fSP->contains(thisChar)) {
   3577             continue;
   3578         }
   3579 
   3580         if (fZW->contains(thisChar)) {
   3581             continue;
   3582         }
   3583 
   3584         // LB 8  Break after zero width space
   3585         if (fZW->contains(prevChar)) {
   3586             break;
   3587         }
   3588 
   3589         // LB 9, 10  Already done, at top of loop.
   3590         //
   3591 
   3592 
   3593         // LB 11  Do not break before or after WORD JOINER and related characters.
   3594         //    x  WJ
   3595         //    WJ  x
   3596         //
   3597         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
   3598             continue;
   3599         }
   3600 
   3601         // LB 12
   3602         //    GL  x
   3603         if (fGL->contains(prevChar)) {
   3604             continue;
   3605         }
   3606 
   3607         // LB 12a
   3608         //    [^SP BA HY] x GL
   3609         if (!(fSP->contains(prevChar) ||
   3610               fBA->contains(prevChar) ||
   3611               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
   3612             continue;
   3613         }
   3614 
   3615 
   3616 
   3617         // LB 13  Don't break before closings.
   3618         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
   3619         //        fall into LB 17 and the more general number regular expression.
   3620         //
   3621         if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
   3622             (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
   3623                                          fEX->contains(thisChar)  ||
   3624             (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
   3625             (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
   3626             continue;
   3627         }
   3628 
   3629         // LB 14 Don't break after OP SP*
   3630         //       Scan backwards, checking for this sequence.
   3631         //       The OP char could include combining marks, so we actually check for
   3632         //           OP CM* SP*
   3633         //       Another Twist: The Rule 67 fixes may have changed a SP CM
   3634         //       sequence into a ID char, so before scanning back through spaces,
   3635         //       verify that prevChar is indeed a space.  The prevChar variable
   3636         //       may differ from fText[prevPos]
   3637         tPos = prevPos;
   3638         if (fSP->contains(prevChar)) {
   3639             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
   3640                 tPos=fText->moveIndex32(tPos, -1);
   3641             }
   3642         }
   3643         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
   3644             tPos=fText->moveIndex32(tPos, -1);
   3645         }
   3646         if (fOP->contains(fText->char32At(tPos))) {
   3647             continue;
   3648         }
   3649 
   3650 
   3651         // LB 15    QU SP* x OP
   3652         if (fOP->contains(thisChar)) {
   3653             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
   3654             int tPos = prevPos;
   3655             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
   3656                 tPos = fText->moveIndex32(tPos, -1);
   3657             }
   3658             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
   3659                 tPos = fText->moveIndex32(tPos, -1);
   3660             }
   3661             if (fQU->contains(fText->char32At(tPos))) {
   3662                 continue;
   3663             }
   3664         }
   3665 
   3666 
   3667 
   3668         // LB 16   (CL | CP) SP* x NS
   3669         //    Scan backwards for SP* CM* (CL | CP)
   3670         if (fNS->contains(thisChar)) {
   3671             int tPos = prevPos;
   3672             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
   3673                 tPos = fText->moveIndex32(tPos, -1);
   3674             }
   3675             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
   3676                 tPos = fText->moveIndex32(tPos, -1);
   3677             }
   3678             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
   3679                 continue;
   3680             }
   3681         }
   3682 
   3683 
   3684         // LB 17        B2 SP* x B2
   3685         if (fB2->contains(thisChar)) {
   3686             //  Scan backwards, checking for the B2 CM* SP* sequence.
   3687             tPos = prevPos;
   3688             if (fSP->contains(prevChar)) {
   3689                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
   3690                     tPos=fText->moveIndex32(tPos, -1);
   3691                 }
   3692             }
   3693             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
   3694                 tPos=fText->moveIndex32(tPos, -1);
   3695             }
   3696             if (fB2->contains(fText->char32At(tPos))) {
   3697                 continue;
   3698             }
   3699         }
   3700 
   3701 
   3702         // LB 18    break after space
   3703         if (fSP->contains(prevChar)) {
   3704             break;
   3705         }
   3706 
   3707         // LB 19
   3708         //    x   QU
   3709         //    QU  x
   3710         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
   3711             continue;
   3712         }
   3713 
   3714         // LB 20  Break around a CB
   3715         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
   3716             break;
   3717         }
   3718 
   3719         // LB 21
   3720         if (fBA->contains(thisChar) ||
   3721             fHY->contains(thisChar) ||
   3722             fNS->contains(thisChar) ||
   3723             fBB->contains(prevChar) )   {
   3724             continue;
   3725         }
   3726 
   3727         // LB 22
   3728         if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
   3729             (fID->contains(prevChar) && fIN->contains(thisChar)) ||
   3730             (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
   3731             (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
   3732             continue;
   3733         }
   3734 
   3735 
   3736         // LB 23    ID x PO
   3737         //          AL x NU
   3738         //          NU x AL
   3739         if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
   3740             (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
   3741             (fNU->contains(prevChar) && fAL->contains(thisChar)) )   {
   3742             continue;
   3743         }
   3744 
   3745         // LB 24  Do not break between prefix and letters or ideographs.
   3746         //        PR x ID
   3747         //        PR x AL
   3748         //        PO x AL
   3749         if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
   3750             (fPR->contains(prevChar) && fAL->contains(thisChar)) ||
   3751             (fPO->contains(prevChar) && fAL->contains(thisChar)) )   {
   3752             continue;
   3753         }
   3754 
   3755 
   3756 
   3757         // LB 25    Numbers
   3758         if (fNumberMatcher->lookingAt(prevPos, status)) {
   3759             if (U_FAILURE(status)) {
   3760                 break;
   3761             }
   3762             // Matched a number.  But could have been just a single digit, which would
   3763             //    not represent a "no break here" between prevChar and thisChar
   3764             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
   3765             if (numEndIdx > pos) {
   3766                 // Number match includes at least our two chars being checked
   3767                 if (numEndIdx > nextPos) {
   3768                     // Number match includes additional chars.  Update pos and nextPos
   3769                     //   so that next loop iteration will continue at the end of the number,
   3770                     //   checking for breaks between last char in number & whatever follows.
   3771                     pos = nextPos = numEndIdx;
   3772                     do {
   3773                         pos = fText->moveIndex32(pos, -1);
   3774                         thisChar = fText->char32At(pos);
   3775                     } while (fCM->contains(thisChar));
   3776                 }
   3777                 continue;
   3778             }
   3779         }
   3780 
   3781 
   3782         // LB 26 Do not break a Korean syllable.
   3783         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
   3784                                         fJV->contains(thisChar) ||
   3785                                         fH2->contains(thisChar) ||
   3786                                         fH3->contains(thisChar))) {
   3787                                             continue;
   3788                                         }
   3789 
   3790         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
   3791             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
   3792                 continue;
   3793         }
   3794 
   3795         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
   3796             fJT->contains(thisChar)) {
   3797                 continue;
   3798         }
   3799 
   3800         // LB 27 Treat a Korean Syllable Block the same as ID.
   3801         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
   3802             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
   3803             fIN->contains(thisChar)) {
   3804                 continue;
   3805             }
   3806         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
   3807             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
   3808             fPO->contains(thisChar)) {
   3809                 continue;
   3810             }
   3811         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
   3812             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
   3813                 continue;
   3814             }
   3815 
   3816 
   3817 
   3818         // LB 28  Do not break between alphabetics ("at").
   3819         if (fAL->contains(prevChar) && fAL->contains(thisChar)) {
   3820             continue;
   3821         }
   3822 
   3823         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
   3824         if (fIS->contains(prevChar) && fAL->contains(thisChar)) {
   3825             continue;
   3826         }
   3827 
   3828         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
   3829         //          (AL | NU) x OP
   3830         //          CP x (AL | NU)
   3831         if ((fAL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
   3832             continue;
   3833         }
   3834         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fNU->contains(thisChar))) {
   3835             continue;
   3836         }
   3837 
   3838         // LB 31    Break everywhere else
   3839         break;
   3840 
   3841     }
   3842 
   3843     return pos;
   3844 }
   3845 
   3846 
   3847 UVector  *RBBILineMonkey::charClasses() {
   3848     return fSets;
   3849 }
   3850 
   3851 
   3852 RBBILineMonkey::~RBBILineMonkey() {
   3853     delete fSets;
   3854 
   3855     delete fBK;
   3856     delete fCR;
   3857     delete fLF;
   3858     delete fCM;
   3859     delete fNL;
   3860     delete fWJ;
   3861     delete fZW;
   3862     delete fGL;
   3863     delete fCB;
   3864     delete fSP;
   3865     delete fB2;
   3866     delete fBA;
   3867     delete fBB;
   3868     delete fHY;
   3869     delete fH2;
   3870     delete fH3;
   3871     delete fCL;
   3872     delete fCP;
   3873     delete fEX;
   3874     delete fIN;
   3875     delete fJL;
   3876     delete fJV;
   3877     delete fJT;
   3878     delete fNS;
   3879     delete fOP;
   3880     delete fQU;
   3881     delete fIS;
   3882     delete fNU;
   3883     delete fPO;
   3884     delete fPR;
   3885     delete fSY;
   3886     delete fAI;
   3887     delete fAL;
   3888     delete fID;
   3889     delete fSA;
   3890     delete fSG;
   3891     delete fXX;
   3892 
   3893     delete fCharBI;
   3894     delete fNumberMatcher;
   3895 }
   3896 
   3897 
   3898 //-------------------------------------------------------------------------------------------
   3899 //
   3900 //   TestMonkey
   3901 //
   3902 //     params
   3903 //       seed=nnnnn        Random number starting seed.
   3904 //                         Setting the seed allows errors to be reproduced.
   3905 //       loop=nnn          Looping count.  Controls running time.
   3906 //                         -1:  run forever.
   3907 //                          0 or greater:  run length.
   3908 //
   3909 //       type = char | word | line | sent | title
   3910 //
   3911 //-------------------------------------------------------------------------------------------
   3912 
   3913 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
   3914     int32_t val = defaultVal;
   3915     name.append(" *= *(-?\\d+)");
   3916     UErrorCode status = U_ZERO_ERROR;
   3917     RegexMatcher m(name, params, 0, status);
   3918     if (m.find()) {
   3919         // The param exists.  Convert the string to an int.
   3920         char valString[100];
   3921         int32_t paramLength = m.end(1, status) - m.start(1, status);
   3922         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
   3923             paramLength = (int32_t)(sizeof(valString)-2);
   3924         }
   3925         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
   3926         val = strtol(valString,  NULL, 10);
   3927 
   3928         // Delete this parameter from the params string.
   3929         m.reset();
   3930         params = m.replaceFirst("", status);
   3931     }
   3932     U_ASSERT(U_SUCCESS(status));
   3933     return val;
   3934 }
   3935 #endif
   3936 
   3937 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
   3938                                     BreakIterator *bi,
   3939                                     int expected[],
   3940                                     int expectedcount)
   3941 {
   3942     int count = 0;
   3943     int i = 0;
   3944     int forward[50];
   3945     bi->setText(ustr);
   3946     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
   3947         forward[count] = i;
   3948         if (count < expectedcount && expected[count] != i) {
   3949             test->errln("break forward test failed: expected %d but got %d",
   3950                         expected[count], i);
   3951             break;
   3952         }
   3953         count ++;
   3954     }
   3955     if (count != expectedcount) {
   3956         printStringBreaks(ustr, expected, expectedcount);
   3957         test->errln("break forward test failed: missed %d match",
   3958                     expectedcount - count);
   3959         return;
   3960     }
   3961     // testing boundaries
   3962     for (i = 1; i < expectedcount; i ++) {
   3963         int j = expected[i - 1];
   3964         if (!bi->isBoundary(j)) {
   3965             printStringBreaks(ustr, expected, expectedcount);
   3966             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
   3967             return;
   3968         }
   3969         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
   3970             if (bi->isBoundary(j)) {
   3971                 printStringBreaks(ustr, expected, expectedcount);
   3972                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
   3973                 return;
   3974             }
   3975         }
   3976     }
   3977 
   3978     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
   3979         count --;
   3980         if (forward[count] != i) {
   3981             test->errln("happy break test previous() failed: expected %d but got %d",
   3982                         forward[count], i);
   3983             break;
   3984         }
   3985     }
   3986     if (count != 0) {
   3987         printStringBreaks(ustr, expected, expectedcount);
   3988         test->errln("break test previous() failed: missed a match");
   3989         return;
   3990     }
   3991 
   3992     // testing preceding
   3993     for (i = 0; i < expectedcount - 1; i ++) {
   3994         // int j = expected[i] + 1;
   3995         int j = ustr.moveIndex32(expected[i], 1);
   3996         for (; j <= expected[i + 1]; j ++) {
   3997             if (bi->preceding(j) != expected[i]) {
   3998                 printStringBreaks(ustr, expected, expectedcount);
   3999                 test->errln("preceding(): Not expecting boundary at position %d", j);
   4000                 return;
   4001             }
   4002         }
   4003     }
   4004 }
   4005 
   4006 void RBBITest::TestWordBreaks(void)
   4007 {
   4008 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4009 
   4010     Locale        locale("en");
   4011     UErrorCode    status = U_ZERO_ERROR;
   4012     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   4013     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
   4014     static const char *strlist[] =
   4015     {
   4016     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
   4017     "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
   4018     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
   4019     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
   4020     "\\u90ca\\u3588\\u009c\\u0953\\u194b",
   4021     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   4022     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
   4023     "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
   4024     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   4025     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   4026     "\\u2027\\U000e0067\\u0a47\\u00b7",
   4027     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   4028     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   4029     "\\u0589\\U000e006e\\u0a42\\U000104a5",
   4030     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
   4031     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   4032     "\\u0027\\u11af\\U000e0057\\u0602",
   4033     "\\U0001d7f2\\U000e007\\u0004\\u0589",
   4034     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
   4035     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   4036     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   4037     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   4038     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
   4039     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   4040     "\\u0233\\U000e0020\\u0a69\\u0d6a",
   4041     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   4042     "\\u58f4\\U000e0049\\u20e7\\u2027",
   4043     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   4044     "\\ua183\\u102d\\u0bec\\u003a",
   4045     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   4046     "\\u003a\\u0e57\\u0fad\\u002e",
   4047     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
   4048     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
   4049     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
   4050     "\\u003a\\u0664\\u00b7\\u1fba",
   4051     "\\u003b\\u0027\\u00b7\\u47a3",
   4052     "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
   4053     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
   4054     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
   4055     };
   4056     int loop;
   4057     if (U_FAILURE(status)) {
   4058         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   4059         return;
   4060     }
   4061     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   4062         // printf("looping %d\n", loop);
   4063         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
   4064         // RBBICharMonkey monkey;
   4065         RBBIWordMonkey monkey;
   4066 
   4067         int expected[50];
   4068         int expectedcount = 0;
   4069 
   4070         monkey.setText(ustr);
   4071         int i;
   4072         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   4073             expected[expectedcount ++] = i;
   4074         }
   4075 
   4076         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   4077     }
   4078     delete bi;
   4079 #endif
   4080 }
   4081 
   4082 void RBBITest::TestWordBoundary(void)
   4083 {
   4084     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
   4085     Locale        locale("en");
   4086     UErrorCode    status = U_ZERO_ERROR;
   4087     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   4088     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
   4089     UChar         str[50];
   4090     static const char *strlist[] =
   4091     {
   4092     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   4093     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   4094     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   4095     "\\u2027\\U000e0067\\u0a47\\u00b7",
   4096     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   4097     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   4098     "\\u0589\\U000e006e\\u0a42\\U000104a5",
   4099     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
   4100     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   4101     "\\u0027\\u11af\\U000e0057\\u0602",
   4102     "\\U0001d7f2\\U000e007\\u0004\\u0589",
   4103     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
   4104     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   4105     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   4106     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   4107     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
   4108     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   4109     "\\u0233\\U000e0020\\u0a69\\u0d6a",
   4110     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   4111     "\\u58f4\\U000e0049\\u20e7\\u2027",
   4112     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   4113     "\\ua183\\u102d\\u0bec\\u003a",
   4114     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   4115     "\\u003a\\u0e57\\u0fad\\u002e",
   4116     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
   4117     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
   4118     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
   4119     "\\u003a\\u0664\\u00b7\\u1fba",
   4120     "\\u003b\\u0027\\u00b7\\u47a3",
   4121     };
   4122     int loop;
   4123     if (U_FAILURE(status)) {
   4124         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   4125         return;
   4126     }
   4127     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   4128         // printf("looping %d\n", loop);
   4129         u_unescape(strlist[loop], str, 20);
   4130         UnicodeString ustr(str);
   4131         int forward[50];
   4132         int count = 0;
   4133 
   4134         bi->setText(ustr);
   4135         int prev = 0;
   4136         int i;
   4137         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
   4138             forward[count ++] = i;
   4139             if (i > prev) {
   4140                 int j;
   4141                 for (j = prev + 1; j < i; j ++) {
   4142                     if (bi->isBoundary(j)) {
   4143                         printStringBreaks(ustr, forward, count);
   4144                         errln("happy boundary test failed: expected %d not a boundary",
   4145                                j);
   4146                         return;
   4147                     }
   4148                 }
   4149             }
   4150             if (!bi->isBoundary(i)) {
   4151                 printStringBreaks(ustr, forward, count);
   4152                 errln("happy boundary test failed: expected %d a boundary",
   4153                        i);
   4154                 return;
   4155             }
   4156             prev = i;
   4157         }
   4158     }
   4159     delete bi;
   4160 }
   4161 
   4162 void RBBITest::TestLineBreaks(void)
   4163 {
   4164 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4165     Locale        locale("en");
   4166     UErrorCode    status = U_ZERO_ERROR;
   4167     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
   4168     const int32_t  STRSIZE = 50;
   4169     UChar         str[STRSIZE];
   4170     static const char *strlist[] =
   4171     {
   4172      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
   4173      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
   4174              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
   4175      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
   4176              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
   4177      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
   4178      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
   4179      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
   4180      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
   4181      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
   4182      "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
   4183      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
   4184      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
   4185      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
   4186      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
   4187      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
   4188      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
   4189      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
   4190      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
   4191      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
   4192      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
   4193      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
   4194      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
   4195      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
   4196      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
   4197      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
   4198      "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
   4199      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
   4200      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
   4201      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
   4202      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
   4203      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
   4204      "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
   4205      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
   4206      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
   4207      "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
   4208      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
   4209      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
   4210      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
   4211      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
   4212      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
   4213      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
   4214          "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
   4215          "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
   4216          "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
   4217      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
   4218          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
   4219     };
   4220     int loop;
   4221     TEST_ASSERT_SUCCESS(status);
   4222     if (U_FAILURE(status)) {
   4223         return;
   4224     }
   4225     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   4226         // printf("looping %d\n", loop);
   4227         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
   4228         if (t >= STRSIZE) {
   4229             TEST_ASSERT(FALSE);
   4230             continue;
   4231         }
   4232 
   4233 
   4234         UnicodeString ustr(str);
   4235         RBBILineMonkey monkey;
   4236         if (U_FAILURE(monkey.deferredStatus)) {
   4237             continue;
   4238         }
   4239 
   4240         const int EXPECTEDSIZE = 50;
   4241         int expected[EXPECTEDSIZE];
   4242         int expectedcount = 0;
   4243 
   4244         monkey.setText(ustr);
   4245         int i;
   4246         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   4247             if (expectedcount >= EXPECTEDSIZE) {
   4248                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
   4249                 return;
   4250             }
   4251             expected[expectedcount ++] = i;
   4252         }
   4253 
   4254         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   4255     }
   4256     delete bi;
   4257 #endif
   4258 }
   4259 
   4260 void RBBITest::TestSentBreaks(void)
   4261 {
   4262 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4263     Locale        locale("en");
   4264     UErrorCode    status = U_ZERO_ERROR;
   4265     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
   4266     UChar         str[200];
   4267     static const char *strlist[] =
   4268     {
   4269      "Now\ris\nthe\r\ntime\n\rfor\r\r",
   4270      "This\n",
   4271      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
   4272      "\"Sentence ending with a quote.\" Bye.",
   4273      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
   4274      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
   4275      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
   4276      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
   4277      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
   4278      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
   4279      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
   4280              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
   4281              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
   4282              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
   4283      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
   4284              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
   4285              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
   4286              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
   4287              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
   4288              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
   4289     };
   4290     int loop;
   4291     if (U_FAILURE(status)) {
   4292         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   4293         return;
   4294     }
   4295     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   4296         u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
   4297         UnicodeString ustr(str);
   4298 
   4299         RBBISentMonkey monkey;
   4300         if (U_FAILURE(monkey.deferredStatus)) {
   4301             continue;
   4302         }
   4303 
   4304         const int EXPECTEDSIZE = 50;
   4305         int expected[EXPECTEDSIZE];
   4306         int expectedcount = 0;
   4307 
   4308         monkey.setText(ustr);
   4309         int i;
   4310         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   4311             if (expectedcount >= EXPECTEDSIZE) {
   4312                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
   4313                 return;
   4314             }
   4315             expected[expectedcount ++] = i;
   4316         }
   4317 
   4318         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   4319     }
   4320     delete bi;
   4321 #endif
   4322 }
   4323 
   4324 void RBBITest::TestMonkey(char *params) {
   4325 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4326 
   4327     UErrorCode     status    = U_ZERO_ERROR;
   4328     int32_t        loopCount = 500;
   4329     int32_t        seed      = 1;
   4330     UnicodeString  breakType = "all";
   4331     Locale         locale("en");
   4332     UBool          useUText  = FALSE;
   4333 
   4334     if (quick == FALSE) {
   4335         loopCount = 10000;
   4336     }
   4337 
   4338     if (params) {
   4339         UnicodeString p(params);
   4340         loopCount = getIntParam("loop", p, loopCount);
   4341         seed      = getIntParam("seed", p, seed);
   4342 
   4343         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
   4344         if (m.find()) {
   4345             breakType = m.group(1, status);
   4346             m.reset();
   4347             p = m.replaceFirst("", status);
   4348         }
   4349 
   4350         RegexMatcher u(" *utext", p, 0, status);
   4351         if (u.find()) {
   4352             useUText = TRUE;
   4353             u.reset();
   4354             p = u.replaceFirst("", status);
   4355         }
   4356 
   4357 
   4358         // m.reset(p);
   4359         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
   4360             // Each option is stripped out of the option string as it is processed.
   4361             // All options have been checked.  The option string should have been completely emptied..
   4362             char buf[100];
   4363             p.extract(buf, sizeof(buf), NULL, status);
   4364             buf[sizeof(buf)-1] = 0;
   4365             errln("Unrecognized or extra parameter:  %s\n", buf);
   4366             return;
   4367         }
   4368 
   4369     }
   4370 
   4371     if (breakType == "char" || breakType == "all") {
   4372         RBBICharMonkey  m;
   4373         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   4374         if (U_SUCCESS(status)) {
   4375             RunMonkey(bi, m, "char", seed, loopCount, useUText);
   4376             if (breakType == "all" && useUText==FALSE) {
   4377                 // Also run a quick test with UText when "all" is specified
   4378                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
   4379             }
   4380         }
   4381         else {
   4382             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
   4383         }
   4384         delete bi;
   4385     }
   4386 
   4387     if (breakType == "word" || breakType == "all") {
   4388         logln("Word Break Monkey Test");
   4389         RBBIWordMonkey  m;
   4390         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
   4391         if (U_SUCCESS(status)) {
   4392             RunMonkey(bi, m, "word", seed, loopCount, useUText);
   4393         }
   4394         else {
   4395             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
   4396         }
   4397         delete bi;
   4398     }
   4399 
   4400     if (breakType == "line" || breakType == "all") {
   4401         logln("Line Break Monkey Test");
   4402         RBBILineMonkey  m;
   4403         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
   4404         if (loopCount >= 10) {
   4405             loopCount = loopCount / 5;   // Line break runs slower than the others.
   4406         }
   4407         if (U_SUCCESS(status)) {
   4408             RunMonkey(bi, m, "line", seed, loopCount, useUText);
   4409         }
   4410         else {
   4411             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
   4412         }
   4413         delete bi;
   4414     }
   4415 
   4416     if (breakType == "sent" || breakType == "all"  ) {
   4417         logln("Sentence Break Monkey Test");
   4418         RBBISentMonkey  m;
   4419         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
   4420         if (loopCount >= 10) {
   4421             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
   4422         }
   4423         if (U_SUCCESS(status)) {
   4424             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
   4425         }
   4426         else {
   4427             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
   4428         }
   4429         delete bi;
   4430     }
   4431 
   4432 #endif
   4433 }
   4434 
   4435 //
   4436 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
   4437 //    Parameters:
   4438 //       bi      - the break iterator to use
   4439 //       mk      - MonkeyKind, abstraction for obtaining expected results
   4440 //       name    - Name of test (char, word, etc.) for use in error messages
   4441 //       seed    - Seed for starting random number generator (parameter from user)
   4442 //       numIterations
   4443 //
   4444 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
   4445                          int32_t numIterations, UBool useUText) {
   4446 
   4447 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4448 
   4449     const int32_t    TESTSTRINGLEN = 500;
   4450     UnicodeString    testText;
   4451     int32_t          numCharClasses;
   4452     UVector          *chClasses;
   4453     int              expected[TESTSTRINGLEN*2 + 1];
   4454     int              expectedCount = 0;
   4455     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
   4456     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
   4457     char             reverseBreaks[TESTSTRINGLEN*2+1];
   4458     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
   4459     char             followingBreaks[TESTSTRINGLEN*2+1];
   4460     char             precedingBreaks[TESTSTRINGLEN*2+1];
   4461     int              i;
   4462     int              loopCount = 0;
   4463 
   4464     m_seed = seed;
   4465 
   4466     numCharClasses = mk.charClasses()->size();
   4467     chClasses      = mk.charClasses();
   4468 
   4469     // Check for errors that occured during the construction of the MonkeyKind object.
   4470     //  Can't report them where they occured because errln() is a method coming from intlTest,
   4471     //  and is not visible outside of RBBITest :-(
   4472     if (U_FAILURE(mk.deferredStatus)) {
   4473         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
   4474         return;
   4475     }
   4476 
   4477     // Verify that the character classes all have at least one member.
   4478     for (i=0; i<numCharClasses; i++) {
   4479         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
   4480         if (s == NULL || s->size() == 0) {
   4481             errln("Character Class #%d is null or of zero size.", i);
   4482             return;
   4483         }
   4484     }
   4485 
   4486     while (loopCount < numIterations || numIterations == -1) {
   4487         if (numIterations == -1 && loopCount % 10 == 0) {
   4488             // If test is running in an infinite loop, display a periodic tic so
   4489             //   we can tell that it is making progress.
   4490             fprintf(stderr, ".");
   4491         }
   4492         // Save current random number seed, so that we can recreate the random numbers
   4493         //   for this loop iteration in event of an error.
   4494         seed = m_seed;
   4495 
   4496         // Populate a test string with data.
   4497         testText.truncate(0);
   4498         for (i=0; i<TESTSTRINGLEN; i++) {
   4499             int32_t  aClassNum = m_rand() % numCharClasses;
   4500             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
   4501             int32_t   charIdx = m_rand() % classSet->size();
   4502             UChar32   c = classSet->charAt(charIdx);
   4503             if (c < 0) {   // TODO:  deal with sets containing strings.
   4504                 errln("c < 0");
   4505                 break;
   4506             }
   4507             testText.append(c);
   4508         }
   4509 
   4510         // Calculate the expected results for this test string.
   4511         mk.setText(testText);
   4512         memset(expectedBreaks, 0, sizeof(expectedBreaks));
   4513         expectedBreaks[0] = 1;
   4514         int32_t breakPos = 0;
   4515         expectedCount = 0;
   4516         for (;;) {
   4517             breakPos = mk.next(breakPos);
   4518             if (breakPos == -1) {
   4519                 break;
   4520             }
   4521             if (breakPos > testText.length()) {
   4522                 errln("breakPos > testText.length()");
   4523             }
   4524             expectedBreaks[breakPos] = 1;
   4525             U_ASSERT(expectedCount<testText.length());
   4526             expected[expectedCount ++] = breakPos;
   4527         }
   4528 
   4529         // Find the break positions using forward iteration
   4530         memset(forwardBreaks, 0, sizeof(forwardBreaks));
   4531         if (useUText) {
   4532             UErrorCode status = U_ZERO_ERROR;
   4533             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
   4534             // testUText = utext_openUnicodeString(testUText, &testText, &status);
   4535             bi->setText(testUText, status);
   4536             TEST_ASSERT_SUCCESS(status);
   4537             utext_close(testUText);   // The break iterator does a shallow clone of the UText
   4538                                       //  This UText can be closed immediately, so long as the
   4539                                       //  testText string continues to exist.
   4540         } else {
   4541             bi->setText(testText);
   4542         }
   4543 
   4544         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
   4545             if (i < 0 || i > testText.length()) {
   4546                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
   4547                 break;
   4548             }
   4549             forwardBreaks[i] = 1;
   4550         }
   4551 
   4552         // Find the break positions using reverse iteration
   4553         memset(reverseBreaks, 0, sizeof(reverseBreaks));
   4554         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
   4555             if (i < 0 || i > testText.length()) {
   4556                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
   4557                 break;
   4558             }
   4559             reverseBreaks[i] = 1;
   4560         }
   4561 
   4562         // Find the break positions using isBoundary() tests.
   4563         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
   4564         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
   4565         for (i=0; i<=testText.length(); i++) {
   4566             isBoundaryBreaks[i] = bi->isBoundary(i);
   4567         }
   4568 
   4569 
   4570         // Find the break positions using the following() function.
   4571         // printf(".");
   4572         memset(followingBreaks, 0, sizeof(followingBreaks));
   4573         int32_t   lastBreakPos = 0;
   4574         followingBreaks[0] = 1;
   4575         for (i=0; i<testText.length(); i++) {
   4576             breakPos = bi->following(i);
   4577             if (breakPos <= i ||
   4578                 breakPos < lastBreakPos ||
   4579                 breakPos > testText.length() ||
   4580                 (breakPos > lastBreakPos && lastBreakPos > i)) {
   4581                 errln("%s break monkey test: "
   4582                     "Out of range value returned by BreakIterator::following().\n"
   4583                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
   4584                          name, seed, i, breakPos, lastBreakPos);
   4585                 break;
   4586             }
   4587             followingBreaks[breakPos] = 1;
   4588             lastBreakPos = breakPos;
   4589         }
   4590 
   4591         // Find the break positions using the preceding() function.
   4592         memset(precedingBreaks, 0, sizeof(precedingBreaks));
   4593         lastBreakPos = testText.length();
   4594         precedingBreaks[testText.length()] = 1;
   4595         for (i=testText.length(); i>0; i--) {
   4596             breakPos = bi->preceding(i);
   4597             if (breakPos >= i ||
   4598                 breakPos > lastBreakPos ||
   4599                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
   4600                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
   4601                 errln("%s break monkey test: "
   4602                     "Out of range value returned by BreakIterator::preceding().\n"
   4603                     "index=%d;  prev returned %d; lastBreak=%d" ,
   4604                     name,  i, breakPos, lastBreakPos);
   4605                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
   4606                     precedingBreaks[i] = 2;   // Forces an error.
   4607                 }
   4608             } else {
   4609                 if (breakPos >= 0) {
   4610                     precedingBreaks[breakPos] = 1;
   4611                 }
   4612                 lastBreakPos = breakPos;
   4613             }
   4614         }
   4615 
   4616         // Compare the expected and actual results.
   4617         for (i=0; i<=testText.length(); i++) {
   4618             const char *errorType = NULL;
   4619             if  (forwardBreaks[i] != expectedBreaks[i]) {
   4620                 errorType = "next()";
   4621             } else if (reverseBreaks[i] != forwardBreaks[i]) {
   4622                 errorType = "previous()";
   4623             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
   4624                 errorType = "isBoundary()";
   4625             } else if (followingBreaks[i] != expectedBreaks[i]) {
   4626                 errorType = "following()";
   4627             } else if (precedingBreaks[i] != expectedBreaks[i]) {
   4628                 errorType = "preceding()";
   4629             }
   4630 
   4631 
   4632             if (errorType != NULL) {
   4633                 // Format a range of the test text that includes the failure as
   4634                 //  a data item that can be included in the rbbi test data file.
   4635 
   4636                 // Start of the range is the last point where expected and actual results
   4637                 //   both agreed that there was a break position.
   4638                 int startContext = i;
   4639                 int32_t count = 0;
   4640                 for (;;) {
   4641                     if (startContext==0) { break; }
   4642                     startContext --;
   4643                     if (expectedBreaks[startContext] != 0) {
   4644                         if (count == 2) break;
   4645                         count ++;
   4646                     }
   4647                 }
   4648 
   4649                 // End of range is two expected breaks past the start position.
   4650                 int endContext = i + 1;
   4651                 int ci;
   4652                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
   4653                     for (;;) {
   4654                         if (endContext >= testText.length()) {break;}
   4655                         if (expectedBreaks[endContext-1] != 0) {
   4656                             if (count == 0) break;
   4657                             count --;
   4658                         }
   4659                         endContext ++;
   4660                     }
   4661                 }
   4662 
   4663                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
   4664                 UnicodeString errorText = "<data>";
   4665                 /***if (strcmp(errorType, "next()") == 0) {
   4666                     startContext = 0;
   4667                     endContext = testText.length();
   4668 
   4669                     printStringBreaks(testText, expected, expectedCount);
   4670                 }***/
   4671 
   4672                 for (ci=startContext; ci<endContext;) {
   4673                     UnicodeString hexChars("0123456789abcdef");
   4674                     UChar32  c;
   4675                     int      bn;
   4676                     c = testText.char32At(ci);
   4677                     if (ci == i) {
   4678                         // This is the location of the error.
   4679                         errorText.append("<?>");
   4680                     } else if (expectedBreaks[ci] != 0) {
   4681                         // This a non-error expected break position.
   4682                         errorText.append("\\");
   4683                     }
   4684                     if (c < 0x10000) {
   4685                         errorText.append("\\u");
   4686                         for (bn=12; bn>=0; bn-=4) {
   4687                             errorText.append(hexChars.charAt((c>>bn)&0xf));
   4688                         }
   4689                     } else {
   4690                         errorText.append("\\U");
   4691                         for (bn=28; bn>=0; bn-=4) {
   4692                             errorText.append(hexChars.charAt((c>>bn)&0xf));
   4693                         }
   4694                     }
   4695                     ci = testText.moveIndex32(ci, 1);
   4696                 }
   4697                 errorText.append("\\");
   4698                 errorText.append("</data>\n");
   4699 
   4700                 // Output the error
   4701                 char  charErrorTxt[500];
   4702                 UErrorCode status = U_ZERO_ERROR;
   4703                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
   4704                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
   4705                 errln("%s break monkey test error.  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
   4706                     name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
   4707                     errorType, seed, i, charErrorTxt);
   4708                 break;
   4709             }
   4710         }
   4711 
   4712         loopCount++;
   4713     }
   4714 #endif
   4715 }
   4716 
   4717 
   4718 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
   4719 //             This test checks the initial patch,
   4720 //             which is to just keep it from crashing.  Correct word boundaries
   4721 //             await a proper fix to the dictionary code.
   4722 //
   4723 void RBBITest::TestBug5532(void)  {
   4724    // Text includes a mixture of Thai and Latin.
   4725    const unsigned char utf8Data[] = {
   4726            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
   4727            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
   4728            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
   4729            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
   4730            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
   4731            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
   4732            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
   4733            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
   4734            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
   4735            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
   4736            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
   4737 
   4738     UErrorCode status = U_ZERO_ERROR;
   4739     UText utext=UTEXT_INITIALIZER;
   4740     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
   4741     TEST_ASSERT_SUCCESS(status);
   4742 
   4743     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
   4744     TEST_ASSERT_SUCCESS(status);
   4745     if (U_SUCCESS(status)) {
   4746         bi->setText(&utext, status);
   4747         TEST_ASSERT_SUCCESS(status);
   4748 
   4749         int32_t breakCount = 0;
   4750         int32_t previousBreak = -1;
   4751         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
   4752             // For now, just make sure that the break iterator doesn't hang.
   4753             TEST_ASSERT(previousBreak < bi->current());
   4754             previousBreak = bi->current();
   4755         }
   4756         TEST_ASSERT(breakCount > 0);
   4757     }
   4758     delete bi;
   4759     utext_close(&utext);
   4760 }
   4761 
   4762 
   4763 //
   4764 //  TestDebug    -  A place-holder test for debugging purposes.
   4765 //                  For putting in fragments of other tests that can be invoked
   4766 //                  for tracing  without a lot of unwanted extra stuff happening.
   4767 //
   4768 void RBBITest::TestDebug(void) {
   4769 #if 0
   4770     UErrorCode   status = U_ZERO_ERROR;
   4771     int pos = 0;
   4772     int ruleStatus = 0;
   4773 
   4774     RuleBasedBreakIterator* bi =
   4775        // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
   4776        // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
   4777        (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
   4778     UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
   4779     // UnicodeString s("Aaa.  Bcd");
   4780     s = s.unescape();
   4781     bi->setText(s);
   4782     UBool r = bi->isBoundary(8);
   4783     printf("%s", r?"true":"false");
   4784     return;
   4785     pos = bi->last();
   4786     do {
   4787         // ruleStatus = bi->getRuleStatus();
   4788         printf("%d\t%d\n", pos, ruleStatus);
   4789         pos = bi->previous();
   4790     } while (pos != BreakIterator::DONE);
   4791 #endif
   4792 }
   4793 
   4794 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   4795