Home | History | Annotate | Download | only in intltest
      1 /********************************************************************
      2  * COPYRIGHT:
      3  * Copyright (c) 1999-2011, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  ********************************************************************/
      6 /************************************************************************
      7 *   Date        Name        Description
      8 *   12/15/99    Madhu        Creation.
      9 *   01/12/2000  Madhu        Updated for changed API and added new tests
     10 ************************************************************************/
     11 
     12 #include <typeinfo>  // for 'typeid' to work
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_BREAK_ITERATION
     17 
     18 #include "unicode/utypes.h"
     19 #include "unicode/brkiter.h"
     20 #include "unicode/rbbi.h"
     21 #include "unicode/uchar.h"
     22 #include "unicode/utf16.h"
     23 #include "unicode/ucnv.h"
     24 #include "unicode/schriter.h"
     25 #include "unicode/uniset.h"
     26 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
     27 #include "unicode/ustring.h"
     28 #include "unicode/utext.h"
     29 #include "intltest.h"
     30 #include "rbbitst.h"
     31 #include <string.h>
     32 #include "uvector.h"
     33 #include "uvectr32.h"
     34 #include "triedict.h"
     35 #include <string.h>
     36 #include <stdio.h>
     37 #include <stdlib.h>
     38 
     39 #define TEST_ASSERT(x) {if (!(x)) { \
     40     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
     41 
     42 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
     43     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
     44 
     45 
     46 //---------------------------------------------
     47 // runIndexedTest
     48 //---------------------------------------------
     49 
     50 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
     51 {
     52     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
     53 
     54     switch (index) {
     55 #if !UCONFIG_NO_FILE_IO
     56         case 0: name = "TestBug4153072";
     57             if(exec) TestBug4153072();                         break;
     58 #else
     59         case 0: name = "skip";
     60             break;
     61 #endif
     62 
     63         case 1: name = "TestJapaneseLineBreak";
     64             if(exec) TestJapaneseLineBreak();                  break;
     65         case 2: name = "TestStatusReturn";
     66             if(exec) TestStatusReturn();                       break;
     67 
     68 #if !UCONFIG_NO_FILE_IO
     69         case 3: name = "TestUnicodeFiles";
     70             if(exec) TestUnicodeFiles();                       break;
     71         case 4: name = "TestEmptyString";
     72             if(exec) TestEmptyString();                        break;
     73 #else
     74         case 3: case 4: name = "skip";
     75             break;
     76 #endif
     77 
     78         case 5: name = "TestGetAvailableLocales";
     79             if(exec) TestGetAvailableLocales();                break;
     80 
     81         case 6: name = "TestGetDisplayName";
     82             if(exec) TestGetDisplayName();                     break;
     83 
     84 #if !UCONFIG_NO_FILE_IO
     85         case 7: name = "TestEndBehaviour";
     86             if(exec) TestEndBehaviour();                       break;
     87         case 8: name = "TestMixedThaiLineBreak";
     88              if(exec) TestMixedThaiLineBreak();                break;
     89         case 9: name = "TestThaiLineBreak";
     90              if(exec) TestThaiLineBreak();                     break;
     91         case 10: name = "TestMaiyamok";
     92              if(exec) TestMaiyamok();                          break;
     93         case 11: name = "TestWordBreaks";
     94              if(exec) TestWordBreaks();                        break;
     95         case 12: name = "TestWordBoundary";
     96              if(exec) TestWordBoundary();                      break;
     97         case 13: name = "TestLineBreaks";
     98              if(exec) TestLineBreaks();                        break;
     99         case 14: name = "TestSentBreaks";
    100              if(exec) TestSentBreaks();                        break;
    101         case 15: name = "TestExtended";
    102              if(exec) TestExtended();                          break;
    103 #else
    104         case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
    105              break;
    106 #endif
    107 
    108         case 16:
    109              if(exec) {
    110  #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
    111                name = "TestMonkey";
    112                TestMonkey(params);
    113  #else
    114                name = "skip";
    115  #endif
    116              }
    117                                                                break;
    118 
    119 #if !UCONFIG_NO_FILE_IO
    120         case 17: name = "TestBug3818";
    121             if(exec) TestBug3818();                            break;
    122         case 18: name = "TestJapaneseWordBreak";
    123             if(exec) TestJapaneseWordBreak();                  break;
    124 #else
    125         case 17: case 18: name = "skip";
    126             break;
    127 #endif
    128 
    129         case 19: name = "TestDebug";
    130             if(exec) TestDebug();                              break;
    131         case 20: name = "TestTrieDict";
    132             if(exec) TestTrieDict();                           break;
    133 
    134 #if !UCONFIG_NO_FILE_IO
    135         case 21: name = "TestBug5775";
    136             if (exec) TestBug5775();                           break;
    137         case 22: name = "TestTailoredBreaks";
    138             if (exec) TestTailoredBreaks();                    break;
    139 #else
    140         case 21: case 22: name = "skip";
    141             break;
    142 #endif
    143         case 23: name = "TestDictRules";
    144             if (exec) TestDictRules();                         break;
    145         case 24: name = "TestBug5532";
    146             if (exec) TestBug5532();                           break;
    147         default: name = ""; break; //needed to end loop
    148     }
    149 }
    150 
    151 
    152 //---------------------------------------------------------------------------
    153 //
    154 //   class BITestData   Holds a set of Break iterator test data and results
    155 //                      Includes
    156 //                         - the string data to be broken
    157 //                         - a vector of the expected break positions.
    158 //                         - a vector of source line numbers for the data,
    159 //                               (to help see where errors occured.)
    160 //                         - The expected break tag values.
    161 //                         - Vectors of actual break positions and tag values.
    162 //                         - Functions for comparing actual with expected and
    163 //                            reporting errors.
    164 //
    165 //----------------------------------------------------------------------------
    166 class BITestData {
    167 public:
    168     UnicodeString    fDataToBreak;
    169     UVector          fExpectedBreakPositions;
    170     UVector          fExpectedTags;
    171     UVector          fLineNum;
    172     UVector          fActualBreakPositions;   // Test Results.
    173     UVector          fActualTags;
    174 
    175     BITestData(UErrorCode &status);
    176     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
    177     void             checkResults(const char *heading, RBBITest *test);
    178     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
    179     void             clearResults();
    180 };
    181 
    182 //
    183 // Constructor.
    184 //
    185 BITestData::BITestData(UErrorCode &status)
    186 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
    187   fActualTags(status)
    188 {
    189 }
    190 
    191 //
    192 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
    193 //                 The macro form collects the line number, which is helpful
    194 //                 when tracking down failures.
    195 //
    196 //                 A null data item is inserted at the start of each test's data
    197 //                  to put the starting zero into the data list.  The position saved for
    198 //                  each non-null item is its ending position.
    199 //
    200 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
    201 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
    202     if (U_FAILURE(status)) {return;}
    203     if (data != NULL) {
    204         fDataToBreak.append(CharsToUnicodeString(data));
    205     }
    206     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
    207     fExpectedTags.addElement(tag, status);
    208     fLineNum.addElement(lineNum, status);
    209 }
    210 
    211 
    212 //
    213 //  checkResults.   Compare the actual and expected break positions, report any differences.
    214 //
    215 void BITestData::checkResults(const char *heading, RBBITest *test) {
    216     int32_t   expectedIndex = 0;
    217     int32_t   actualIndex = 0;
    218 
    219     for (;;) {
    220         // If we've run through both the expected and actual results vectors, we're done.
    221         //   break out of the loop.
    222         if (expectedIndex >= fExpectedBreakPositions.size() &&
    223             actualIndex   >= fActualBreakPositions.size()) {
    224             break;
    225         }
    226 
    227 
    228         if (expectedIndex >= fExpectedBreakPositions.size()) {
    229             err(heading, test, expectedIndex-1, actualIndex);
    230             actualIndex++;
    231             continue;
    232         }
    233 
    234         if (actualIndex >= fActualBreakPositions.size()) {
    235             err(heading, test, expectedIndex, actualIndex-1);
    236             expectedIndex++;
    237             continue;
    238         }
    239 
    240         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
    241             err(heading, test, expectedIndex, actualIndex);
    242             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
    243             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
    244                 actualIndex++;
    245             } else {
    246                 expectedIndex++;
    247             }
    248             continue;
    249         }
    250 
    251         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
    252             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
    253                 heading, fLineNum.elementAt(expectedIndex),
    254                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
    255         }
    256 
    257         actualIndex++;
    258         expectedIndex++;
    259     }
    260 }
    261 
    262 //
    263 //  err   -  An error was found.  Report it, along with information about where the
    264 //                                incorrectly broken test data appeared in the source file.
    265 //
    266 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
    267 {
    268     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
    269     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
    270     int32_t   o        = 0;
    271     int32_t   line     = fLineNum.elementAti(expectedIdx);
    272     if (expectedIdx > 0) {
    273         // The line numbers are off by one because a premature break occurs somewhere
    274         //    within the previous item, rather than at the start of the current (expected) item.
    275         //    We want to report the offset of the unexpected break from the start of
    276         //      this previous item.
    277         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
    278     }
    279     if (actual < expected) {
    280         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
    281     } else {
    282         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
    283     }
    284 }
    285 
    286 
    287 void BITestData::clearResults() {
    288     fActualBreakPositions.removeAllElements();
    289     fActualTags.removeAllElements();
    290 }
    291 
    292 
    293 //-----------------------------------------------------------------------------------
    294 //
    295 //    Cannned Test Characters
    296 //
    297 //-----------------------------------------------------------------------------------
    298 
    299 static const UChar cannedTestArray[] = {
    300     0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
    301     0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
    302     0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
    303     0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
    304     0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
    305     0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
    306     0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
    307     0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
    308 };
    309 
    310 static UnicodeString* cannedTestChars = 0;
    311 
    312 #define  halfNA     "\\u0928\\u094d\\u200d"
    313 #define  halfSA     "\\u0938\\u094d\\u200d"
    314 #define  halfCHA    "\\u091a\\u094d\\u200d"
    315 #define  halfKA     "\\u0915\\u094d\\u200d"
    316 #define  deadTA     "\\u0924\\u094d"
    317 
    318 //--------------------------------------------------------------------------------------
    319 //
    320 //    RBBITest    constructor and destructor
    321 //
    322 //--------------------------------------------------------------------------------------
    323 
    324 RBBITest::RBBITest() {
    325     UnicodeString temp(cannedTestArray);
    326     cannedTestChars = new UnicodeString();
    327     *cannedTestChars += (UChar)0x0000;
    328     *cannedTestChars += temp;
    329 }
    330 
    331 
    332 RBBITest::~RBBITest() {
    333     delete cannedTestChars;
    334 }
    335 
    336 
    337 static const int T_NUMBER = 100;
    338 static const int T_LETTER = 200;
    339 static const int T_H_OR_K = 300;
    340 static const int T_IDEO   = 400;
    341 
    342 
    343 
    344 
    345 
    346 
    347 //--------------------------------------------------------------------
    348 //Testing the BreakIterator for devanagari script
    349 //--------------------------------------------------------------------
    350 
    351 #define deadRA   "\\u0930\\u094d"         /*deadform RA = devanagari RA + virama*/
    352 #define deadPHA  "\\u092b\\u094d"         /*deadform PHA = devanagari PHA + virama*/
    353 #define deadTTHA "\\u0920\\u094d"
    354 #define deadPA   "\\u092a\\u094d"
    355 #define deadSA   "\\u0938\\u094d"
    356 #define visarga  "\\u0903"                /*devanagari visarga looks like a english colon*/
    357 
    358 
    359 
    360 
    361 
    362 
    363 //-----------------------------------------------------------------------------------
    364 //
    365 //   Test for status {tag} return value from break rules.
    366 //        TODO:  a more thorough test.
    367 //
    368 //-----------------------------------------------------------------------------------
    369 void RBBITest::TestStatusReturn() {
    370      UnicodeString rulesString1("$Letters = [:L:];\n"
    371                                   "$Numbers = [:N:];\n"
    372                                   "$Letters+{1};\n"
    373                                   "$Numbers+{2};\n"
    374                                   "Help\\ {4}/me\\!;\n"
    375                                   "[^$Letters $Numbers];\n"
    376                                   "!.*;\n", -1, US_INV);
    377      UnicodeString testString1  = "abc123..abc Help me Help me!";
    378                                 // 01234567890123456789012345678
    379      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
    380      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
    381 
    382      UErrorCode status=U_ZERO_ERROR;
    383      UParseError    parseError;
    384 
    385      RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
    386      if(U_FAILURE(status)) {
    387          dataerrln("FAIL : in construction - %s", u_errorName(status));
    388      } else {
    389          int32_t  pos;
    390          int32_t  i = 0;
    391          bi->setText(testString1);
    392          for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
    393              if (pos != bounds1[i]) {
    394                  errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
    395                  break;
    396              }
    397 
    398              int tag = bi->getRuleStatus();
    399              if (tag != brkStatus[i]) {
    400                  errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
    401                  break;
    402              }
    403              i++;
    404          }
    405      }
    406      delete bi;
    407 }
    408 
    409 
    410 static void printStringBreaks(UnicodeString ustr, int expected[],
    411                               int expectedcount)
    412 {
    413     UErrorCode status = U_ZERO_ERROR;
    414     char name[100];
    415     printf("code    alpha extend alphanum type word sent line name\n");
    416     int j;
    417     for (j = 0; j < ustr.length(); j ++) {
    418         if (expectedcount > 0) {
    419             int k;
    420             for (k = 0; k < expectedcount; k ++) {
    421                 if (j == expected[k]) {
    422                     printf("------------------------------------------------ %d\n",
    423                            j);
    424                 }
    425             }
    426         }
    427         UChar32 c = ustr.char32At(j);
    428         if (c > 0xffff) {
    429             j ++;
    430         }
    431         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
    432         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
    433                            u_isUAlphabetic(c),
    434                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
    435                            u_isalnum(c),
    436                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
    437                                                   u_charType(c),
    438                                                   U_SHORT_PROPERTY_NAME),
    439                            u_getPropertyValueName(UCHAR_WORD_BREAK,
    440                                                   u_getIntPropertyValue(c,
    441                                                           UCHAR_WORD_BREAK),
    442                                                   U_SHORT_PROPERTY_NAME),
    443                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
    444                                    u_getIntPropertyValue(c,
    445                                            UCHAR_SENTENCE_BREAK),
    446                                    U_SHORT_PROPERTY_NAME),
    447                            u_getPropertyValueName(UCHAR_LINE_BREAK,
    448                                    u_getIntPropertyValue(c,
    449                                            UCHAR_LINE_BREAK),
    450                                    U_SHORT_PROPERTY_NAME),
    451                            name);
    452     }
    453 }
    454 
    455 void RBBITest::TestThaiLineBreak() {
    456     UErrorCode status = U_ZERO_ERROR;
    457     BITestData thaiLineSelection(status);
    458 
    459     // \u0e2f-- the Thai paiyannoi character-- isn't a letter.  It's a symbol that
    460     // represents elided letters at the end of a long word.  It should be bound to
    461     // the end of the word and not treated as an independent punctuation mark.
    462 
    463 
    464     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
    465     ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status);
    466     ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status);
    467     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status);
    468     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status);
    469 //        ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);
    470 //        ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
    471     ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status);
    472     // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
    473     ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status);
    474     ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status);
    475     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status);
    476     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status);
    477     ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status);
    478     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status);
    479 
    480     // the one time where the paiyannoi occurs somewhere other than at the end
    481     // of a word is in the Thai abbrevation for "etc.", which both begins and
    482     // ends with a paiyannoi
    483     ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status);
    484     ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
    485     ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status);
    486 
    487     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
    488         Locale("th"), status);
    489     if (U_FAILURE(status))
    490     {
    491         errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestThaiLineBreak. - %s", u_errorName(status));
    492         return;
    493     }
    494 
    495     generalIteratorTest(*e, thaiLineSelection);
    496     delete e;
    497 }
    498 
    499 
    500 
    501 void RBBITest::TestMixedThaiLineBreak()
    502 {
    503     UErrorCode   status = U_ZERO_ERROR;
    504     BITestData   thaiLineSelection(status);
    505 
    506     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
    507 
    508 
    509     // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters
    510     // start
    511 
    512     ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
    513     ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status);
    514     ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status);
    515     ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status);
    516     ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
    517     ADD_DATACHUNK(thaiLineSelection, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status);
    518     ADD_DATACHUNK(thaiLineSelection, "\\u0E04\\u0E23\\u0E1A", 0, status);
    519     ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E2D\\u0E1A ", 0, status);
    520     ADD_DATACHUNK(thaiLineSelection, "\"\\u0E52\\u0E52\\u0E50 ", 0, status);
    521     ADD_DATACHUNK(thaiLineSelection, "\\u0E1b\\u0E35\" ", 0, status);
    522     ADD_DATACHUNK(thaiLineSelection, "\\u0E02\\u0E2d\\u0E07", 0, status);
    523     ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status);
    524     ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status);
    525     ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status);
    526     ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status);
    527     ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status);
    528 
    529     // @suwit - end of changes
    530 
    531 
    532     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status);
    533     if (U_FAILURE(status))
    534     {
    535         errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak. - %s", u_errorName(status));
    536         return;
    537     }
    538 
    539 
    540     generalIteratorTest(*e, thaiLineSelection);
    541     delete e;
    542 }
    543 
    544 
    545 void RBBITest::TestMaiyamok()
    546 {
    547     UErrorCode status = U_ZERO_ERROR;
    548     BITestData   thaiLineSelection(status);
    549     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
    550     // the Thai maiyamok character is a shorthand symbol that means "repeat the previous
    551     // word".  Instead of appearing as a word unto itself, however, it's kept together
    552     // with the word before it
    553     ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);
    554     ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);
    555     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);
    556     ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status);
    557     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status);
    558     ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);
    559     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status);
    560     ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status);
    561     ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
    562 
    563     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
    564         Locale("th"), status);
    565 
    566     if (U_FAILURE(status))
    567     {
    568         errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMaiyamok. - %s", u_errorName(status));
    569         return;
    570     }
    571     generalIteratorTest(*e, thaiLineSelection);
    572     delete e;
    573 }
    574 
    575 
    576 
    577 void RBBITest::TestBug3818() {
    578     UErrorCode  status = U_ZERO_ERROR;
    579 
    580     // Four Thai words...
    581     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
    582                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
    583     UnicodeString  thaiStr(thaiWordData);
    584 
    585     RuleBasedBreakIterator* bi =
    586         (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
    587     if (U_FAILURE(status) || bi == NULL) {
    588         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
    589         return;
    590     }
    591     bi->setText(thaiStr);
    592 
    593     int32_t  startOfSecondWord = bi->following(1);
    594     if (startOfSecondWord != 4) {
    595         errln("Fail at file %s, line %d expected start of word at 4, got %d",
    596             __FILE__, __LINE__, startOfSecondWord);
    597     }
    598     startOfSecondWord = bi->following(0);
    599     if (startOfSecondWord != 4) {
    600         errln("Fail at file %s, line %d expected start of word at 4, got %d",
    601             __FILE__, __LINE__, startOfSecondWord);
    602     }
    603     delete bi;
    604 }
    605 
    606 
    607 void RBBITest::TestJapaneseWordBreak() {
    608     UErrorCode status = U_ZERO_ERROR;
    609     BITestData   japaneseWordSelection(status);
    610 
    611     ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status);           // Break at start of data
    612     ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2
    613     ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5
    614     ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7
    615     ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10
    616     ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11
    617     ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12
    618 
    619     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
    620         Locale("ja"), status);
    621     if (U_FAILURE(status))
    622     {
    623         errcheckln(status, "Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
    624         return;
    625     }
    626 
    627     generalIteratorTest(*e, japaneseWordSelection);
    628     delete e;
    629 }
    630 
    631 void RBBITest::TestTrieDict() {
    632     UErrorCode      status  = U_ZERO_ERROR;
    633 
    634     //
    635     //  Open and read the test data file.
    636     //
    637     const char *testDataDirectory = IntlTest::getSourceTestData(status);
    638     char testFileName[1000];
    639     if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) {
    640         errln("Can't open test data.  Path too long.");
    641         return;
    642     }
    643     strcpy(testFileName, testDataDirectory);
    644     strcat(testFileName, "riwords.txt");
    645 
    646     // Items needing deleting at the end
    647     MutableTrieDictionary *mutableDict = NULL;
    648     CompactTrieDictionary *compactDict = NULL;
    649     UnicodeSet            *breaks      = NULL;
    650     UChar                 *testFile    = NULL;
    651     StringEnumeration     *enumer1     = NULL;
    652     StringEnumeration     *enumer2     = NULL;
    653     MutableTrieDictionary *mutable2    = NULL;
    654     StringEnumeration     *cloneEnum   = NULL;
    655     CompactTrieDictionary *compact2    = NULL;
    656 
    657 
    658     const UnicodeString *originalWord = NULL;
    659     const UnicodeString *cloneWord    = NULL;
    660     UChar *current;
    661     UChar *word;
    662     UChar uc;
    663     int32_t wordLen;
    664     int32_t wordCount;
    665     int32_t testCount;
    666 
    667     int    len;
    668     testFile = ReadAndConvertFile(testFileName, len, NULL, status);
    669     if (U_FAILURE(status)) {
    670         goto cleanup; /* something went wrong, error already output */
    671     }
    672 
    673     mutableDict = new MutableTrieDictionary(0x0E1C, status);
    674     if (U_FAILURE(status)) {
    675         errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
    676         goto cleanup;
    677     }
    678 
    679     breaks = new UnicodeSet;
    680     breaks->add(0x000A);     // Line Feed
    681     breaks->add(0x000D);     // Carriage Return
    682     breaks->add(0x2028);     // Line Separator
    683     breaks->add(0x2029);     // Paragraph Separator
    684 
    685     // Now add each non-comment line of the file as a word.
    686     current = testFile;
    687     word = current;
    688     uc = *current++;
    689     wordLen = 0;
    690     wordCount = 0;
    691 
    692     while (uc) {
    693         if (uc == 0x0023) {     // #comment line, skip
    694             while (uc && !breaks->contains(uc)) {
    695                 uc = *current++;
    696             }
    697         }
    698         else while (uc && !breaks->contains(uc)) {
    699             ++wordLen;
    700             uc = *current++;
    701         }
    702         if (wordLen > 0) {
    703             mutableDict->addWord(word, wordLen, status);
    704             if (U_FAILURE(status)) {
    705                 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
    706                 goto cleanup;
    707             }
    708             wordCount += 1;
    709         }
    710 
    711         // Find beginning of next line
    712         while (uc && breaks->contains(uc)) {
    713             uc = *current++;
    714         }
    715         word = current-1;
    716         wordLen = 0;
    717     }
    718 
    719     if (wordCount < 50) {
    720         errln("Word count (%d) unreasonably small\n", wordCount);
    721         goto cleanup;
    722     }
    723 
    724     enumer1 = mutableDict->openWords(status);
    725     if (U_FAILURE(status)) {
    726         errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
    727         goto cleanup;
    728     }
    729 
    730     testCount = 0;
    731     if (wordCount != (testCount = enumer1->count(status))) {
    732         errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
    733             testCount, wordCount, u_errorName(status));
    734         goto cleanup;
    735     }
    736 
    737     // Now compact it
    738     compactDict = new CompactTrieDictionary(*mutableDict, status);
    739     if (U_FAILURE(status)) {
    740         errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
    741         goto cleanup;
    742     }
    743 
    744     enumer2 = compactDict->openWords(status);
    745     if (U_FAILURE(status)) {
    746         errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
    747         goto cleanup;
    748     }
    749 
    750     if (wordCount != (testCount = enumer2->count(status))) {
    751         errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
    752             testCount, wordCount, u_errorName(status));
    753         goto cleanup;
    754     }
    755 
    756     if (typeid(*enumer1) == typeid(*enumer2)) {
    757         errln("CompactTrieEnumeration and MutableTrieEnumeration typeids are the same");
    758     }
    759     delete enumer1;
    760     enumer1 = NULL;
    761     delete enumer2;
    762     enumer2 = NULL;
    763 
    764     // Now un-compact it
    765     mutable2 = compactDict->cloneMutable(status);
    766     if (U_FAILURE(status)) {
    767         errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
    768         goto cleanup;
    769     }
    770 
    771     cloneEnum = mutable2->openWords(status);
    772     if (U_FAILURE(status)) {
    773         errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
    774         goto cleanup;
    775     }
    776 
    777     if (wordCount != (testCount = cloneEnum->count(status))) {
    778         errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
    779             testCount, wordCount, u_errorName(status));
    780         goto cleanup;
    781     }
    782 
    783     // Compact original dictionary to clone. Note that we can only compare the same kind of
    784     // dictionary as the order of the enumerators is not guaranteed to be the same between
    785     // different kinds
    786     enumer1 = mutableDict->openWords(status);
    787     if (U_FAILURE(status)) {
    788         errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
    789         goto cleanup;
    790      }
    791 
    792     originalWord = enumer1->snext(status);
    793     cloneWord = cloneEnum->snext(status);
    794     while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
    795         if (*originalWord != *cloneWord) {
    796             errln("Original and cloned MutableTrieDictionary word mismatch\n");
    797             goto cleanup;
    798         }
    799         originalWord = enumer1->snext(status);
    800         cloneWord = cloneEnum->snext(status);
    801     }
    802 
    803     if (U_FAILURE(status)) {
    804         errln("Enumeration failed: %s\n", u_errorName(status));
    805         goto cleanup;
    806     }
    807 
    808     if (originalWord != cloneWord) {
    809         errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
    810         goto cleanup;
    811     }
    812 
    813     // Test the data copying constructor for CompactTrieDict, and the data access APIs.
    814     compact2 = new CompactTrieDictionary(compactDict->data(), status);
    815     if (U_FAILURE(status)) {
    816         errln("CompactTrieDictionary(const void *,...) failed\n");
    817         goto cleanup;
    818     }
    819 
    820     if (compact2->dataSize() == 0) {
    821         errln("CompactTrieDictionary->dataSize() == 0\n");
    822         goto cleanup;
    823     }
    824 
    825     // Now count the words via the second dictionary
    826     delete enumer1;
    827     enumer1 = compact2->openWords(status);
    828     if (U_FAILURE(status)) {
    829         errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
    830         goto cleanup;
    831     }
    832 
    833     if (wordCount != (testCount = enumer1->count(status))) {
    834         errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
    835             testCount, wordCount, u_errorName(status));
    836         goto cleanup;
    837     }
    838 
    839 cleanup:
    840     delete compactDict;
    841     delete mutableDict;
    842     delete breaks;
    843     delete[] testFile;
    844     delete enumer1;
    845     delete mutable2;
    846     delete cloneEnum;
    847     delete compact2;
    848 }
    849 
    850 
    851 //----------------------------------------------------------------------------
    852 //
    853 // generalIteratorTest      Given a break iterator and a set of test data,
    854 //                          Run the tests and report the results.
    855 //
    856 //----------------------------------------------------------------------------
    857 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
    858 {
    859 
    860     bi.setText(td.fDataToBreak);
    861 
    862     testFirstAndNext(bi, td);
    863 
    864     testLastAndPrevious(bi, td);
    865 
    866     testFollowing(bi, td);
    867     testPreceding(bi, td);
    868     testIsBoundary(bi, td);
    869     doMultipleSelectionTest(bi, td);
    870 }
    871 
    872 
    873 //
    874 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
    875 //                       kind of loop.
    876 //
    877 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
    878 {
    879     UErrorCode  status = U_ZERO_ERROR;
    880     int32_t     p;
    881     int32_t     lastP = -1;
    882     int32_t     tag;
    883 
    884     logln("Test first and next");
    885     bi.setText(td.fDataToBreak);
    886     td.clearResults();
    887 
    888     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
    889         td.fActualBreakPositions.addElement(p, status);  // Save result.
    890         tag = bi.getRuleStatus();
    891         td.fActualTags.addElement(tag, status);
    892         if (p <= lastP) {
    893             // If the iterator is not making forward progress, stop.
    894             //  No need to raise an error here, it'll be detected in the normal check of results.
    895             break;
    896         }
    897         lastP = p;
    898     }
    899     td.checkResults("testFirstAndNext", this);
    900 }
    901 
    902 
    903 //
    904 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
    905 //
    906 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
    907 {
    908     UErrorCode  status = U_ZERO_ERROR;
    909     int32_t     p;
    910     int32_t     lastP  = 0x7ffffffe;
    911     int32_t     tag;
    912 
    913     logln("Test last and previous");
    914     bi.setText(td.fDataToBreak);
    915     td.clearResults();
    916 
    917     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
    918         // Save break position.  Insert it at start of vector of results, shoving
    919         //    already-saved results further towards the end.
    920         td.fActualBreakPositions.insertElementAt(p, 0, status);
    921         // bi.previous();   // TODO:  Why does this fix things up????
    922         // bi.next();
    923         tag = bi.getRuleStatus();
    924         td.fActualTags.insertElementAt(tag, 0, status);
    925         if (p >= lastP) {
    926             // If the iterator is not making progress, stop.
    927             //  No need to raise an error here, it'll be detected in the normal check of results.
    928             break;
    929         }
    930         lastP = p;
    931     }
    932     td.checkResults("testLastAndPrevious", this);
    933 }
    934 
    935 
    936 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
    937 {
    938     UErrorCode  status = U_ZERO_ERROR;
    939     int32_t     p;
    940     int32_t     tag;
    941     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
    942                                  //   cannot be -1; that is returned for DONE.
    943     int         i;
    944 
    945     logln("testFollowing():");
    946     bi.setText(td.fDataToBreak);
    947     td.clearResults();
    948 
    949     // Save the starting point, since we won't get that out of following.
    950     p = bi.first();
    951     td.fActualBreakPositions.addElement(p, status);  // Save result.
    952     tag = bi.getRuleStatus();
    953     td.fActualTags.addElement(tag, status);
    954 
    955     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
    956         p = bi.following(i);
    957         if (p != lastP) {
    958             if (p == RuleBasedBreakIterator::DONE) {
    959                 break;
    960             }
    961             // We've reached a new break position.  Save it.
    962             td.fActualBreakPositions.addElement(p, status);  // Save result.
    963             tag = bi.getRuleStatus();
    964             td.fActualTags.addElement(tag, status);
    965             lastP = p;
    966         }
    967     }
    968     // The loop normally exits by means of the break in the middle.
    969     // Make sure that the index was at the correct position for the break iterator to have
    970     //   returned DONE.
    971     if (i != td.fDataToBreak.length()) {
    972         errln("testFollowing():  iterator returned DONE prematurely.");
    973     }
    974 
    975     // Full check of all results.
    976     td.checkResults("testFollowing", this);
    977 }
    978 
    979 
    980 
    981 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
    982     UErrorCode  status = U_ZERO_ERROR;
    983     int32_t     p;
    984     int32_t     tag;
    985     int32_t     lastP  = 0x7ffffffe;
    986     int         i;
    987 
    988     logln("testPreceding():");
    989     bi.setText(td.fDataToBreak);
    990     td.clearResults();
    991 
    992     p = bi.last();
    993     td.fActualBreakPositions.addElement(p, status);
    994     tag = bi.getRuleStatus();
    995     td.fActualTags.addElement(tag, status);
    996 
    997     for (i = td.fDataToBreak.length(); i>=-1; i--) {
    998         p = bi.preceding(i);
    999         if (p != lastP) {
   1000             if (p == RuleBasedBreakIterator::DONE) {
   1001                 break;
   1002             }
   1003             // We've reached a new break position.  Save it.
   1004             td.fActualBreakPositions.insertElementAt(p, 0, status);
   1005             lastP = p;
   1006             tag = bi.getRuleStatus();
   1007             td.fActualTags.insertElementAt(tag, 0, status);
   1008         }
   1009     }
   1010     // The loop normally exits by means of the break in the middle.
   1011     // Make sure that the index was at the correct position for the break iterator to have
   1012     //   returned DONE.
   1013     if (i != 0) {
   1014         errln("testPreceding():  iterator returned DONE prematurely.");
   1015     }
   1016 
   1017     // Full check of all results.
   1018     td.checkResults("testPreceding", this);
   1019 }
   1020 
   1021 
   1022 
   1023 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
   1024     UErrorCode  status = U_ZERO_ERROR;
   1025     int         i;
   1026     int32_t     tag;
   1027 
   1028     logln("testIsBoundary():");
   1029     bi.setText(td.fDataToBreak);
   1030     td.clearResults();
   1031 
   1032     for (i = 0; i <= td.fDataToBreak.length(); i++) {
   1033         if (bi.isBoundary(i)) {
   1034             td.fActualBreakPositions.addElement(i, status);  // Save result.
   1035             tag = bi.getRuleStatus();
   1036             td.fActualTags.addElement(tag, status);
   1037         }
   1038     }
   1039     td.checkResults("testIsBoundary: ", this);
   1040 }
   1041 
   1042 
   1043 
   1044 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
   1045 {
   1046     iterator.setText(td.fDataToBreak);
   1047 
   1048     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
   1049     int32_t offset = iterator.first();
   1050     int32_t testOffset;
   1051     int32_t count = 0;
   1052 
   1053     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
   1054 
   1055     if (*testIterator != iterator)
   1056         errln("clone() or operator!= failed: two clones compared unequal");
   1057 
   1058     do {
   1059         testOffset = testIterator->first();
   1060         testOffset = testIterator->next(count);
   1061         if (offset != testOffset)
   1062             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
   1063 
   1064         if (offset != RuleBasedBreakIterator::DONE) {
   1065             count++;
   1066             offset = iterator.next();
   1067 
   1068             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
   1069                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
   1070                 if (count > 10000 || offset == -1) {
   1071                     errln("operator== failed too many times. Stopping test.");
   1072                     if (offset == -1) {
   1073                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
   1074                     }
   1075                     return;
   1076                 }
   1077             }
   1078         }
   1079     } while (offset != RuleBasedBreakIterator::DONE);
   1080 
   1081     // now do it backwards...
   1082     offset = iterator.last();
   1083     count = 0;
   1084 
   1085     do {
   1086         testOffset = testIterator->last();
   1087         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
   1088         if (offset != testOffset)
   1089             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
   1090 
   1091         if (offset != RuleBasedBreakIterator::DONE) {
   1092             count--;
   1093             offset = iterator.previous();
   1094         }
   1095     } while (offset != RuleBasedBreakIterator::DONE);
   1096 
   1097     delete testIterator;
   1098 }
   1099 
   1100 
   1101 //---------------------------------------------
   1102 //
   1103 //     other tests
   1104 //
   1105 //---------------------------------------------
   1106 void RBBITest::TestEmptyString()
   1107 {
   1108     UnicodeString text = "";
   1109     UErrorCode status = U_ZERO_ERROR;
   1110 
   1111     BITestData x(status);
   1112     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
   1113     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
   1114     if (U_FAILURE(status))
   1115     {
   1116         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
   1117         return;
   1118     }
   1119     generalIteratorTest(*bi, x);
   1120     delete bi;
   1121 }
   1122 
   1123 void RBBITest::TestGetAvailableLocales()
   1124 {
   1125     int32_t locCount = 0;
   1126     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
   1127 
   1128     if (locCount == 0)
   1129         dataerrln("getAvailableLocales() returned an empty list!");
   1130     // Just make sure that it's returning good memory.
   1131     int32_t i;
   1132     for (i = 0; i < locCount; ++i) {
   1133         logln(locList[i].getName());
   1134     }
   1135 }
   1136 
   1137 //Testing the BreakIterator::getDisplayName() function
   1138 void RBBITest::TestGetDisplayName()
   1139 {
   1140     UnicodeString   result;
   1141 
   1142     BreakIterator::getDisplayName(Locale::getUS(), result);
   1143     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
   1144         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
   1145                 + result);
   1146 
   1147     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
   1148     if (result != "French (France)")
   1149         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
   1150                 + result);
   1151 }
   1152 /**
   1153  * Test End Behaviour
   1154  * @bug 4068137
   1155  */
   1156 void RBBITest::TestEndBehaviour()
   1157 {
   1158     UErrorCode status = U_ZERO_ERROR;
   1159     UnicodeString testString("boo.");
   1160     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
   1161     if (U_FAILURE(status))
   1162     {
   1163         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
   1164         return;
   1165     }
   1166     wb->setText(testString);
   1167 
   1168     if (wb->first() != 0)
   1169         errln("Didn't get break at beginning of string.");
   1170     if (wb->next() != 3)
   1171         errln("Didn't get break before period in \"boo.\"");
   1172     if (wb->current() != 4 && wb->next() != 4)
   1173         errln("Didn't get break at end of string.");
   1174     delete wb;
   1175 }
   1176 /*
   1177  * @bug 4153072
   1178  */
   1179 void RBBITest::TestBug4153072() {
   1180     UErrorCode status = U_ZERO_ERROR;
   1181     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
   1182     if (U_FAILURE(status))
   1183     {
   1184         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
   1185         return;
   1186     }
   1187     UnicodeString str("...Hello, World!...");
   1188     int32_t begin = 3;
   1189     int32_t end = str.length() - 3;
   1190     UBool onBoundary;
   1191 
   1192     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
   1193     iter->adoptText(textIterator);
   1194     int index;
   1195     // Note: with the switch to UText, there is no way to restrict the
   1196     //       iteration range to begin at an index other than zero.
   1197     //       String character iterators created with a non-zero bound are
   1198     //         treated by RBBI as being empty.
   1199     for (index = -1; index < begin + 1; ++index) {
   1200         onBoundary = iter->isBoundary(index);
   1201         if (index == 0?  !onBoundary : onBoundary) {
   1202             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
   1203                             " and begin index = " + begin);
   1204         }
   1205     }
   1206     delete iter;
   1207 }
   1208 
   1209 
   1210 //
   1211 // Test for problem reported by Ashok Matoria on 9 July 2007
   1212 //    One.<kSoftHyphen><kSpace>Two.
   1213 //
   1214 //    Sentence break at start (0) and then on calling next() it breaks at
   1215 //   'T' of "Two". Now, at this point if I do next() and
   1216 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
   1217 //
   1218 void RBBITest::TestBug5775() {
   1219     UErrorCode status = U_ZERO_ERROR;
   1220     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
   1221     TEST_ASSERT_SUCCESS(status);
   1222     if (U_FAILURE(status)) {
   1223         return;
   1224     }
   1225 // Check for status first for better handling of no data errors.
   1226     TEST_ASSERT(bi != NULL);
   1227     if (bi == NULL) {
   1228         return;
   1229     }
   1230 
   1231     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
   1232     //               01234      56789
   1233     s = s.unescape();
   1234     bi->setText(s);
   1235     int pos = bi->next();
   1236     TEST_ASSERT(pos == 6);
   1237     pos = bi->next();
   1238     TEST_ASSERT(pos == 10);
   1239     pos = bi->previous();
   1240     TEST_ASSERT(pos == 6);
   1241     delete bi;
   1242 }
   1243 
   1244 
   1245 
   1246 /**
   1247  * Test Japanese Line Break
   1248  * @bug 4095322
   1249  */
   1250 void RBBITest::TestJapaneseLineBreak()
   1251 {
   1252 #if 0
   1253     // Test needs updating some more...   Dump it for now.
   1254 
   1255 
   1256     // Change for Unicode TR 14:  Punctuation characters with categories Pi and Pf do not count
   1257     //        as opening and closing punctuation for line breaking.
   1258     //        Also, \u30fc and \u30fe are not counted as hyphens.   Remove these chars
   1259     //        from these tests.    6-13-2002
   1260     //
   1261     UErrorCode status = U_ZERO_ERROR;
   1262     UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");
   1263     UnicodeString precedingChars = CharsToUnicodeString(
   1264         //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
   1265         "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
   1266     UnicodeString followingChars = CharsToUnicodeString(
   1267         // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
   1268         ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
   1269         // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
   1270         ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
   1271         "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
   1272     BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status);
   1273 
   1274     int32_t i;
   1275     if (U_FAILURE(status))
   1276     {
   1277         errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
   1278         return;
   1279     }
   1280 
   1281     for (i = 0; i < precedingChars.length(); i++) {
   1282         testString.setCharAt(1, precedingChars[i]);
   1283         iter->setText(testString);
   1284         int32_t j = iter->first();
   1285         if (j != 0)
   1286             errln("ja line break failure: failed to start at 0");
   1287         j = iter->next();
   1288         if (j != 1)
   1289             errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])
   1290                         + "' (" + ((int)(precedingChars[i])) + ")");
   1291         j = iter->next();
   1292         if (j != 3)
   1293             errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])
   1294                         + "' (" + ((int)(precedingChars[i])) + ")");
   1295     }
   1296 
   1297     for (i = 0; i < followingChars.length(); i++) {
   1298         testString.setCharAt(1, followingChars[i]);
   1299         iter->setText(testString);
   1300         int j = iter->first();
   1301         if (j != 0)
   1302             errln("ja line break failure: failed to start at 0");
   1303         j = iter->next();
   1304         if (j != 2)
   1305             errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])
   1306                         + "' (" + ((int)(followingChars[i])) + ")");
   1307         j = iter->next();
   1308         if (j != 3)
   1309             errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])
   1310                         + "' (" + ((int)(followingChars[i])) + ")");
   1311     }
   1312     delete iter;
   1313 #endif
   1314 }
   1315 
   1316 
   1317 //------------------------------------------------------------------------------
   1318 //
   1319 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
   1320 //
   1321 //------------------------------------------------------------------------------
   1322 
   1323 struct TestParams {
   1324     BreakIterator   *bi;
   1325     UnicodeString    dataToBreak;
   1326     UVector32       *expectedBreaks;
   1327     UVector32       *srcLine;
   1328     UVector32       *srcCol;
   1329 };
   1330 
   1331 void RBBITest::executeTest(TestParams *t) {
   1332     int32_t    bp;
   1333     int32_t    prevBP;
   1334     int32_t    i;
   1335 
   1336     if (t->bi == NULL) {
   1337         return;
   1338     }
   1339 
   1340     t->bi->setText(t->dataToBreak);
   1341     //
   1342     //  Run the iterator forward
   1343     //
   1344     prevBP = -1;
   1345     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
   1346         if (prevBP ==  bp) {
   1347             // Fail for lack of forward progress.
   1348             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
   1349                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
   1350             break;
   1351         }
   1352 
   1353         // Check that there were we didn't miss an expected break between the last one
   1354         //  and this one.
   1355         for (i=prevBP+1; i<bp; i++) {
   1356             if (t->expectedBreaks->elementAti(i) != 0) {
   1357                 int expected[] = {0, i};
   1358                 printStringBreaks(t->dataToBreak, expected, 2);
   1359                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1360                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
   1361             }
   1362         }
   1363 
   1364         // Check that the break we did find was expected
   1365         if (t->expectedBreaks->elementAti(bp) == 0) {
   1366             int expected[] = {0, bp};
   1367             printStringBreaks(t->dataToBreak, expected, 2);
   1368             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
   1369                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
   1370         } else {
   1371             // The break was expected.
   1372             //   Check that the {nnn} tag value is correct.
   1373             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
   1374             if (expectedTagVal == -1) {
   1375                 expectedTagVal = 0;
   1376             }
   1377             int32_t line = t->srcLine->elementAti(bp);
   1378             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
   1379             if (rs != expectedTagVal) {
   1380                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
   1381                       "          Actual, Expected status = %4d, %4d",
   1382                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
   1383             }
   1384         }
   1385 
   1386 
   1387         prevBP = bp;
   1388     }
   1389 
   1390     // Verify that there were no missed expected breaks after the last one found
   1391     for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
   1392         if (t->expectedBreaks->elementAti(i) != 0) {
   1393             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1394                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
   1395         }
   1396     }
   1397 
   1398     //
   1399     //  Run the iterator backwards, verify that the same breaks are found.
   1400     //
   1401     prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
   1402     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
   1403         if (prevBP ==  bp) {
   1404             // Fail for lack of progress.
   1405             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
   1406                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
   1407             break;
   1408         }
   1409 
   1410         // Check that there were we didn't miss an expected break between the last one
   1411         //  and this one.  (UVector returns zeros for index out of bounds.)
   1412         for (i=prevBP-1; i>bp; i--) {
   1413             if (t->expectedBreaks->elementAti(i) != 0) {
   1414                 errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1415                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
   1416             }
   1417         }
   1418 
   1419         // Check that the break we did find was expected
   1420         if (t->expectedBreaks->elementAti(bp) == 0) {
   1421             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
   1422                    bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
   1423         } else {
   1424             // The break was expected.
   1425             //   Check that the {nnn} tag value is correct.
   1426             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
   1427             if (expectedTagVal == -1) {
   1428                 expectedTagVal = 0;
   1429             }
   1430             int line = t->srcLine->elementAti(bp);
   1431             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
   1432             if (rs != expectedTagVal) {
   1433                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
   1434                       "          Actual, Expected status = %4d, %4d",
   1435                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
   1436             }
   1437         }
   1438 
   1439         prevBP = bp;
   1440     }
   1441 
   1442     // Verify that there were no missed breaks prior to the last one found
   1443     for (i=prevBP-1; i>=0; i--) {
   1444         if (t->expectedBreaks->elementAti(i) != 0) {
   1445             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1446                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
   1447         }
   1448     }
   1449 }
   1450 
   1451 
   1452 void RBBITest::TestExtended() {
   1453 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   1454     UErrorCode      status  = U_ZERO_ERROR;
   1455     Locale          locale("");
   1456 
   1457     UnicodeString       rules;
   1458     TestParams          tp;
   1459     tp.bi             = NULL;
   1460     tp.expectedBreaks = new UVector32(status);
   1461     tp.srcLine        = new UVector32(status);
   1462     tp.srcCol         = new UVector32(status);
   1463 
   1464     RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
   1465     if (U_FAILURE(status)) {
   1466         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
   1467     }
   1468 
   1469 
   1470     //
   1471     //  Open and read the test data file.
   1472     //
   1473     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   1474     char testFileName[1000];
   1475     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
   1476         errln("Can't open test data.  Path too long.");
   1477         return;
   1478     }
   1479     strcpy(testFileName, testDataDirectory);
   1480     strcat(testFileName, "rbbitst.txt");
   1481 
   1482     int    len;
   1483     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
   1484     if (U_FAILURE(status)) {
   1485         return; /* something went wrong, error already output */
   1486     }
   1487 
   1488 
   1489 
   1490 
   1491     //
   1492     //  Put the test data into a UnicodeString
   1493     //
   1494     UnicodeString testString(FALSE, testFile, len);
   1495 
   1496     enum EParseState{
   1497         PARSE_COMMENT,
   1498         PARSE_TAG,
   1499         PARSE_DATA,
   1500         PARSE_NUM
   1501     }
   1502     parseState = PARSE_TAG;
   1503 
   1504     EParseState savedState = PARSE_TAG;
   1505 
   1506     static const UChar CH_LF        = 0x0a;
   1507     static const UChar CH_CR        = 0x0d;
   1508     static const UChar CH_HASH      = 0x23;
   1509     /*static const UChar CH_PERIOD    = 0x2e;*/
   1510     static const UChar CH_LT        = 0x3c;
   1511     static const UChar CH_GT        = 0x3e;
   1512     static const UChar CH_BACKSLASH = 0x5c;
   1513     static const UChar CH_BULLET    = 0x2022;
   1514 
   1515     int32_t    lineNum  = 1;
   1516     int32_t    colStart = 0;
   1517     int32_t    column   = 0;
   1518     int32_t    charIdx  = 0;
   1519 
   1520     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
   1521 
   1522     for (charIdx = 0; charIdx < len; ) {
   1523         status = U_ZERO_ERROR;
   1524         UChar  c = testString.charAt(charIdx);
   1525         charIdx++;
   1526         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
   1527             // treat CRLF as a unit
   1528             c = CH_LF;
   1529             charIdx++;
   1530         }
   1531         if (c == CH_LF || c == CH_CR) {
   1532             lineNum++;
   1533             colStart = charIdx;
   1534         }
   1535         column = charIdx - colStart + 1;
   1536 
   1537         switch (parseState) {
   1538         case PARSE_COMMENT:
   1539             if (c == 0x0a || c == 0x0d) {
   1540                 parseState = savedState;
   1541             }
   1542             break;
   1543 
   1544         case PARSE_TAG:
   1545             {
   1546             if (c == CH_HASH) {
   1547                 parseState = PARSE_COMMENT;
   1548                 savedState = PARSE_TAG;
   1549                 break;
   1550             }
   1551             if (u_isUWhiteSpace(c)) {
   1552                 break;
   1553             }
   1554             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
   1555                 delete tp.bi;
   1556                 tp.bi = BreakIterator::createWordInstance(locale,  status);
   1557                 charIdx += 5;
   1558                 break;
   1559             }
   1560             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
   1561                 delete tp.bi;
   1562                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
   1563                 charIdx += 5;
   1564                 break;
   1565             }
   1566             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
   1567                 delete tp.bi;
   1568                 tp.bi = BreakIterator::createLineInstance(locale,  status);
   1569                 charIdx += 5;
   1570                 break;
   1571             }
   1572             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
   1573                 delete tp.bi;
   1574                 tp.bi = NULL;
   1575                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
   1576                 charIdx += 5;
   1577                 break;
   1578             }
   1579             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
   1580                 delete tp.bi;
   1581                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
   1582                 charIdx += 6;
   1583                 break;
   1584             }
   1585 
   1586             // <locale  loc_name>
   1587             localeMatcher.reset(testString);
   1588             if (localeMatcher.lookingAt(charIdx-1, status)) {
   1589                 UnicodeString localeName = localeMatcher.group(1, status);
   1590                 char localeName8[100];
   1591                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
   1592                 locale = Locale::createFromName(localeName8);
   1593                 charIdx += localeMatcher.group(0, status).length();
   1594                 TEST_ASSERT_SUCCESS(status);
   1595                 break;
   1596             }
   1597             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
   1598                 parseState = PARSE_DATA;
   1599                 charIdx += 5;
   1600                 tp.dataToBreak = "";
   1601                 tp.expectedBreaks->removeAllElements();
   1602                 tp.srcCol ->removeAllElements();
   1603                 tp.srcLine->removeAllElements();
   1604                 break;
   1605             }
   1606 
   1607             errln("line %d: Tag expected in test file.", lineNum);
   1608             parseState = PARSE_COMMENT;
   1609             savedState = PARSE_DATA;
   1610             goto end_test; // Stop the test.
   1611             }
   1612             break;
   1613 
   1614         case PARSE_DATA:
   1615             if (c == CH_BULLET) {
   1616                 int32_t  breakIdx = tp.dataToBreak.length();
   1617                 tp.expectedBreaks->setSize(breakIdx+1);
   1618                 tp.expectedBreaks->setElementAt(-1, breakIdx);
   1619                 tp.srcLine->setSize(breakIdx+1);
   1620                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1621                 tp.srcCol ->setSize(breakIdx+1);
   1622                 tp.srcCol ->setElementAt(column, breakIdx);
   1623                 break;
   1624             }
   1625 
   1626             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
   1627                 // Add final entry to mappings from break location to source file position.
   1628                 //  Need one extra because last break position returned is after the
   1629                 //    last char in the data, not at the last char.
   1630                 tp.srcLine->addElement(lineNum, status);
   1631                 tp.srcCol ->addElement(column, status);
   1632 
   1633                 parseState = PARSE_TAG;
   1634                 charIdx += 6;
   1635 
   1636                 // RUN THE TEST!
   1637                 executeTest(&tp);
   1638                 break;
   1639             }
   1640 
   1641             if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
   1642                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
   1643                 // Get the code point from the name and insert it into the test data.
   1644                 //   (Damn, no API takes names in Unicode  !!!
   1645                 //    we've got to take it back to char *)
   1646                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
   1647                 int32_t nameLength = nameEndIdx - (charIdx+2);
   1648                 char charNameBuf[200];
   1649                 UChar32 theChar = -1;
   1650                 if (nameEndIdx != -1) {
   1651                     UErrorCode status = U_ZERO_ERROR;
   1652                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
   1653                     charNameBuf[sizeof(charNameBuf)-1] = 0;
   1654                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
   1655                     if (U_FAILURE(status)) {
   1656                         theChar = -1;
   1657                     }
   1658                 }
   1659                 if (theChar == -1) {
   1660                     errln("Error in named character in test file at line %d, col %d",
   1661                         lineNum, column);
   1662                 } else {
   1663                     // Named code point was recognized.  Insert it
   1664                     //   into the test data.
   1665                     tp.dataToBreak.append(theChar);
   1666                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
   1667                         tp.srcLine->addElement(lineNum, status);
   1668                         tp.srcCol ->addElement(column, status);
   1669                     }
   1670                 }
   1671                 if (nameEndIdx > charIdx) {
   1672                     charIdx = nameEndIdx+1;
   1673 
   1674                 }
   1675                 break;
   1676             }
   1677 
   1678 
   1679 
   1680 
   1681             if (testString.compare(charIdx-1, 2, "<>") == 0) {
   1682                 charIdx++;
   1683                 int32_t  breakIdx = tp.dataToBreak.length();
   1684                 tp.expectedBreaks->setSize(breakIdx+1);
   1685                 tp.expectedBreaks->setElementAt(-1, breakIdx);
   1686                 tp.srcLine->setSize(breakIdx+1);
   1687                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1688                 tp.srcCol ->setSize(breakIdx+1);
   1689                 tp.srcCol ->setElementAt(column, breakIdx);
   1690                 break;
   1691             }
   1692 
   1693             if (c == CH_LT) {
   1694                 tagValue   = 0;
   1695                 parseState = PARSE_NUM;
   1696                 break;
   1697             }
   1698 
   1699             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
   1700                 parseState = PARSE_COMMENT;
   1701                 savedState = PARSE_DATA;
   1702                 break;
   1703             }
   1704 
   1705             if (c == CH_BACKSLASH) {
   1706                 // Check for \ at end of line, a line continuation.
   1707                 //     Advance over (discard) the newline
   1708                 UChar32 cp = testString.char32At(charIdx);
   1709                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
   1710                     // We have a CR LF
   1711                     //  Need an extra increment of the input ptr to move over both of them
   1712                     charIdx++;
   1713                 }
   1714                 if (cp == CH_LF || cp == CH_CR) {
   1715                     lineNum++;
   1716                     colStart = charIdx;
   1717                     charIdx++;
   1718                     break;
   1719                 }
   1720 
   1721                 // Let unescape handle the back slash.
   1722                 cp = testString.unescapeAt(charIdx);
   1723                 if (cp != -1) {
   1724                     // Escape sequence was recognized.  Insert the char
   1725                     //   into the test data.
   1726                     tp.dataToBreak.append(cp);
   1727                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
   1728                         tp.srcLine->addElement(lineNum, status);
   1729                         tp.srcCol ->addElement(column, status);
   1730                     }
   1731                     break;
   1732                 }
   1733 
   1734 
   1735                 // Not a recognized backslash escape sequence.
   1736                 // Take the next char as a literal.
   1737                 //  TODO:  Should this be an error?
   1738                 c = testString.charAt(charIdx);
   1739                 charIdx = testString.moveIndex32(charIdx, 1);
   1740             }
   1741 
   1742             // Normal, non-escaped data char.
   1743             tp.dataToBreak.append(c);
   1744 
   1745             // Save the mapping from offset in the data to line/column numbers in
   1746             //   the original input file.  Will be used for better error messages only.
   1747             //   If there's an expected break before this char, the slot in the mapping
   1748             //     vector will already be set for this char; don't overwrite it.
   1749             if (tp.dataToBreak.length() > tp.srcLine->size()) {
   1750                 tp.srcLine->addElement(lineNum, status);
   1751                 tp.srcCol ->addElement(column, status);
   1752             }
   1753             break;
   1754 
   1755 
   1756         case PARSE_NUM:
   1757             // We are parsing an expected numeric tag value, like <1234>,
   1758             //   within a chunk of data.
   1759             if (u_isUWhiteSpace(c)) {
   1760                 break;
   1761             }
   1762 
   1763             if (c == CH_GT) {
   1764                 // Finished the number.  Add the info to the expected break data,
   1765                 //   and switch parse state back to doing plain data.
   1766                 parseState = PARSE_DATA;
   1767                 if (tagValue == 0) {
   1768                     tagValue = -1;
   1769                 }
   1770                 int32_t  breakIdx = tp.dataToBreak.length();
   1771                 tp.expectedBreaks->setSize(breakIdx+1);
   1772                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
   1773                 tp.srcLine->setSize(breakIdx+1);
   1774                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1775                 tp.srcCol ->setSize(breakIdx+1);
   1776                 tp.srcCol ->setElementAt(column, breakIdx);
   1777                 break;
   1778             }
   1779 
   1780             if (u_isdigit(c)) {
   1781                 tagValue = tagValue*10 + u_charDigitValue(c);
   1782                 break;
   1783             }
   1784 
   1785             errln("Syntax Error in test file at line %d, col %d",
   1786                 lineNum, column);
   1787             parseState = PARSE_COMMENT;
   1788             goto end_test; // Stop the test
   1789             break;
   1790         }
   1791 
   1792 
   1793         if (U_FAILURE(status)) {
   1794             dataerrln("ICU Error %s while parsing test file at line %d.",
   1795                 u_errorName(status), lineNum);
   1796             status = U_ZERO_ERROR;
   1797             goto end_test; // Stop the test
   1798         }
   1799 
   1800     }
   1801 
   1802 end_test:
   1803     delete tp.bi;
   1804     delete tp.expectedBreaks;
   1805     delete tp.srcLine;
   1806     delete tp.srcCol;
   1807     delete [] testFile;
   1808 #endif
   1809 }
   1810 
   1811 // UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"
   1812 // Words don't include colon or period (cldrbug #1969).
   1813 static const char    posxWordText[]     = "Can't have breaks in xx:yy or struct.field for CS-types.";
   1814 static const int32_t posxWordTOffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 };
   1815 static const int32_t posxWordROffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21,         26, 27, 29, 30,         42, 43, 46, 47, 49, 50, 55, 56 };
   1816 
   1817 // UBreakIteratorType UBRK_WORD, Locale "ja"
   1818 // Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
   1819 static const char    jaWordText[]     = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"
   1820                                         "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";
   1821 static const int32_t jaWordTOffsets[] = {    2, 3,          7, 8, 14,         17, 18,     20, 21, 24,         27, 28 };
   1822 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 };
   1823 
   1824 // UBreakIteratorType UBRK_SENTENCE, Locale "el"
   1825 // Add break after Greek question mark (cldrbug #2069).
   1826 static const char    elSentText[]     = "\\u0391\\u03B2, \\u03B3\\u03B4; \\u0395 \\u03B6\\u03B7\\u037E \\u0398 \\u03B9\\u03BA. "
   1827                                         "\\u039B\\u03BC \\u03BD\\u03BE! \\u039F\\u03C0, \\u03A1\\u03C2? \\u03A3";
   1828 static const int32_t elSentTOffsets[] = { 8, 14, 20, 27, 35, 36 };
   1829 static const int32_t elSentROffsets[] = {        20, 27, 35, 36 };
   1830 
   1831 // UBreakIteratorType UBRK_CHARACTER, Locale "th"
   1832 // Clusters should not include spacing Thai/Lao vowels (prefix or postfix), except for [SARA] AM (cldrbug #2161).
   1833 static const char    thCharText[]     = "\\u0E01\\u0E23\\u0E30\\u0E17\\u0E48\\u0E2D\\u0E21\\u0E23\\u0E08\\u0E19\\u0E32 "
   1834                                         "(\\u0E2A\\u0E38\\u0E0A\\u0E32\\u0E15\\u0E34-\\u0E08\\u0E38\\u0E11\\u0E32\\u0E21\\u0E32\\u0E28) "
   1835                                         "\\u0E40\\u0E14\\u0E47\\u0E01\\u0E21\\u0E35\\u0E1B\\u0E31\\u0E0D\\u0E2B\\u0E32 ";
   1836 static const int32_t thCharTOffsets[] = { 1, 2, 3, 5, 6, 7, 8, 9, 10, 11,
   1837                                           12, 13, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28,
   1838                                           29, 30, 32, 33, 35, 37, 38, 39, 40, 41 };
   1839 static const int32_t thCharROffsets[] = { 1,    3, 5, 6, 7, 8, 9,     11,
   1840                                           12, 13, 15,     17, 19, 20, 22,     24,     26, 27, 28,
   1841                                           29,     32, 33, 35, 37, 38,     40, 41 };
   1842 
   1843 typedef struct {
   1844     UBreakIteratorType  type;
   1845     const char *        locale;
   1846     const char *        escapedText;
   1847     const int32_t *     tailoredOffsets;
   1848     int32_t             tailoredOffsetsCount;
   1849     const int32_t *     rootOffsets;
   1850     int32_t             rootOffsetsCount;
   1851 } TailoredBreakItem;
   1852 
   1853 #define ARRAY_PTR_LEN(array) (array),(sizeof(array)/sizeof(array[0]))
   1854 
   1855 static const TailoredBreakItem tbItems[] = {
   1856     { UBRK_WORD,      "en_US_POSIX", posxWordText, ARRAY_PTR_LEN(posxWordTOffsets), ARRAY_PTR_LEN(posxWordROffsets) },
   1857     { UBRK_WORD,      "ja",          jaWordText,   ARRAY_PTR_LEN(jaWordTOffsets),   ARRAY_PTR_LEN(jaWordROffsets)   },
   1858     { UBRK_SENTENCE,  "el",          elSentText,   ARRAY_PTR_LEN(elSentTOffsets),   ARRAY_PTR_LEN(elSentROffsets)   },
   1859     { UBRK_CHARACTER, "th",          thCharText,   ARRAY_PTR_LEN(thCharTOffsets),   ARRAY_PTR_LEN(thCharROffsets)   },
   1860     { UBRK_CHARACTER, NULL,          NULL,         NULL,0,                          NULL,0                          } // terminator
   1861 };
   1862 
   1863 static void formatOffsets(char* buffer, int32_t buflen, int32_t count, const int32_t* offsets) {
   1864     while (count-- > 0) {
   1865         int writeCount;
   1866         sprintf(buffer, /* buflen, */ " %d%n", *offsets++, &writeCount); /* wants to be snprintf */
   1867         buffer += writeCount;
   1868         buflen -= writeCount;
   1869     }
   1870 }
   1871 
   1872 enum { kMaxOffsetCount = 128 };
   1873 
   1874 void RBBITest::TBTest(BreakIterator* brkitr, int type, const char *locale, const char* escapedText, const int32_t *expectOffsets, int32_t expectOffsetsCount) {
   1875     brkitr->setText( CharsToUnicodeString(escapedText) );
   1876     int32_t foundOffsets[kMaxOffsetCount];
   1877     int32_t offset, foundOffsetsCount = 0;
   1878     // do forwards iteration test
   1879     while ( foundOffsetsCount < kMaxOffsetCount && (offset = brkitr->next()) != BreakIterator::DONE ) {
   1880         foundOffsets[foundOffsetsCount++] = offset;
   1881     }
   1882     if ( foundOffsetsCount != expectOffsetsCount || memcmp(expectOffsets, foundOffsets, foundOffsetsCount*sizeof(foundOffsets[0])) != 0 ) {
   1883         // log error for forwards test
   1884         char formatExpect[512], formatFound[512];
   1885         formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets);
   1886         formatOffsets(formatFound, sizeof(formatFound), foundOffsetsCount, foundOffsets);
   1887         errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found %d offsets fwd:%s\n",
   1888                 type, locale, escapedText, expectOffsetsCount, formatExpect, foundOffsetsCount, formatFound);
   1889     } else {
   1890         // do backwards iteration test
   1891         --foundOffsetsCount; // back off one from the end offset
   1892         while ( foundOffsetsCount > 0 ) {
   1893             offset = brkitr->previous();
   1894             if ( offset != foundOffsets[--foundOffsetsCount] ) {
   1895                 // log error for backwards test
   1896                 char formatExpect[512];
   1897                 formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets);
   1898                 errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found rev offset %d where expect %d\n",
   1899                         type, locale, escapedText, expectOffsetsCount, formatExpect, offset, foundOffsets[foundOffsetsCount]);
   1900                 break;
   1901             }
   1902         }
   1903     }
   1904 }
   1905 
   1906 void RBBITest::TestTailoredBreaks() {
   1907     const TailoredBreakItem * tbItemPtr;
   1908     Locale rootLocale = Locale("root");
   1909     for (tbItemPtr = tbItems; tbItemPtr->escapedText != NULL; ++tbItemPtr) {
   1910         Locale testLocale = Locale(tbItemPtr->locale);
   1911         BreakIterator * tailoredBrkiter = NULL;
   1912         BreakIterator * rootBrkiter = NULL;
   1913         UErrorCode status = U_ZERO_ERROR;
   1914         switch (tbItemPtr->type) {
   1915             case UBRK_CHARACTER:
   1916                 tailoredBrkiter = BreakIterator::createCharacterInstance(testLocale, status);
   1917                 rootBrkiter = BreakIterator::createCharacterInstance(rootLocale, status);
   1918                 break;
   1919             case UBRK_WORD:
   1920                 tailoredBrkiter = BreakIterator::createWordInstance(testLocale, status);
   1921                 rootBrkiter = BreakIterator::createWordInstance(rootLocale, status);
   1922                 break;
   1923             case UBRK_LINE:
   1924                 tailoredBrkiter = BreakIterator::createLineInstance(testLocale, status);
   1925                 rootBrkiter = BreakIterator::createLineInstance(rootLocale, status);
   1926                 break;
   1927             case UBRK_SENTENCE:
   1928                 tailoredBrkiter = BreakIterator::createSentenceInstance(testLocale, status);
   1929                 rootBrkiter = BreakIterator::createSentenceInstance(rootLocale, status);
   1930                 break;
   1931             default:
   1932                 status = U_UNSUPPORTED_ERROR;
   1933                 break;
   1934         }
   1935         if (U_FAILURE(status)) {
   1936             errcheckln(status, "BreakIterator create failed for type %d, locales root or %s - Error: %s", (int)(tbItemPtr->type), tbItemPtr->locale, u_errorName(status));
   1937             continue;
   1938         }
   1939         TBTest(tailoredBrkiter, (int)(tbItemPtr->type), tbItemPtr->locale, tbItemPtr->escapedText, tbItemPtr->tailoredOffsets, tbItemPtr->tailoredOffsetsCount);
   1940         TBTest(rootBrkiter,     (int)(tbItemPtr->type), "root",            tbItemPtr->escapedText, tbItemPtr->rootOffsets,     tbItemPtr->rootOffsetsCount);
   1941 
   1942         delete rootBrkiter;
   1943         delete tailoredBrkiter;
   1944     }
   1945 }
   1946 
   1947 
   1948 //-------------------------------------------------------------------------------
   1949 //
   1950 //  TestDictRules   create a break iterator from source rules that includes a
   1951 //                  dictionary range.   Regression for bug #7130.  Source rules
   1952 //                  do not declare a break iterator type (word, line, sentence, etc.
   1953 //                  but the dictionary code, without a type, would loop.
   1954 //
   1955 //-------------------------------------------------------------------------------
   1956 void RBBITest::TestDictRules() {
   1957     const char *rules =  "$dictionary = [a-z]; \n"
   1958                          "!!forward; \n"
   1959                          "$dictionary $dictionary; \n"
   1960                          "!!reverse; \n"
   1961                          "$dictionary $dictionary; \n";
   1962     const char *text = "aa";
   1963     UErrorCode status = U_ZERO_ERROR;
   1964     UParseError parseError;
   1965 
   1966     RuleBasedBreakIterator bi(rules, parseError, status);
   1967     if (U_SUCCESS(status)) {
   1968         UnicodeString utext = text;
   1969         bi.setText(utext);
   1970         int32_t position;
   1971         int32_t loops;
   1972         for (loops = 0; loops<10; loops++) {
   1973             position = bi.next();
   1974             if (position == RuleBasedBreakIterator::DONE) {
   1975                 break;
   1976             }
   1977         }
   1978         TEST_ASSERT(loops == 1);
   1979     } else {
   1980         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
   1981     }
   1982 }
   1983 
   1984 
   1985 
   1986 //-------------------------------------------------------------------------------
   1987 //
   1988 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
   1989 //    return the datain one big UChar * buffer, which the caller must delete.
   1990 //
   1991 //    parameters:
   1992 //          fileName:   the name of the file, with no directory part.  The test data directory
   1993 //                      is assumed.
   1994 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
   1995 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
   1996 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
   1997 //                      Pass NULL for the system default encoding.
   1998 //          status
   1999 //    returns:
   2000 //                      The file data, converted to UChar.
   2001 //                      The caller must delete this when done with
   2002 //                           delete [] theBuffer;
   2003 //
   2004 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
   2005 //           Move this function to some common place.
   2006 //
   2007 //--------------------------------------------------------------------------------
   2008 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
   2009     UChar       *retPtr  = NULL;
   2010     char        *fileBuf = NULL;
   2011     UConverter* conv     = NULL;
   2012     FILE        *f       = NULL;
   2013 
   2014     ulen = 0;
   2015     if (U_FAILURE(status)) {
   2016         return retPtr;
   2017     }
   2018 
   2019     //
   2020     //  Open the file.
   2021     //
   2022     f = fopen(fileName, "rb");
   2023     if (f == 0) {
   2024         dataerrln("Error opening test data file %s\n", fileName);
   2025         status = U_FILE_ACCESS_ERROR;
   2026         return NULL;
   2027     }
   2028     //
   2029     //  Read it in
   2030     //
   2031     int   fileSize;
   2032     int   amt_read;
   2033 
   2034     fseek( f, 0, SEEK_END);
   2035     fileSize = ftell(f);
   2036     fileBuf = new char[fileSize];
   2037     fseek(f, 0, SEEK_SET);
   2038     amt_read = fread(fileBuf, 1, fileSize, f);
   2039     if (amt_read != fileSize || fileSize <= 0) {
   2040         errln("Error reading test data file.");
   2041         goto cleanUpAndReturn;
   2042     }
   2043 
   2044     //
   2045     // Look for a Unicode Signature (BOM) on the data just read
   2046     //
   2047     int32_t        signatureLength;
   2048     const char *   fileBufC;
   2049     const char*    bomEncoding;
   2050 
   2051     fileBufC = fileBuf;
   2052     bomEncoding = ucnv_detectUnicodeSignature(
   2053         fileBuf, fileSize, &signatureLength, &status);
   2054     if(bomEncoding!=NULL ){
   2055         fileBufC  += signatureLength;
   2056         fileSize  -= signatureLength;
   2057         encoding = bomEncoding;
   2058     }
   2059 
   2060     //
   2061     // Open a converter to take the rule file to UTF-16
   2062     //
   2063     conv = ucnv_open(encoding, &status);
   2064     if (U_FAILURE(status)) {
   2065         goto cleanUpAndReturn;
   2066     }
   2067 
   2068     //
   2069     // Convert the rules to UChar.
   2070     //  Preflight first to determine required buffer size.
   2071     //
   2072     ulen = ucnv_toUChars(conv,
   2073         NULL,           //  dest,
   2074         0,              //  destCapacity,
   2075         fileBufC,
   2076         fileSize,
   2077         &status);
   2078     if (status == U_BUFFER_OVERFLOW_ERROR) {
   2079         // Buffer Overflow is expected from the preflight operation.
   2080         status = U_ZERO_ERROR;
   2081 
   2082         retPtr = new UChar[ulen+1];
   2083         ucnv_toUChars(conv,
   2084             retPtr,       //  dest,
   2085             ulen+1,
   2086             fileBufC,
   2087             fileSize,
   2088             &status);
   2089     }
   2090 
   2091 cleanUpAndReturn:
   2092     fclose(f);
   2093     delete []fileBuf;
   2094     ucnv_close(conv);
   2095     if (U_FAILURE(status)) {
   2096         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   2097         delete []retPtr;
   2098         retPtr = 0;
   2099         ulen   = 0;
   2100     };
   2101     return retPtr;
   2102 }
   2103 
   2104 
   2105 
   2106 //--------------------------------------------------------------------------------------------
   2107 //
   2108 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
   2109 //
   2110 //-------------------------------------------------------------------------------------------
   2111 void RBBITest::TestUnicodeFiles() {
   2112     RuleBasedBreakIterator  *bi;
   2113     UErrorCode               status = U_ZERO_ERROR;
   2114 
   2115     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
   2116     TEST_ASSERT_SUCCESS(status);
   2117     if (U_SUCCESS(status)) {
   2118         runUnicodeTestData("GraphemeBreakTest.txt", bi);
   2119     }
   2120     delete bi;
   2121 
   2122     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
   2123     TEST_ASSERT_SUCCESS(status);
   2124     if (U_SUCCESS(status)) {
   2125         runUnicodeTestData("WordBreakTest.txt", bi);
   2126     }
   2127     delete bi;
   2128 
   2129     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
   2130     TEST_ASSERT_SUCCESS(status);
   2131     if (U_SUCCESS(status)) {
   2132         runUnicodeTestData("SentenceBreakTest.txt", bi);
   2133     }
   2134     delete bi;
   2135 
   2136     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
   2137     TEST_ASSERT_SUCCESS(status);
   2138     if (U_SUCCESS(status)) {
   2139         runUnicodeTestData("LineBreakTest.txt", bi);
   2140     }
   2141     delete bi;
   2142 }
   2143 
   2144 
   2145 //--------------------------------------------------------------------------------------------
   2146 //
   2147 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
   2148 //
   2149 //-------------------------------------------------------------------------------------------
   2150 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
   2151 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   2152 // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb.
   2153   UVersionInfo icu49 = { 4, 9, 0, 0 };
   2154 UBool isICUVersionPast48 = isICUVersionAtLeast(icu49);
   2155 UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt");
   2156     UErrorCode  status = U_ZERO_ERROR;
   2157 
   2158     //
   2159     //  Open and read the test data file, put it into a UnicodeString.
   2160     //
   2161     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   2162     char testFileName[1000];
   2163     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
   2164         dataerrln("Can't open test data.  Path too long.");
   2165         return;
   2166     }
   2167     strcpy(testFileName, testDataDirectory);
   2168     strcat(testFileName, fileName);
   2169 
   2170     logln("Opening data file %s\n", fileName);
   2171 
   2172     int    len;
   2173     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
   2174     if (status != U_FILE_ACCESS_ERROR) {
   2175         TEST_ASSERT_SUCCESS(status);
   2176         TEST_ASSERT(testFile != NULL);
   2177     }
   2178     if (U_FAILURE(status) || testFile == NULL) {
   2179         return; /* something went wrong, error already output */
   2180     }
   2181     UnicodeString testFileAsString(TRUE, testFile, len);
   2182 
   2183     //
   2184     //  Parse the test data file using a regular expression.
   2185     //  Each kind of token is recognized in its own capture group; what type of item was scanned
   2186     //     is identified by which group had a match.
   2187     //
   2188     //    Caputure Group #                  1          2            3            4           5
   2189     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
   2190     //
   2191     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
   2192     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
   2193     UnicodeString   testString;
   2194     UVector32       breakPositions(status);
   2195     int             lineNumber = 1;
   2196     TEST_ASSERT_SUCCESS(status);
   2197     if (U_FAILURE(status)) {
   2198         return;
   2199     }
   2200 
   2201     //
   2202     //  Scan through each test case, building up the string to be broken in testString,
   2203     //   and the positions that should be boundaries in the breakPositions vector.
   2204     //
   2205     int spin = 0;
   2206     while (tokenMatcher.find()) {
   2207       	if(tokenMatcher.hitEnd()) {
   2208           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
   2209              This occurred when the text file was corrupt (wasn't marked as UTF-8)
   2210              and caused an infinite loop here on EBCDIC systems!
   2211           */
   2212           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
   2213           //	   return;
   2214       	}
   2215         if (tokenMatcher.start(1, status) >= 0) {
   2216             // Scanned a divide sign, indicating a break position in the test data.
   2217             if (testString.length()>0) {
   2218                 breakPositions.addElement(testString.length(), status);
   2219             }
   2220         }
   2221         else if (tokenMatcher.start(2, status) >= 0) {
   2222             // Scanned an 'x', meaning no break at this position in the test data
   2223             //   Nothing to be done here.
   2224             }
   2225         else if (tokenMatcher.start(3, status) >= 0) {
   2226             // Scanned Hex digits.  Convert them to binary, append to the character data string.
   2227             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
   2228             int length = hexNumber.length();
   2229             if (length<=8) {
   2230                 char buf[10];
   2231                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
   2232                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
   2233                 if (c<=0x10ffff) {
   2234                     testString.append(c);
   2235                 } else {
   2236                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
   2237                        fileName, lineNumber);
   2238                 }
   2239             } else {
   2240                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
   2241                        fileName, lineNumber);
   2242              }
   2243         }
   2244         else if (tokenMatcher.start(4, status) >= 0) {
   2245             // Scanned to end of a line, possibly skipping over a comment in the process.
   2246             //   If the line from the file contained test data, run the test now.
   2247             //
   2248             if (testString.length() > 0) {
   2249 // TODO(andy): Remove this time bomb code.
   2250 if (!isLineBreak || isICUVersionPast48 || !(4658 <= lineNumber && lineNumber <= 4758)) {
   2251                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
   2252 }
   2253             }
   2254 
   2255             // Clear out this test case.
   2256             //    The string and breakPositions vector will be refilled as the next
   2257             //       test case is parsed.
   2258             testString.remove();
   2259             breakPositions.removeAllElements();
   2260             lineNumber++;
   2261         } else {
   2262             // Scanner catchall.  Something unrecognized appeared on the line.
   2263             char token[16];
   2264             UnicodeString uToken = tokenMatcher.group(0, status);
   2265             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
   2266             token[sizeof(token)-1] = 0;
   2267             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
   2268 
   2269             // Clean up, in preparation for continuing with the next line.
   2270             testString.remove();
   2271             breakPositions.removeAllElements();
   2272             lineNumber++;
   2273         }
   2274         TEST_ASSERT_SUCCESS(status);
   2275         if (U_FAILURE(status)) {
   2276             break;
   2277         }
   2278     }
   2279 
   2280     delete [] testFile;
   2281  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
   2282 }
   2283 
   2284 //--------------------------------------------------------------------------------------------
   2285 //
   2286 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
   2287 //                            test data files.  Do only a simple, forward-only check -
   2288 //                            this test is mostly to check that ICU and the Unicode
   2289 //                            data agree with each other.
   2290 //
   2291 //--------------------------------------------------------------------------------------------
   2292 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
   2293                          const UnicodeString &testString,   // Text data to be broken
   2294                          UVector32 *breakPositions,         // Positions where breaks should be found.
   2295                          RuleBasedBreakIterator *bi) {
   2296     int32_t pos;                 // Break Position in the test string
   2297     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
   2298     int32_t expectedPos;         // Expected break position (index into test string)
   2299 
   2300     bi->setText(testString);
   2301     pos = bi->first();
   2302     pos = bi->next();
   2303 
   2304     while (pos != BreakIterator::DONE) {
   2305         if (expectedI >= breakPositions->size()) {
   2306             errln("Test file \"%s\", line %d, unexpected break found at position %d",
   2307                 testFileName, lineNumber, pos);
   2308             break;
   2309         }
   2310         expectedPos = breakPositions->elementAti(expectedI);
   2311         if (pos < expectedPos) {
   2312             errln("Test file \"%s\", line %d, unexpected break found at position %d",
   2313                 testFileName, lineNumber, pos);
   2314             break;
   2315         }
   2316         if (pos > expectedPos) {
   2317             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
   2318                 testFileName, lineNumber, expectedPos);
   2319             break;
   2320         }
   2321         pos = bi->next();
   2322         expectedI++;
   2323     }
   2324 
   2325     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
   2326         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
   2327             testFileName, lineNumber, breakPositions->elementAti(expectedI));
   2328     }
   2329 }
   2330 
   2331 
   2332 
   2333 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   2334 //---------------------------------------------------------------------------------------
   2335 //
   2336 //   classs RBBIMonkeyKind
   2337 //
   2338 //      Monkey Test for Break Iteration
   2339 //      Abstract interface class.   Concrete derived classes independently
   2340 //      implement the break rules for different iterator types.
   2341 //
   2342 //      The Monkey Test itself uses doesn't know which type of break iterator it is
   2343 //      testing, but works purely in terms of the interface defined here.
   2344 //
   2345 //---------------------------------------------------------------------------------------
   2346 class RBBIMonkeyKind {
   2347 public:
   2348     // Return a UVector of UnicodeSets, representing the character classes used
   2349     //   for this type of iterator.
   2350     virtual  UVector  *charClasses() = 0;
   2351 
   2352     // Set the test text on which subsequent calls to next() will operate
   2353     virtual  void      setText(const UnicodeString &s) = 0;
   2354 
   2355     // Find the next break postion, starting from the prev break position, or from zero.
   2356     // Return -1 after reaching end of string.
   2357     virtual  int32_t   next(int32_t i) = 0;
   2358 
   2359     virtual ~RBBIMonkeyKind();
   2360     UErrorCode       deferredStatus;
   2361 
   2362 
   2363 protected:
   2364     RBBIMonkeyKind();
   2365 
   2366 private:
   2367 };
   2368 
   2369 RBBIMonkeyKind::RBBIMonkeyKind() {
   2370     deferredStatus = U_ZERO_ERROR;
   2371 }
   2372 
   2373 RBBIMonkeyKind::~RBBIMonkeyKind() {
   2374 }
   2375 
   2376 
   2377 //----------------------------------------------------------------------------------------
   2378 //
   2379 //   Random Numbers.  Similar to standard lib rand() and srand()
   2380 //                    Not using library to
   2381 //                      1.  Get same results on all platforms.
   2382 //                      2.  Get access to current seed, to more easily reproduce failures.
   2383 //
   2384 //---------------------------------------------------------------------------------------
   2385 static uint32_t m_seed = 1;
   2386 
   2387 static uint32_t m_rand()
   2388 {
   2389     m_seed = m_seed * 1103515245 + 12345;
   2390     return (uint32_t)(m_seed/65536) % 32768;
   2391 }
   2392 
   2393 
   2394 //------------------------------------------------------------------------------------------
   2395 //
   2396 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
   2397 //                             of RBBIMonkeyKind.
   2398 //
   2399 //------------------------------------------------------------------------------------------
   2400 class RBBICharMonkey: public RBBIMonkeyKind {
   2401 public:
   2402     RBBICharMonkey();
   2403     virtual          ~RBBICharMonkey();
   2404     virtual  UVector *charClasses();
   2405     virtual  void     setText(const UnicodeString &s);
   2406     virtual  int32_t  next(int32_t i);
   2407 private:
   2408     UVector   *fSets;
   2409 
   2410     UnicodeSet  *fCRLFSet;
   2411     UnicodeSet  *fControlSet;
   2412     UnicodeSet  *fExtendSet;
   2413     UnicodeSet  *fPrependSet;
   2414     UnicodeSet  *fSpacingSet;
   2415     UnicodeSet  *fLSet;
   2416     UnicodeSet  *fVSet;
   2417     UnicodeSet  *fTSet;
   2418     UnicodeSet  *fLVSet;
   2419     UnicodeSet  *fLVTSet;
   2420     UnicodeSet  *fHangulSet;
   2421     UnicodeSet  *fAnySet;
   2422 
   2423     const UnicodeString *fText;
   2424 };
   2425 
   2426 
   2427 RBBICharMonkey::RBBICharMonkey() {
   2428     UErrorCode  status = U_ZERO_ERROR;
   2429 
   2430     fText = NULL;
   2431 
   2432     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
   2433     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
   2434     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
   2435     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
   2436     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
   2437     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
   2438     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
   2439     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
   2440     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
   2441     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
   2442     fHangulSet  = new UnicodeSet();
   2443     fHangulSet->addAll(*fLSet);
   2444     fHangulSet->addAll(*fVSet);
   2445     fHangulSet->addAll(*fTSet);
   2446     fHangulSet->addAll(*fLVSet);
   2447     fHangulSet->addAll(*fLVTSet);
   2448     fAnySet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0000-\\U0010ffff]"), status);
   2449 
   2450     fSets       = new UVector(status);
   2451     fSets->addElement(fCRLFSet,    status);
   2452     fSets->addElement(fControlSet, status);
   2453     fSets->addElement(fExtendSet,  status);
   2454     fSets->addElement(fPrependSet, status);
   2455     fSets->addElement(fSpacingSet, status);
   2456     fSets->addElement(fHangulSet,  status);
   2457     fSets->addElement(fAnySet,     status);
   2458     if (U_FAILURE(status)) {
   2459         deferredStatus = status;
   2460     }
   2461 }
   2462 
   2463 
   2464 void RBBICharMonkey::setText(const UnicodeString &s) {
   2465     fText = &s;
   2466 }
   2467 
   2468 
   2469 
   2470 int32_t RBBICharMonkey::next(int32_t prevPos) {
   2471     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2472                               //   break position being tested.  The candidate break
   2473                               //   location is before p2.
   2474 
   2475     int     breakPos = -1;
   2476 
   2477     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2478 
   2479     if (U_FAILURE(deferredStatus)) {
   2480         return -1;
   2481     }
   2482 
   2483     // Previous break at end of string.  return DONE.
   2484     if (prevPos >= fText->length()) {
   2485         return -1;
   2486     }
   2487     p0 = p1 = p2 = p3 = prevPos;
   2488     c3 =  fText->char32At(prevPos);
   2489     c0 = c1 = c2 = 0;
   2490 
   2491     // Loop runs once per "significant" character position in the input text.
   2492     for (;;) {
   2493         // Move all of the positions forward in the input string.
   2494         p0 = p1;  c0 = c1;
   2495         p1 = p2;  c1 = c2;
   2496         p2 = p3;  c2 = c3;
   2497 
   2498         // Advancd p3 by one codepoint
   2499         p3 = fText->moveIndex32(p3, 1);
   2500         c3 = fText->char32At(p3);
   2501 
   2502         if (p1 == p2) {
   2503             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2504             continue;
   2505         }
   2506         if (p2 == fText->length()) {
   2507             // Reached end of string.  Always a break position.
   2508             break;
   2509         }
   2510 
   2511         // Rule  GB3   CR x LF
   2512         //     No Extend or Format characters may appear between the CR and LF,
   2513         //     which requires the additional check for p2 immediately following p1.
   2514         //
   2515         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
   2516             continue;
   2517         }
   2518 
   2519         // Rule (GB4).   ( Control | CR | LF ) <break>
   2520         if (fControlSet->contains(c1) ||
   2521             c1 == 0x0D ||
   2522             c1 == 0x0A)  {
   2523             break;
   2524         }
   2525 
   2526         // Rule (GB5)    <break>  ( Control | CR | LF )
   2527         //
   2528         if (fControlSet->contains(c2) ||
   2529             c2 == 0x0D ||
   2530             c2 == 0x0A)  {
   2531             break;
   2532         }
   2533 
   2534 
   2535         // Rule (GB6)  L x ( L | V | LV | LVT )
   2536         if (fLSet->contains(c1) &&
   2537                (fLSet->contains(c2)  ||
   2538                 fVSet->contains(c2)  ||
   2539                 fLVSet->contains(c2) ||
   2540                 fLVTSet->contains(c2))) {
   2541             continue;
   2542         }
   2543 
   2544         // Rule (GB7)    ( LV | V )  x  ( V | T )
   2545         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
   2546             (fVSet->contains(c2) || fTSet->contains(c2)))  {
   2547             continue;
   2548         }
   2549 
   2550         // Rule (GB8)    ( LVT | T)  x T
   2551         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
   2552             fTSet->contains(c2))  {
   2553             continue;
   2554         }
   2555 
   2556         // Rule (GB9)    Numeric x ALetter
   2557         if (fExtendSet->contains(c2))  {
   2558             continue;
   2559         }
   2560 
   2561         // Rule (GB9a)   x  SpacingMark
   2562         if (fSpacingSet->contains(c2)) {
   2563             continue;
   2564         }
   2565 
   2566         // Rule (GB9b)   Prepend x
   2567         if (fPrependSet->contains(c1)) {
   2568             continue;
   2569         }
   2570 
   2571         // Rule (GB10)  Any  <break>  Any
   2572         break;
   2573     }
   2574 
   2575     breakPos = p2;
   2576     return breakPos;
   2577 }
   2578 
   2579 
   2580 
   2581 UVector  *RBBICharMonkey::charClasses() {
   2582     return fSets;
   2583 }
   2584 
   2585 
   2586 RBBICharMonkey::~RBBICharMonkey() {
   2587     delete fSets;
   2588     delete fCRLFSet;
   2589     delete fControlSet;
   2590     delete fExtendSet;
   2591     delete fPrependSet;
   2592     delete fSpacingSet;
   2593     delete fLSet;
   2594     delete fVSet;
   2595     delete fTSet;
   2596     delete fLVSet;
   2597     delete fLVTSet;
   2598     delete fHangulSet;
   2599     delete fAnySet;
   2600 }
   2601 
   2602 //------------------------------------------------------------------------------------------
   2603 //
   2604 //   class RBBIWordMonkey      Word Break specific implementation
   2605 //                             of RBBIMonkeyKind.
   2606 //
   2607 //------------------------------------------------------------------------------------------
   2608 class RBBIWordMonkey: public RBBIMonkeyKind {
   2609 public:
   2610     RBBIWordMonkey();
   2611     virtual          ~RBBIWordMonkey();
   2612     virtual  UVector *charClasses();
   2613     virtual  void     setText(const UnicodeString &s);
   2614     virtual int32_t   next(int32_t i);
   2615 private:
   2616     UVector      *fSets;
   2617 
   2618     UnicodeSet  *fCRSet;
   2619     UnicodeSet  *fLFSet;
   2620     UnicodeSet  *fNewlineSet;
   2621     UnicodeSet  *fKatakanaSet;
   2622     UnicodeSet  *fALetterSet;
   2623     UnicodeSet  *fMidNumLetSet;
   2624     UnicodeSet  *fMidLetterSet;
   2625     UnicodeSet  *fMidNumSet;
   2626     UnicodeSet  *fNumericSet;
   2627     UnicodeSet  *fFormatSet;
   2628     UnicodeSet  *fOtherSet;
   2629     UnicodeSet  *fExtendSet;
   2630     UnicodeSet  *fExtendNumLetSet;
   2631 
   2632     RegexMatcher  *fMatcher;
   2633 
   2634     const UnicodeString  *fText;
   2635 };
   2636 
   2637 
   2638 RBBIWordMonkey::RBBIWordMonkey()
   2639 {
   2640     UErrorCode  status = U_ZERO_ERROR;
   2641 
   2642     fSets            = new UVector(status);
   2643 
   2644     fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
   2645     fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
   2646     fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
   2647     fALetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"),      status);
   2648     fKatakanaSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
   2649     fMidNumLetSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
   2650     fMidLetterSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
   2651     fMidNumSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
   2652     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
   2653     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
   2654     fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
   2655     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
   2656 
   2657     fOtherSet        = new UnicodeSet();
   2658     if(U_FAILURE(status)) {
   2659       deferredStatus = status;
   2660       return;
   2661     }
   2662 
   2663     fOtherSet->complement();
   2664     fOtherSet->removeAll(*fCRSet);
   2665     fOtherSet->removeAll(*fLFSet);
   2666     fOtherSet->removeAll(*fNewlineSet);
   2667     fOtherSet->removeAll(*fKatakanaSet);
   2668     fOtherSet->removeAll(*fALetterSet);
   2669     fOtherSet->removeAll(*fMidLetterSet);
   2670     fOtherSet->removeAll(*fMidNumSet);
   2671     fOtherSet->removeAll(*fNumericSet);
   2672     fOtherSet->removeAll(*fExtendNumLetSet);
   2673     fOtherSet->removeAll(*fFormatSet);
   2674     fOtherSet->removeAll(*fExtendSet);
   2675     // Inhibit dictionary characters from being tested at all.
   2676     fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
   2677 
   2678     fSets->addElement(fCRSet,        status);
   2679     fSets->addElement(fLFSet,        status);
   2680     fSets->addElement(fNewlineSet,   status);
   2681     fSets->addElement(fALetterSet,   status);
   2682     fSets->addElement(fKatakanaSet,  status);
   2683     fSets->addElement(fMidLetterSet, status);
   2684     fSets->addElement(fMidNumLetSet, status);
   2685     fSets->addElement(fMidNumSet,    status);
   2686     fSets->addElement(fNumericSet,   status);
   2687     fSets->addElement(fFormatSet,    status);
   2688     fSets->addElement(fExtendSet,    status);
   2689     fSets->addElement(fOtherSet,     status);
   2690     fSets->addElement(fExtendNumLetSet, status);
   2691 
   2692     if (U_FAILURE(status)) {
   2693         deferredStatus = status;
   2694     }
   2695 }
   2696 
   2697 void RBBIWordMonkey::setText(const UnicodeString &s) {
   2698     fText       = &s;
   2699 }
   2700 
   2701 
   2702 int32_t RBBIWordMonkey::next(int32_t prevPos) {
   2703     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2704                               //   break position being tested.  The candidate break
   2705                               //   location is before p2.
   2706 
   2707     int     breakPos = -1;
   2708 
   2709     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2710 
   2711     if (U_FAILURE(deferredStatus)) {
   2712         return -1;
   2713     }
   2714 
   2715     // Prev break at end of string.  return DONE.
   2716     if (prevPos >= fText->length()) {
   2717         return -1;
   2718     }
   2719     p0 = p1 = p2 = p3 = prevPos;
   2720     c3 =  fText->char32At(prevPos);
   2721     c0 = c1 = c2 = 0;
   2722 
   2723     // Loop runs once per "significant" character position in the input text.
   2724     for (;;) {
   2725         // Move all of the positions forward in the input string.
   2726         p0 = p1;  c0 = c1;
   2727         p1 = p2;  c1 = c2;
   2728         p2 = p3;  c2 = c3;
   2729 
   2730         // Advancd p3 by    X(Extend | Format)*   Rule 4
   2731         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
   2732         do {
   2733             p3 = fText->moveIndex32(p3, 1);
   2734             c3 = fText->char32At(p3);
   2735             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
   2736                break;
   2737             };
   2738         }
   2739         while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
   2740 
   2741 
   2742         if (p1 == p2) {
   2743             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2744             continue;
   2745         }
   2746         if (p2 == fText->length()) {
   2747             // Reached end of string.  Always a break position.
   2748             break;
   2749         }
   2750 
   2751         // Rule  (3)   CR x LF
   2752         //     No Extend or Format characters may appear between the CR and LF,
   2753         //     which requires the additional check for p2 immediately following p1.
   2754         //
   2755         if (c1==0x0D && c2==0x0A) {
   2756             continue;
   2757         }
   2758 
   2759         // Rule (3a)  Break before and after newlines (including CR and LF)
   2760         //
   2761         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
   2762             break;
   2763         };
   2764         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
   2765             break;
   2766         };
   2767 
   2768         // Rule (5).   ALetter x ALetter
   2769         if (fALetterSet->contains(c1) &&
   2770             fALetterSet->contains(c2))  {
   2771             continue;
   2772         }
   2773 
   2774         // Rule (6)  ALetter  x  (MidLetter | MidNumLet) ALetter
   2775         //
   2776         if ( fALetterSet->contains(c1)   &&
   2777              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
   2778              fALetterSet->contains(c3)) {
   2779             continue;
   2780         }
   2781 
   2782 
   2783         // Rule (7)  ALetter (MidLetter | MidNumLet)  x  ALetter
   2784         if (fALetterSet->contains(c0) &&
   2785             (fMidLetterSet->contains(c1) ||  fMidNumLetSet->contains(c1)) &&
   2786             fALetterSet->contains(c2)) {
   2787             continue;
   2788         }
   2789 
   2790         // Rule (8)    Numeric x Numeric
   2791         if (fNumericSet->contains(c1) &&
   2792             fNumericSet->contains(c2))  {
   2793             continue;
   2794         }
   2795 
   2796         // Rule (9)    ALetter x Numeric
   2797         if (fALetterSet->contains(c1) &&
   2798             fNumericSet->contains(c2))  {
   2799             continue;
   2800         }
   2801 
   2802         // Rule (10)    Numeric x ALetter
   2803         if (fNumericSet->contains(c1) &&
   2804             fALetterSet->contains(c2))  {
   2805             continue;
   2806         }
   2807 
   2808         // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric
   2809         if (fNumericSet->contains(c0) &&
   2810             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1))  &&
   2811             fNumericSet->contains(c2)) {
   2812             continue;
   2813         }
   2814 
   2815         // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric
   2816         if (fNumericSet->contains(c1) &&
   2817             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2))  &&
   2818             fNumericSet->contains(c3)) {
   2819             continue;
   2820         }
   2821 
   2822         // Rule (13)  Katakana x Katakana
   2823         if (fKatakanaSet->contains(c1) &&
   2824             fKatakanaSet->contains(c2))  {
   2825             continue;
   2826         }
   2827 
   2828         // Rule 13a
   2829         if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
   2830              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
   2831              fExtendNumLetSet->contains(c2)) {
   2832                 continue;
   2833              }
   2834 
   2835         // Rule 13b
   2836         if (fExtendNumLetSet->contains(c1) &&
   2837                 (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
   2838                 fKatakanaSet->contains(c2)))  {
   2839                 continue;
   2840              }
   2841 
   2842         // Rule 14.  Break found here.
   2843         break;
   2844     }
   2845 
   2846     breakPos = p2;
   2847     return breakPos;
   2848 }
   2849 
   2850 
   2851 UVector  *RBBIWordMonkey::charClasses() {
   2852     return fSets;
   2853 }
   2854 
   2855 
   2856 RBBIWordMonkey::~RBBIWordMonkey() {
   2857     delete fSets;
   2858     delete fCRSet;
   2859     delete fLFSet;
   2860     delete fNewlineSet;
   2861     delete fKatakanaSet;
   2862     delete fALetterSet;
   2863     delete fMidNumLetSet;
   2864     delete fMidLetterSet;
   2865     delete fMidNumSet;
   2866     delete fNumericSet;
   2867     delete fFormatSet;
   2868     delete fExtendSet;
   2869     delete fExtendNumLetSet;
   2870     delete fOtherSet;
   2871 }
   2872 
   2873 
   2874 
   2875 
   2876 //------------------------------------------------------------------------------------------
   2877 //
   2878 //   class RBBISentMonkey      Sentence Break specific implementation
   2879 //                             of RBBIMonkeyKind.
   2880 //
   2881 //------------------------------------------------------------------------------------------
   2882 class RBBISentMonkey: public RBBIMonkeyKind {
   2883 public:
   2884     RBBISentMonkey();
   2885     virtual          ~RBBISentMonkey();
   2886     virtual  UVector *charClasses();
   2887     virtual  void     setText(const UnicodeString &s);
   2888     virtual int32_t   next(int32_t i);
   2889 private:
   2890     int               moveBack(int posFrom);
   2891     int               moveForward(int posFrom);
   2892     UChar32           cAt(int pos);
   2893 
   2894     UVector      *fSets;
   2895 
   2896     UnicodeSet  *fSepSet;
   2897     UnicodeSet  *fFormatSet;
   2898     UnicodeSet  *fSpSet;
   2899     UnicodeSet  *fLowerSet;
   2900     UnicodeSet  *fUpperSet;
   2901     UnicodeSet  *fOLetterSet;
   2902     UnicodeSet  *fNumericSet;
   2903     UnicodeSet  *fATermSet;
   2904     UnicodeSet  *fSContinueSet;
   2905     UnicodeSet  *fSTermSet;
   2906     UnicodeSet  *fCloseSet;
   2907     UnicodeSet  *fOtherSet;
   2908     UnicodeSet  *fExtendSet;
   2909 
   2910     const UnicodeString  *fText;
   2911 
   2912 };
   2913 
   2914 RBBISentMonkey::RBBISentMonkey()
   2915 {
   2916     UErrorCode  status = U_ZERO_ERROR;
   2917 
   2918     fSets            = new UVector(status);
   2919 
   2920     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
   2921     //                       set and made into character classes of their own.  For the monkey impl,
   2922     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
   2923     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
   2924     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
   2925     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
   2926     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
   2927     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
   2928     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
   2929     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
   2930     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
   2931     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
   2932     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
   2933     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
   2934     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
   2935     fOtherSet        = new UnicodeSet();
   2936 
   2937     if(U_FAILURE(status)) {
   2938       deferredStatus = status;
   2939       return;
   2940     }
   2941 
   2942     fOtherSet->complement();
   2943     fOtherSet->removeAll(*fSepSet);
   2944     fOtherSet->removeAll(*fFormatSet);
   2945     fOtherSet->removeAll(*fSpSet);
   2946     fOtherSet->removeAll(*fLowerSet);
   2947     fOtherSet->removeAll(*fUpperSet);
   2948     fOtherSet->removeAll(*fOLetterSet);
   2949     fOtherSet->removeAll(*fNumericSet);
   2950     fOtherSet->removeAll(*fATermSet);
   2951     fOtherSet->removeAll(*fSContinueSet);
   2952     fOtherSet->removeAll(*fSTermSet);
   2953     fOtherSet->removeAll(*fCloseSet);
   2954     fOtherSet->removeAll(*fExtendSet);
   2955 
   2956     fSets->addElement(fSepSet,       status);
   2957     fSets->addElement(fFormatSet,    status);
   2958     fSets->addElement(fSpSet,        status);
   2959     fSets->addElement(fLowerSet,     status);
   2960     fSets->addElement(fUpperSet,     status);
   2961     fSets->addElement(fOLetterSet,   status);
   2962     fSets->addElement(fNumericSet,   status);
   2963     fSets->addElement(fATermSet,     status);
   2964     fSets->addElement(fSContinueSet, status);
   2965     fSets->addElement(fSTermSet,     status);
   2966     fSets->addElement(fCloseSet,     status);
   2967     fSets->addElement(fOtherSet,     status);
   2968     fSets->addElement(fExtendSet,    status);
   2969 
   2970     if (U_FAILURE(status)) {
   2971         deferredStatus = status;
   2972     }
   2973 }
   2974 
   2975 
   2976 
   2977 void RBBISentMonkey::setText(const UnicodeString &s) {
   2978     fText       = &s;
   2979 }
   2980 
   2981 UVector  *RBBISentMonkey::charClasses() {
   2982     return fSets;
   2983 }
   2984 
   2985 
   2986 //  moveBack()   Find the "significant" code point preceding the index i.
   2987 //               Skips over ($Extend | $Format)* .
   2988 //
   2989 int RBBISentMonkey::moveBack(int i) {
   2990     if (i <= 0) {
   2991         return -1;
   2992     }
   2993     UChar32   c;
   2994     int32_t   j = i;
   2995     do {
   2996         j = fText->moveIndex32(j, -1);
   2997         c = fText->char32At(j);
   2998     }
   2999     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
   3000     return j;
   3001 
   3002  }
   3003 
   3004 
   3005 int RBBISentMonkey::moveForward(int i) {
   3006     if (i>=fText->length()) {
   3007         return fText->length();
   3008     }
   3009     UChar32   c;
   3010     int32_t   j = i;
   3011     do {
   3012         j = fText->moveIndex32(j, 1);
   3013         c = cAt(j);
   3014     }
   3015     while (fFormatSet->contains(c) || fExtendSet->contains(c));
   3016     return j;
   3017 }
   3018 
   3019 UChar32 RBBISentMonkey::cAt(int pos) {
   3020     if (pos<0 || pos>=fText->length()) {
   3021         return -1;
   3022     } else {
   3023         return fText->char32At(pos);
   3024     }
   3025 }
   3026 
   3027 int32_t RBBISentMonkey::next(int32_t prevPos) {
   3028     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   3029                               //   break position being tested.  The candidate break
   3030                               //   location is before p2.
   3031 
   3032     int     breakPos = -1;
   3033 
   3034     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   3035     UChar32 c;
   3036 
   3037     if (U_FAILURE(deferredStatus)) {
   3038         return -1;
   3039     }
   3040 
   3041     // Prev break at end of string.  return DONE.
   3042     if (prevPos >= fText->length()) {
   3043         return -1;
   3044     }
   3045     p0 = p1 = p2 = p3 = prevPos;
   3046     c3 =  fText->char32At(prevPos);
   3047     c0 = c1 = c2 = 0;
   3048 
   3049     // Loop runs once per "significant" character position in the input text.
   3050     for (;;) {
   3051         // Move all of the positions forward in the input string.
   3052         p0 = p1;  c0 = c1;
   3053         p1 = p2;  c1 = c2;
   3054         p2 = p3;  c2 = c3;
   3055 
   3056         // Advancd p3 by    X(Extend | Format)*   Rule 4
   3057         p3 = moveForward(p3);
   3058         c3 = cAt(p3);
   3059 
   3060         // Rule (3)  CR x LF
   3061         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
   3062             continue;
   3063         }
   3064 
   3065         // Rule (4).   Sep  <break>
   3066         if (fSepSet->contains(c1)) {
   3067             p2 = p1+1;   // Separators don't combine with Extend or Format.
   3068             break;
   3069         }
   3070 
   3071         if (p2 >= fText->length()) {
   3072             // Reached end of string.  Always a break position.
   3073             break;
   3074         }
   3075 
   3076         if (p2 == prevPos) {
   3077             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   3078             continue;
   3079         }
   3080 
   3081         // Rule (6).   ATerm x Numeric
   3082         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
   3083             continue;
   3084         }
   3085 
   3086         // Rule (7).  Upper ATerm  x  Uppper
   3087         if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
   3088             continue;
   3089         }
   3090 
   3091         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
   3092         //           Note:  STerm | ATerm are added to the negated part of the expression by a
   3093         //                  note to the Unicode 5.0 documents.
   3094         int p8 = p1;
   3095         while (fSpSet->contains(cAt(p8))) {
   3096             p8 = moveBack(p8);
   3097         }
   3098         while (fCloseSet->contains(cAt(p8))) {
   3099             p8 = moveBack(p8);
   3100         }
   3101         if (fATermSet->contains(cAt(p8))) {
   3102             p8=p2;
   3103             for (;;) {
   3104                 c = cAt(p8);
   3105                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
   3106                     fLowerSet->contains(c) || fSepSet->contains(c) ||
   3107                     fATermSet->contains(c) || fSTermSet->contains(c))  {
   3108                     break;
   3109                 }
   3110                 p8 = moveForward(p8);
   3111             }
   3112             if (fLowerSet->contains(cAt(p8))) {
   3113                 continue;
   3114             }
   3115         }
   3116 
   3117         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
   3118         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
   3119             p8 = p1;
   3120             while (fSpSet->contains(cAt(p8))) {
   3121                 p8 = moveBack(p8);
   3122             }
   3123             while (fCloseSet->contains(cAt(p8))) {
   3124                 p8 = moveBack(p8);
   3125             }
   3126             c = cAt(p8);
   3127             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
   3128                 continue;
   3129             }
   3130         }
   3131 
   3132         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
   3133         int p9 = p1;
   3134         while (fCloseSet->contains(cAt(p9))) {
   3135             p9 = moveBack(p9);
   3136         }
   3137         c = cAt(p9);
   3138         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
   3139             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
   3140                 continue;
   3141             }
   3142         }
   3143 
   3144         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
   3145         int p10 = p1;
   3146         while (fSpSet->contains(cAt(p10))) {
   3147             p10 = moveBack(p10);
   3148         }
   3149         while (fCloseSet->contains(cAt(p10))) {
   3150             p10 = moveBack(p10);
   3151         }
   3152         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
   3153             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
   3154                 continue;
   3155             }
   3156         }
   3157 
   3158         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
   3159         int p11 = p1;
   3160         if (fSepSet->contains(cAt(p11))) {
   3161             p11 = moveBack(p11);
   3162         }
   3163         while (fSpSet->contains(cAt(p11))) {
   3164             p11 = moveBack(p11);
   3165         }
   3166         while (fCloseSet->contains(cAt(p11))) {
   3167             p11 = moveBack(p11);
   3168         }
   3169         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
   3170             break;
   3171         }
   3172 
   3173         //  Rule (12)  Any x Any
   3174         continue;
   3175     }
   3176     breakPos = p2;
   3177     return breakPos;
   3178 }
   3179 
   3180 RBBISentMonkey::~RBBISentMonkey() {
   3181     delete fSets;
   3182     delete fSepSet;
   3183     delete fFormatSet;
   3184     delete fSpSet;
   3185     delete fLowerSet;
   3186     delete fUpperSet;
   3187     delete fOLetterSet;
   3188     delete fNumericSet;
   3189     delete fATermSet;
   3190     delete fSContinueSet;
   3191     delete fSTermSet;
   3192     delete fCloseSet;
   3193     delete fOtherSet;
   3194     delete fExtendSet;
   3195 }
   3196 
   3197 
   3198 
   3199 //-------------------------------------------------------------------------------------------
   3200 //
   3201 //  RBBILineMonkey
   3202 //
   3203 //-------------------------------------------------------------------------------------------
   3204 
   3205 class RBBILineMonkey: public RBBIMonkeyKind {
   3206 public:
   3207     RBBILineMonkey();
   3208     virtual          ~RBBILineMonkey();
   3209     virtual  UVector *charClasses();
   3210     virtual  void     setText(const UnicodeString &s);
   3211     virtual  int32_t  next(int32_t i);
   3212     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
   3213 private:
   3214     UVector      *fSets;
   3215 
   3216     UnicodeSet  *fBK;
   3217     UnicodeSet  *fCR;
   3218     UnicodeSet  *fLF;
   3219     UnicodeSet  *fCM;
   3220     UnicodeSet  *fNL;
   3221     UnicodeSet  *fSG;
   3222     UnicodeSet  *fWJ;
   3223     UnicodeSet  *fZW;
   3224     UnicodeSet  *fGL;
   3225     UnicodeSet  *fCB;
   3226     UnicodeSet  *fSP;
   3227     UnicodeSet  *fB2;
   3228     UnicodeSet  *fBA;
   3229     UnicodeSet  *fBB;
   3230     UnicodeSet  *fHY;
   3231     UnicodeSet  *fH2;
   3232     UnicodeSet  *fH3;
   3233     UnicodeSet  *fCL;
   3234     UnicodeSet  *fCP;
   3235     UnicodeSet  *fEX;
   3236     UnicodeSet  *fIN;
   3237     UnicodeSet  *fJL;
   3238     UnicodeSet  *fJV;
   3239     UnicodeSet  *fJT;
   3240     UnicodeSet  *fNS;
   3241     UnicodeSet  *fOP;
   3242     UnicodeSet  *fQU;
   3243     UnicodeSet  *fIS;
   3244     UnicodeSet  *fNU;
   3245     UnicodeSet  *fPO;
   3246     UnicodeSet  *fPR;
   3247     UnicodeSet  *fSY;
   3248     UnicodeSet  *fAI;
   3249     UnicodeSet  *fAL;
   3250     UnicodeSet  *fID;
   3251     UnicodeSet  *fSA;
   3252     UnicodeSet  *fXX;
   3253 
   3254     BreakIterator  *fCharBI;
   3255 
   3256     const UnicodeString  *fText;
   3257     int32_t              *fOrigPositions;
   3258 
   3259     RegexMatcher         *fNumberMatcher;
   3260     RegexMatcher         *fLB11Matcher;
   3261 };
   3262 
   3263 
   3264 RBBILineMonkey::RBBILineMonkey()
   3265 {
   3266     UErrorCode  status = U_ZERO_ERROR;
   3267 
   3268     fSets  = new UVector(status);
   3269 
   3270     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
   3271     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
   3272     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
   3273     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
   3274     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
   3275     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
   3276     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
   3277     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
   3278     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
   3279     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
   3280     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
   3281     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
   3282     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
   3283     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
   3284     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
   3285     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
   3286     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
   3287     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
   3288     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
   3289     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
   3290     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
   3291     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
   3292     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
   3293     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
   3294     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
   3295     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
   3296     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
   3297     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
   3298     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
   3299     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
   3300     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
   3301     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
   3302     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
   3303     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
   3304     fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
   3305     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
   3306     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
   3307 
   3308     if (U_FAILURE(status)) {
   3309         deferredStatus = status;
   3310         fCharBI = NULL;
   3311         fNumberMatcher = NULL;
   3312         return;
   3313     }
   3314 
   3315     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
   3316     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
   3317     fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
   3318     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
   3319 
   3320     fSets->addElement(fBK, status);
   3321     fSets->addElement(fCR, status);
   3322     fSets->addElement(fLF, status);
   3323     fSets->addElement(fCM, status);
   3324     fSets->addElement(fNL, status);
   3325     fSets->addElement(fWJ, status);
   3326     fSets->addElement(fZW, status);
   3327     fSets->addElement(fGL, status);
   3328     fSets->addElement(fCB, status);
   3329     fSets->addElement(fSP, status);
   3330     fSets->addElement(fB2, status);
   3331     fSets->addElement(fBA, status);
   3332     fSets->addElement(fBB, status);
   3333     fSets->addElement(fHY, status);
   3334     fSets->addElement(fH2, status);
   3335     fSets->addElement(fH3, status);
   3336     fSets->addElement(fCL, status);
   3337     fSets->addElement(fCP, status);
   3338     fSets->addElement(fEX, status);
   3339     fSets->addElement(fIN, status);
   3340     fSets->addElement(fJL, status);
   3341     fSets->addElement(fJT, status);
   3342     fSets->addElement(fJV, status);
   3343     fSets->addElement(fNS, status);
   3344     fSets->addElement(fOP, status);
   3345     fSets->addElement(fQU, status);
   3346     fSets->addElement(fIS, status);
   3347     fSets->addElement(fNU, status);
   3348     fSets->addElement(fPO, status);
   3349     fSets->addElement(fPR, status);
   3350     fSets->addElement(fSY, status);
   3351     fSets->addElement(fAI, status);
   3352     fSets->addElement(fAL, status);
   3353     fSets->addElement(fID, status);
   3354     fSets->addElement(fWJ, status);
   3355     fSets->addElement(fSA, status);
   3356     fSets->addElement(fSG, status);
   3357 
   3358     const char *rules =
   3359             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
   3360             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
   3361             "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
   3362             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
   3363             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
   3364             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
   3365 
   3366     fNumberMatcher = new RegexMatcher(
   3367         UnicodeString(rules, -1, US_INV), 0, status);
   3368 
   3369     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
   3370 
   3371     if (U_FAILURE(status)) {
   3372         deferredStatus = status;
   3373     }
   3374 }
   3375 
   3376 
   3377 void RBBILineMonkey::setText(const UnicodeString &s) {
   3378     fText       = &s;
   3379     fCharBI->setText(s);
   3380     fNumberMatcher->reset(s);
   3381 }
   3382 
   3383 //
   3384 //  rule9Adjust
   3385 //     Line Break TR rules 9 and 10 implementation.
   3386 //     This deals with combining marks and other sequences that
   3387 //     that must be treated as if they were something other than what they actually are.
   3388 //
   3389 //     This is factored out into a separate function because it must be applied twice for
   3390 //     each potential break, once to the chars before the position being checked, then
   3391 //     again to the text following the possible break.
   3392 //
   3393 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
   3394     if (pos == -1) {
   3395         // Invalid initial position.  Happens during the warmup iteration of the
   3396         //   main loop in next().
   3397         return;
   3398     }
   3399 
   3400     int32_t  nPos = *nextPos;
   3401 
   3402     // LB 9  Keep combining sequences together.
   3403     //  advance over any CM class chars.  Note that Line Break CM is different
   3404     //  from the normal Grapheme Extend property.
   3405     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
   3406           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
   3407         for (;;) {
   3408             *nextChar = fText->char32At(nPos);
   3409             if (!fCM->contains(*nextChar)) {
   3410                 break;
   3411             }
   3412             nPos = fText->moveIndex32(nPos, 1);
   3413         }
   3414     }
   3415 
   3416 
   3417     // LB 9 Treat X CM* as if it were x.
   3418     //       No explicit action required.
   3419 
   3420     // LB 10  Treat any remaining combining mark as AL
   3421     if (fCM->contains(*posChar)) {
   3422         *posChar = 0x41;   // thisChar = 'A';
   3423     }
   3424 
   3425     // Push the updated nextPos and nextChar back to our caller.
   3426     // This only makes a difference if posChar got bigger by consuming a
   3427     // combining sequence.
   3428     *nextPos  = nPos;
   3429     *nextChar = fText->char32At(nPos);
   3430 }
   3431 
   3432 
   3433 
   3434 int32_t RBBILineMonkey::next(int32_t startPos) {
   3435     UErrorCode status = U_ZERO_ERROR;
   3436     int32_t    pos;       //  Index of the char following a potential break position
   3437     UChar32    thisChar;  //  Character at above position "pos"
   3438 
   3439     int32_t    prevPos;   //  Index of the char preceding a potential break position
   3440     UChar32    prevChar;  //  Character at above position.  Note that prevChar
   3441                           //   and thisChar may not be adjacent because combining
   3442                           //   characters between them will be ignored.
   3443 
   3444     int32_t    nextPos;   //  Index of the next character following pos.
   3445                           //     Usually skips over combining marks.
   3446     int32_t    nextCPPos; //  Index of the code point following "pos."
   3447                           //     May point to a combining mark.
   3448     int32_t    tPos;      //  temp value.
   3449     UChar32    c;
   3450 
   3451     if (U_FAILURE(deferredStatus)) {
   3452         return -1;
   3453     }
   3454 
   3455     if (startPos >= fText->length()) {
   3456         return -1;
   3457     }
   3458 
   3459 
   3460     // Initial values for loop.  Loop will run the first time without finding breaks,
   3461     //                           while the invalid values shift out and the "this" and
   3462     //                           "prev" positions are filled in with good values.
   3463     pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.
   3464     thisChar = prevChar  = 0;
   3465     nextPos  = nextCPPos = startPos;
   3466 
   3467 
   3468     // Loop runs once per position in the test text, until a break position
   3469     //  is found.
   3470     for (;;) {
   3471         prevPos   = pos;
   3472         prevChar  = thisChar;
   3473 
   3474         pos       = nextPos;
   3475         thisChar  = fText->char32At(pos);
   3476 
   3477         nextCPPos = fText->moveIndex32(pos, 1);
   3478         nextPos   = nextCPPos;
   3479 
   3480         // Rule LB2 - Break at end of text.
   3481         if (pos >= fText->length()) {
   3482             break;
   3483         }
   3484 
   3485         // Rule LB 9 - adjust for combining sequences.
   3486         //             We do this one out-of-order because the adjustment does not change anything
   3487         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
   3488         //             be applied.
   3489         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
   3490         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
   3491         c = fText->char32At(nextPos);
   3492         rule9Adjust(pos,     &thisChar, &nextPos, &c);
   3493 
   3494         // If the loop is still warming up - if we haven't shifted the initial
   3495         //   -1 positions out of prevPos yet - loop back to advance the
   3496         //    position in the input without any further looking for breaks.
   3497         if (prevPos == -1) {
   3498             continue;
   3499         }
   3500 
   3501         // LB 4  Always break after hard line breaks,
   3502         if (fBK->contains(prevChar)) {
   3503             break;
   3504         }
   3505 
   3506         // LB 5  Break after CR, LF, NL, but not inside CR LF
   3507         if (prevChar == 0x0d && thisChar == 0x0a) {
   3508             continue;
   3509         }
   3510         if (prevChar == 0x0d ||
   3511             prevChar == 0x0a ||
   3512             prevChar == 0x85)  {
   3513             break;
   3514         }
   3515 
   3516         // LB 6  Don't break before hard line breaks
   3517         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
   3518             fBK->contains(thisChar)) {
   3519                 continue;
   3520         }
   3521 
   3522 
   3523         // LB 7  Don't break before spaces or zero-width space.
   3524         if (fSP->contains(thisChar)) {
   3525             continue;
   3526         }
   3527 
   3528         if (fZW->contains(thisChar)) {
   3529             continue;
   3530         }
   3531 
   3532         // LB 8  Break after zero width space
   3533         if (fZW->contains(prevChar)) {
   3534             break;
   3535         }
   3536 
   3537         // LB 9, 10  Already done, at top of loop.
   3538         //
   3539 
   3540 
   3541         // LB 11  Do not break before or after WORD JOINER and related characters.
   3542         //    x  WJ
   3543         //    WJ  x
   3544         //
   3545         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
   3546             continue;
   3547         }
   3548 
   3549         // LB 12
   3550         //    GL  x
   3551         if (fGL->contains(prevChar)) {
   3552             continue;
   3553         }
   3554 
   3555         // LB 12a
   3556         //    [^SP BA HY] x GL
   3557         if (!(fSP->contains(prevChar) ||
   3558               fBA->contains(prevChar) ||
   3559               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
   3560             continue;
   3561         }
   3562 
   3563 
   3564 
   3565         // LB 13  Don't break before closings.
   3566         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
   3567         //        fall into LB 17 and the more general number regular expression.
   3568         //
   3569         if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
   3570             (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
   3571                                          fEX->contains(thisChar)  ||
   3572             (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
   3573             (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
   3574             continue;
   3575         }
   3576 
   3577         // LB 14 Don't break after OP SP*
   3578         //       Scan backwards, checking for this sequence.
   3579         //       The OP char could include combining marks, so we actually check for
   3580         //           OP CM* SP*
   3581         //       Another Twist: The Rule 67 fixes may have changed a SP CM
   3582         //       sequence into a ID char, so before scanning back through spaces,
   3583         //       verify that prevChar is indeed a space.  The prevChar variable
   3584         //       may differ from fText[prevPos]
   3585         tPos = prevPos;
   3586         if (fSP->contains(prevChar)) {
   3587             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
   3588                 tPos=fText->moveIndex32(tPos, -1);
   3589             }
   3590         }
   3591         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
   3592             tPos=fText->moveIndex32(tPos, -1);
   3593         }
   3594         if (fOP->contains(fText->char32At(tPos))) {
   3595             continue;
   3596         }
   3597 
   3598 
   3599         // LB 15    QU SP* x OP
   3600         if (fOP->contains(thisChar)) {
   3601             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
   3602             int tPos = prevPos;
   3603             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
   3604                 tPos = fText->moveIndex32(tPos, -1);
   3605             }
   3606             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
   3607                 tPos = fText->moveIndex32(tPos, -1);
   3608             }
   3609             if (fQU->contains(fText->char32At(tPos))) {
   3610                 continue;
   3611             }
   3612         }
   3613 
   3614 
   3615 
   3616         // LB 16   (CL | CP) SP* x NS
   3617         //    Scan backwards for SP* CM* (CL | CP)
   3618         if (fNS->contains(thisChar)) {
   3619             int tPos = prevPos;
   3620             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
   3621                 tPos = fText->moveIndex32(tPos, -1);
   3622             }
   3623             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
   3624                 tPos = fText->moveIndex32(tPos, -1);
   3625             }
   3626             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
   3627                 continue;
   3628             }
   3629         }
   3630 
   3631 
   3632         // LB 17        B2 SP* x B2
   3633         if (fB2->contains(thisChar)) {
   3634             //  Scan backwards, checking for the B2 CM* SP* sequence.
   3635             tPos = prevPos;
   3636             if (fSP->contains(prevChar)) {
   3637                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
   3638                     tPos=fText->moveIndex32(tPos, -1);
   3639                 }
   3640             }
   3641             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
   3642                 tPos=fText->moveIndex32(tPos, -1);
   3643             }
   3644             if (fB2->contains(fText->char32At(tPos))) {
   3645                 continue;
   3646             }
   3647         }
   3648 
   3649 
   3650         // LB 18    break after space
   3651         if (fSP->contains(prevChar)) {
   3652             break;
   3653         }
   3654 
   3655         // LB 19
   3656         //    x   QU
   3657         //    QU  x
   3658         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
   3659             continue;
   3660         }
   3661 
   3662         // LB 20  Break around a CB
   3663         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
   3664             break;
   3665         }
   3666 
   3667         // LB 21
   3668         if (fBA->contains(thisChar) ||
   3669             fHY->contains(thisChar) ||
   3670             fNS->contains(thisChar) ||
   3671             fBB->contains(prevChar) )   {
   3672             continue;
   3673         }
   3674 
   3675         // LB 22
   3676         if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
   3677             (fID->contains(prevChar) && fIN->contains(thisChar)) ||
   3678             (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
   3679             (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
   3680             continue;
   3681         }
   3682 
   3683 
   3684         // LB 23    ID x PO
   3685         //          AL x NU
   3686         //          NU x AL
   3687         if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
   3688             (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
   3689             (fNU->contains(prevChar) && fAL->contains(thisChar)) )   {
   3690             continue;
   3691         }
   3692 
   3693         // LB 24  Do not break between prefix and letters or ideographs.
   3694         //        PR x ID
   3695         //        PR x AL
   3696         //        PO x AL
   3697         if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
   3698             (fPR->contains(prevChar) && fAL->contains(thisChar)) ||
   3699             (fPO->contains(prevChar) && fAL->contains(thisChar)) )   {
   3700             continue;
   3701         }
   3702 
   3703 
   3704 
   3705         // LB 25    Numbers
   3706         if (fNumberMatcher->lookingAt(prevPos, status)) {
   3707             if (U_FAILURE(status)) {
   3708                 break;
   3709             }
   3710             // Matched a number.  But could have been just a single digit, which would
   3711             //    not represent a "no break here" between prevChar and thisChar
   3712             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
   3713             if (numEndIdx > pos) {
   3714                 // Number match includes at least our two chars being checked
   3715                 if (numEndIdx > nextPos) {
   3716                     // Number match includes additional chars.  Update pos and nextPos
   3717                     //   so that next loop iteration will continue at the end of the number,
   3718                     //   checking for breaks between last char in number & whatever follows.
   3719                     pos = nextPos = numEndIdx;
   3720                     do {
   3721                         pos = fText->moveIndex32(pos, -1);
   3722                         thisChar = fText->char32At(pos);
   3723                     } while (fCM->contains(thisChar));
   3724                 }
   3725                 continue;
   3726             }
   3727         }
   3728 
   3729 
   3730         // LB 26 Do not break a Korean syllable.
   3731         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
   3732                                         fJV->contains(thisChar) ||
   3733                                         fH2->contains(thisChar) ||
   3734                                         fH3->contains(thisChar))) {
   3735                                             continue;
   3736                                         }
   3737 
   3738         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
   3739             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
   3740                 continue;
   3741         }
   3742 
   3743         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
   3744             fJT->contains(thisChar)) {
   3745                 continue;
   3746         }
   3747 
   3748         // LB 27 Treat a Korean Syllable Block the same as ID.
   3749         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
   3750             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
   3751             fIN->contains(thisChar)) {
   3752                 continue;
   3753             }
   3754         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
   3755             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
   3756             fPO->contains(thisChar)) {
   3757                 continue;
   3758             }
   3759         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
   3760             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
   3761                 continue;
   3762             }
   3763 
   3764 
   3765 
   3766         // LB 28  Do not break between alphabetics ("at").
   3767         if (fAL->contains(prevChar) && fAL->contains(thisChar)) {
   3768             continue;
   3769         }
   3770 
   3771         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
   3772         if (fIS->contains(prevChar) && fAL->contains(thisChar)) {
   3773             continue;
   3774         }
   3775 
   3776         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
   3777         //          (AL | NU) x OP
   3778         //          CP x (AL | NU)
   3779         if ((fAL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
   3780             continue;
   3781         }
   3782         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fNU->contains(thisChar))) {
   3783             continue;
   3784         }
   3785 
   3786         // LB 31    Break everywhere else
   3787         break;
   3788 
   3789     }
   3790 
   3791     return pos;
   3792 }
   3793 
   3794 
   3795 UVector  *RBBILineMonkey::charClasses() {
   3796     return fSets;
   3797 }
   3798 
   3799 
   3800 RBBILineMonkey::~RBBILineMonkey() {
   3801     delete fSets;
   3802 
   3803     delete fBK;
   3804     delete fCR;
   3805     delete fLF;
   3806     delete fCM;
   3807     delete fNL;
   3808     delete fWJ;
   3809     delete fZW;
   3810     delete fGL;
   3811     delete fCB;
   3812     delete fSP;
   3813     delete fB2;
   3814     delete fBA;
   3815     delete fBB;
   3816     delete fHY;
   3817     delete fH2;
   3818     delete fH3;
   3819     delete fCL;
   3820     delete fCP;
   3821     delete fEX;
   3822     delete fIN;
   3823     delete fJL;
   3824     delete fJV;
   3825     delete fJT;
   3826     delete fNS;
   3827     delete fOP;
   3828     delete fQU;
   3829     delete fIS;
   3830     delete fNU;
   3831     delete fPO;
   3832     delete fPR;
   3833     delete fSY;
   3834     delete fAI;
   3835     delete fAL;
   3836     delete fID;
   3837     delete fSA;
   3838     delete fSG;
   3839     delete fXX;
   3840 
   3841     delete fCharBI;
   3842     delete fNumberMatcher;
   3843 }
   3844 
   3845 
   3846 //-------------------------------------------------------------------------------------------
   3847 //
   3848 //   TestMonkey
   3849 //
   3850 //     params
   3851 //       seed=nnnnn        Random number starting seed.
   3852 //                         Setting the seed allows errors to be reproduced.
   3853 //       loop=nnn          Looping count.  Controls running time.
   3854 //                         -1:  run forever.
   3855 //                          0 or greater:  run length.
   3856 //
   3857 //       type = char | word | line | sent | title
   3858 //
   3859 //-------------------------------------------------------------------------------------------
   3860 
   3861 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
   3862     int32_t val = defaultVal;
   3863     name.append(" *= *(-?\\d+)");
   3864     UErrorCode status = U_ZERO_ERROR;
   3865     RegexMatcher m(name, params, 0, status);
   3866     if (m.find()) {
   3867         // The param exists.  Convert the string to an int.
   3868         char valString[100];
   3869         int32_t paramLength = m.end(1, status) - m.start(1, status);
   3870         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
   3871             paramLength = (int32_t)(sizeof(valString)-2);
   3872         }
   3873         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
   3874         val = strtol(valString,  NULL, 10);
   3875 
   3876         // Delete this parameter from the params string.
   3877         m.reset();
   3878         params = m.replaceFirst("", status);
   3879     }
   3880     U_ASSERT(U_SUCCESS(status));
   3881     return val;
   3882 }
   3883 #endif
   3884 
   3885 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
   3886                                     BreakIterator *bi,
   3887                                     int expected[],
   3888                                     int expectedcount)
   3889 {
   3890     int count = 0;
   3891     int i = 0;
   3892     int forward[50];
   3893     bi->setText(ustr);
   3894     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
   3895         forward[count] = i;
   3896         if (count < expectedcount && expected[count] != i) {
   3897             test->errln("break forward test failed: expected %d but got %d",
   3898                         expected[count], i);
   3899             break;
   3900         }
   3901         count ++;
   3902     }
   3903     if (count != expectedcount) {
   3904         printStringBreaks(ustr, expected, expectedcount);
   3905         test->errln("break forward test failed: missed %d match",
   3906                     expectedcount - count);
   3907         return;
   3908     }
   3909     // testing boundaries
   3910     for (i = 1; i < expectedcount; i ++) {
   3911         int j = expected[i - 1];
   3912         if (!bi->isBoundary(j)) {
   3913             printStringBreaks(ustr, expected, expectedcount);
   3914             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
   3915             return;
   3916         }
   3917         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
   3918             if (bi->isBoundary(j)) {
   3919                 printStringBreaks(ustr, expected, expectedcount);
   3920                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
   3921                 return;
   3922             }
   3923         }
   3924     }
   3925 
   3926     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
   3927         count --;
   3928         if (forward[count] != i) {
   3929             test->errln("happy break test previous() failed: expected %d but got %d",
   3930                         forward[count], i);
   3931             break;
   3932         }
   3933     }
   3934     if (count != 0) {
   3935         printStringBreaks(ustr, expected, expectedcount);
   3936         test->errln("break test previous() failed: missed a match");
   3937         return;
   3938     }
   3939 
   3940     // testing preceding
   3941     for (i = 0; i < expectedcount - 1; i ++) {
   3942         // int j = expected[i] + 1;
   3943         int j = ustr.moveIndex32(expected[i], 1);
   3944         for (; j <= expected[i + 1]; j ++) {
   3945             if (bi->preceding(j) != expected[i]) {
   3946                 printStringBreaks(ustr, expected, expectedcount);
   3947                 test->errln("preceding(): Not expecting boundary at position %d", j);
   3948                 return;
   3949             }
   3950         }
   3951     }
   3952 }
   3953 
   3954 void RBBITest::TestWordBreaks(void)
   3955 {
   3956 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3957 
   3958     Locale        locale("en");
   3959     UErrorCode    status = U_ZERO_ERROR;
   3960     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   3961     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
   3962     static const char *strlist[] =
   3963     {
   3964     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
   3965     "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
   3966     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
   3967     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
   3968     "\\u90ca\\u3588\\u009c\\u0953\\u194b",
   3969     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   3970     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
   3971     "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
   3972     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   3973     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   3974     "\\u2027\\U000e0067\\u0a47\\u00b7",
   3975     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   3976     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   3977     "\\u0589\\U000e006e\\u0a42\\U000104a5",
   3978     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
   3979     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   3980     "\\u0027\\u11af\\U000e0057\\u0602",
   3981     "\\U0001d7f2\\U000e007\\u0004\\u0589",
   3982     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
   3983     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   3984     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   3985     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   3986     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
   3987     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   3988     "\\u0233\\U000e0020\\u0a69\\u0d6a",
   3989     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   3990     "\\u58f4\\U000e0049\\u20e7\\u2027",
   3991     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   3992     "\\ua183\\u102d\\u0bec\\u003a",
   3993     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   3994     "\\u003a\\u0e57\\u0fad\\u002e",
   3995     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
   3996     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
   3997     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
   3998     "\\u003a\\u0664\\u00b7\\u1fba",
   3999     "\\u003b\\u0027\\u00b7\\u47a3",
   4000     "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
   4001     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
   4002     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
   4003     };
   4004     int loop;
   4005     if (U_FAILURE(status)) {
   4006         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   4007         return;
   4008     }
   4009     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   4010         // printf("looping %d\n", loop);
   4011         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
   4012         // RBBICharMonkey monkey;
   4013         RBBIWordMonkey monkey;
   4014 
   4015         int expected[50];
   4016         int expectedcount = 0;
   4017 
   4018         monkey.setText(ustr);
   4019         int i;
   4020         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   4021             expected[expectedcount ++] = i;
   4022         }
   4023 
   4024         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   4025     }
   4026     delete bi;
   4027 #endif
   4028 }
   4029 
   4030 void RBBITest::TestWordBoundary(void)
   4031 {
   4032     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
   4033     Locale        locale("en");
   4034     UErrorCode    status = U_ZERO_ERROR;
   4035     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   4036     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
   4037     UChar         str[50];
   4038     static const char *strlist[] =
   4039     {
   4040     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   4041     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   4042     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   4043     "\\u2027\\U000e0067\\u0a47\\u00b7",
   4044     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   4045     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   4046     "\\u0589\\U000e006e\\u0a42\\U000104a5",
   4047     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
   4048     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   4049     "\\u0027\\u11af\\U000e0057\\u0602",
   4050     "\\U0001d7f2\\U000e007\\u0004\\u0589",
   4051     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
   4052     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   4053     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   4054     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   4055     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
   4056     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   4057     "\\u0233\\U000e0020\\u0a69\\u0d6a",
   4058     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   4059     "\\u58f4\\U000e0049\\u20e7\\u2027",
   4060     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   4061     "\\ua183\\u102d\\u0bec\\u003a",
   4062     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   4063     "\\u003a\\u0e57\\u0fad\\u002e",
   4064     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
   4065     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
   4066     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
   4067     "\\u003a\\u0664\\u00b7\\u1fba",
   4068     "\\u003b\\u0027\\u00b7\\u47a3",
   4069     };
   4070     int loop;
   4071     if (U_FAILURE(status)) {
   4072         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   4073         return;
   4074     }
   4075     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   4076         // printf("looping %d\n", loop);
   4077         u_unescape(strlist[loop], str, 20);
   4078         UnicodeString ustr(str);
   4079         int forward[50];
   4080         int count = 0;
   4081 
   4082         bi->setText(ustr);
   4083         int prev = 0;
   4084         int i;
   4085         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
   4086             forward[count ++] = i;
   4087             if (i > prev) {
   4088                 int j;
   4089                 for (j = prev + 1; j < i; j ++) {
   4090                     if (bi->isBoundary(j)) {
   4091                         printStringBreaks(ustr, forward, count);
   4092                         errln("happy boundary test failed: expected %d not a boundary",
   4093                                j);
   4094                         return;
   4095                     }
   4096                 }
   4097             }
   4098             if (!bi->isBoundary(i)) {
   4099                 printStringBreaks(ustr, forward, count);
   4100                 errln("happy boundary test failed: expected %d a boundary",
   4101                        i);
   4102                 return;
   4103             }
   4104             prev = i;
   4105         }
   4106     }
   4107     delete bi;
   4108 }
   4109 
   4110 void RBBITest::TestLineBreaks(void)
   4111 {
   4112 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4113     Locale        locale("en");
   4114     UErrorCode    status = U_ZERO_ERROR;
   4115     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
   4116     const int32_t  STRSIZE = 50;
   4117     UChar         str[STRSIZE];
   4118     static const char *strlist[] =
   4119     {
   4120      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
   4121      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
   4122              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
   4123      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
   4124              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
   4125      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
   4126      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
   4127      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
   4128      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
   4129      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
   4130      "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
   4131      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
   4132      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
   4133      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
   4134      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
   4135      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
   4136      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
   4137      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
   4138      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
   4139      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
   4140      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
   4141      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
   4142      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
   4143      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
   4144      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
   4145      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
   4146      "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
   4147      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
   4148      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
   4149      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
   4150      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
   4151      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
   4152      "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
   4153      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
   4154      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
   4155      "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
   4156      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
   4157      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
   4158      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
   4159      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
   4160      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
   4161      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
   4162          "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
   4163          "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
   4164          "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
   4165      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
   4166          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
   4167     };
   4168     int loop;
   4169     TEST_ASSERT_SUCCESS(status);
   4170     if (U_FAILURE(status)) {
   4171         return;
   4172     }
   4173     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   4174         // printf("looping %d\n", loop);
   4175         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
   4176         if (t >= STRSIZE) {
   4177             TEST_ASSERT(FALSE);
   4178             continue;
   4179         }
   4180 
   4181 
   4182         UnicodeString ustr(str);
   4183         RBBILineMonkey monkey;
   4184         if (U_FAILURE(monkey.deferredStatus)) {
   4185             continue;
   4186         }
   4187 
   4188         const int EXPECTEDSIZE = 50;
   4189         int expected[EXPECTEDSIZE];
   4190         int expectedcount = 0;
   4191 
   4192         monkey.setText(ustr);
   4193         int i;
   4194         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   4195             if (expectedcount >= EXPECTEDSIZE) {
   4196                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
   4197                 return;
   4198             }
   4199             expected[expectedcount ++] = i;
   4200         }
   4201 
   4202         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   4203     }
   4204     delete bi;
   4205 #endif
   4206 }
   4207 
   4208 void RBBITest::TestSentBreaks(void)
   4209 {
   4210 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4211     Locale        locale("en");
   4212     UErrorCode    status = U_ZERO_ERROR;
   4213     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
   4214     UChar         str[200];
   4215     static const char *strlist[] =
   4216     {
   4217      "Now\ris\nthe\r\ntime\n\rfor\r\r",
   4218      "This\n",
   4219      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
   4220      "\"Sentence ending with a quote.\" Bye.",
   4221      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
   4222      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
   4223      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
   4224      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
   4225      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
   4226      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
   4227      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
   4228              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
   4229              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
   4230              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
   4231      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
   4232              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
   4233              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
   4234              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
   4235              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
   4236              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
   4237     };
   4238     int loop;
   4239     if (U_FAILURE(status)) {
   4240         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   4241         return;
   4242     }
   4243     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   4244         u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
   4245         UnicodeString ustr(str);
   4246 
   4247         RBBISentMonkey monkey;
   4248         if (U_FAILURE(monkey.deferredStatus)) {
   4249             continue;
   4250         }
   4251 
   4252         const int EXPECTEDSIZE = 50;
   4253         int expected[EXPECTEDSIZE];
   4254         int expectedcount = 0;
   4255 
   4256         monkey.setText(ustr);
   4257         int i;
   4258         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   4259             if (expectedcount >= EXPECTEDSIZE) {
   4260                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
   4261                 return;
   4262             }
   4263             expected[expectedcount ++] = i;
   4264         }
   4265 
   4266         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   4267     }
   4268     delete bi;
   4269 #endif
   4270 }
   4271 
   4272 void RBBITest::TestMonkey(char *params) {
   4273 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4274 
   4275     UErrorCode     status    = U_ZERO_ERROR;
   4276     int32_t        loopCount = 500;
   4277     int32_t        seed      = 1;
   4278     UnicodeString  breakType = "all";
   4279     Locale         locale("en");
   4280     UBool          useUText  = FALSE;
   4281 
   4282     if (quick == FALSE) {
   4283         loopCount = 10000;
   4284     }
   4285 
   4286     if (params) {
   4287         UnicodeString p(params);
   4288         loopCount = getIntParam("loop", p, loopCount);
   4289         seed      = getIntParam("seed", p, seed);
   4290 
   4291         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
   4292         if (m.find()) {
   4293             breakType = m.group(1, status);
   4294             m.reset();
   4295             p = m.replaceFirst("", status);
   4296         }
   4297 
   4298         RegexMatcher u(" *utext", p, 0, status);
   4299         if (u.find()) {
   4300             useUText = TRUE;
   4301             u.reset();
   4302             p = u.replaceFirst("", status);
   4303         }
   4304 
   4305 
   4306         // m.reset(p);
   4307         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
   4308             // Each option is stripped out of the option string as it is processed.
   4309             // All options have been checked.  The option string should have been completely emptied..
   4310             char buf[100];
   4311             p.extract(buf, sizeof(buf), NULL, status);
   4312             buf[sizeof(buf)-1] = 0;
   4313             errln("Unrecognized or extra parameter:  %s\n", buf);
   4314             return;
   4315         }
   4316 
   4317     }
   4318 
   4319     if (breakType == "char" || breakType == "all") {
   4320         RBBICharMonkey  m;
   4321         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   4322         if (U_SUCCESS(status)) {
   4323             RunMonkey(bi, m, "char", seed, loopCount, useUText);
   4324             if (breakType == "all" && useUText==FALSE) {
   4325                 // Also run a quick test with UText when "all" is specified
   4326                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
   4327             }
   4328         }
   4329         else {
   4330             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
   4331         }
   4332         delete bi;
   4333     }
   4334 
   4335     if (breakType == "word" || breakType == "all") {
   4336         logln("Word Break Monkey Test");
   4337         RBBIWordMonkey  m;
   4338         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
   4339         if (U_SUCCESS(status)) {
   4340             RunMonkey(bi, m, "word", seed, loopCount, useUText);
   4341         }
   4342         else {
   4343             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
   4344         }
   4345         delete bi;
   4346     }
   4347 
   4348     if (breakType == "line" || breakType == "all") {
   4349         logln("Line Break Monkey Test");
   4350         RBBILineMonkey  m;
   4351         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
   4352         if (loopCount >= 10) {
   4353             loopCount = loopCount / 5;   // Line break runs slower than the others.
   4354         }
   4355         if (U_SUCCESS(status)) {
   4356             RunMonkey(bi, m, "line", seed, loopCount, useUText);
   4357         }
   4358         else {
   4359             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
   4360         }
   4361         delete bi;
   4362     }
   4363 
   4364     if (breakType == "sent" || breakType == "all"  ) {
   4365         logln("Sentence Break Monkey Test");
   4366         RBBISentMonkey  m;
   4367         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
   4368         if (loopCount >= 10) {
   4369             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
   4370         }
   4371         if (U_SUCCESS(status)) {
   4372             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
   4373         }
   4374         else {
   4375             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
   4376         }
   4377         delete bi;
   4378     }
   4379 
   4380 #endif
   4381 }
   4382 
   4383 //
   4384 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
   4385 //    Parameters:
   4386 //       bi      - the break iterator to use
   4387 //       mk      - MonkeyKind, abstraction for obtaining expected results
   4388 //       name    - Name of test (char, word, etc.) for use in error messages
   4389 //       seed    - Seed for starting random number generator (parameter from user)
   4390 //       numIterations
   4391 //
   4392 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
   4393                          int32_t numIterations, UBool useUText) {
   4394 
   4395 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4396 
   4397     const int32_t    TESTSTRINGLEN = 500;
   4398     UnicodeString    testText;
   4399     int32_t          numCharClasses;
   4400     UVector          *chClasses;
   4401     int              expected[TESTSTRINGLEN*2 + 1];
   4402     int              expectedCount = 0;
   4403     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
   4404     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
   4405     char             reverseBreaks[TESTSTRINGLEN*2+1];
   4406     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
   4407     char             followingBreaks[TESTSTRINGLEN*2+1];
   4408     char             precedingBreaks[TESTSTRINGLEN*2+1];
   4409     int              i;
   4410     int              loopCount = 0;
   4411 
   4412     m_seed = seed;
   4413 
   4414     numCharClasses = mk.charClasses()->size();
   4415     chClasses      = mk.charClasses();
   4416 
   4417     // Check for errors that occured during the construction of the MonkeyKind object.
   4418     //  Can't report them where they occured because errln() is a method coming from intlTest,
   4419     //  and is not visible outside of RBBITest :-(
   4420     if (U_FAILURE(mk.deferredStatus)) {
   4421         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
   4422         return;
   4423     }
   4424 
   4425     // Verify that the character classes all have at least one member.
   4426     for (i=0; i<numCharClasses; i++) {
   4427         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
   4428         if (s == NULL || s->size() == 0) {
   4429             errln("Character Class #%d is null or of zero size.", i);
   4430             return;
   4431         }
   4432     }
   4433 
   4434     while (loopCount < numIterations || numIterations == -1) {
   4435         if (numIterations == -1 && loopCount % 10 == 0) {
   4436             // If test is running in an infinite loop, display a periodic tic so
   4437             //   we can tell that it is making progress.
   4438             fprintf(stderr, ".");
   4439         }
   4440         // Save current random number seed, so that we can recreate the random numbers
   4441         //   for this loop iteration in event of an error.
   4442         seed = m_seed;
   4443 
   4444         // Populate a test string with data.
   4445         testText.truncate(0);
   4446         for (i=0; i<TESTSTRINGLEN; i++) {
   4447             int32_t  aClassNum = m_rand() % numCharClasses;
   4448             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
   4449             int32_t   charIdx = m_rand() % classSet->size();
   4450             UChar32   c = classSet->charAt(charIdx);
   4451             if (c < 0) {   // TODO:  deal with sets containing strings.
   4452                 errln("c < 0");
   4453                 break;
   4454             }
   4455             testText.append(c);
   4456         }
   4457 
   4458         // Calculate the expected results for this test string.
   4459         mk.setText(testText);
   4460         memset(expectedBreaks, 0, sizeof(expectedBreaks));
   4461         expectedBreaks[0] = 1;
   4462         int32_t breakPos = 0;
   4463         expectedCount = 0;
   4464         for (;;) {
   4465             breakPos = mk.next(breakPos);
   4466             if (breakPos == -1) {
   4467                 break;
   4468             }
   4469             if (breakPos > testText.length()) {
   4470                 errln("breakPos > testText.length()");
   4471             }
   4472             expectedBreaks[breakPos] = 1;
   4473             U_ASSERT(expectedCount<testText.length());
   4474             expected[expectedCount ++] = breakPos;
   4475         }
   4476 
   4477         // Find the break positions using forward iteration
   4478         memset(forwardBreaks, 0, sizeof(forwardBreaks));
   4479         if (useUText) {
   4480             UErrorCode status = U_ZERO_ERROR;
   4481             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
   4482             // testUText = utext_openUnicodeString(testUText, &testText, &status);
   4483             bi->setText(testUText, status);
   4484             TEST_ASSERT_SUCCESS(status);
   4485             utext_close(testUText);   // The break iterator does a shallow clone of the UText
   4486                                       //  This UText can be closed immediately, so long as the
   4487                                       //  testText string continues to exist.
   4488         } else {
   4489             bi->setText(testText);
   4490         }
   4491 
   4492         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
   4493             if (i < 0 || i > testText.length()) {
   4494                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
   4495                 break;
   4496             }
   4497             forwardBreaks[i] = 1;
   4498         }
   4499 
   4500         // Find the break positions using reverse iteration
   4501         memset(reverseBreaks, 0, sizeof(reverseBreaks));
   4502         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
   4503             if (i < 0 || i > testText.length()) {
   4504                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
   4505                 break;
   4506             }
   4507             reverseBreaks[i] = 1;
   4508         }
   4509 
   4510         // Find the break positions using isBoundary() tests.
   4511         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
   4512         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
   4513         for (i=0; i<=testText.length(); i++) {
   4514             isBoundaryBreaks[i] = bi->isBoundary(i);
   4515         }
   4516 
   4517 
   4518         // Find the break positions using the following() function.
   4519         // printf(".");
   4520         memset(followingBreaks, 0, sizeof(followingBreaks));
   4521         int32_t   lastBreakPos = 0;
   4522         followingBreaks[0] = 1;
   4523         for (i=0; i<testText.length(); i++) {
   4524             breakPos = bi->following(i);
   4525             if (breakPos <= i ||
   4526                 breakPos < lastBreakPos ||
   4527                 breakPos > testText.length() ||
   4528                 (breakPos > lastBreakPos && lastBreakPos > i)) {
   4529                 errln("%s break monkey test: "
   4530                     "Out of range value returned by BreakIterator::following().\n"
   4531                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
   4532                          name, seed, i, breakPos, lastBreakPos);
   4533                 break;
   4534             }
   4535             followingBreaks[breakPos] = 1;
   4536             lastBreakPos = breakPos;
   4537         }
   4538 
   4539         // Find the break positions using the preceding() function.
   4540         memset(precedingBreaks, 0, sizeof(precedingBreaks));
   4541         lastBreakPos = testText.length();
   4542         precedingBreaks[testText.length()] = 1;
   4543         for (i=testText.length(); i>0; i--) {
   4544             breakPos = bi->preceding(i);
   4545             if (breakPos >= i ||
   4546                 breakPos > lastBreakPos ||
   4547                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
   4548                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
   4549                 errln("%s break monkey test: "
   4550                     "Out of range value returned by BreakIterator::preceding().\n"
   4551                     "index=%d;  prev returned %d; lastBreak=%d" ,
   4552                     name,  i, breakPos, lastBreakPos);
   4553                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
   4554                     precedingBreaks[i] = 2;   // Forces an error.
   4555                 }
   4556             } else {
   4557                 if (breakPos >= 0) {
   4558                     precedingBreaks[breakPos] = 1;
   4559                 }
   4560                 lastBreakPos = breakPos;
   4561             }
   4562         }
   4563 
   4564         // Compare the expected and actual results.
   4565         for (i=0; i<=testText.length(); i++) {
   4566             const char *errorType = NULL;
   4567             if  (forwardBreaks[i] != expectedBreaks[i]) {
   4568                 errorType = "next()";
   4569             } else if (reverseBreaks[i] != forwardBreaks[i]) {
   4570                 errorType = "previous()";
   4571             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
   4572                 errorType = "isBoundary()";
   4573             } else if (followingBreaks[i] != expectedBreaks[i]) {
   4574                 errorType = "following()";
   4575             } else if (precedingBreaks[i] != expectedBreaks[i]) {
   4576                 errorType = "preceding()";
   4577             }
   4578 
   4579 
   4580             if (errorType != NULL) {
   4581                 // Format a range of the test text that includes the failure as
   4582                 //  a data item that can be included in the rbbi test data file.
   4583 
   4584                 // Start of the range is the last point where expected and actual results
   4585                 //   both agreed that there was a break position.
   4586                 int startContext = i;
   4587                 int32_t count = 0;
   4588                 for (;;) {
   4589                     if (startContext==0) { break; }
   4590                     startContext --;
   4591                     if (expectedBreaks[startContext] != 0) {
   4592                         if (count == 2) break;
   4593                         count ++;
   4594                     }
   4595                 }
   4596 
   4597                 // End of range is two expected breaks past the start position.
   4598                 int endContext = i + 1;
   4599                 int ci;
   4600                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
   4601                     for (;;) {
   4602                         if (endContext >= testText.length()) {break;}
   4603                         if (expectedBreaks[endContext-1] != 0) {
   4604                             if (count == 0) break;
   4605                             count --;
   4606                         }
   4607                         endContext ++;
   4608                     }
   4609                 }
   4610 
   4611                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
   4612                 UnicodeString errorText = "<data>";
   4613                 /***if (strcmp(errorType, "next()") == 0) {
   4614                     startContext = 0;
   4615                     endContext = testText.length();
   4616 
   4617                     printStringBreaks(testText, expected, expectedCount);
   4618                 }***/
   4619 
   4620                 for (ci=startContext; ci<endContext;) {
   4621                     UnicodeString hexChars("0123456789abcdef");
   4622                     UChar32  c;
   4623                     int      bn;
   4624                     c = testText.char32At(ci);
   4625                     if (ci == i) {
   4626                         // This is the location of the error.
   4627                         errorText.append("<?>");
   4628                     } else if (expectedBreaks[ci] != 0) {
   4629                         // This a non-error expected break position.
   4630                         errorText.append("\\");
   4631                     }
   4632                     if (c < 0x10000) {
   4633                         errorText.append("\\u");
   4634                         for (bn=12; bn>=0; bn-=4) {
   4635                             errorText.append(hexChars.charAt((c>>bn)&0xf));
   4636                         }
   4637                     } else {
   4638                         errorText.append("\\U");
   4639                         for (bn=28; bn>=0; bn-=4) {
   4640                             errorText.append(hexChars.charAt((c>>bn)&0xf));
   4641                         }
   4642                     }
   4643                     ci = testText.moveIndex32(ci, 1);
   4644                 }
   4645                 errorText.append("\\");
   4646                 errorText.append("</data>\n");
   4647 
   4648                 // Output the error
   4649                 char  charErrorTxt[500];
   4650                 UErrorCode status = U_ZERO_ERROR;
   4651                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
   4652                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
   4653                 errln("%s break monkey test error.  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
   4654                     name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
   4655                     errorType, seed, i, charErrorTxt);
   4656                 break;
   4657             }
   4658         }
   4659 
   4660         loopCount++;
   4661     }
   4662 #endif
   4663 }
   4664 
   4665 
   4666 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
   4667 //             This test checks the initial patch,
   4668 //             which is to just keep it from crashing.  Correct word boundaries
   4669 //             await a proper fix to the dictionary code.
   4670 //
   4671 void RBBITest::TestBug5532(void)  {
   4672    // Text includes a mixture of Thai and Latin.
   4673    const unsigned char utf8Data[] = {
   4674            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
   4675            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
   4676            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
   4677            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
   4678            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
   4679            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
   4680            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
   4681            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
   4682            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
   4683            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
   4684            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
   4685 
   4686     UErrorCode status = U_ZERO_ERROR;
   4687     UText utext=UTEXT_INITIALIZER;
   4688     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
   4689     TEST_ASSERT_SUCCESS(status);
   4690 
   4691     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
   4692     TEST_ASSERT_SUCCESS(status);
   4693     if (U_SUCCESS(status)) {
   4694         bi->setText(&utext, status);
   4695         TEST_ASSERT_SUCCESS(status);
   4696 
   4697         int32_t breakCount = 0;
   4698         int32_t previousBreak = -1;
   4699         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
   4700             // For now, just make sure that the break iterator doesn't hang.
   4701             TEST_ASSERT(previousBreak < bi->current());
   4702             previousBreak = bi->current();
   4703         }
   4704         TEST_ASSERT(breakCount > 0);
   4705     }
   4706     delete bi;
   4707     utext_close(&utext);
   4708 }
   4709 
   4710 
   4711 //
   4712 //  TestDebug    -  A place-holder test for debugging purposes.
   4713 //                  For putting in fragments of other tests that can be invoked
   4714 //                  for tracing  without a lot of unwanted extra stuff happening.
   4715 //
   4716 void RBBITest::TestDebug(void) {
   4717 #if 0
   4718     UErrorCode   status = U_ZERO_ERROR;
   4719     int pos = 0;
   4720     int ruleStatus = 0;
   4721 
   4722     RuleBasedBreakIterator* bi =
   4723        // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
   4724        // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
   4725        (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
   4726     UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
   4727     // UnicodeString s("Aaa.  Bcd");
   4728     s = s.unescape();
   4729     bi->setText(s);
   4730     UBool r = bi->isBoundary(8);
   4731     printf("%s", r?"true":"false");
   4732     return;
   4733     pos = bi->last();
   4734     do {
   4735         // ruleStatus = bi->getRuleStatus();
   4736         printf("%d\t%d\n", pos, ruleStatus);
   4737         pos = bi->previous();
   4738     } while (pos != BreakIterator::DONE);
   4739 #endif
   4740 }
   4741 
   4742 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   4743