Home | History | Annotate | Download | only in intltest
      1 /********************************************************************
      2  * COPYRIGHT:
      3  * Copyright (c) 1999-2009, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  ********************************************************************/
      6 /************************************************************************
      7 *   Date        Name        Description
      8 *   12/15/99    Madhu        Creation.
      9 *   01/12/2000  Madhu        Updated for changed API and added new tests
     10 ************************************************************************/
     11 
     12 #include "unicode/utypes.h"
     13 
     14 #if !UCONFIG_NO_BREAK_ITERATION
     15 
     16 #include "unicode/utypes.h"
     17 #include "unicode/brkiter.h"
     18 #include "unicode/rbbi.h"
     19 #include "unicode/uchar.h"
     20 #include "unicode/utf16.h"
     21 #include "unicode/ucnv.h"
     22 #include "unicode/schriter.h"
     23 #include "unicode/uniset.h"
     24 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
     25 #include "unicode/ustring.h"
     26 #include "unicode/utext.h"
     27 #include "intltest.h"
     28 #include "rbbitst.h"
     29 #include <string.h>
     30 #include "uvector.h"
     31 #include "uvectr32.h"
     32 #include "triedict.h"
     33 #include <string.h>
     34 #include <stdio.h>
     35 #include <stdlib.h>
     36 
     37 #define TEST_ASSERT(x) {if (!(x)) { \
     38     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
     39 
     40 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
     41     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
     42 
     43 
     44 //---------------------------------------------
     45 // runIndexedTest
     46 //---------------------------------------------
     47 
     48 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
     49 {
     50     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
     51 
     52     switch (index) {
     53 #if !UCONFIG_NO_FILE_IO
     54         case 0: name = "TestBug4153072";
     55             if(exec) TestBug4153072();                         break;
     56 #else
     57         case 0: name = "skip";
     58             break;
     59 #endif
     60 
     61         case 1: name = "TestJapaneseLineBreak";
     62             if(exec) TestJapaneseLineBreak();                  break;
     63         case 2: name = "TestStatusReturn";
     64             if(exec) TestStatusReturn();                       break;
     65 
     66 #if !UCONFIG_NO_FILE_IO
     67         case 3: name = "TestUnicodeFiles";
     68             if(exec) TestUnicodeFiles();                       break;
     69         case 4: name = "TestEmptyString";
     70             if(exec) TestEmptyString();                        break;
     71 #else
     72         case 3: case 4: name = "skip";
     73             break;
     74 #endif
     75 
     76         case 5: name = "TestGetAvailableLocales";
     77             if(exec) TestGetAvailableLocales();                break;
     78 
     79         case 6: name = "TestGetDisplayName";
     80             if(exec) TestGetDisplayName();                     break;
     81 
     82 #if !UCONFIG_NO_FILE_IO
     83         case 7: name = "TestEndBehaviour";
     84             if(exec) TestEndBehaviour();                       break;
     85         case 8: name = "TestMixedThaiLineBreak";
     86              if(exec) TestMixedThaiLineBreak();                break;
     87         case 9: name = "TestThaiLineBreak";
     88              if(exec) TestThaiLineBreak();                     break;
     89         case 10: name = "TestMaiyamok";
     90              if(exec) TestMaiyamok();                          break;
     91         case 11: name = "TestWordBreaks";
     92              if(exec) TestWordBreaks();                        break;
     93         case 12: name = "TestWordBoundary";
     94              if(exec) TestWordBoundary();                      break;
     95         case 13: name = "TestLineBreaks";
     96              if(exec) TestLineBreaks();                        break;
     97         case 14: name = "TestSentBreaks";
     98              if(exec) TestSentBreaks();                        break;
     99         case 15: name = "TestExtended";
    100              if(exec) TestExtended();                          break;
    101 #else
    102         case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
    103              break;
    104 #endif
    105 
    106         case 16:
    107              if(exec) {
    108  #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
    109                name = "TestMonkey";
    110                TestMonkey(params);
    111  #else
    112                name = "skip";
    113  #endif
    114              }
    115                                                                break;
    116 
    117 #if !UCONFIG_NO_FILE_IO
    118         case 17: name = "TestBug3818";
    119             if(exec) TestBug3818();                            break;
    120         case 18: name = "TestJapaneseWordBreak";
    121             if(exec) TestJapaneseWordBreak();                  break;
    122 #else
    123         case 17: case 18: name = "skip";
    124             break;
    125 #endif
    126 
    127         case 19: name = "TestDebug";
    128             if(exec) TestDebug();                              break;
    129         case 20: name = "TestTrieDict";
    130             if(exec) TestTrieDict();                           break;
    131 
    132 #if !UCONFIG_NO_FILE_IO
    133         case 21: name = "TestBug5775";
    134             if (exec) TestBug5775();                           break;
    135         case 22: name = "TestThaiBreaks";
    136             if (exec) TestThaiBreaks();                        break;
    137         case 23: name = "TestTailoredBreaks";
    138             if (exec) TestTailoredBreaks();                    break;
    139 #else
    140         case 21: case 22: case 23: name = "skip";
    141             break;
    142 #endif
    143         case 24: name = "TestDictRules";
    144             if (exec) TestDictRules();                         break;
    145 
    146         default: name = ""; break; //needed to end loop
    147     }
    148 }
    149 
    150 
    151 //---------------------------------------------------------------------------
    152 //
    153 //   class BITestData   Holds a set of Break iterator test data and results
    154 //                      Includes
    155 //                         - the string data to be broken
    156 //                         - a vector of the expected break positions.
    157 //                         - a vector of source line numbers for the data,
    158 //                               (to help see where errors occured.)
    159 //                         - The expected break tag values.
    160 //                         - Vectors of actual break positions and tag values.
    161 //                         - Functions for comparing actual with expected and
    162 //                            reporting errors.
    163 //
    164 //----------------------------------------------------------------------------
    165 class BITestData {
    166 public:
    167     UnicodeString    fDataToBreak;
    168     UVector          fExpectedBreakPositions;
    169     UVector          fExpectedTags;
    170     UVector          fLineNum;
    171     UVector          fActualBreakPositions;   // Test Results.
    172     UVector          fActualTags;
    173 
    174     BITestData(UErrorCode &status);
    175     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
    176     void             checkResults(const char *heading, RBBITest *test);
    177     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
    178     void             clearResults();
    179 };
    180 
    181 //
    182 // Constructor.
    183 //
    184 BITestData::BITestData(UErrorCode &status)
    185 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
    186   fActualTags(status)
    187 {
    188 }
    189 
    190 //
    191 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
    192 //                 The macro form collects the line number, which is helpful
    193 //                 when tracking down failures.
    194 //
    195 //                 A null data item is inserted at the start of each test's data
    196 //                  to put the starting zero into the data list.  The position saved for
    197 //                  each non-null item is its ending position.
    198 //
    199 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
    200 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
    201     if (U_FAILURE(status)) {return;}
    202     if (data != NULL) {
    203         fDataToBreak.append(CharsToUnicodeString(data));
    204     }
    205     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
    206     fExpectedTags.addElement(tag, status);
    207     fLineNum.addElement(lineNum, status);
    208 }
    209 
    210 
    211 //
    212 //  checkResults.   Compare the actual and expected break positions, report any differences.
    213 //
    214 void BITestData::checkResults(const char *heading, RBBITest *test) {
    215     int32_t   expectedIndex = 0;
    216     int32_t   actualIndex = 0;
    217 
    218     for (;;) {
    219         // If we've run through both the expected and actual results vectors, we're done.
    220         //   break out of the loop.
    221         if (expectedIndex >= fExpectedBreakPositions.size() &&
    222             actualIndex   >= fActualBreakPositions.size()) {
    223             break;
    224         }
    225 
    226 
    227         if (expectedIndex >= fExpectedBreakPositions.size()) {
    228             err(heading, test, expectedIndex-1, actualIndex);
    229             actualIndex++;
    230             continue;
    231         }
    232 
    233         if (actualIndex >= fActualBreakPositions.size()) {
    234             err(heading, test, expectedIndex, actualIndex-1);
    235             expectedIndex++;
    236             continue;
    237         }
    238 
    239         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
    240             err(heading, test, expectedIndex, actualIndex);
    241             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
    242             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
    243                 actualIndex++;
    244             } else {
    245                 expectedIndex++;
    246             }
    247             continue;
    248         }
    249 
    250         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
    251             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
    252                 heading, fLineNum.elementAt(expectedIndex),
    253                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
    254         }
    255 
    256         actualIndex++;
    257         expectedIndex++;
    258     }
    259 }
    260 
    261 //
    262 //  err   -  An error was found.  Report it, along with information about where the
    263 //                                incorrectly broken test data appeared in the source file.
    264 //
    265 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
    266 {
    267     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
    268     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
    269     int32_t   o        = 0;
    270     int32_t   line     = fLineNum.elementAti(expectedIdx);
    271     if (expectedIdx > 0) {
    272         // The line numbers are off by one because a premature break occurs somewhere
    273         //    within the previous item, rather than at the start of the current (expected) item.
    274         //    We want to report the offset of the unexpected break from the start of
    275         //      this previous item.
    276         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
    277     }
    278     if (actual < expected) {
    279         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
    280     } else {
    281         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
    282     }
    283 }
    284 
    285 
    286 void BITestData::clearResults() {
    287     fActualBreakPositions.removeAllElements();
    288     fActualTags.removeAllElements();
    289 }
    290 
    291 
    292 //-----------------------------------------------------------------------------------
    293 //
    294 //    Cannned Test Characters
    295 //
    296 //-----------------------------------------------------------------------------------
    297 
    298 static const UChar cannedTestArray[] = {
    299     0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
    300     0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
    301     0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
    302     0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
    303     0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
    304     0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
    305     0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
    306     0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
    307 };
    308 
    309 static UnicodeString* cannedTestChars = 0;
    310 
    311 #define  halfNA     "\\u0928\\u094d\\u200d"
    312 #define  halfSA     "\\u0938\\u094d\\u200d"
    313 #define  halfCHA    "\\u091a\\u094d\\u200d"
    314 #define  halfKA     "\\u0915\\u094d\\u200d"
    315 #define  deadTA     "\\u0924\\u094d"
    316 
    317 //--------------------------------------------------------------------------------------
    318 //
    319 //    RBBITest    constructor and destructor
    320 //
    321 //--------------------------------------------------------------------------------------
    322 
    323 RBBITest::RBBITest() {
    324     UnicodeString temp(cannedTestArray);
    325     cannedTestChars = new UnicodeString();
    326     *cannedTestChars += (UChar)0x0000;
    327     *cannedTestChars += temp;
    328 }
    329 
    330 
    331 RBBITest::~RBBITest() {
    332     delete cannedTestChars;
    333 }
    334 
    335 
    336 static const int T_NUMBER = 100;
    337 static const int T_LETTER = 200;
    338 static const int T_H_OR_K = 300;
    339 static const int T_IDEO   = 400;
    340 
    341 
    342 
    343 
    344 
    345 
    346 //--------------------------------------------------------------------
    347 //Testing the BreakIterator for devanagari script
    348 //--------------------------------------------------------------------
    349 
    350 #define deadRA   "\\u0930\\u094d"         /*deadform RA = devanagari RA + virama*/
    351 #define deadPHA  "\\u092b\\u094d"         /*deadform PHA = devanagari PHA + virama*/
    352 #define deadTTHA "\\u0920\\u094d"
    353 #define deadPA   "\\u092a\\u094d"
    354 #define deadSA   "\\u0938\\u094d"
    355 #define visarga  "\\u0903"                /*devanagari visarga looks like a english colon*/
    356 
    357 
    358 
    359 
    360 
    361 
    362 //-----------------------------------------------------------------------------------
    363 //
    364 //   Test for status {tag} return value from break rules.
    365 //        TODO:  a more thorough test.
    366 //
    367 //-----------------------------------------------------------------------------------
    368 void RBBITest::TestStatusReturn() {
    369      UnicodeString rulesString1("$Letters = [:L:];\n"
    370                                   "$Numbers = [:N:];\n"
    371                                   "$Letters+{1};\n"
    372                                   "$Numbers+{2};\n"
    373                                   "Help\\ {4}/me\\!;\n"
    374                                   "[^$Letters $Numbers];\n"
    375                                   "!.*;\n", -1, US_INV);
    376      UnicodeString testString1  = "abc123..abc Help me Help me!";
    377                                 // 01234567890123456789012345678
    378      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
    379      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
    380 
    381      UErrorCode status=U_ZERO_ERROR;
    382      UParseError    parseError;
    383 
    384      RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
    385      if(U_FAILURE(status)) {
    386          dataerrln("FAIL : in construction - %s", u_errorName(status));
    387      } else {
    388          int32_t  pos;
    389          int32_t  i = 0;
    390          bi->setText(testString1);
    391          for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
    392              if (pos != bounds1[i]) {
    393                  errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
    394                  break;
    395              }
    396 
    397              int tag = bi->getRuleStatus();
    398              if (tag != brkStatus[i]) {
    399                  errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
    400                  break;
    401              }
    402              i++;
    403          }
    404      }
    405      delete bi;
    406 }
    407 
    408 
    409 static void printStringBreaks(UnicodeString ustr, int expected[],
    410                               int expectedcount)
    411 {
    412     UErrorCode status = U_ZERO_ERROR;
    413     char name[100];
    414     printf("code    alpha extend alphanum type word sent line name\n");
    415     int j;
    416     for (j = 0; j < ustr.length(); j ++) {
    417         if (expectedcount > 0) {
    418             int k;
    419             for (k = 0; k < expectedcount; k ++) {
    420                 if (j == expected[k]) {
    421                     printf("------------------------------------------------ %d\n",
    422                            j);
    423                 }
    424             }
    425         }
    426         UChar32 c = ustr.char32At(j);
    427         if (c > 0xffff) {
    428             j ++;
    429         }
    430         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
    431         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
    432                            u_isUAlphabetic(c),
    433                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
    434                            u_isalnum(c),
    435                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
    436                                                   u_charType(c),
    437                                                   U_SHORT_PROPERTY_NAME),
    438                            u_getPropertyValueName(UCHAR_WORD_BREAK,
    439                                                   u_getIntPropertyValue(c,
    440                                                           UCHAR_WORD_BREAK),
    441                                                   U_SHORT_PROPERTY_NAME),
    442                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
    443                                    u_getIntPropertyValue(c,
    444                                            UCHAR_SENTENCE_BREAK),
    445                                    U_SHORT_PROPERTY_NAME),
    446                            u_getPropertyValueName(UCHAR_LINE_BREAK,
    447                                    u_getIntPropertyValue(c,
    448                                            UCHAR_LINE_BREAK),
    449                                    U_SHORT_PROPERTY_NAME),
    450                            name);
    451     }
    452 }
    453 
    454 void RBBITest::TestThaiLineBreak() {
    455     UErrorCode status = U_ZERO_ERROR;
    456     BITestData thaiLineSelection(status);
    457 
    458     // \u0e2f-- the Thai paiyannoi character-- isn't a letter.  It's a symbol that
    459     // represents elided letters at the end of a long word.  It should be bound to
    460     // the end of the word and not treated as an independent punctuation mark.
    461 
    462 
    463     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
    464     ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status);
    465     ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status);
    466     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status);
    467     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status);
    468 //        ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);
    469 //        ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
    470     ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status);
    471     // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
    472     ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status);
    473     ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status);
    474     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status);
    475     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status);
    476     ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status);
    477     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status);
    478 
    479     // the one time where the paiyannoi occurs somewhere other than at the end
    480     // of a word is in the Thai abbrevation for "etc.", which both begins and
    481     // ends with a paiyannoi
    482     ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status);
    483     ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
    484     ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status);
    485 
    486     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
    487         Locale("th"), status);
    488     if (U_FAILURE(status))
    489     {
    490         errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestThaiLineBreak. - %s", u_errorName(status));
    491         return;
    492     }
    493 
    494     generalIteratorTest(*e, thaiLineSelection);
    495     delete e;
    496 }
    497 
    498 
    499 
    500 void RBBITest::TestMixedThaiLineBreak()
    501 {
    502     UErrorCode   status = U_ZERO_ERROR;
    503     BITestData   thaiLineSelection(status);
    504 
    505     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
    506 
    507 
    508     // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters
    509     // start
    510 
    511     ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
    512     ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status);
    513     ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status);
    514     ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status);
    515     ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
    516     ADD_DATACHUNK(thaiLineSelection, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status);
    517     ADD_DATACHUNK(thaiLineSelection, "\\u0E04\\u0E23\\u0E1A", 0, status);
    518     ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E2D\\u0E1A ", 0, status);
    519     ADD_DATACHUNK(thaiLineSelection, "\"\\u0E52\\u0E52\\u0E50 ", 0, status);
    520     ADD_DATACHUNK(thaiLineSelection, "\\u0E1b\\u0E35\" ", 0, status);
    521     ADD_DATACHUNK(thaiLineSelection, "\\u0E02\\u0E2d\\u0E07", 0, status);
    522     ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status);
    523     ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status);
    524     ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status);
    525     ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status);
    526     ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status);
    527 
    528     // @suwit - end of changes
    529 
    530 
    531     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status);
    532     if (U_FAILURE(status))
    533     {
    534         errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak. - %s", u_errorName(status));
    535         return;
    536     }
    537 
    538 
    539     generalIteratorTest(*e, thaiLineSelection);
    540     delete e;
    541 }
    542 
    543 
    544 void RBBITest::TestMaiyamok()
    545 {
    546     UErrorCode status = U_ZERO_ERROR;
    547     BITestData   thaiLineSelection(status);
    548     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
    549     // the Thai maiyamok character is a shorthand symbol that means "repeat the previous
    550     // word".  Instead of appearing as a word unto itself, however, it's kept together
    551     // with the word before it
    552     ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);
    553     ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);
    554     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);
    555     ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status);
    556     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status);
    557     ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);
    558     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status);
    559     ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status);
    560     ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
    561 
    562     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
    563         Locale("th"), status);
    564 
    565     if (U_FAILURE(status))
    566     {
    567         errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMaiyamok. - %s", u_errorName(status));
    568         return;
    569     }
    570     generalIteratorTest(*e, thaiLineSelection);
    571     delete e;
    572 }
    573 
    574 
    575 
    576 void RBBITest::TestBug3818() {
    577     UErrorCode  status = U_ZERO_ERROR;
    578 
    579     // Four Thai words...
    580     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
    581                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
    582     UnicodeString  thaiStr(thaiWordData);
    583 
    584     RuleBasedBreakIterator* bi =
    585         (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
    586     if (U_FAILURE(status) || bi == NULL) {
    587         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
    588         return;
    589     }
    590     bi->setText(thaiStr);
    591 
    592     int32_t  startOfSecondWord = bi->following(1);
    593     if (startOfSecondWord != 4) {
    594         errln("Fail at file %s, line %d expected start of word at 4, got %d",
    595             __FILE__, __LINE__, startOfSecondWord);
    596     }
    597     startOfSecondWord = bi->following(0);
    598     if (startOfSecondWord != 4) {
    599         errln("Fail at file %s, line %d expected start of word at 4, got %d",
    600             __FILE__, __LINE__, startOfSecondWord);
    601     }
    602     delete bi;
    603 }
    604 
    605 
    606 void RBBITest::TestJapaneseWordBreak() {
    607     UErrorCode status = U_ZERO_ERROR;
    608     BITestData   japaneseWordSelection(status);
    609 
    610     ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status);           // Break at start of data
    611     ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2
    612     ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5
    613     ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7
    614     ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10
    615     ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11
    616     ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12
    617 
    618     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
    619         Locale("ja"), status);
    620     if (U_FAILURE(status))
    621     {
    622         errcheckln(status, "Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
    623         return;
    624     }
    625 
    626     generalIteratorTest(*e, japaneseWordSelection);
    627     delete e;
    628 }
    629 
    630 void RBBITest::TestTrieDict() {
    631     UErrorCode      status  = U_ZERO_ERROR;
    632 
    633     //
    634     //  Open and read the test data file.
    635     //
    636     const char *testDataDirectory = IntlTest::getSourceTestData(status);
    637     char testFileName[1000];
    638     if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) {
    639         errln("Can't open test data.  Path too long.");
    640         return;
    641     }
    642     strcpy(testFileName, testDataDirectory);
    643     strcat(testFileName, "riwords.txt");
    644 
    645     // Items needing deleting at the end
    646     MutableTrieDictionary *mutableDict = NULL;
    647     CompactTrieDictionary *compactDict = NULL;
    648     UnicodeSet            *breaks      = NULL;
    649     UChar                 *testFile    = NULL;
    650     StringEnumeration     *enumer1     = NULL;
    651     StringEnumeration     *enumer2     = NULL;
    652     MutableTrieDictionary *mutable2    = NULL;
    653     StringEnumeration     *cloneEnum   = NULL;
    654     CompactTrieDictionary *compact2    = NULL;
    655 
    656 
    657     const UnicodeString *originalWord = NULL;
    658     const UnicodeString *cloneWord    = NULL;
    659     UChar *current;
    660     UChar *word;
    661     UChar uc;
    662     int32_t wordLen;
    663     int32_t wordCount;
    664     int32_t testCount;
    665 
    666     int    len;
    667     testFile = ReadAndConvertFile(testFileName, len, NULL, status);
    668     if (U_FAILURE(status)) {
    669         goto cleanup; /* something went wrong, error already output */
    670     }
    671 
    672     mutableDict = new MutableTrieDictionary(0x0E1C, status);
    673     if (U_FAILURE(status)) {
    674         errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
    675         goto cleanup;
    676     }
    677 
    678     breaks = new UnicodeSet;
    679     breaks->add(0x000A);     // Line Feed
    680     breaks->add(0x000D);     // Carriage Return
    681     breaks->add(0x2028);     // Line Separator
    682     breaks->add(0x2029);     // Paragraph Separator
    683 
    684     // Now add each non-comment line of the file as a word.
    685     current = testFile;
    686     word = current;
    687     uc = *current++;
    688     wordLen = 0;
    689     wordCount = 0;
    690 
    691     while (uc) {
    692         if (uc == 0x0023) {     // #comment line, skip
    693             while (uc && !breaks->contains(uc)) {
    694                 uc = *current++;
    695             }
    696         }
    697         else while (uc && !breaks->contains(uc)) {
    698             ++wordLen;
    699             uc = *current++;
    700         }
    701         if (wordLen > 0) {
    702             mutableDict->addWord(word, wordLen, status);
    703             if (U_FAILURE(status)) {
    704                 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
    705                 goto cleanup;
    706             }
    707             wordCount += 1;
    708         }
    709 
    710         // Find beginning of next line
    711         while (uc && breaks->contains(uc)) {
    712             uc = *current++;
    713         }
    714         word = current-1;
    715         wordLen = 0;
    716     }
    717 
    718     if (wordCount < 50) {
    719         errln("Word count (%d) unreasonably small\n", wordCount);
    720         goto cleanup;
    721     }
    722 
    723     enumer1 = mutableDict->openWords(status);
    724     if (U_FAILURE(status)) {
    725         errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
    726         goto cleanup;
    727     }
    728 
    729     testCount = 0;
    730     if (wordCount != (testCount = enumer1->count(status))) {
    731         errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
    732             testCount, wordCount, u_errorName(status));
    733         goto cleanup;
    734     }
    735 
    736     // Now compact it
    737     compactDict = new CompactTrieDictionary(*mutableDict, status);
    738     if (U_FAILURE(status)) {
    739         errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
    740         goto cleanup;
    741     }
    742 
    743     enumer2 = compactDict->openWords(status);
    744     if (U_FAILURE(status)) {
    745         errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
    746         goto cleanup;
    747     }
    748 
    749     if (wordCount != (testCount = enumer2->count(status))) {
    750         errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
    751             testCount, wordCount, u_errorName(status));
    752         goto cleanup;
    753     }
    754 
    755     if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) {
    756         errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same");
    757     }
    758     delete enumer1;
    759     enumer1 = NULL;
    760     delete enumer2;
    761     enumer2 = NULL;
    762 
    763     // Now un-compact it
    764     mutable2 = compactDict->cloneMutable(status);
    765     if (U_FAILURE(status)) {
    766         errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
    767         goto cleanup;
    768     }
    769 
    770     cloneEnum = mutable2->openWords(status);
    771     if (U_FAILURE(status)) {
    772         errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
    773         goto cleanup;
    774     }
    775 
    776     if (wordCount != (testCount = cloneEnum->count(status))) {
    777         errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
    778             testCount, wordCount, u_errorName(status));
    779         goto cleanup;
    780     }
    781 
    782     // Compact original dictionary to clone. Note that we can only compare the same kind of
    783     // dictionary as the order of the enumerators is not guaranteed to be the same between
    784     // different kinds
    785     enumer1 = mutableDict->openWords(status);
    786     if (U_FAILURE(status)) {
    787         errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
    788         goto cleanup;
    789      }
    790 
    791     originalWord = enumer1->snext(status);
    792     cloneWord = cloneEnum->snext(status);
    793     while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
    794         if (*originalWord != *cloneWord) {
    795             errln("Original and cloned MutableTrieDictionary word mismatch\n");
    796             goto cleanup;
    797         }
    798         originalWord = enumer1->snext(status);
    799         cloneWord = cloneEnum->snext(status);
    800     }
    801 
    802     if (U_FAILURE(status)) {
    803         errln("Enumeration failed: %s\n", u_errorName(status));
    804         goto cleanup;
    805     }
    806 
    807     if (originalWord != cloneWord) {
    808         errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
    809         goto cleanup;
    810     }
    811 
    812     // Test the data copying constructor for CompactTrieDict, and the data access APIs.
    813     compact2 = new CompactTrieDictionary(compactDict->data(), status);
    814     if (U_FAILURE(status)) {
    815         errln("CompactTrieDictionary(const void *,...) failed\n");
    816         goto cleanup;
    817     }
    818 
    819     if (compact2->dataSize() == 0) {
    820         errln("CompactTrieDictionary->dataSize() == 0\n");
    821         goto cleanup;
    822     }
    823 
    824     // Now count the words via the second dictionary
    825     delete enumer1;
    826     enumer1 = compact2->openWords(status);
    827     if (U_FAILURE(status)) {
    828         errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
    829         goto cleanup;
    830     }
    831 
    832     if (wordCount != (testCount = enumer1->count(status))) {
    833         errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
    834             testCount, wordCount, u_errorName(status));
    835         goto cleanup;
    836     }
    837 
    838 cleanup:
    839     delete compactDict;
    840     delete mutableDict;
    841     delete breaks;
    842     delete[] testFile;
    843     delete enumer1;
    844     delete mutable2;
    845     delete cloneEnum;
    846     delete compact2;
    847 }
    848 
    849 
    850 //----------------------------------------------------------------------------
    851 //
    852 // generalIteratorTest      Given a break iterator and a set of test data,
    853 //                          Run the tests and report the results.
    854 //
    855 //----------------------------------------------------------------------------
    856 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
    857 {
    858 
    859     bi.setText(td.fDataToBreak);
    860 
    861     testFirstAndNext(bi, td);
    862 
    863     testLastAndPrevious(bi, td);
    864 
    865     testFollowing(bi, td);
    866     testPreceding(bi, td);
    867     testIsBoundary(bi, td);
    868     doMultipleSelectionTest(bi, td);
    869 }
    870 
    871 
    872 //
    873 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
    874 //                       kind of loop.
    875 //
    876 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
    877 {
    878     UErrorCode  status = U_ZERO_ERROR;
    879     int32_t     p;
    880     int32_t     lastP = -1;
    881     int32_t     tag;
    882 
    883     logln("Test first and next");
    884     bi.setText(td.fDataToBreak);
    885     td.clearResults();
    886 
    887     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
    888         td.fActualBreakPositions.addElement(p, status);  // Save result.
    889         tag = bi.getRuleStatus();
    890         td.fActualTags.addElement(tag, status);
    891         if (p <= lastP) {
    892             // If the iterator is not making forward progress, stop.
    893             //  No need to raise an error here, it'll be detected in the normal check of results.
    894             break;
    895         }
    896         lastP = p;
    897     }
    898     td.checkResults("testFirstAndNext", this);
    899 }
    900 
    901 
    902 //
    903 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
    904 //
    905 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
    906 {
    907     UErrorCode  status = U_ZERO_ERROR;
    908     int32_t     p;
    909     int32_t     lastP  = 0x7ffffffe;
    910     int32_t     tag;
    911 
    912     logln("Test last and previous");
    913     bi.setText(td.fDataToBreak);
    914     td.clearResults();
    915 
    916     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
    917         // Save break position.  Insert it at start of vector of results, shoving
    918         //    already-saved results further towards the end.
    919         td.fActualBreakPositions.insertElementAt(p, 0, status);
    920         // bi.previous();   // TODO:  Why does this fix things up????
    921         // bi.next();
    922         tag = bi.getRuleStatus();
    923         td.fActualTags.insertElementAt(tag, 0, status);
    924         if (p >= lastP) {
    925             // If the iterator is not making progress, stop.
    926             //  No need to raise an error here, it'll be detected in the normal check of results.
    927             break;
    928         }
    929         lastP = p;
    930     }
    931     td.checkResults("testLastAndPrevious", this);
    932 }
    933 
    934 
    935 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
    936 {
    937     UErrorCode  status = U_ZERO_ERROR;
    938     int32_t     p;
    939     int32_t     tag;
    940     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
    941                                  //   cannot be -1; that is returned for DONE.
    942     int         i;
    943 
    944     logln("testFollowing():");
    945     bi.setText(td.fDataToBreak);
    946     td.clearResults();
    947 
    948     // Save the starting point, since we won't get that out of following.
    949     p = bi.first();
    950     td.fActualBreakPositions.addElement(p, status);  // Save result.
    951     tag = bi.getRuleStatus();
    952     td.fActualTags.addElement(tag, status);
    953 
    954     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
    955         p = bi.following(i);
    956         if (p != lastP) {
    957             if (p == RuleBasedBreakIterator::DONE) {
    958                 break;
    959             }
    960             // We've reached a new break position.  Save it.
    961             td.fActualBreakPositions.addElement(p, status);  // Save result.
    962             tag = bi.getRuleStatus();
    963             td.fActualTags.addElement(tag, status);
    964             lastP = p;
    965         }
    966     }
    967     // The loop normally exits by means of the break in the middle.
    968     // Make sure that the index was at the correct position for the break iterator to have
    969     //   returned DONE.
    970     if (i != td.fDataToBreak.length()) {
    971         errln("testFollowing():  iterator returned DONE prematurely.");
    972     }
    973 
    974     // Full check of all results.
    975     td.checkResults("testFollowing", this);
    976 }
    977 
    978 
    979 
    980 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
    981     UErrorCode  status = U_ZERO_ERROR;
    982     int32_t     p;
    983     int32_t     tag;
    984     int32_t     lastP  = 0x7ffffffe;
    985     int         i;
    986 
    987     logln("testPreceding():");
    988     bi.setText(td.fDataToBreak);
    989     td.clearResults();
    990 
    991     p = bi.last();
    992     td.fActualBreakPositions.addElement(p, status);
    993     tag = bi.getRuleStatus();
    994     td.fActualTags.addElement(tag, status);
    995 
    996     for (i = td.fDataToBreak.length(); i>=-1; i--) {
    997         p = bi.preceding(i);
    998         if (p != lastP) {
    999             if (p == RuleBasedBreakIterator::DONE) {
   1000                 break;
   1001             }
   1002             // We've reached a new break position.  Save it.
   1003             td.fActualBreakPositions.insertElementAt(p, 0, status);
   1004             lastP = p;
   1005             tag = bi.getRuleStatus();
   1006             td.fActualTags.insertElementAt(tag, 0, status);
   1007         }
   1008     }
   1009     // The loop normally exits by means of the break in the middle.
   1010     // Make sure that the index was at the correct position for the break iterator to have
   1011     //   returned DONE.
   1012     if (i != 0) {
   1013         errln("testPreceding():  iterator returned DONE prematurely.");
   1014     }
   1015 
   1016     // Full check of all results.
   1017     td.checkResults("testPreceding", this);
   1018 }
   1019 
   1020 
   1021 
   1022 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
   1023     UErrorCode  status = U_ZERO_ERROR;
   1024     int         i;
   1025     int32_t     tag;
   1026 
   1027     logln("testIsBoundary():");
   1028     bi.setText(td.fDataToBreak);
   1029     td.clearResults();
   1030 
   1031     for (i = 0; i <= td.fDataToBreak.length(); i++) {
   1032         if (bi.isBoundary(i)) {
   1033             td.fActualBreakPositions.addElement(i, status);  // Save result.
   1034             tag = bi.getRuleStatus();
   1035             td.fActualTags.addElement(tag, status);
   1036         }
   1037     }
   1038     td.checkResults("testIsBoundary: ", this);
   1039 }
   1040 
   1041 
   1042 
   1043 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
   1044 {
   1045     iterator.setText(td.fDataToBreak);
   1046 
   1047     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
   1048     int32_t offset = iterator.first();
   1049     int32_t testOffset;
   1050     int32_t count = 0;
   1051 
   1052     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
   1053 
   1054     if (*testIterator != iterator)
   1055         errln("clone() or operator!= failed: two clones compared unequal");
   1056 
   1057     do {
   1058         testOffset = testIterator->first();
   1059         testOffset = testIterator->next(count);
   1060         if (offset != testOffset)
   1061             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
   1062 
   1063         if (offset != RuleBasedBreakIterator::DONE) {
   1064             count++;
   1065             offset = iterator.next();
   1066 
   1067             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
   1068                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
   1069                 if (count > 10000 || offset == -1) {
   1070                     errln("operator== failed too many times. Stopping test.");
   1071                     if (offset == -1) {
   1072                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
   1073                     }
   1074                     return;
   1075                 }
   1076             }
   1077         }
   1078     } while (offset != RuleBasedBreakIterator::DONE);
   1079 
   1080     // now do it backwards...
   1081     offset = iterator.last();
   1082     count = 0;
   1083 
   1084     do {
   1085         testOffset = testIterator->last();
   1086         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
   1087         if (offset != testOffset)
   1088             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
   1089 
   1090         if (offset != RuleBasedBreakIterator::DONE) {
   1091             count--;
   1092             offset = iterator.previous();
   1093         }
   1094     } while (offset != RuleBasedBreakIterator::DONE);
   1095 
   1096     delete testIterator;
   1097 }
   1098 
   1099 
   1100 //---------------------------------------------
   1101 //
   1102 //     other tests
   1103 //
   1104 //---------------------------------------------
   1105 void RBBITest::TestEmptyString()
   1106 {
   1107     UnicodeString text = "";
   1108     UErrorCode status = U_ZERO_ERROR;
   1109 
   1110     BITestData x(status);
   1111     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
   1112     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
   1113     if (U_FAILURE(status))
   1114     {
   1115         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
   1116         return;
   1117     }
   1118     generalIteratorTest(*bi, x);
   1119     delete bi;
   1120 }
   1121 
   1122 void RBBITest::TestGetAvailableLocales()
   1123 {
   1124     int32_t locCount = 0;
   1125     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
   1126 
   1127     if (locCount == 0)
   1128         dataerrln("getAvailableLocales() returned an empty list!");
   1129     // Just make sure that it's returning good memory.
   1130     int32_t i;
   1131     for (i = 0; i < locCount; ++i) {
   1132         logln(locList[i].getName());
   1133     }
   1134 }
   1135 
   1136 //Testing the BreakIterator::getDisplayName() function
   1137 void RBBITest::TestGetDisplayName()
   1138 {
   1139     UnicodeString   result;
   1140 
   1141     BreakIterator::getDisplayName(Locale::getUS(), result);
   1142     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
   1143         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
   1144                 + result);
   1145 
   1146     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
   1147     if (result != "French (France)")
   1148         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
   1149                 + result);
   1150 }
   1151 /**
   1152  * Test End Behaviour
   1153  * @bug 4068137
   1154  */
   1155 void RBBITest::TestEndBehaviour()
   1156 {
   1157     UErrorCode status = U_ZERO_ERROR;
   1158     UnicodeString testString("boo.");
   1159     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
   1160     if (U_FAILURE(status))
   1161     {
   1162         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
   1163         return;
   1164     }
   1165     wb->setText(testString);
   1166 
   1167     if (wb->first() != 0)
   1168         errln("Didn't get break at beginning of string.");
   1169     if (wb->next() != 3)
   1170         errln("Didn't get break before period in \"boo.\"");
   1171     if (wb->current() != 4 && wb->next() != 4)
   1172         errln("Didn't get break at end of string.");
   1173     delete wb;
   1174 }
   1175 /*
   1176  * @bug 4153072
   1177  */
   1178 void RBBITest::TestBug4153072() {
   1179     UErrorCode status = U_ZERO_ERROR;
   1180     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
   1181     if (U_FAILURE(status))
   1182     {
   1183         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
   1184         return;
   1185     }
   1186     UnicodeString str("...Hello, World!...");
   1187     int32_t begin = 3;
   1188     int32_t end = str.length() - 3;
   1189     UBool onBoundary;
   1190 
   1191     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
   1192     iter->adoptText(textIterator);
   1193     int index;
   1194     // Note: with the switch to UText, there is no way to restrict the
   1195     //       iteration range to begin at an index other than zero.
   1196     //       String character iterators created with a non-zero bound are
   1197     //         treated by RBBI as being empty.
   1198     for (index = -1; index < begin + 1; ++index) {
   1199         onBoundary = iter->isBoundary(index);
   1200         if (index == 0?  !onBoundary : onBoundary) {
   1201             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
   1202                             " and begin index = " + begin);
   1203         }
   1204     }
   1205     delete iter;
   1206 }
   1207 
   1208 
   1209 //
   1210 // Test for problem reported by Ashok Matoria on 9 July 2007
   1211 //    One.<kSoftHyphen><kSpace>Two.
   1212 //
   1213 //    Sentence break at start (0) and then on calling next() it breaks at
   1214 //   'T' of "Two". Now, at this point if I do next() and
   1215 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
   1216 //
   1217 void RBBITest::TestBug5775() {
   1218     UErrorCode status = U_ZERO_ERROR;
   1219     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
   1220     TEST_ASSERT_SUCCESS(status);
   1221     if (U_FAILURE(status)) {
   1222         return;
   1223     }
   1224 // Check for status first for better handling of no data errors.
   1225     TEST_ASSERT(bi != NULL);
   1226     if (bi == NULL) {
   1227         return;
   1228     }
   1229 
   1230     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
   1231     //               01234      56789
   1232     s = s.unescape();
   1233     bi->setText(s);
   1234     int pos = bi->next();
   1235     TEST_ASSERT(pos == 6);
   1236     pos = bi->next();
   1237     TEST_ASSERT(pos == 10);
   1238     pos = bi->previous();
   1239     TEST_ASSERT(pos == 6);
   1240     delete bi;
   1241 }
   1242 
   1243 
   1244 
   1245 /**
   1246  * Test Japanese Line Break
   1247  * @bug 4095322
   1248  */
   1249 void RBBITest::TestJapaneseLineBreak()
   1250 {
   1251 #if 0
   1252     // Test needs updating some more...   Dump it for now.
   1253 
   1254 
   1255     // Change for Unicode TR 14:  Punctuation characters with categories Pi and Pf do not count
   1256     //        as opening and closing punctuation for line breaking.
   1257     //        Also, \u30fc and \u30fe are not counted as hyphens.   Remove these chars
   1258     //        from these tests.    6-13-2002
   1259     //
   1260     UErrorCode status = U_ZERO_ERROR;
   1261     UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");
   1262     UnicodeString precedingChars = CharsToUnicodeString(
   1263         //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
   1264         "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
   1265     UnicodeString followingChars = CharsToUnicodeString(
   1266         // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
   1267         ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
   1268         // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
   1269         ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
   1270         "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
   1271     BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status);
   1272 
   1273     int32_t i;
   1274     if (U_FAILURE(status))
   1275     {
   1276         errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
   1277         return;
   1278     }
   1279 
   1280     for (i = 0; i < precedingChars.length(); i++) {
   1281         testString.setCharAt(1, precedingChars[i]);
   1282         iter->setText(testString);
   1283         int32_t j = iter->first();
   1284         if (j != 0)
   1285             errln("ja line break failure: failed to start at 0");
   1286         j = iter->next();
   1287         if (j != 1)
   1288             errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])
   1289                         + "' (" + ((int)(precedingChars[i])) + ")");
   1290         j = iter->next();
   1291         if (j != 3)
   1292             errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])
   1293                         + "' (" + ((int)(precedingChars[i])) + ")");
   1294     }
   1295 
   1296     for (i = 0; i < followingChars.length(); i++) {
   1297         testString.setCharAt(1, followingChars[i]);
   1298         iter->setText(testString);
   1299         int j = iter->first();
   1300         if (j != 0)
   1301             errln("ja line break failure: failed to start at 0");
   1302         j = iter->next();
   1303         if (j != 2)
   1304             errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])
   1305                         + "' (" + ((int)(followingChars[i])) + ")");
   1306         j = iter->next();
   1307         if (j != 3)
   1308             errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])
   1309                         + "' (" + ((int)(followingChars[i])) + ")");
   1310     }
   1311     delete iter;
   1312 #endif
   1313 }
   1314 
   1315 
   1316 //------------------------------------------------------------------------------
   1317 //
   1318 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
   1319 //
   1320 //------------------------------------------------------------------------------
   1321 
   1322 struct TestParams {
   1323     BreakIterator   *bi;
   1324     UnicodeString    dataToBreak;
   1325     UVector32       *expectedBreaks;
   1326     UVector32       *srcLine;
   1327     UVector32       *srcCol;
   1328 };
   1329 
   1330 void RBBITest::executeTest(TestParams *t) {
   1331     int32_t    bp;
   1332     int32_t    prevBP;
   1333     int32_t    i;
   1334 
   1335     if (t->bi == NULL) {
   1336         return;
   1337     }
   1338 
   1339     t->bi->setText(t->dataToBreak);
   1340     //
   1341     //  Run the iterator forward
   1342     //
   1343     prevBP = -1;
   1344     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
   1345         if (prevBP ==  bp) {
   1346             // Fail for lack of forward progress.
   1347             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
   1348                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
   1349             break;
   1350         }
   1351 
   1352         // Check that there were we didn't miss an expected break between the last one
   1353         //  and this one.
   1354         for (i=prevBP+1; i<bp; i++) {
   1355             if (t->expectedBreaks->elementAti(i) != 0) {
   1356                 int expected[] = {0, i};
   1357                 printStringBreaks(t->dataToBreak, expected, 2);
   1358                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1359                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
   1360             }
   1361         }
   1362 
   1363         // Check that the break we did find was expected
   1364         if (t->expectedBreaks->elementAti(bp) == 0) {
   1365             int expected[] = {0, bp};
   1366             printStringBreaks(t->dataToBreak, expected, 2);
   1367             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
   1368                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
   1369         } else {
   1370             // The break was expected.
   1371             //   Check that the {nnn} tag value is correct.
   1372             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
   1373             if (expectedTagVal == -1) {
   1374                 expectedTagVal = 0;
   1375             }
   1376             int32_t line = t->srcLine->elementAti(bp);
   1377             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
   1378             if (rs != expectedTagVal) {
   1379                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
   1380                       "          Actual, Expected status = %4d, %4d",
   1381                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
   1382             }
   1383         }
   1384 
   1385 
   1386         prevBP = bp;
   1387     }
   1388 
   1389     // Verify that there were no missed expected breaks after the last one found
   1390     for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
   1391         if (t->expectedBreaks->elementAti(i) != 0) {
   1392             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1393                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
   1394         }
   1395     }
   1396 
   1397     //
   1398     //  Run the iterator backwards, verify that the same breaks are found.
   1399     //
   1400     prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
   1401     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
   1402         if (prevBP ==  bp) {
   1403             // Fail for lack of progress.
   1404             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
   1405                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
   1406             break;
   1407         }
   1408 
   1409         // Check that there were we didn't miss an expected break between the last one
   1410         //  and this one.  (UVector returns zeros for index out of bounds.)
   1411         for (i=prevBP-1; i>bp; i--) {
   1412             if (t->expectedBreaks->elementAti(i) != 0) {
   1413                 errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1414                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
   1415             }
   1416         }
   1417 
   1418         // Check that the break we did find was expected
   1419         if (t->expectedBreaks->elementAti(bp) == 0) {
   1420             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
   1421                    bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
   1422         } else {
   1423             // The break was expected.
   1424             //   Check that the {nnn} tag value is correct.
   1425             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
   1426             if (expectedTagVal == -1) {
   1427                 expectedTagVal = 0;
   1428             }
   1429             int line = t->srcLine->elementAti(bp);
   1430             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
   1431             if (rs != expectedTagVal) {
   1432                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
   1433                       "          Actual, Expected status = %4d, %4d",
   1434                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
   1435             }
   1436         }
   1437 
   1438         prevBP = bp;
   1439     }
   1440 
   1441     // Verify that there were no missed breaks prior to the last one found
   1442     for (i=prevBP-1; i>=0; i--) {
   1443         if (t->expectedBreaks->elementAti(i) != 0) {
   1444             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1445                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
   1446         }
   1447     }
   1448 }
   1449 
   1450 
   1451 void RBBITest::TestExtended() {
   1452 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   1453     UErrorCode      status  = U_ZERO_ERROR;
   1454     Locale          locale("");
   1455 
   1456     UnicodeString       rules;
   1457     TestParams          tp;
   1458     tp.bi             = NULL;
   1459     tp.expectedBreaks = new UVector32(status);
   1460     tp.srcLine        = new UVector32(status);
   1461     tp.srcCol         = new UVector32(status);
   1462 
   1463     RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
   1464     if (U_FAILURE(status)) {
   1465         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
   1466     }
   1467 
   1468 
   1469     //
   1470     //  Open and read the test data file.
   1471     //
   1472     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   1473     char testFileName[1000];
   1474     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
   1475         errln("Can't open test data.  Path too long.");
   1476         return;
   1477     }
   1478     strcpy(testFileName, testDataDirectory);
   1479     strcat(testFileName, "rbbitst.txt");
   1480 
   1481     int    len;
   1482     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
   1483     if (U_FAILURE(status)) {
   1484         return; /* something went wrong, error already output */
   1485     }
   1486 
   1487 
   1488 
   1489 
   1490     //
   1491     //  Put the test data into a UnicodeString
   1492     //
   1493     UnicodeString testString(FALSE, testFile, len);
   1494 
   1495     enum EParseState{
   1496         PARSE_COMMENT,
   1497         PARSE_TAG,
   1498         PARSE_DATA,
   1499         PARSE_NUM
   1500     }
   1501     parseState = PARSE_TAG;
   1502 
   1503     EParseState savedState = PARSE_TAG;
   1504 
   1505     static const UChar CH_LF        = 0x0a;
   1506     static const UChar CH_CR        = 0x0d;
   1507     static const UChar CH_HASH      = 0x23;
   1508     /*static const UChar CH_PERIOD    = 0x2e;*/
   1509     static const UChar CH_LT        = 0x3c;
   1510     static const UChar CH_GT        = 0x3e;
   1511     static const UChar CH_BACKSLASH = 0x5c;
   1512     static const UChar CH_BULLET    = 0x2022;
   1513 
   1514     int32_t    lineNum  = 1;
   1515     int32_t    colStart = 0;
   1516     int32_t    column   = 0;
   1517     int32_t    charIdx  = 0;
   1518 
   1519     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
   1520 
   1521     for (charIdx = 0; charIdx < len; ) {
   1522         status = U_ZERO_ERROR;
   1523         UChar  c = testString.charAt(charIdx);
   1524         charIdx++;
   1525         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
   1526             // treat CRLF as a unit
   1527             c = CH_LF;
   1528             charIdx++;
   1529         }
   1530         if (c == CH_LF || c == CH_CR) {
   1531             lineNum++;
   1532             colStart = charIdx;
   1533         }
   1534         column = charIdx - colStart + 1;
   1535 
   1536         switch (parseState) {
   1537         case PARSE_COMMENT:
   1538             if (c == 0x0a || c == 0x0d) {
   1539                 parseState = savedState;
   1540             }
   1541             break;
   1542 
   1543         case PARSE_TAG:
   1544             {
   1545             if (c == CH_HASH) {
   1546                 parseState = PARSE_COMMENT;
   1547                 savedState = PARSE_TAG;
   1548                 break;
   1549             }
   1550             if (u_isUWhiteSpace(c)) {
   1551                 break;
   1552             }
   1553             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
   1554                 delete tp.bi;
   1555                 tp.bi = BreakIterator::createWordInstance(locale,  status);
   1556                 charIdx += 5;
   1557                 break;
   1558             }
   1559             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
   1560                 delete tp.bi;
   1561                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
   1562                 charIdx += 5;
   1563                 break;
   1564             }
   1565             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
   1566                 delete tp.bi;
   1567                 tp.bi = BreakIterator::createLineInstance(locale,  status);
   1568                 charIdx += 5;
   1569                 break;
   1570             }
   1571             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
   1572                 delete tp.bi;
   1573                 tp.bi = NULL;
   1574                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
   1575                 charIdx += 5;
   1576                 break;
   1577             }
   1578             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
   1579                 delete tp.bi;
   1580                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
   1581                 charIdx += 6;
   1582                 break;
   1583             }
   1584 
   1585             // <locale  loc_name>
   1586             localeMatcher.reset(testString);
   1587             if (localeMatcher.lookingAt(charIdx-1, status)) {
   1588                 UnicodeString localeName = localeMatcher.group(1, status);
   1589                 char localeName8[100];
   1590                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
   1591                 locale = Locale::createFromName(localeName8);
   1592                 charIdx += localeMatcher.group(0, status).length();
   1593                 TEST_ASSERT_SUCCESS(status);
   1594                 break;
   1595             }
   1596             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
   1597                 parseState = PARSE_DATA;
   1598                 charIdx += 5;
   1599                 tp.dataToBreak = "";
   1600                 tp.expectedBreaks->removeAllElements();
   1601                 tp.srcCol ->removeAllElements();
   1602                 tp.srcLine->removeAllElements();
   1603                 break;
   1604             }
   1605 
   1606             errln("line %d: Tag expected in test file.", lineNum);
   1607             parseState = PARSE_COMMENT;
   1608             savedState = PARSE_DATA;
   1609             goto end_test; // Stop the test.
   1610             }
   1611             break;
   1612 
   1613         case PARSE_DATA:
   1614             if (c == CH_BULLET) {
   1615                 int32_t  breakIdx = tp.dataToBreak.length();
   1616                 tp.expectedBreaks->setSize(breakIdx+1);
   1617                 tp.expectedBreaks->setElementAt(-1, breakIdx);
   1618                 tp.srcLine->setSize(breakIdx+1);
   1619                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1620                 tp.srcCol ->setSize(breakIdx+1);
   1621                 tp.srcCol ->setElementAt(column, breakIdx);
   1622                 break;
   1623             }
   1624 
   1625             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
   1626                 // Add final entry to mappings from break location to source file position.
   1627                 //  Need one extra because last break position returned is after the
   1628                 //    last char in the data, not at the last char.
   1629                 tp.srcLine->addElement(lineNum, status);
   1630                 tp.srcCol ->addElement(column, status);
   1631 
   1632                 parseState = PARSE_TAG;
   1633                 charIdx += 6;
   1634 
   1635                 // RUN THE TEST!
   1636                 executeTest(&tp);
   1637                 break;
   1638             }
   1639 
   1640             if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
   1641                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
   1642                 // Get the code point from the name and insert it into the test data.
   1643                 //   (Damn, no API takes names in Unicode  !!!
   1644                 //    we've got to take it back to char *)
   1645                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
   1646                 int32_t nameLength = nameEndIdx - (charIdx+2);
   1647                 char charNameBuf[200];
   1648                 UChar32 theChar = -1;
   1649                 if (nameEndIdx != -1) {
   1650                     UErrorCode status = U_ZERO_ERROR;
   1651                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
   1652                     charNameBuf[sizeof(charNameBuf)-1] = 0;
   1653                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
   1654                     if (U_FAILURE(status)) {
   1655                         theChar = -1;
   1656                     }
   1657                 }
   1658                 if (theChar == -1) {
   1659                     errln("Error in named character in test file at line %d, col %d",
   1660                         lineNum, column);
   1661                 } else {
   1662                     // Named code point was recognized.  Insert it
   1663                     //   into the test data.
   1664                     tp.dataToBreak.append(theChar);
   1665                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
   1666                         tp.srcLine->addElement(lineNum, status);
   1667                         tp.srcCol ->addElement(column, status);
   1668                     }
   1669                 }
   1670                 if (nameEndIdx > charIdx) {
   1671                     charIdx = nameEndIdx+1;
   1672 
   1673                 }
   1674                 break;
   1675             }
   1676 
   1677 
   1678 
   1679 
   1680             if (testString.compare(charIdx-1, 2, "<>") == 0) {
   1681                 charIdx++;
   1682                 int32_t  breakIdx = tp.dataToBreak.length();
   1683                 tp.expectedBreaks->setSize(breakIdx+1);
   1684                 tp.expectedBreaks->setElementAt(-1, breakIdx);
   1685                 tp.srcLine->setSize(breakIdx+1);
   1686                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1687                 tp.srcCol ->setSize(breakIdx+1);
   1688                 tp.srcCol ->setElementAt(column, breakIdx);
   1689                 break;
   1690             }
   1691 
   1692             if (c == CH_LT) {
   1693                 tagValue   = 0;
   1694                 parseState = PARSE_NUM;
   1695                 break;
   1696             }
   1697 
   1698             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
   1699                 parseState = PARSE_COMMENT;
   1700                 savedState = PARSE_DATA;
   1701                 break;
   1702             }
   1703 
   1704             if (c == CH_BACKSLASH) {
   1705                 // Check for \ at end of line, a line continuation.
   1706                 //     Advance over (discard) the newline
   1707                 UChar32 cp = testString.char32At(charIdx);
   1708                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
   1709                     // We have a CR LF
   1710                     //  Need an extra increment of the input ptr to move over both of them
   1711                     charIdx++;
   1712                 }
   1713                 if (cp == CH_LF || cp == CH_CR) {
   1714                     lineNum++;
   1715                     colStart = charIdx;
   1716                     charIdx++;
   1717                     break;
   1718                 }
   1719 
   1720                 // Let unescape handle the back slash.
   1721                 cp = testString.unescapeAt(charIdx);
   1722                 if (cp != -1) {
   1723                     // Escape sequence was recognized.  Insert the char
   1724                     //   into the test data.
   1725                     tp.dataToBreak.append(cp);
   1726                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
   1727                         tp.srcLine->addElement(lineNum, status);
   1728                         tp.srcCol ->addElement(column, status);
   1729                     }
   1730                     break;
   1731                 }
   1732 
   1733 
   1734                 // Not a recognized backslash escape sequence.
   1735                 // Take the next char as a literal.
   1736                 //  TODO:  Should this be an error?
   1737                 c = testString.charAt(charIdx);
   1738                 charIdx = testString.moveIndex32(charIdx, 1);
   1739             }
   1740 
   1741             // Normal, non-escaped data char.
   1742             tp.dataToBreak.append(c);
   1743 
   1744             // Save the mapping from offset in the data to line/column numbers in
   1745             //   the original input file.  Will be used for better error messages only.
   1746             //   If there's an expected break before this char, the slot in the mapping
   1747             //     vector will already be set for this char; don't overwrite it.
   1748             if (tp.dataToBreak.length() > tp.srcLine->size()) {
   1749                 tp.srcLine->addElement(lineNum, status);
   1750                 tp.srcCol ->addElement(column, status);
   1751             }
   1752             break;
   1753 
   1754 
   1755         case PARSE_NUM:
   1756             // We are parsing an expected numeric tag value, like <1234>,
   1757             //   within a chunk of data.
   1758             if (u_isUWhiteSpace(c)) {
   1759                 break;
   1760             }
   1761 
   1762             if (c == CH_GT) {
   1763                 // Finished the number.  Add the info to the expected break data,
   1764                 //   and switch parse state back to doing plain data.
   1765                 parseState = PARSE_DATA;
   1766                 if (tagValue == 0) {
   1767                     tagValue = -1;
   1768                 }
   1769                 int32_t  breakIdx = tp.dataToBreak.length();
   1770                 tp.expectedBreaks->setSize(breakIdx+1);
   1771                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
   1772                 tp.srcLine->setSize(breakIdx+1);
   1773                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1774                 tp.srcCol ->setSize(breakIdx+1);
   1775                 tp.srcCol ->setElementAt(column, breakIdx);
   1776                 break;
   1777             }
   1778 
   1779             if (u_isdigit(c)) {
   1780                 tagValue = tagValue*10 + u_charDigitValue(c);
   1781                 break;
   1782             }
   1783 
   1784             errln("Syntax Error in test file at line %d, col %d",
   1785                 lineNum, column);
   1786             parseState = PARSE_COMMENT;
   1787             goto end_test; // Stop the test
   1788             break;
   1789         }
   1790 
   1791 
   1792         if (U_FAILURE(status)) {
   1793             errln("ICU Error %s while parsing test file at line %d.",
   1794                 u_errorName(status), lineNum);
   1795             status = U_ZERO_ERROR;
   1796             goto end_test; // Stop the test
   1797         }
   1798 
   1799     }
   1800 
   1801 end_test:
   1802     delete tp.bi;
   1803     delete tp.expectedBreaks;
   1804     delete tp.srcLine;
   1805     delete tp.srcCol;
   1806     delete [] testFile;
   1807 #endif
   1808 }
   1809 
   1810 void RBBITest::TestThaiBreaks() {
   1811     UErrorCode status=U_ZERO_ERROR;
   1812     BreakIterator* b;
   1813     Locale locale = Locale("th");
   1814     int32_t p, index;
   1815     UChar c[]= {
   1816             0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B,
   1817             0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19,
   1818             0x0E16, 0x0E49, 0x0E33, 0x0000
   1819     };
   1820     int32_t expectedWordResult[] = {
   1821             2, 3, 6, 10, 11, 15, 17, 20, 22
   1822     };
   1823     int32_t expectedLineResult[] = {
   1824             3, 6, 11, 15, 17, 20, 22
   1825     };
   1826 
   1827     int32_t size = u_strlen(c);
   1828     UnicodeString text=UnicodeString(c);
   1829 
   1830     b = BreakIterator::createWordInstance(locale, status);
   1831     if (U_FAILURE(status)) {
   1832         errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status));
   1833         return;
   1834     }
   1835     b->setText(text);
   1836     p = index = 0;
   1837     while ((p=b->next())!=BreakIterator::DONE && p < size) {
   1838         if (p != expectedWordResult[index++]) {
   1839             errln("Incorrect break given by thai word break iterator. Expected: %d  Got: %d", expectedWordResult[index-1], p);
   1840         }
   1841     }
   1842     delete b;
   1843 
   1844     b = BreakIterator::createLineInstance(locale, status);
   1845     if (U_FAILURE(status)) {
   1846         printf("Unable to create thai line break iterator.\n");
   1847         return;
   1848     }
   1849     b->setText(text);
   1850     p = index = 0;
   1851     while ((p=b->next())!=BreakIterator::DONE && p < size) {
   1852         if (p != expectedLineResult[index++]) {
   1853             errln("Incorrect break given by thai line break iterator. Expected: %d  Got: %d", expectedLineResult[index-1], p);
   1854         }
   1855     }
   1856 
   1857     delete b;
   1858 }
   1859 
   1860 // UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"
   1861 // Words don't include colon or period (cldrbug #1969).
   1862 static const char    posxWordText[]     = "Can't have breaks in xx:yy or struct.field for CS-types.";
   1863 static const int32_t posxWordTOffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 };
   1864 static const int32_t posxWordROffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21,         26, 27, 29, 30,         42, 43, 46, 47, 49, 50, 55, 56 };
   1865 
   1866 // UBreakIteratorType UBRK_WORD, Locale "ja"
   1867 // Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
   1868 static const char    jaWordText[]     = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"
   1869                                         "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";
   1870 static const int32_t jaWordTOffsets[] = {    2, 3,          7, 8, 14,         17, 18,     20, 21, 24,         27, 28 };
   1871 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 };
   1872 
   1873 // UBreakIteratorType UBRK_SENTENCE, Locale "el"
   1874 // Add break after Greek question mark (cldrbug #2069).
   1875 static const char    elSentText[]     = "\\u0391\\u03B2, \\u03B3\\u03B4; \\u0395 \\u03B6\\u03B7\\u037E \\u0398 \\u03B9\\u03BA. "
   1876                                         "\\u039B\\u03BC \\u03BD\\u03BE! \\u039F\\u03C0, \\u03A1\\u03C2? \\u03A3";
   1877 static const int32_t elSentTOffsets[] = { 8, 14, 20, 27, 35, 36 };
   1878 static const int32_t elSentROffsets[] = {        20, 27, 35, 36 };
   1879 
   1880 // UBreakIteratorType UBRK_CHARACTER, Locale "th"
   1881 // Clusters should not include spacing Thai/Lao vowels (prefix or postfix), except for [SARA] AM (cldrbug #2161).
   1882 static const char    thCharText[]     = "\\u0E01\\u0E23\\u0E30\\u0E17\\u0E48\\u0E2D\\u0E21\\u0E23\\u0E08\\u0E19\\u0E32 "
   1883                                         "(\\u0E2A\\u0E38\\u0E0A\\u0E32\\u0E15\\u0E34-\\u0E08\\u0E38\\u0E11\\u0E32\\u0E21\\u0E32\\u0E28) "
   1884                                         "\\u0E40\\u0E14\\u0E47\\u0E01\\u0E21\\u0E35\\u0E1B\\u0E31\\u0E0D\\u0E2B\\u0E32 ";
   1885 static const int32_t thCharTOffsets[] = { 1, 2, 3, 5, 6, 7, 8, 9, 10, 11,
   1886                                           12, 13, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28,
   1887                                           29, 30, 32, 33, 35, 37, 38, 39, 40, 41 };
   1888 static const int32_t thCharROffsets[] = { 1,    3, 5, 6, 7, 8, 9,     11,
   1889                                           12, 13, 15,     17, 19, 20, 22,     24,     26, 27, 28,
   1890                                           29,     32, 33, 35, 37, 38,     40, 41 };
   1891 
   1892 typedef struct {
   1893     UBreakIteratorType  type;
   1894     const char *        locale;
   1895     const char *        escapedText;
   1896     const int32_t *     tailoredOffsets;
   1897     int32_t             tailoredOffsetsCount;
   1898     const int32_t *     rootOffsets;
   1899     int32_t             rootOffsetsCount;
   1900 } TailoredBreakItem;
   1901 
   1902 #define ARRAY_PTR_LEN(array) (array),(sizeof(array)/sizeof(array[0]))
   1903 
   1904 static const TailoredBreakItem tbItems[] = {
   1905     { UBRK_WORD,      "en_US_POSIX", posxWordText, ARRAY_PTR_LEN(posxWordTOffsets), ARRAY_PTR_LEN(posxWordROffsets) },
   1906     { UBRK_WORD,      "ja",          jaWordText,   ARRAY_PTR_LEN(jaWordTOffsets),   ARRAY_PTR_LEN(jaWordROffsets)   },
   1907     { UBRK_SENTENCE,  "el",          elSentText,   ARRAY_PTR_LEN(elSentTOffsets),   ARRAY_PTR_LEN(elSentROffsets)   },
   1908     { UBRK_CHARACTER, "th",          thCharText,   ARRAY_PTR_LEN(thCharTOffsets),   ARRAY_PTR_LEN(thCharROffsets)   },
   1909     { UBRK_CHARACTER, NULL,          NULL,         NULL,0,                          NULL,0                          } // terminator
   1910 };
   1911 
   1912 static void formatOffsets(char* buffer, int32_t buflen, int32_t count, const int32_t* offsets) {
   1913     while (count-- > 0) {
   1914         int writeCount;
   1915         sprintf(buffer, /* buflen, */ " %d%n", *offsets++, &writeCount); /* wants to be snprintf */
   1916         buffer += writeCount;
   1917         buflen -= writeCount;
   1918     }
   1919 }
   1920 
   1921 enum { kMaxOffsetCount = 128 };
   1922 
   1923 void RBBITest::TBTest(BreakIterator* brkitr, int type, const char *locale, const char* escapedText, const int32_t *expectOffsets, int32_t expectOffsetsCount) {
   1924     brkitr->setText( CharsToUnicodeString(escapedText) );
   1925     int32_t foundOffsets[kMaxOffsetCount];
   1926     int32_t offset, foundOffsetsCount = 0;
   1927     // do forwards iteration test
   1928     while ( foundOffsetsCount < kMaxOffsetCount && (offset = brkitr->next()) != BreakIterator::DONE ) {
   1929         foundOffsets[foundOffsetsCount++] = offset;
   1930     }
   1931     if ( foundOffsetsCount != expectOffsetsCount || memcmp(expectOffsets, foundOffsets, foundOffsetsCount*sizeof(foundOffsets[0])) != 0 ) {
   1932         // log error for forwards test
   1933         char formatExpect[512], formatFound[512];
   1934         formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets);
   1935         formatOffsets(formatFound, sizeof(formatFound), foundOffsetsCount, foundOffsets);
   1936         errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found %d offsets fwd:%s\n",
   1937                 type, locale, escapedText, expectOffsetsCount, formatExpect, foundOffsetsCount, formatFound);
   1938     } else {
   1939         // do backwards iteration test
   1940         --foundOffsetsCount; // back off one from the end offset
   1941         while ( foundOffsetsCount > 0 ) {
   1942             offset = brkitr->previous();
   1943             if ( offset != foundOffsets[--foundOffsetsCount] ) {
   1944                 // log error for backwards test
   1945                 char formatExpect[512];
   1946                 formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets);
   1947                 errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found rev offset %d where expect %d\n",
   1948                         type, locale, escapedText, expectOffsetsCount, formatExpect, offset, foundOffsets[foundOffsetsCount]);
   1949                 break;
   1950             }
   1951         }
   1952     }
   1953 }
   1954 
   1955 void RBBITest::TestTailoredBreaks() {
   1956     const TailoredBreakItem * tbItemPtr;
   1957     Locale rootLocale = Locale("root");
   1958     for (tbItemPtr = tbItems; tbItemPtr->escapedText != NULL; ++tbItemPtr) {
   1959         Locale testLocale = Locale(tbItemPtr->locale);
   1960         BreakIterator * tailoredBrkiter;
   1961         BreakIterator * rootBrkiter;
   1962         UErrorCode status = U_ZERO_ERROR;
   1963         switch (tbItemPtr->type) {
   1964             case UBRK_CHARACTER:
   1965                 tailoredBrkiter = BreakIterator::createCharacterInstance(testLocale, status);
   1966                 rootBrkiter = BreakIterator::createCharacterInstance(rootLocale, status);
   1967                 break;
   1968             case UBRK_WORD:
   1969                 tailoredBrkiter = BreakIterator::createWordInstance(testLocale, status);
   1970                 rootBrkiter = BreakIterator::createWordInstance(rootLocale, status);
   1971                 break;
   1972             case UBRK_LINE:
   1973                 tailoredBrkiter = BreakIterator::createLineInstance(testLocale, status);
   1974                 rootBrkiter = BreakIterator::createLineInstance(rootLocale, status);
   1975                 break;
   1976             case UBRK_SENTENCE:
   1977                 tailoredBrkiter = BreakIterator::createSentenceInstance(testLocale, status);
   1978                 rootBrkiter = BreakIterator::createSentenceInstance(rootLocale, status);
   1979                 break;
   1980             default:
   1981                 status = U_UNSUPPORTED_ERROR;
   1982                 break;
   1983         }
   1984         if (U_FAILURE(status)) {
   1985             errcheckln(status, "BreakIterator create failed for type %d, locales root or %s - Error: %s", (int)(tbItemPtr->type), tbItemPtr->locale, u_errorName(status));
   1986             continue;
   1987         }
   1988         TBTest(tailoredBrkiter, (int)(tbItemPtr->type), tbItemPtr->locale, tbItemPtr->escapedText, tbItemPtr->tailoredOffsets, tbItemPtr->tailoredOffsetsCount);
   1989         TBTest(rootBrkiter,     (int)(tbItemPtr->type), "root",            tbItemPtr->escapedText, tbItemPtr->rootOffsets,     tbItemPtr->rootOffsetsCount);
   1990 
   1991         delete rootBrkiter;
   1992         delete tailoredBrkiter;
   1993     }
   1994 }
   1995 
   1996 
   1997 //-------------------------------------------------------------------------------
   1998 //
   1999 //  TestDictRules   create a break iterator from source rules that includes a
   2000 //                  dictionary range.   Regression for bug #7130.  Source rules
   2001 //                  do not declare a break iterator type (word, line, sentence, etc.
   2002 //                  but the dictionary code, without a type, would loop.
   2003 //
   2004 //-------------------------------------------------------------------------------
   2005 void RBBITest::TestDictRules() {
   2006     const char *rules =  "$dictionary = [a-z]; \n"
   2007                          "!!forward; \n"
   2008                          "$dictionary $dictionary; \n"
   2009                          "!!reverse; \n"
   2010                          "$dictionary $dictionary; \n";
   2011     const char *text = "aa";
   2012     UErrorCode status = U_ZERO_ERROR;
   2013     UParseError parseError;
   2014 
   2015     RuleBasedBreakIterator bi(rules, parseError, status);
   2016     if (U_SUCCESS(status)) {
   2017         UnicodeString utext = text;
   2018         bi.setText(utext);
   2019         int32_t position;
   2020         int32_t loops;
   2021         for (loops = 0; loops<10; loops++) {
   2022             position = bi.next();
   2023             if (position == RuleBasedBreakIterator::DONE) {
   2024                 break;
   2025             }
   2026         }
   2027         TEST_ASSERT(loops == 1);
   2028     } else {
   2029         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
   2030     }
   2031 }
   2032 
   2033 
   2034 
   2035 //-------------------------------------------------------------------------------
   2036 //
   2037 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
   2038 //    return the datain one big UChar * buffer, which the caller must delete.
   2039 //
   2040 //    parameters:
   2041 //          fileName:   the name of the file, with no directory part.  The test data directory
   2042 //                      is assumed.
   2043 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
   2044 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
   2045 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
   2046 //                      Pass NULL for the system default encoding.
   2047 //          status
   2048 //    returns:
   2049 //                      The file data, converted to UChar.
   2050 //                      The caller must delete this when done with
   2051 //                           delete [] theBuffer;
   2052 //
   2053 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
   2054 //           Move this function to some common place.
   2055 //
   2056 //--------------------------------------------------------------------------------
   2057 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
   2058     UChar       *retPtr  = NULL;
   2059     char        *fileBuf = NULL;
   2060     UConverter* conv     = NULL;
   2061     FILE        *f       = NULL;
   2062 
   2063     ulen = 0;
   2064     if (U_FAILURE(status)) {
   2065         return retPtr;
   2066     }
   2067 
   2068     //
   2069     //  Open the file.
   2070     //
   2071     f = fopen(fileName, "rb");
   2072     if (f == 0) {
   2073         dataerrln("Error opening test data file %s\n", fileName);
   2074         status = U_FILE_ACCESS_ERROR;
   2075         return NULL;
   2076     }
   2077     //
   2078     //  Read it in
   2079     //
   2080     int   fileSize;
   2081     int   amt_read;
   2082 
   2083     fseek( f, 0, SEEK_END);
   2084     fileSize = ftell(f);
   2085     fileBuf = new char[fileSize];
   2086     fseek(f, 0, SEEK_SET);
   2087     amt_read = fread(fileBuf, 1, fileSize, f);
   2088     if (amt_read != fileSize || fileSize <= 0) {
   2089         errln("Error reading test data file.");
   2090         goto cleanUpAndReturn;
   2091     }
   2092 
   2093     //
   2094     // Look for a Unicode Signature (BOM) on the data just read
   2095     //
   2096     int32_t        signatureLength;
   2097     const char *   fileBufC;
   2098     const char*    bomEncoding;
   2099 
   2100     fileBufC = fileBuf;
   2101     bomEncoding = ucnv_detectUnicodeSignature(
   2102         fileBuf, fileSize, &signatureLength, &status);
   2103     if(bomEncoding!=NULL ){
   2104         fileBufC  += signatureLength;
   2105         fileSize  -= signatureLength;
   2106         encoding = bomEncoding;
   2107     }
   2108 
   2109     //
   2110     // Open a converter to take the rule file to UTF-16
   2111     //
   2112     conv = ucnv_open(encoding, &status);
   2113     if (U_FAILURE(status)) {
   2114         goto cleanUpAndReturn;
   2115     }
   2116 
   2117     //
   2118     // Convert the rules to UChar.
   2119     //  Preflight first to determine required buffer size.
   2120     //
   2121     ulen = ucnv_toUChars(conv,
   2122         NULL,           //  dest,
   2123         0,              //  destCapacity,
   2124         fileBufC,
   2125         fileSize,
   2126         &status);
   2127     if (status == U_BUFFER_OVERFLOW_ERROR) {
   2128         // Buffer Overflow is expected from the preflight operation.
   2129         status = U_ZERO_ERROR;
   2130 
   2131         retPtr = new UChar[ulen+1];
   2132         ucnv_toUChars(conv,
   2133             retPtr,       //  dest,
   2134             ulen+1,
   2135             fileBufC,
   2136             fileSize,
   2137             &status);
   2138     }
   2139 
   2140 cleanUpAndReturn:
   2141     fclose(f);
   2142     delete []fileBuf;
   2143     ucnv_close(conv);
   2144     if (U_FAILURE(status)) {
   2145         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   2146         delete retPtr;
   2147         retPtr = 0;
   2148         ulen   = 0;
   2149     };
   2150     return retPtr;
   2151 }
   2152 
   2153 
   2154 
   2155 //--------------------------------------------------------------------------------------------
   2156 //
   2157 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
   2158 //
   2159 //-------------------------------------------------------------------------------------------
   2160 void RBBITest::TestUnicodeFiles() {
   2161     RuleBasedBreakIterator  *bi;
   2162     UErrorCode               status = U_ZERO_ERROR;
   2163 
   2164     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getDefault(), status);
   2165     TEST_ASSERT_SUCCESS(status);
   2166     if (U_SUCCESS(status)) {
   2167         runUnicodeTestData("GraphemeBreakTest.txt", bi);
   2168     }
   2169     delete bi;
   2170 
   2171     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getDefault(), status);
   2172     TEST_ASSERT_SUCCESS(status);
   2173     if (U_SUCCESS(status)) {
   2174         runUnicodeTestData("WordBreakTest.txt", bi);
   2175     }
   2176     delete bi;
   2177 
   2178     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
   2179     TEST_ASSERT_SUCCESS(status);
   2180     if (U_SUCCESS(status)) {
   2181         runUnicodeTestData("SentenceBreakTest.txt", bi);
   2182     }
   2183     delete bi;
   2184 
   2185     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
   2186     TEST_ASSERT_SUCCESS(status);
   2187     if (U_SUCCESS(status)) {
   2188         runUnicodeTestData("LineBreakTest.txt", bi);
   2189     }
   2190     delete bi;
   2191 }
   2192 
   2193 
   2194 //--------------------------------------------------------------------------------------------
   2195 //
   2196 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
   2197 //
   2198 //-------------------------------------------------------------------------------------------
   2199 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
   2200 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   2201     UErrorCode  status = U_ZERO_ERROR;
   2202 
   2203     //
   2204     //  Open and read the test data file, put it into a UnicodeString.
   2205     //
   2206     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   2207     char testFileName[1000];
   2208     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
   2209         dataerrln("Can't open test data.  Path too long.");
   2210         return;
   2211     }
   2212     strcpy(testFileName, testDataDirectory);
   2213     strcat(testFileName, fileName);
   2214 
   2215     logln("Opening data file %s\n", fileName);
   2216 
   2217     int    len;
   2218     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
   2219     if (status != U_FILE_ACCESS_ERROR) {
   2220         TEST_ASSERT_SUCCESS(status);
   2221         TEST_ASSERT(testFile != NULL);
   2222     }
   2223     if (U_FAILURE(status) || testFile == NULL) {
   2224         return; /* something went wrong, error already output */
   2225     }
   2226     UnicodeString testFileAsString(TRUE, testFile, len);
   2227 
   2228     //
   2229     //  Parse the test data file using a regular expression.
   2230     //  Each kind of token is recognized in its own capture group; what type of item was scanned
   2231     //     is identified by which group had a match.
   2232     //
   2233     //    Caputure Group #                  1          2            3            4           5
   2234     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
   2235     //
   2236     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
   2237     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
   2238     UnicodeString   testString;
   2239     UVector32       breakPositions(status);
   2240     int             lineNumber = 1;
   2241     TEST_ASSERT_SUCCESS(status);
   2242     if (U_FAILURE(status)) {
   2243         return;
   2244     }
   2245 
   2246     //
   2247     //  Scan through each test case, building up the string to be broken in testString,
   2248     //   and the positions that should be boundaries in the breakPositions vector.
   2249     //
   2250     while (tokenMatcher.find()) {
   2251         if (tokenMatcher.start(1, status) >= 0) {
   2252             // Scanned a divide sign, indicating a break position in the test data.
   2253             if (testString.length()>0) {
   2254                 breakPositions.addElement(testString.length(), status);
   2255             }
   2256         }
   2257         else if (tokenMatcher.start(2, status) >= 0) {
   2258             // Scanned an 'x', meaning no break at this position in the test data
   2259             //   Nothing to be done here.
   2260             }
   2261         else if (tokenMatcher.start(3, status) >= 0) {
   2262             // Scanned Hex digits.  Convert them to binary, append to the character data string.
   2263             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
   2264             int length = hexNumber.length();
   2265             if (length<=8) {
   2266                 char buf[10];
   2267                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
   2268                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
   2269                 if (c<=0x10ffff) {
   2270                     testString.append(c);
   2271                 } else {
   2272                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
   2273                        fileName, lineNumber);
   2274                 }
   2275             } else {
   2276                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
   2277                        fileName, lineNumber);
   2278              }
   2279         }
   2280         else if (tokenMatcher.start(4, status) >= 0) {
   2281             // Scanned to end of a line, possibly skipping over a comment in the process.
   2282             //   If the line from the file contained test data, run the test now.
   2283             //
   2284             if (testString.length() > 0) {
   2285                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
   2286             }
   2287 
   2288             // Clear out this test case.
   2289             //    The string and breakPositions vector will be refilled as the next
   2290             //       test case is parsed.
   2291             testString.remove();
   2292             breakPositions.removeAllElements();
   2293             lineNumber++;
   2294         } else {
   2295             // Scanner catchall.  Something unrecognized appeared on the line.
   2296             char token[16];
   2297             UnicodeString uToken = tokenMatcher.group(0, status);
   2298             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
   2299             token[sizeof(token)-1] = 0;
   2300             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
   2301 
   2302             // Clean up, in preparation for continuing with the next line.
   2303             testString.remove();
   2304             breakPositions.removeAllElements();
   2305             lineNumber++;
   2306         }
   2307         TEST_ASSERT_SUCCESS(status);
   2308         if (U_FAILURE(status)) {
   2309             break;
   2310         }
   2311     }
   2312 
   2313     delete [] testFile;
   2314  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
   2315 }
   2316 
   2317 //--------------------------------------------------------------------------------------------
   2318 //
   2319 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
   2320 //                            test data files.  Do only a simple, forward-only check -
   2321 //                            this test is mostly to check that ICU and the Unicode
   2322 //                            data agree with each other.
   2323 //
   2324 //--------------------------------------------------------------------------------------------
   2325 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
   2326                          const UnicodeString &testString,   // Text data to be broken
   2327                          UVector32 *breakPositions,         // Positions where breaks should be found.
   2328                          RuleBasedBreakIterator *bi) {
   2329     int32_t pos;                 // Break Position in the test string
   2330     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
   2331     int32_t expectedPos;         // Expected break position (index into test string)
   2332 
   2333     bi->setText(testString);
   2334     pos = bi->first();
   2335     pos = bi->next();
   2336 
   2337     while (pos != BreakIterator::DONE) {
   2338         if (expectedI >= breakPositions->size()) {
   2339             errln("Test file \"%s\", line %d, unexpected break found at position %d",
   2340                 testFileName, lineNumber, pos);
   2341             break;
   2342         }
   2343         expectedPos = breakPositions->elementAti(expectedI);
   2344         if (pos < expectedPos) {
   2345             errln("Test file \"%s\", line %d, unexpected break found at position %d",
   2346                 testFileName, lineNumber, pos);
   2347             break;
   2348         }
   2349         if (pos > expectedPos) {
   2350             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
   2351                 testFileName, lineNumber, expectedPos);
   2352             break;
   2353         }
   2354         pos = bi->next();
   2355         expectedI++;
   2356     }
   2357 
   2358     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
   2359         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
   2360             testFileName, lineNumber, breakPositions->elementAti(expectedI));
   2361     }
   2362 }
   2363 
   2364 
   2365 
   2366 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   2367 //---------------------------------------------------------------------------------------
   2368 //
   2369 //   classs RBBIMonkeyKind
   2370 //
   2371 //      Monkey Test for Break Iteration
   2372 //      Abstract interface class.   Concrete derived classes independently
   2373 //      implement the break rules for different iterator types.
   2374 //
   2375 //      The Monkey Test itself uses doesn't know which type of break iterator it is
   2376 //      testing, but works purely in terms of the interface defined here.
   2377 //
   2378 //---------------------------------------------------------------------------------------
   2379 class RBBIMonkeyKind {
   2380 public:
   2381     // Return a UVector of UnicodeSets, representing the character classes used
   2382     //   for this type of iterator.
   2383     virtual  UVector  *charClasses() = 0;
   2384 
   2385     // Set the test text on which subsequent calls to next() will operate
   2386     virtual  void      setText(const UnicodeString &s) = 0;
   2387 
   2388     // Find the next break postion, starting from the prev break position, or from zero.
   2389     // Return -1 after reaching end of string.
   2390     virtual  int32_t   next(int32_t i) = 0;
   2391 
   2392     virtual ~RBBIMonkeyKind();
   2393     UErrorCode       deferredStatus;
   2394 
   2395 
   2396 protected:
   2397     RBBIMonkeyKind();
   2398 
   2399 private:
   2400 };
   2401 
   2402 RBBIMonkeyKind::RBBIMonkeyKind() {
   2403     deferredStatus = U_ZERO_ERROR;
   2404 }
   2405 
   2406 RBBIMonkeyKind::~RBBIMonkeyKind() {
   2407 }
   2408 
   2409 
   2410 //----------------------------------------------------------------------------------------
   2411 //
   2412 //   Random Numbers.  Similar to standard lib rand() and srand()
   2413 //                    Not using library to
   2414 //                      1.  Get same results on all platforms.
   2415 //                      2.  Get access to current seed, to more easily reproduce failures.
   2416 //
   2417 //---------------------------------------------------------------------------------------
   2418 static uint32_t m_seed = 1;
   2419 
   2420 static uint32_t m_rand()
   2421 {
   2422     m_seed = m_seed * 1103515245 + 12345;
   2423     return (uint32_t)(m_seed/65536) % 32768;
   2424 }
   2425 
   2426 
   2427 //------------------------------------------------------------------------------------------
   2428 //
   2429 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
   2430 //                             of RBBIMonkeyKind.
   2431 //
   2432 //------------------------------------------------------------------------------------------
   2433 class RBBICharMonkey: public RBBIMonkeyKind {
   2434 public:
   2435     RBBICharMonkey();
   2436     virtual          ~RBBICharMonkey();
   2437     virtual  UVector *charClasses();
   2438     virtual  void     setText(const UnicodeString &s);
   2439     virtual  int32_t  next(int32_t i);
   2440 private:
   2441     UVector   *fSets;
   2442 
   2443     UnicodeSet  *fCRLFSet;
   2444     UnicodeSet  *fControlSet;
   2445     UnicodeSet  *fExtendSet;
   2446     UnicodeSet  *fPrependSet;
   2447     UnicodeSet  *fSpacingSet;
   2448     UnicodeSet  *fLSet;
   2449     UnicodeSet  *fVSet;
   2450     UnicodeSet  *fTSet;
   2451     UnicodeSet  *fLVSet;
   2452     UnicodeSet  *fLVTSet;
   2453     UnicodeSet  *fHangulSet;
   2454     UnicodeSet  *fAnySet;
   2455 
   2456     const UnicodeString *fText;
   2457 };
   2458 
   2459 
   2460 RBBICharMonkey::RBBICharMonkey() {
   2461     UErrorCode  status = U_ZERO_ERROR;
   2462 
   2463     fText = NULL;
   2464 
   2465     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
   2466     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
   2467     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
   2468     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
   2469     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
   2470     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
   2471     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
   2472     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
   2473     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
   2474     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
   2475     fHangulSet  = new UnicodeSet();
   2476     fHangulSet->addAll(*fLSet);
   2477     fHangulSet->addAll(*fVSet);
   2478     fHangulSet->addAll(*fTSet);
   2479     fHangulSet->addAll(*fLVSet);
   2480     fHangulSet->addAll(*fLVTSet);
   2481     fAnySet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0000-\\U0010ffff]"), status);
   2482 
   2483     fSets       = new UVector(status);
   2484     fSets->addElement(fCRLFSet,    status);
   2485     fSets->addElement(fControlSet, status);
   2486     fSets->addElement(fExtendSet,  status);
   2487     fSets->addElement(fPrependSet, status);
   2488     fSets->addElement(fSpacingSet, status);
   2489     fSets->addElement(fHangulSet,  status);
   2490     fSets->addElement(fAnySet,     status);
   2491     if (U_FAILURE(status)) {
   2492         deferredStatus = status;
   2493     }
   2494 }
   2495 
   2496 
   2497 void RBBICharMonkey::setText(const UnicodeString &s) {
   2498     fText = &s;
   2499 }
   2500 
   2501 
   2502 
   2503 int32_t RBBICharMonkey::next(int32_t prevPos) {
   2504     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2505                               //   break position being tested.  The candidate break
   2506                               //   location is before p2.
   2507 
   2508     int     breakPos = -1;
   2509 
   2510     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2511 
   2512     if (U_FAILURE(deferredStatus)) {
   2513         return -1;
   2514     }
   2515 
   2516     // Previous break at end of string.  return DONE.
   2517     if (prevPos >= fText->length()) {
   2518         return -1;
   2519     }
   2520     p0 = p1 = p2 = p3 = prevPos;
   2521     c3 =  fText->char32At(prevPos);
   2522     c0 = c1 = c2 = 0;
   2523 
   2524     // Loop runs once per "significant" character position in the input text.
   2525     for (;;) {
   2526         // Move all of the positions forward in the input string.
   2527         p0 = p1;  c0 = c1;
   2528         p1 = p2;  c1 = c2;
   2529         p2 = p3;  c2 = c3;
   2530 
   2531         // Advancd p3 by one codepoint
   2532         p3 = fText->moveIndex32(p3, 1);
   2533         c3 = fText->char32At(p3);
   2534 
   2535         if (p1 == p2) {
   2536             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2537             continue;
   2538         }
   2539         if (p2 == fText->length()) {
   2540             // Reached end of string.  Always a break position.
   2541             break;
   2542         }
   2543 
   2544         // Rule  GB3   CR x LF
   2545         //     No Extend or Format characters may appear between the CR and LF,
   2546         //     which requires the additional check for p2 immediately following p1.
   2547         //
   2548         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
   2549             continue;
   2550         }
   2551 
   2552         // Rule (GB4).   ( Control | CR | LF ) <break>
   2553         if (fControlSet->contains(c1) ||
   2554             c1 == 0x0D ||
   2555             c1 == 0x0A)  {
   2556             break;
   2557         }
   2558 
   2559         // Rule (GB5)    <break>  ( Control | CR | LF )
   2560         //
   2561         if (fControlSet->contains(c2) ||
   2562             c2 == 0x0D ||
   2563             c2 == 0x0A)  {
   2564             break;
   2565         }
   2566 
   2567 
   2568         // Rule (GB6)  L x ( L | V | LV | LVT )
   2569         if (fLSet->contains(c1) &&
   2570                (fLSet->contains(c2)  ||
   2571                 fVSet->contains(c2)  ||
   2572                 fLVSet->contains(c2) ||
   2573                 fLVTSet->contains(c2))) {
   2574             continue;
   2575         }
   2576 
   2577         // Rule (GB7)    ( LV | V )  x  ( V | T )
   2578         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
   2579             (fVSet->contains(c2) || fTSet->contains(c2)))  {
   2580             continue;
   2581         }
   2582 
   2583         // Rule (GB8)    ( LVT | T)  x T
   2584         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
   2585             fTSet->contains(c2))  {
   2586             continue;
   2587         }
   2588 
   2589         // Rule (GB9)    Numeric x ALetter
   2590         if (fExtendSet->contains(c2))  {
   2591             continue;
   2592         }
   2593 
   2594         // Rule (GB9a)   x  SpacingMark
   2595         if (fSpacingSet->contains(c2)) {
   2596             continue;
   2597         }
   2598 
   2599         // Rule (GB9b)   Prepend x
   2600         if (fPrependSet->contains(c1)) {
   2601             continue;
   2602         }
   2603 
   2604         // Rule (GB10)  Any  <break>  Any
   2605         break;
   2606     }
   2607 
   2608     breakPos = p2;
   2609     return breakPos;
   2610 }
   2611 
   2612 
   2613 
   2614 UVector  *RBBICharMonkey::charClasses() {
   2615     return fSets;
   2616 }
   2617 
   2618 
   2619 RBBICharMonkey::~RBBICharMonkey() {
   2620     delete fSets;
   2621     delete fCRLFSet;
   2622     delete fControlSet;
   2623     delete fExtendSet;
   2624     delete fPrependSet;
   2625     delete fSpacingSet;
   2626     delete fLSet;
   2627     delete fVSet;
   2628     delete fTSet;
   2629     delete fLVSet;
   2630     delete fLVTSet;
   2631     delete fHangulSet;
   2632     delete fAnySet;
   2633 }
   2634 
   2635 //------------------------------------------------------------------------------------------
   2636 //
   2637 //   class RBBIWordMonkey      Word Break specific implementation
   2638 //                             of RBBIMonkeyKind.
   2639 //
   2640 //------------------------------------------------------------------------------------------
   2641 class RBBIWordMonkey: public RBBIMonkeyKind {
   2642 public:
   2643     RBBIWordMonkey();
   2644     virtual          ~RBBIWordMonkey();
   2645     virtual  UVector *charClasses();
   2646     virtual  void     setText(const UnicodeString &s);
   2647     virtual int32_t   next(int32_t i);
   2648 private:
   2649     UVector      *fSets;
   2650 
   2651     UnicodeSet  *fCRSet;
   2652     UnicodeSet  *fLFSet;
   2653     UnicodeSet  *fNewlineSet;
   2654     UnicodeSet  *fKatakanaSet;
   2655     UnicodeSet  *fALetterSet;
   2656     UnicodeSet  *fMidNumLetSet;
   2657     UnicodeSet  *fMidLetterSet;
   2658     UnicodeSet  *fMidNumSet;
   2659     UnicodeSet  *fNumericSet;
   2660     UnicodeSet  *fFormatSet;
   2661     UnicodeSet  *fOtherSet;
   2662     UnicodeSet  *fExtendSet;
   2663     UnicodeSet  *fExtendNumLetSet;
   2664 
   2665     RegexMatcher  *fMatcher;
   2666 
   2667     const UnicodeString  *fText;
   2668 };
   2669 
   2670 
   2671 RBBIWordMonkey::RBBIWordMonkey()
   2672 {
   2673     UErrorCode  status = U_ZERO_ERROR;
   2674 
   2675     fSets            = new UVector(status);
   2676 
   2677     fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
   2678     fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
   2679     fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
   2680     fALetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"),      status);
   2681     fKatakanaSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
   2682     fMidNumLetSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
   2683     fMidLetterSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
   2684     fMidNumSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
   2685     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
   2686     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
   2687     fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
   2688     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
   2689 
   2690     fOtherSet        = new UnicodeSet();
   2691     if(U_FAILURE(status)) {
   2692       deferredStatus = status;
   2693       return;
   2694     }
   2695 
   2696     fOtherSet->complement();
   2697     fOtherSet->removeAll(*fCRSet);
   2698     fOtherSet->removeAll(*fLFSet);
   2699     fOtherSet->removeAll(*fNewlineSet);
   2700     fOtherSet->removeAll(*fKatakanaSet);
   2701     fOtherSet->removeAll(*fALetterSet);
   2702     fOtherSet->removeAll(*fMidLetterSet);
   2703     fOtherSet->removeAll(*fMidNumSet);
   2704     fOtherSet->removeAll(*fNumericSet);
   2705     fOtherSet->removeAll(*fExtendNumLetSet);
   2706     fOtherSet->removeAll(*fFormatSet);
   2707     fOtherSet->removeAll(*fExtendSet);
   2708     // Inhibit dictionary characters from being tested at all.
   2709     fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
   2710 
   2711     fSets->addElement(fCRSet,        status);
   2712     fSets->addElement(fLFSet,        status);
   2713     fSets->addElement(fNewlineSet,   status);
   2714     fSets->addElement(fALetterSet,   status);
   2715     fSets->addElement(fKatakanaSet,  status);
   2716     fSets->addElement(fMidLetterSet, status);
   2717     fSets->addElement(fMidNumLetSet, status);
   2718     fSets->addElement(fMidNumSet,    status);
   2719     fSets->addElement(fNumericSet,   status);
   2720     fSets->addElement(fFormatSet,    status);
   2721     fSets->addElement(fExtendSet,    status);
   2722     fSets->addElement(fOtherSet,     status);
   2723     fSets->addElement(fExtendNumLetSet, status);
   2724 
   2725     if (U_FAILURE(status)) {
   2726         deferredStatus = status;
   2727     }
   2728 }
   2729 
   2730 void RBBIWordMonkey::setText(const UnicodeString &s) {
   2731     fText       = &s;
   2732 }
   2733 
   2734 
   2735 int32_t RBBIWordMonkey::next(int32_t prevPos) {
   2736     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2737                               //   break position being tested.  The candidate break
   2738                               //   location is before p2.
   2739 
   2740     int     breakPos = -1;
   2741 
   2742     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2743 
   2744     if (U_FAILURE(deferredStatus)) {
   2745         return -1;
   2746     }
   2747 
   2748     // Prev break at end of string.  return DONE.
   2749     if (prevPos >= fText->length()) {
   2750         return -1;
   2751     }
   2752     p0 = p1 = p2 = p3 = prevPos;
   2753     c3 =  fText->char32At(prevPos);
   2754     c0 = c1 = c2 = 0;
   2755 
   2756     // Loop runs once per "significant" character position in the input text.
   2757     for (;;) {
   2758         // Move all of the positions forward in the input string.
   2759         p0 = p1;  c0 = c1;
   2760         p1 = p2;  c1 = c2;
   2761         p2 = p3;  c2 = c3;
   2762 
   2763         // Advancd p3 by    X(Extend | Format)*   Rule 4
   2764         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
   2765         do {
   2766             p3 = fText->moveIndex32(p3, 1);
   2767             c3 = fText->char32At(p3);
   2768             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
   2769                break;
   2770             };
   2771         }
   2772         while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
   2773 
   2774 
   2775         if (p1 == p2) {
   2776             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2777             continue;
   2778         }
   2779         if (p2 == fText->length()) {
   2780             // Reached end of string.  Always a break position.
   2781             break;
   2782         }
   2783 
   2784         // Rule  (3)   CR x LF
   2785         //     No Extend or Format characters may appear between the CR and LF,
   2786         //     which requires the additional check for p2 immediately following p1.
   2787         //
   2788         if (c1==0x0D && c2==0x0A) {
   2789             continue;
   2790         }
   2791 
   2792         // Rule (3a)  Break before and after newlines (including CR and LF)
   2793         //
   2794         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
   2795             break;
   2796         };
   2797         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
   2798             break;
   2799         };
   2800 
   2801         // Rule (5).   ALetter x ALetter
   2802         if (fALetterSet->contains(c1) &&
   2803             fALetterSet->contains(c2))  {
   2804             continue;
   2805         }
   2806 
   2807         // Rule (6)  ALetter  x  (MidLetter | MidNumLet) ALetter
   2808         //
   2809         if ( fALetterSet->contains(c1)   &&
   2810              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
   2811              fALetterSet->contains(c3)) {
   2812             continue;
   2813         }
   2814 
   2815 
   2816         // Rule (7)  ALetter (MidLetter | MidNumLet)  x  ALetter
   2817         if (fALetterSet->contains(c0) &&
   2818             (fMidLetterSet->contains(c1) ||  fMidNumLetSet->contains(c1)) &&
   2819             fALetterSet->contains(c2)) {
   2820             continue;
   2821         }
   2822 
   2823         // Rule (8)    Numeric x Numeric
   2824         if (fNumericSet->contains(c1) &&
   2825             fNumericSet->contains(c2))  {
   2826             continue;
   2827         }
   2828 
   2829         // Rule (9)    ALetter x Numeric
   2830         if (fALetterSet->contains(c1) &&
   2831             fNumericSet->contains(c2))  {
   2832             continue;
   2833         }
   2834 
   2835         // Rule (10)    Numeric x ALetter
   2836         if (fNumericSet->contains(c1) &&
   2837             fALetterSet->contains(c2))  {
   2838             continue;
   2839         }
   2840 
   2841         // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric
   2842         if (fNumericSet->contains(c0) &&
   2843             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1))  &&
   2844             fNumericSet->contains(c2)) {
   2845             continue;
   2846         }
   2847 
   2848         // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric
   2849         if (fNumericSet->contains(c1) &&
   2850             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2))  &&
   2851             fNumericSet->contains(c3)) {
   2852             continue;
   2853         }
   2854 
   2855         // Rule (13)  Katakana x Katakana
   2856         if (fKatakanaSet->contains(c1) &&
   2857             fKatakanaSet->contains(c2))  {
   2858             continue;
   2859         }
   2860 
   2861         // Rule 13a
   2862         if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
   2863              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
   2864              fExtendNumLetSet->contains(c2)) {
   2865                 continue;
   2866              }
   2867 
   2868         // Rule 13b
   2869         if (fExtendNumLetSet->contains(c1) &&
   2870                 (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
   2871                 fKatakanaSet->contains(c2)))  {
   2872                 continue;
   2873              }
   2874 
   2875         // Rule 14.  Break found here.
   2876         break;
   2877     }
   2878 
   2879     breakPos = p2;
   2880     return breakPos;
   2881 }
   2882 
   2883 
   2884 UVector  *RBBIWordMonkey::charClasses() {
   2885     return fSets;
   2886 }
   2887 
   2888 
   2889 RBBIWordMonkey::~RBBIWordMonkey() {
   2890     delete fSets;
   2891     delete fCRSet;
   2892     delete fLFSet;
   2893     delete fNewlineSet;
   2894     delete fKatakanaSet;
   2895     delete fALetterSet;
   2896     delete fMidNumLetSet;
   2897     delete fMidLetterSet;
   2898     delete fMidNumSet;
   2899     delete fNumericSet;
   2900     delete fFormatSet;
   2901     delete fExtendSet;
   2902     delete fExtendNumLetSet;
   2903     delete fOtherSet;
   2904 }
   2905 
   2906 
   2907 
   2908 
   2909 //------------------------------------------------------------------------------------------
   2910 //
   2911 //   class RBBISentMonkey      Sentence Break specific implementation
   2912 //                             of RBBIMonkeyKind.
   2913 //
   2914 //------------------------------------------------------------------------------------------
   2915 class RBBISentMonkey: public RBBIMonkeyKind {
   2916 public:
   2917     RBBISentMonkey();
   2918     virtual          ~RBBISentMonkey();
   2919     virtual  UVector *charClasses();
   2920     virtual  void     setText(const UnicodeString &s);
   2921     virtual int32_t   next(int32_t i);
   2922 private:
   2923     int               moveBack(int posFrom);
   2924     int               moveForward(int posFrom);
   2925     UChar32           cAt(int pos);
   2926 
   2927     UVector      *fSets;
   2928 
   2929     UnicodeSet  *fSepSet;
   2930     UnicodeSet  *fFormatSet;
   2931     UnicodeSet  *fSpSet;
   2932     UnicodeSet  *fLowerSet;
   2933     UnicodeSet  *fUpperSet;
   2934     UnicodeSet  *fOLetterSet;
   2935     UnicodeSet  *fNumericSet;
   2936     UnicodeSet  *fATermSet;
   2937     UnicodeSet  *fSContinueSet;
   2938     UnicodeSet  *fSTermSet;
   2939     UnicodeSet  *fCloseSet;
   2940     UnicodeSet  *fOtherSet;
   2941     UnicodeSet  *fExtendSet;
   2942 
   2943     const UnicodeString  *fText;
   2944 
   2945 };
   2946 
   2947 RBBISentMonkey::RBBISentMonkey()
   2948 {
   2949     UErrorCode  status = U_ZERO_ERROR;
   2950 
   2951     fSets            = new UVector(status);
   2952 
   2953     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
   2954     //                       set and made into character classes of their own.  For the monkey impl,
   2955     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
   2956     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
   2957     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
   2958     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
   2959     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
   2960     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
   2961     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
   2962     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
   2963     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
   2964     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
   2965     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
   2966     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
   2967     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
   2968     fOtherSet        = new UnicodeSet();
   2969 
   2970     if(U_FAILURE(status)) {
   2971       deferredStatus = status;
   2972       return;
   2973     }
   2974 
   2975     fOtherSet->complement();
   2976     fOtherSet->removeAll(*fSepSet);
   2977     fOtherSet->removeAll(*fFormatSet);
   2978     fOtherSet->removeAll(*fSpSet);
   2979     fOtherSet->removeAll(*fLowerSet);
   2980     fOtherSet->removeAll(*fUpperSet);
   2981     fOtherSet->removeAll(*fOLetterSet);
   2982     fOtherSet->removeAll(*fNumericSet);
   2983     fOtherSet->removeAll(*fATermSet);
   2984     fOtherSet->removeAll(*fSContinueSet);
   2985     fOtherSet->removeAll(*fSTermSet);
   2986     fOtherSet->removeAll(*fCloseSet);
   2987     fOtherSet->removeAll(*fExtendSet);
   2988 
   2989     fSets->addElement(fSepSet,       status);
   2990     fSets->addElement(fFormatSet,    status);
   2991     fSets->addElement(fSpSet,        status);
   2992     fSets->addElement(fLowerSet,     status);
   2993     fSets->addElement(fUpperSet,     status);
   2994     fSets->addElement(fOLetterSet,   status);
   2995     fSets->addElement(fNumericSet,   status);
   2996     fSets->addElement(fATermSet,     status);
   2997     fSets->addElement(fSContinueSet, status);
   2998     fSets->addElement(fSTermSet,     status);
   2999     fSets->addElement(fCloseSet,     status);
   3000     fSets->addElement(fOtherSet,     status);
   3001     fSets->addElement(fExtendSet,    status);
   3002 
   3003     if (U_FAILURE(status)) {
   3004         deferredStatus = status;
   3005     }
   3006 }
   3007 
   3008 
   3009 
   3010 void RBBISentMonkey::setText(const UnicodeString &s) {
   3011     fText       = &s;
   3012 }
   3013 
   3014 UVector  *RBBISentMonkey::charClasses() {
   3015     return fSets;
   3016 }
   3017 
   3018 
   3019 //  moveBack()   Find the "significant" code point preceding the index i.
   3020 //               Skips over ($Extend | $Format)* .
   3021 //
   3022 int RBBISentMonkey::moveBack(int i) {
   3023     if (i <= 0) {
   3024         return -1;
   3025     }
   3026     UChar32   c;
   3027     int32_t   j = i;
   3028     do {
   3029         j = fText->moveIndex32(j, -1);
   3030         c = fText->char32At(j);
   3031     }
   3032     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
   3033     return j;
   3034 
   3035  }
   3036 
   3037 
   3038 int RBBISentMonkey::moveForward(int i) {
   3039     if (i>=fText->length()) {
   3040         return fText->length();
   3041     }
   3042     UChar32   c;
   3043     int32_t   j = i;
   3044     do {
   3045         j = fText->moveIndex32(j, 1);
   3046         c = cAt(j);
   3047     }
   3048     while (fFormatSet->contains(c) || fExtendSet->contains(c));
   3049     return j;
   3050 }
   3051 
   3052 UChar32 RBBISentMonkey::cAt(int pos) {
   3053     if (pos<0 || pos>=fText->length()) {
   3054         return -1;
   3055     } else {
   3056         return fText->char32At(pos);
   3057     }
   3058 }
   3059 
   3060 int32_t RBBISentMonkey::next(int32_t prevPos) {
   3061     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   3062                               //   break position being tested.  The candidate break
   3063                               //   location is before p2.
   3064 
   3065     int     breakPos = -1;
   3066 
   3067     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   3068     UChar32 c;
   3069 
   3070     if (U_FAILURE(deferredStatus)) {
   3071         return -1;
   3072     }
   3073 
   3074     // Prev break at end of string.  return DONE.
   3075     if (prevPos >= fText->length()) {
   3076         return -1;
   3077     }
   3078     p0 = p1 = p2 = p3 = prevPos;
   3079     c3 =  fText->char32At(prevPos);
   3080     c0 = c1 = c2 = 0;
   3081 
   3082     // Loop runs once per "significant" character position in the input text.
   3083     for (;;) {
   3084         // Move all of the positions forward in the input string.
   3085         p0 = p1;  c0 = c1;
   3086         p1 = p2;  c1 = c2;
   3087         p2 = p3;  c2 = c3;
   3088 
   3089         // Advancd p3 by    X(Extend | Format)*   Rule 4
   3090         p3 = moveForward(p3);
   3091         c3 = cAt(p3);
   3092 
   3093         // Rule (3)  CR x LF
   3094         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
   3095             continue;
   3096         }
   3097 
   3098         // Rule (4).   Sep  <break>
   3099         if (fSepSet->contains(c1)) {
   3100             p2 = p1+1;   // Separators don't combine with Extend or Format.
   3101             break;
   3102         }
   3103 
   3104         if (p2 >= fText->length()) {
   3105             // Reached end of string.  Always a break position.
   3106             break;
   3107         }
   3108 
   3109         if (p2 == prevPos) {
   3110             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   3111             continue;
   3112         }
   3113 
   3114         // Rule (6).   ATerm x Numeric
   3115         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
   3116             continue;
   3117         }
   3118 
   3119         // Rule (7).  Upper ATerm  x  Uppper
   3120         if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
   3121             continue;
   3122         }
   3123 
   3124         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
   3125         //           Note:  STerm | ATerm are added to the negated part of the expression by a
   3126         //                  note to the Unicode 5.0 documents.
   3127         int p8 = p1;
   3128         while (fSpSet->contains(cAt(p8))) {
   3129             p8 = moveBack(p8);
   3130         }
   3131         while (fCloseSet->contains(cAt(p8))) {
   3132             p8 = moveBack(p8);
   3133         }
   3134         if (fATermSet->contains(cAt(p8))) {
   3135             p8=p2;
   3136             for (;;) {
   3137                 c = cAt(p8);
   3138                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
   3139                     fLowerSet->contains(c) || fSepSet->contains(c) ||
   3140                     fATermSet->contains(c) || fSTermSet->contains(c))  {
   3141                     break;
   3142                 }
   3143                 p8 = moveForward(p8);
   3144             }
   3145             if (fLowerSet->contains(cAt(p8))) {
   3146                 continue;
   3147             }
   3148         }
   3149 
   3150         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
   3151         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
   3152             p8 = p1;
   3153             while (fSpSet->contains(cAt(p8))) {
   3154                 p8 = moveBack(p8);
   3155             }
   3156             while (fCloseSet->contains(cAt(p8))) {
   3157                 p8 = moveBack(p8);
   3158             }
   3159             c = cAt(p8);
   3160             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
   3161                 continue;
   3162             }
   3163         }
   3164 
   3165         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
   3166         int p9 = p1;
   3167         while (fCloseSet->contains(cAt(p9))) {
   3168             p9 = moveBack(p9);
   3169         }
   3170         c = cAt(p9);
   3171         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
   3172             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
   3173                 continue;
   3174             }
   3175         }
   3176 
   3177         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
   3178         int p10 = p1;
   3179         while (fSpSet->contains(cAt(p10))) {
   3180             p10 = moveBack(p10);
   3181         }
   3182         while (fCloseSet->contains(cAt(p10))) {
   3183             p10 = moveBack(p10);
   3184         }
   3185         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
   3186             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
   3187                 continue;
   3188             }
   3189         }
   3190 
   3191         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
   3192         int p11 = p1;
   3193         if (fSepSet->contains(cAt(p11))) {
   3194             p11 = moveBack(p11);
   3195         }
   3196         while (fSpSet->contains(cAt(p11))) {
   3197             p11 = moveBack(p11);
   3198         }
   3199         while (fCloseSet->contains(cAt(p11))) {
   3200             p11 = moveBack(p11);
   3201         }
   3202         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
   3203             break;
   3204         }
   3205 
   3206         //  Rule (12)  Any x Any
   3207         continue;
   3208     }
   3209     breakPos = p2;
   3210     return breakPos;
   3211 }
   3212 
   3213 RBBISentMonkey::~RBBISentMonkey() {
   3214     delete fSets;
   3215     delete fSepSet;
   3216     delete fFormatSet;
   3217     delete fSpSet;
   3218     delete fLowerSet;
   3219     delete fUpperSet;
   3220     delete fOLetterSet;
   3221     delete fNumericSet;
   3222     delete fATermSet;
   3223     delete fSContinueSet;
   3224     delete fSTermSet;
   3225     delete fCloseSet;
   3226     delete fOtherSet;
   3227     delete fExtendSet;
   3228 }
   3229 
   3230 
   3231 
   3232 //-------------------------------------------------------------------------------------------
   3233 //
   3234 //  RBBILineMonkey
   3235 //
   3236 //-------------------------------------------------------------------------------------------
   3237 
   3238 class RBBILineMonkey: public RBBIMonkeyKind {
   3239 public:
   3240     RBBILineMonkey();
   3241     virtual          ~RBBILineMonkey();
   3242     virtual  UVector *charClasses();
   3243     virtual  void     setText(const UnicodeString &s);
   3244     virtual  int32_t  next(int32_t i);
   3245     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
   3246 private:
   3247     UVector      *fSets;
   3248 
   3249     UnicodeSet  *fBK;
   3250     UnicodeSet  *fCR;
   3251     UnicodeSet  *fLF;
   3252     UnicodeSet  *fCM;
   3253     UnicodeSet  *fNL;
   3254     UnicodeSet  *fSG;
   3255     UnicodeSet  *fWJ;
   3256     UnicodeSet  *fZW;
   3257     UnicodeSet  *fGL;
   3258     UnicodeSet  *fCB;
   3259     UnicodeSet  *fSP;
   3260     UnicodeSet  *fB2;
   3261     UnicodeSet  *fBA;
   3262     UnicodeSet  *fBB;
   3263     UnicodeSet  *fHY;
   3264     UnicodeSet  *fH2;
   3265     UnicodeSet  *fH3;
   3266     UnicodeSet  *fCL;
   3267     UnicodeSet  *fCP;
   3268     UnicodeSet  *fEX;
   3269     UnicodeSet  *fIN;
   3270     UnicodeSet  *fJL;
   3271     UnicodeSet  *fJV;
   3272     UnicodeSet  *fJT;
   3273     UnicodeSet  *fNS;
   3274     UnicodeSet  *fOP;
   3275     UnicodeSet  *fQU;
   3276     UnicodeSet  *fIS;
   3277     UnicodeSet  *fNU;
   3278     UnicodeSet  *fPO;
   3279     UnicodeSet  *fPR;
   3280     UnicodeSet  *fSY;
   3281     UnicodeSet  *fAI;
   3282     UnicodeSet  *fAL;
   3283     UnicodeSet  *fID;
   3284     UnicodeSet  *fSA;
   3285     UnicodeSet  *fXX;
   3286 
   3287     BreakIterator  *fCharBI;
   3288 
   3289     const UnicodeString  *fText;
   3290     int32_t              *fOrigPositions;
   3291 
   3292     RegexMatcher         *fNumberMatcher;
   3293     RegexMatcher         *fLB11Matcher;
   3294 };
   3295 
   3296 
   3297 RBBILineMonkey::RBBILineMonkey()
   3298 {
   3299     UErrorCode  status = U_ZERO_ERROR;
   3300 
   3301     fSets  = new UVector(status);
   3302 
   3303     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
   3304     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
   3305     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
   3306     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
   3307     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
   3308     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
   3309     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
   3310     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
   3311     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
   3312     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
   3313     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
   3314     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
   3315     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
   3316     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
   3317     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
   3318     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
   3319     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
   3320     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
   3321     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
   3322     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
   3323     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
   3324     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
   3325     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
   3326     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
   3327     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
   3328     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
   3329     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
   3330     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
   3331     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
   3332     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
   3333     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
   3334     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
   3335     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
   3336     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
   3337     fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
   3338     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
   3339     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
   3340 
   3341     if (U_FAILURE(status)) {
   3342         deferredStatus = status;
   3343         fCharBI = NULL;
   3344         fNumberMatcher = NULL;
   3345         return;
   3346     }
   3347 
   3348     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
   3349     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
   3350     fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
   3351     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
   3352 
   3353     fSets->addElement(fBK, status);
   3354     fSets->addElement(fCR, status);
   3355     fSets->addElement(fLF, status);
   3356     fSets->addElement(fCM, status);
   3357     fSets->addElement(fNL, status);
   3358     fSets->addElement(fWJ, status);
   3359     fSets->addElement(fZW, status);
   3360     fSets->addElement(fGL, status);
   3361     fSets->addElement(fCB, status);
   3362     fSets->addElement(fSP, status);
   3363     fSets->addElement(fB2, status);
   3364     fSets->addElement(fBA, status);
   3365     fSets->addElement(fBB, status);
   3366     fSets->addElement(fHY, status);
   3367     fSets->addElement(fH2, status);
   3368     fSets->addElement(fH3, status);
   3369     fSets->addElement(fCL, status);
   3370     fSets->addElement(fCP, status);
   3371     fSets->addElement(fEX, status);
   3372     fSets->addElement(fIN, status);
   3373     fSets->addElement(fJL, status);
   3374     fSets->addElement(fJT, status);
   3375     fSets->addElement(fJV, status);
   3376     fSets->addElement(fNS, status);
   3377     fSets->addElement(fOP, status);
   3378     fSets->addElement(fQU, status);
   3379     fSets->addElement(fIS, status);
   3380     fSets->addElement(fNU, status);
   3381     fSets->addElement(fPO, status);
   3382     fSets->addElement(fPR, status);
   3383     fSets->addElement(fSY, status);
   3384     fSets->addElement(fAI, status);
   3385     fSets->addElement(fAL, status);
   3386     fSets->addElement(fID, status);
   3387     fSets->addElement(fWJ, status);
   3388     fSets->addElement(fSA, status);
   3389     fSets->addElement(fSG, status);
   3390 
   3391     const char *rules =
   3392             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
   3393             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
   3394             "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
   3395             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
   3396             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
   3397             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
   3398 
   3399     fNumberMatcher = new RegexMatcher(
   3400         UnicodeString(rules, -1, US_INV), 0, status);
   3401 
   3402     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
   3403 
   3404     if (U_FAILURE(status)) {
   3405         deferredStatus = status;
   3406     }
   3407 }
   3408 
   3409 
   3410 void RBBILineMonkey::setText(const UnicodeString &s) {
   3411     fText       = &s;
   3412     fCharBI->setText(s);
   3413     fNumberMatcher->reset(s);
   3414 }
   3415 
   3416 //
   3417 //  rule9Adjust
   3418 //     Line Break TR rules 9 and 10 implementation.
   3419 //     This deals with combining marks and other sequences that
   3420 //     that must be treated as if they were something other than what they actually are.
   3421 //
   3422 //     This is factored out into a separate function because it must be applied twice for
   3423 //     each potential break, once to the chars before the position being checked, then
   3424 //     again to the text following the possible break.
   3425 //
   3426 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
   3427     if (pos == -1) {
   3428         // Invalid initial position.  Happens during the warmup iteration of the
   3429         //   main loop in next().
   3430         return;
   3431     }
   3432 
   3433     int32_t  nPos = *nextPos;
   3434 
   3435     // LB 9  Keep combining sequences together.
   3436     //  advance over any CM class chars.  Note that Line Break CM is different
   3437     //  from the normal Grapheme Extend property.
   3438     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
   3439           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
   3440         for (;;) {
   3441             *nextChar = fText->char32At(nPos);
   3442             if (!fCM->contains(*nextChar)) {
   3443                 break;
   3444             }
   3445             nPos = fText->moveIndex32(nPos, 1);
   3446         }
   3447     }
   3448 
   3449 
   3450     // LB 9 Treat X CM* as if it were x.
   3451     //       No explicit action required.
   3452 
   3453     // LB 10  Treat any remaining combining mark as AL
   3454     if (fCM->contains(*posChar)) {
   3455         *posChar = 0x41;   // thisChar = 'A';
   3456     }
   3457 
   3458     // Push the updated nextPos and nextChar back to our caller.
   3459     // This only makes a difference if posChar got bigger by consuming a
   3460     // combining sequence.
   3461     *nextPos  = nPos;
   3462     *nextChar = fText->char32At(nPos);
   3463 }
   3464 
   3465 
   3466 
   3467 int32_t RBBILineMonkey::next(int32_t startPos) {
   3468     UErrorCode status = U_ZERO_ERROR;
   3469     int32_t    pos;       //  Index of the char following a potential break position
   3470     UChar32    thisChar;  //  Character at above position "pos"
   3471 
   3472     int32_t    prevPos;   //  Index of the char preceding a potential break position
   3473     UChar32    prevChar;  //  Character at above position.  Note that prevChar
   3474                           //   and thisChar may not be adjacent because combining
   3475                           //   characters between them will be ignored.
   3476 
   3477     int32_t    nextPos;   //  Index of the next character following pos.
   3478                           //     Usually skips over combining marks.
   3479     int32_t    nextCPPos; //  Index of the code point following "pos."
   3480                           //     May point to a combining mark.
   3481     int32_t    tPos;      //  temp value.
   3482     UChar32    c;
   3483 
   3484     if (U_FAILURE(deferredStatus)) {
   3485         return -1;
   3486     }
   3487 
   3488     if (startPos >= fText->length()) {
   3489         return -1;
   3490     }
   3491 
   3492 
   3493     // Initial values for loop.  Loop will run the first time without finding breaks,
   3494     //                           while the invalid values shift out and the "this" and
   3495     //                           "prev" positions are filled in with good values.
   3496     pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.
   3497     thisChar = prevChar  = 0;
   3498     nextPos  = nextCPPos = startPos;
   3499 
   3500 
   3501     // Loop runs once per position in the test text, until a break position
   3502     //  is found.
   3503     for (;;) {
   3504         prevPos   = pos;
   3505         prevChar  = thisChar;
   3506 
   3507         pos       = nextPos;
   3508         thisChar  = fText->char32At(pos);
   3509 
   3510         nextCPPos = fText->moveIndex32(pos, 1);
   3511         nextPos   = nextCPPos;
   3512 
   3513         // Rule LB2 - Break at end of text.
   3514         if (pos >= fText->length()) {
   3515             break;
   3516         }
   3517 
   3518         // Rule LB 9 - adjust for combining sequences.
   3519         //             We do this one out-of-order because the adjustment does not change anything
   3520         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
   3521         //             be applied.
   3522         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
   3523         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
   3524         c = fText->char32At(nextPos);
   3525         rule9Adjust(pos,     &thisChar, &nextPos, &c);
   3526 
   3527         // If the loop is still warming up - if we haven't shifted the initial
   3528         //   -1 positions out of prevPos yet - loop back to advance the
   3529         //    position in the input without any further looking for breaks.
   3530         if (prevPos == -1) {
   3531             continue;
   3532         }
   3533 
   3534         // LB 4  Always break after hard line breaks,
   3535         if (fBK->contains(prevChar)) {
   3536             break;
   3537         }
   3538 
   3539         // LB 5  Break after CR, LF, NL, but not inside CR LF
   3540         if (prevChar == 0x0d && thisChar == 0x0a) {
   3541             continue;
   3542         }
   3543         if (prevChar == 0x0d ||
   3544             prevChar == 0x0a ||
   3545             prevChar == 0x85)  {
   3546             break;
   3547         }
   3548 
   3549         // LB 6  Don't break before hard line breaks
   3550         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
   3551             fBK->contains(thisChar)) {
   3552                 continue;
   3553         }
   3554 
   3555 
   3556         // LB 7  Don't break before spaces or zero-width space.
   3557         if (fSP->contains(thisChar)) {
   3558             continue;
   3559         }
   3560 
   3561         if (fZW->contains(thisChar)) {
   3562             continue;
   3563         }
   3564 
   3565         // LB 8  Break after zero width space
   3566         if (fZW->contains(prevChar)) {
   3567             break;
   3568         }
   3569 
   3570         // LB 9, 10  Already done, at top of loop.
   3571         //
   3572 
   3573 
   3574         // LB 11  Do not break before or after WORD JOINER and related characters.
   3575         //    x  WJ
   3576         //    WJ  x
   3577         //
   3578         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
   3579             continue;
   3580         }
   3581 
   3582         // LB 12
   3583         //    GL  x
   3584         if (fGL->contains(prevChar)) {
   3585             continue;
   3586         }
   3587 
   3588         // LB 12a
   3589         //    [^SP BA HY] x GL
   3590         if (!(fSP->contains(prevChar) ||
   3591               fBA->contains(prevChar) ||
   3592               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
   3593             continue;
   3594         }
   3595 
   3596 
   3597 
   3598         // LB 13  Don't break before closings.
   3599         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
   3600         //        fall into LB 17 and the more general number regular expression.
   3601         //
   3602         if (!fNU->contains(prevChar) && fCL->contains(thisChar) ||
   3603             !fNU->contains(prevChar) && fCP->contains(thisChar) ||
   3604                                         fEX->contains(thisChar) ||
   3605             !fNU->contains(prevChar) && fIS->contains(thisChar) ||
   3606             !fNU->contains(prevChar) && fSY->contains(thisChar))    {
   3607             continue;
   3608         }
   3609 
   3610         // LB 14 Don't break after OP SP*
   3611         //       Scan backwards, checking for this sequence.
   3612         //       The OP char could include combining marks, so we actually check for
   3613         //           OP CM* SP*
   3614         //       Another Twist: The Rule 67 fixes may have changed a SP CM
   3615         //       sequence into a ID char, so before scanning back through spaces,
   3616         //       verify that prevChar is indeed a space.  The prevChar variable
   3617         //       may differ from fText[prevPos]
   3618         tPos = prevPos;
   3619         if (fSP->contains(prevChar)) {
   3620             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
   3621                 tPos=fText->moveIndex32(tPos, -1);
   3622             }
   3623         }
   3624         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
   3625             tPos=fText->moveIndex32(tPos, -1);
   3626         }
   3627         if (fOP->contains(fText->char32At(tPos))) {
   3628             continue;
   3629         }
   3630 
   3631 
   3632         // LB 15    QU SP* x OP
   3633         if (fOP->contains(thisChar)) {
   3634             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
   3635             int tPos = prevPos;
   3636             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
   3637                 tPos = fText->moveIndex32(tPos, -1);
   3638             }
   3639             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
   3640                 tPos = fText->moveIndex32(tPos, -1);
   3641             }
   3642             if (fQU->contains(fText->char32At(tPos))) {
   3643                 continue;
   3644             }
   3645         }
   3646 
   3647 
   3648 
   3649         // LB 16   (CL | CP) SP* x NS
   3650         //    Scan backwards for SP* CM* (CL | CP)
   3651         if (fNS->contains(thisChar)) {
   3652             int tPos = prevPos;
   3653             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
   3654                 tPos = fText->moveIndex32(tPos, -1);
   3655             }
   3656             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
   3657                 tPos = fText->moveIndex32(tPos, -1);
   3658             }
   3659             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
   3660                 continue;
   3661             }
   3662         }
   3663 
   3664 
   3665         // LB 17        B2 SP* x B2
   3666         if (fB2->contains(thisChar)) {
   3667             //  Scan backwards, checking for the B2 CM* SP* sequence.
   3668             tPos = prevPos;
   3669             if (fSP->contains(prevChar)) {
   3670                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
   3671                     tPos=fText->moveIndex32(tPos, -1);
   3672                 }
   3673             }
   3674             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
   3675                 tPos=fText->moveIndex32(tPos, -1);
   3676             }
   3677             if (fB2->contains(fText->char32At(tPos))) {
   3678                 continue;
   3679             }
   3680         }
   3681 
   3682 
   3683         // LB 18    break after space
   3684         if (fSP->contains(prevChar)) {
   3685             break;
   3686         }
   3687 
   3688         // LB 19
   3689         //    x   QU
   3690         //    QU  x
   3691         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
   3692             continue;
   3693         }
   3694 
   3695         // LB 20  Break around a CB
   3696         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
   3697             break;
   3698         }
   3699 
   3700         // LB 21
   3701         if (fBA->contains(thisChar) ||
   3702             fHY->contains(thisChar) ||
   3703             fNS->contains(thisChar) ||
   3704             fBB->contains(prevChar) )   {
   3705             continue;
   3706         }
   3707 
   3708         // LB 22
   3709         if (fAL->contains(prevChar) && fIN->contains(thisChar) ||
   3710             fID->contains(prevChar) && fIN->contains(thisChar) ||
   3711             fIN->contains(prevChar) && fIN->contains(thisChar) ||
   3712             fNU->contains(prevChar) && fIN->contains(thisChar) )   {
   3713             continue;
   3714         }
   3715 
   3716 
   3717         // LB 23    ID x PO
   3718         //          AL x NU
   3719         //          NU x AL
   3720         if (fID->contains(prevChar) && fPO->contains(thisChar) ||
   3721             fAL->contains(prevChar) && fNU->contains(thisChar) ||
   3722             fNU->contains(prevChar) && fAL->contains(thisChar) )   {
   3723             continue;
   3724         }
   3725 
   3726         // LB 24  Do not break between prefix and letters or ideographs.
   3727         //        PR x ID
   3728         //        PR x AL
   3729         //        PO x AL
   3730         if (fPR->contains(prevChar) && fID->contains(thisChar) ||
   3731             fPR->contains(prevChar) && fAL->contains(thisChar) ||
   3732             fPO->contains(prevChar) && fAL->contains(thisChar) )   {
   3733             continue;
   3734         }
   3735 
   3736 
   3737 
   3738         // LB 25    Numbers
   3739         if (fNumberMatcher->lookingAt(prevPos, status)) {
   3740             if (U_FAILURE(status)) {
   3741                 break;
   3742             }
   3743             // Matched a number.  But could have been just a single digit, which would
   3744             //    not represent a "no break here" between prevChar and thisChar
   3745             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
   3746             if (numEndIdx > pos) {
   3747                 // Number match includes at least our two chars being checked
   3748                 if (numEndIdx > nextPos) {
   3749                     // Number match includes additional chars.  Update pos and nextPos
   3750                     //   so that next loop iteration will continue at the end of the number,
   3751                     //   checking for breaks between last char in number & whatever follows.
   3752                     pos = nextPos = numEndIdx;
   3753                     do {
   3754                         pos = fText->moveIndex32(pos, -1);
   3755                         thisChar = fText->char32At(pos);
   3756                     } while (fCM->contains(thisChar));
   3757                 }
   3758                 continue;
   3759             }
   3760         }
   3761 
   3762 
   3763         // LB 26 Do not break a Korean syllable.
   3764         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
   3765                                         fJV->contains(thisChar) ||
   3766                                         fH2->contains(thisChar) ||
   3767                                         fH3->contains(thisChar))) {
   3768                                             continue;
   3769                                         }
   3770 
   3771         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
   3772             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
   3773                 continue;
   3774         }
   3775 
   3776         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
   3777             fJT->contains(thisChar)) {
   3778                 continue;
   3779         }
   3780 
   3781         // LB 27 Treat a Korean Syllable Block the same as ID.
   3782         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
   3783             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
   3784             fIN->contains(thisChar)) {
   3785                 continue;
   3786             }
   3787         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
   3788             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
   3789             fPO->contains(thisChar)) {
   3790                 continue;
   3791             }
   3792         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
   3793             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
   3794                 continue;
   3795             }
   3796 
   3797 
   3798 
   3799         // LB 28  Do not break between alphabetics ("at").
   3800         if (fAL->contains(prevChar) && fAL->contains(thisChar)) {
   3801             continue;
   3802         }
   3803 
   3804         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
   3805         if (fIS->contains(prevChar) && fAL->contains(thisChar)) {
   3806             continue;
   3807         }
   3808 
   3809         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
   3810         //          (AL | NU) x OP
   3811         //          CP x (AL | NU)
   3812         if ((fAL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
   3813             continue;
   3814         }
   3815         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fNU->contains(thisChar))) {
   3816             continue;
   3817         }
   3818 
   3819         // LB 31    Break everywhere else
   3820         break;
   3821 
   3822     }
   3823 
   3824     return pos;
   3825 }
   3826 
   3827 
   3828 UVector  *RBBILineMonkey::charClasses() {
   3829     return fSets;
   3830 }
   3831 
   3832 
   3833 RBBILineMonkey::~RBBILineMonkey() {
   3834     delete fSets;
   3835 
   3836     delete fBK;
   3837     delete fCR;
   3838     delete fLF;
   3839     delete fCM;
   3840     delete fNL;
   3841     delete fWJ;
   3842     delete fZW;
   3843     delete fGL;
   3844     delete fCB;
   3845     delete fSP;
   3846     delete fB2;
   3847     delete fBA;
   3848     delete fBB;
   3849     delete fHY;
   3850     delete fH2;
   3851     delete fH3;
   3852     delete fCL;
   3853     delete fCP;
   3854     delete fEX;
   3855     delete fIN;
   3856     delete fJL;
   3857     delete fJV;
   3858     delete fJT;
   3859     delete fNS;
   3860     delete fOP;
   3861     delete fQU;
   3862     delete fIS;
   3863     delete fNU;
   3864     delete fPO;
   3865     delete fPR;
   3866     delete fSY;
   3867     delete fAI;
   3868     delete fAL;
   3869     delete fID;
   3870     delete fSA;
   3871     delete fSG;
   3872     delete fXX;
   3873 
   3874     delete fCharBI;
   3875     delete fNumberMatcher;
   3876 }
   3877 
   3878 
   3879 //-------------------------------------------------------------------------------------------
   3880 //
   3881 //   TestMonkey
   3882 //
   3883 //     params
   3884 //       seed=nnnnn        Random number starting seed.
   3885 //                         Setting the seed allows errors to be reproduced.
   3886 //       loop=nnn          Looping count.  Controls running time.
   3887 //                         -1:  run forever.
   3888 //                          0 or greater:  run length.
   3889 //
   3890 //       type = char | word | line | sent | title
   3891 //
   3892 //-------------------------------------------------------------------------------------------
   3893 
   3894 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
   3895     int32_t val = defaultVal;
   3896     name.append(" *= *(-?\\d+)");
   3897     UErrorCode status = U_ZERO_ERROR;
   3898     RegexMatcher m(name, params, 0, status);
   3899     if (m.find()) {
   3900         // The param exists.  Convert the string to an int.
   3901         char valString[100];
   3902         int32_t paramLength = m.end(1, status) - m.start(1, status);
   3903         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
   3904             paramLength = (int32_t)(sizeof(valString)-2);
   3905         }
   3906         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
   3907         val = strtol(valString,  NULL, 10);
   3908 
   3909         // Delete this parameter from the params string.
   3910         m.reset();
   3911         params = m.replaceFirst("", status);
   3912     }
   3913     U_ASSERT(U_SUCCESS(status));
   3914     return val;
   3915 }
   3916 #endif
   3917 
   3918 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
   3919                                     BreakIterator *bi,
   3920                                     int expected[],
   3921                                     int expectedcount)
   3922 {
   3923     int count = 0;
   3924     int i = 0;
   3925     int forward[50];
   3926     bi->setText(ustr);
   3927     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
   3928         forward[count] = i;
   3929         if (count < expectedcount && expected[count] != i) {
   3930             test->errln("break forward test failed: expected %d but got %d",
   3931                         expected[count], i);
   3932             break;
   3933         }
   3934         count ++;
   3935     }
   3936     if (count != expectedcount) {
   3937         printStringBreaks(ustr, expected, expectedcount);
   3938         test->errln("break forward test failed: missed %d match",
   3939                     expectedcount - count);
   3940         return;
   3941     }
   3942     // testing boundaries
   3943     for (i = 1; i < expectedcount; i ++) {
   3944         int j = expected[i - 1];
   3945         if (!bi->isBoundary(j)) {
   3946             printStringBreaks(ustr, expected, expectedcount);
   3947             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
   3948             return;
   3949         }
   3950         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
   3951             if (bi->isBoundary(j)) {
   3952                 printStringBreaks(ustr, expected, expectedcount);
   3953                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
   3954                 return;
   3955             }
   3956         }
   3957     }
   3958 
   3959     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
   3960         count --;
   3961         if (forward[count] != i) {
   3962             test->errln("happy break test previous() failed: expected %d but got %d",
   3963                         forward[count], i);
   3964             break;
   3965         }
   3966     }
   3967     if (count != 0) {
   3968         printStringBreaks(ustr, expected, expectedcount);
   3969         test->errln("break test previous() failed: missed a match");
   3970         return;
   3971     }
   3972 
   3973     // testing preceding
   3974     for (i = 0; i < expectedcount - 1; i ++) {
   3975         // int j = expected[i] + 1;
   3976         int j = ustr.moveIndex32(expected[i], 1);
   3977         for (; j <= expected[i + 1]; j ++) {
   3978             if (bi->preceding(j) != expected[i]) {
   3979                 printStringBreaks(ustr, expected, expectedcount);
   3980                 test->errln("preceding(): Not expecting boundary at position %d", j);
   3981                 return;
   3982             }
   3983         }
   3984     }
   3985 }
   3986 
   3987 void RBBITest::TestWordBreaks(void)
   3988 {
   3989 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3990 
   3991     Locale        locale("en");
   3992     UErrorCode    status = U_ZERO_ERROR;
   3993     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   3994     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
   3995     static const char *strlist[] =
   3996     {
   3997     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
   3998     "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
   3999     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
   4000     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
   4001     "\\u90ca\\u3588\\u009c\\u0953\\u194b",
   4002     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   4003     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
   4004     "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
   4005     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   4006     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   4007     "\\u2027\\U000e0067\\u0a47\\u00b7",
   4008     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   4009     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   4010     "\\u0589\\U000e006e\\u0a42\\U000104a5",
   4011     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
   4012     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   4013     "\\u0027\\u11af\\U000e0057\\u0602",
   4014     "\\U0001d7f2\\U000e007\\u0004\\u0589",
   4015     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
   4016     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   4017     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   4018     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   4019     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
   4020     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   4021     "\\u0233\\U000e0020\\u0a69\\u0d6a",
   4022     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   4023     "\\u58f4\\U000e0049\\u20e7\\u2027",
   4024     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   4025     "\\ua183\\u102d\\u0bec\\u003a",
   4026     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   4027     "\\u003a\\u0e57\\u0fad\\u002e",
   4028     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
   4029     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
   4030     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
   4031     "\\u003a\\u0664\\u00b7\\u1fba",
   4032     "\\u003b\\u0027\\u00b7\\u47a3",
   4033     "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
   4034     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
   4035     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
   4036     };
   4037     int loop;
   4038     if (U_FAILURE(status)) {
   4039         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   4040         return;
   4041     }
   4042     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   4043         // printf("looping %d\n", loop);
   4044         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
   4045         // RBBICharMonkey monkey;
   4046         RBBIWordMonkey monkey;
   4047 
   4048         int expected[50];
   4049         int expectedcount = 0;
   4050 
   4051         monkey.setText(ustr);
   4052         int i;
   4053         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   4054             expected[expectedcount ++] = i;
   4055         }
   4056 
   4057         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   4058     }
   4059     delete bi;
   4060 #endif
   4061 }
   4062 
   4063 void RBBITest::TestWordBoundary(void)
   4064 {
   4065     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
   4066     Locale        locale("en");
   4067     UErrorCode    status = U_ZERO_ERROR;
   4068     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   4069     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
   4070     UChar         str[50];
   4071     static const char *strlist[] =
   4072     {
   4073     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   4074     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   4075     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   4076     "\\u2027\\U000e0067\\u0a47\\u00b7",
   4077     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   4078     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   4079     "\\u0589\\U000e006e\\u0a42\\U000104a5",
   4080     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
   4081     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   4082     "\\u0027\\u11af\\U000e0057\\u0602",
   4083     "\\U0001d7f2\\U000e007\\u0004\\u0589",
   4084     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
   4085     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   4086     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   4087     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   4088     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
   4089     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   4090     "\\u0233\\U000e0020\\u0a69\\u0d6a",
   4091     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   4092     "\\u58f4\\U000e0049\\u20e7\\u2027",
   4093     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   4094     "\\ua183\\u102d\\u0bec\\u003a",
   4095     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   4096     "\\u003a\\u0e57\\u0fad\\u002e",
   4097     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
   4098     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
   4099     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
   4100     "\\u003a\\u0664\\u00b7\\u1fba",
   4101     "\\u003b\\u0027\\u00b7\\u47a3",
   4102     };
   4103     int loop;
   4104     if (U_FAILURE(status)) {
   4105         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   4106         return;
   4107     }
   4108     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   4109         // printf("looping %d\n", loop);
   4110         u_unescape(strlist[loop], str, 20);
   4111         UnicodeString ustr(str);
   4112         int forward[50];
   4113         int count = 0;
   4114 
   4115         bi->setText(ustr);
   4116         int prev = 0;
   4117         int i;
   4118         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
   4119             forward[count ++] = i;
   4120             if (i > prev) {
   4121                 int j;
   4122                 for (j = prev + 1; j < i; j ++) {
   4123                     if (bi->isBoundary(j)) {
   4124                         printStringBreaks(ustr, forward, count);
   4125                         errln("happy boundary test failed: expected %d not a boundary",
   4126                                j);
   4127                         return;
   4128                     }
   4129                 }
   4130             }
   4131             if (!bi->isBoundary(i)) {
   4132                 printStringBreaks(ustr, forward, count);
   4133                 errln("happy boundary test failed: expected %d a boundary",
   4134                        i);
   4135                 return;
   4136             }
   4137             prev = i;
   4138         }
   4139     }
   4140     delete bi;
   4141 }
   4142 
   4143 void RBBITest::TestLineBreaks(void)
   4144 {
   4145 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4146     Locale        locale("en");
   4147     UErrorCode    status = U_ZERO_ERROR;
   4148     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
   4149     const int32_t  STRSIZE = 50;
   4150     UChar         str[STRSIZE];
   4151     static const char *strlist[] =
   4152     {
   4153      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
   4154      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
   4155              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
   4156      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
   4157              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
   4158      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
   4159      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
   4160      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
   4161      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
   4162      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
   4163      "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
   4164      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
   4165      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
   4166      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
   4167      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
   4168      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
   4169      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
   4170      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
   4171      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
   4172      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
   4173      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
   4174      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
   4175      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
   4176      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
   4177      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
   4178      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
   4179      "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
   4180      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
   4181      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
   4182      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
   4183      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
   4184      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
   4185      "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
   4186      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
   4187      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
   4188      "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
   4189      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
   4190      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
   4191      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
   4192      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
   4193      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
   4194      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
   4195          "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
   4196          "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
   4197          "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
   4198      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
   4199          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
   4200     };
   4201     int loop;
   4202     TEST_ASSERT_SUCCESS(status);
   4203     if (U_FAILURE(status)) {
   4204         return;
   4205     }
   4206     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   4207         // printf("looping %d\n", loop);
   4208         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
   4209         if (t >= STRSIZE) {
   4210             TEST_ASSERT(FALSE);
   4211             continue;
   4212         }
   4213 
   4214 
   4215         UnicodeString ustr(str);
   4216         RBBILineMonkey monkey;
   4217         if (U_FAILURE(monkey.deferredStatus)) {
   4218             continue;
   4219         }
   4220 
   4221         const int EXPECTEDSIZE = 50;
   4222         int expected[EXPECTEDSIZE];
   4223         int expectedcount = 0;
   4224 
   4225         monkey.setText(ustr);
   4226         int i;
   4227         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   4228             if (expectedcount >= EXPECTEDSIZE) {
   4229                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
   4230                 return;
   4231             }
   4232             expected[expectedcount ++] = i;
   4233         }
   4234 
   4235         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   4236     }
   4237     delete bi;
   4238 #endif
   4239 }
   4240 
   4241 void RBBITest::TestSentBreaks(void)
   4242 {
   4243 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4244     Locale        locale("en");
   4245     UErrorCode    status = U_ZERO_ERROR;
   4246     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
   4247     UChar         str[200];
   4248     static const char *strlist[] =
   4249     {
   4250      "Now\ris\nthe\r\ntime\n\rfor\r\r",
   4251      "This\n",
   4252      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
   4253      "\"Sentence ending with a quote.\" Bye.",
   4254      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
   4255      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
   4256      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
   4257      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
   4258      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
   4259      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
   4260      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
   4261              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
   4262              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
   4263              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
   4264      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
   4265              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
   4266              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
   4267              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
   4268              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
   4269              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
   4270     };
   4271     int loop;
   4272     if (U_FAILURE(status)) {
   4273         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   4274         return;
   4275     }
   4276     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   4277         u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
   4278         UnicodeString ustr(str);
   4279 
   4280         RBBISentMonkey monkey;
   4281         if (U_FAILURE(monkey.deferredStatus)) {
   4282             continue;
   4283         }
   4284 
   4285         const int EXPECTEDSIZE = 50;
   4286         int expected[EXPECTEDSIZE];
   4287         int expectedcount = 0;
   4288 
   4289         monkey.setText(ustr);
   4290         int i;
   4291         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   4292             if (expectedcount >= EXPECTEDSIZE) {
   4293                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
   4294                 return;
   4295             }
   4296             expected[expectedcount ++] = i;
   4297         }
   4298 
   4299         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   4300     }
   4301     delete bi;
   4302 #endif
   4303 }
   4304 
   4305 void RBBITest::TestMonkey(char *params) {
   4306 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4307 
   4308     UErrorCode     status    = U_ZERO_ERROR;
   4309     int32_t        loopCount = 500;
   4310     int32_t        seed      = 1;
   4311     UnicodeString  breakType = "all";
   4312     Locale         locale("en");
   4313     UBool          useUText  = FALSE;
   4314 
   4315     if (quick == FALSE) {
   4316         loopCount = 10000;
   4317     }
   4318 
   4319     if (params) {
   4320         UnicodeString p(params);
   4321         loopCount = getIntParam("loop", p, loopCount);
   4322         seed      = getIntParam("seed", p, seed);
   4323 
   4324         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
   4325         if (m.find()) {
   4326             breakType = m.group(1, status);
   4327             m.reset();
   4328             p = m.replaceFirst("", status);
   4329         }
   4330 
   4331         RegexMatcher u(" *utext", p, 0, status);
   4332         if (u.find()) {
   4333             useUText = TRUE;
   4334             u.reset();
   4335             p = u.replaceFirst("", status);
   4336         }
   4337 
   4338 
   4339         // m.reset(p);
   4340         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
   4341             // Each option is stripped out of the option string as it is processed.
   4342             // All options have been checked.  The option string should have been completely emptied..
   4343             char buf[100];
   4344             p.extract(buf, sizeof(buf), NULL, status);
   4345             buf[sizeof(buf)-1] = 0;
   4346             errln("Unrecognized or extra parameter:  %s\n", buf);
   4347             return;
   4348         }
   4349 
   4350     }
   4351 
   4352     if (breakType == "char" || breakType == "all") {
   4353         RBBICharMonkey  m;
   4354         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   4355         if (U_SUCCESS(status)) {
   4356             RunMonkey(bi, m, "char", seed, loopCount, useUText);
   4357             if (breakType == "all" && useUText==FALSE) {
   4358                 // Also run a quick test with UText when "all" is specified
   4359                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
   4360             }
   4361         }
   4362         else {
   4363             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
   4364         }
   4365         delete bi;
   4366     }
   4367 
   4368     if (breakType == "word" || breakType == "all") {
   4369         logln("Word Break Monkey Test");
   4370         RBBIWordMonkey  m;
   4371         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
   4372         if (U_SUCCESS(status)) {
   4373             RunMonkey(bi, m, "word", seed, loopCount, useUText);
   4374         }
   4375         else {
   4376             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
   4377         }
   4378         delete bi;
   4379     }
   4380 
   4381     if (breakType == "line" || breakType == "all") {
   4382         logln("Line Break Monkey Test");
   4383         RBBILineMonkey  m;
   4384         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
   4385         if (loopCount >= 10) {
   4386             loopCount = loopCount / 5;   // Line break runs slower than the others.
   4387         }
   4388         if (U_SUCCESS(status)) {
   4389             RunMonkey(bi, m, "line", seed, loopCount, useUText);
   4390         }
   4391         else {
   4392             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
   4393         }
   4394         delete bi;
   4395     }
   4396 
   4397     if (breakType == "sent" || breakType == "all"  ) {
   4398         logln("Sentence Break Monkey Test");
   4399         RBBISentMonkey  m;
   4400         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
   4401         if (loopCount >= 10) {
   4402             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
   4403         }
   4404         if (U_SUCCESS(status)) {
   4405             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
   4406         }
   4407         else {
   4408             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
   4409         }
   4410         delete bi;
   4411     }
   4412 
   4413 #endif
   4414 }
   4415 
   4416 //
   4417 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
   4418 //    Parameters:
   4419 //       bi      - the break iterator to use
   4420 //       mk      - MonkeyKind, abstraction for obtaining expected results
   4421 //       name    - Name of test (char, word, etc.) for use in error messages
   4422 //       seed    - Seed for starting random number generator (parameter from user)
   4423 //       numIterations
   4424 //
   4425 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
   4426                          int32_t numIterations, UBool useUText) {
   4427 
   4428 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4429 
   4430     const int32_t    TESTSTRINGLEN = 500;
   4431     UnicodeString    testText;
   4432     int32_t          numCharClasses;
   4433     UVector          *chClasses;
   4434     int              expected[TESTSTRINGLEN*2 + 1];
   4435     int              expectedCount = 0;
   4436     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
   4437     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
   4438     char             reverseBreaks[TESTSTRINGLEN*2+1];
   4439     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
   4440     char             followingBreaks[TESTSTRINGLEN*2+1];
   4441     char             precedingBreaks[TESTSTRINGLEN*2+1];
   4442     int              i;
   4443     int              loopCount = 0;
   4444 
   4445     m_seed = seed;
   4446 
   4447     numCharClasses = mk.charClasses()->size();
   4448     chClasses      = mk.charClasses();
   4449 
   4450     // Check for errors that occured during the construction of the MonkeyKind object.
   4451     //  Can't report them where they occured because errln() is a method coming from intlTest,
   4452     //  and is not visible outside of RBBITest :-(
   4453     if (U_FAILURE(mk.deferredStatus)) {
   4454         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
   4455         return;
   4456     }
   4457 
   4458     // Verify that the character classes all have at least one member.
   4459     for (i=0; i<numCharClasses; i++) {
   4460         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
   4461         if (s == NULL || s->size() == 0) {
   4462             errln("Character Class #%d is null or of zero size.", i);
   4463             return;
   4464         }
   4465     }
   4466 
   4467     while (loopCount < numIterations || numIterations == -1) {
   4468         if (numIterations == -1 && loopCount % 10 == 0) {
   4469             // If test is running in an infinite loop, display a periodic tic so
   4470             //   we can tell that it is making progress.
   4471             fprintf(stderr, ".");
   4472         }
   4473         // Save current random number seed, so that we can recreate the random numbers
   4474         //   for this loop iteration in event of an error.
   4475         seed = m_seed;
   4476 
   4477         // Populate a test string with data.
   4478         testText.truncate(0);
   4479         for (i=0; i<TESTSTRINGLEN; i++) {
   4480             int32_t  aClassNum = m_rand() % numCharClasses;
   4481             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
   4482             int32_t   charIdx = m_rand() % classSet->size();
   4483             UChar32   c = classSet->charAt(charIdx);
   4484             if (c < 0) {   // TODO:  deal with sets containing strings.
   4485                 errln("c < 0");
   4486                 break;
   4487             }
   4488             testText.append(c);
   4489         }
   4490 
   4491         // Calculate the expected results for this test string.
   4492         mk.setText(testText);
   4493         memset(expectedBreaks, 0, sizeof(expectedBreaks));
   4494         expectedBreaks[0] = 1;
   4495         int32_t breakPos = 0;
   4496         expectedCount = 0;
   4497         for (;;) {
   4498             breakPos = mk.next(breakPos);
   4499             if (breakPos == -1) {
   4500                 break;
   4501             }
   4502             if (breakPos > testText.length()) {
   4503                 errln("breakPos > testText.length()");
   4504             }
   4505             expectedBreaks[breakPos] = 1;
   4506             U_ASSERT(expectedCount<testText.length());
   4507             expected[expectedCount ++] = breakPos;
   4508         }
   4509 
   4510         // Find the break positions using forward iteration
   4511         memset(forwardBreaks, 0, sizeof(forwardBreaks));
   4512         if (useUText) {
   4513             UErrorCode status = U_ZERO_ERROR;
   4514             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
   4515             // testUText = utext_openUnicodeString(testUText, &testText, &status);
   4516             bi->setText(testUText, status);
   4517             TEST_ASSERT_SUCCESS(status);
   4518             utext_close(testUText);   // The break iterator does a shallow clone of the UText
   4519                                       //  This UText can be closed immediately, so long as the
   4520                                       //  testText string continues to exist.
   4521         } else {
   4522             bi->setText(testText);
   4523         }
   4524 
   4525         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
   4526             if (i < 0 || i > testText.length()) {
   4527                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
   4528                 break;
   4529             }
   4530             forwardBreaks[i] = 1;
   4531         }
   4532 
   4533         // Find the break positions using reverse iteration
   4534         memset(reverseBreaks, 0, sizeof(reverseBreaks));
   4535         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
   4536             if (i < 0 || i > testText.length()) {
   4537                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
   4538                 break;
   4539             }
   4540             reverseBreaks[i] = 1;
   4541         }
   4542 
   4543         // Find the break positions using isBoundary() tests.
   4544         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
   4545         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
   4546         for (i=0; i<=testText.length(); i++) {
   4547             isBoundaryBreaks[i] = bi->isBoundary(i);
   4548         }
   4549 
   4550 
   4551         // Find the break positions using the following() function.
   4552         // printf(".");
   4553         memset(followingBreaks, 0, sizeof(followingBreaks));
   4554         int32_t   lastBreakPos = 0;
   4555         followingBreaks[0] = 1;
   4556         for (i=0; i<testText.length(); i++) {
   4557             breakPos = bi->following(i);
   4558             if (breakPos <= i ||
   4559                 breakPos < lastBreakPos ||
   4560                 breakPos > testText.length() ||
   4561                 breakPos > lastBreakPos && lastBreakPos > i ) {
   4562                 errln("%s break monkey test: "
   4563                     "Out of range value returned by BreakIterator::following().\n"
   4564                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
   4565                          name, seed, i, breakPos, lastBreakPos);
   4566                 break;
   4567             }
   4568             followingBreaks[breakPos] = 1;
   4569             lastBreakPos = breakPos;
   4570         }
   4571 
   4572         // Find the break positions using the preceding() function.
   4573         memset(precedingBreaks, 0, sizeof(precedingBreaks));
   4574         lastBreakPos = testText.length();
   4575         precedingBreaks[testText.length()] = 1;
   4576         for (i=testText.length(); i>0; i--) {
   4577             breakPos = bi->preceding(i);
   4578             if (breakPos >= i ||
   4579                 breakPos > lastBreakPos ||
   4580                 breakPos < 0 && testText.getChar32Start(i)>0 ||
   4581                 breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i) ) {
   4582                 errln("%s break monkey test: "
   4583                     "Out of range value returned by BreakIterator::preceding().\n"
   4584                     "index=%d;  prev returned %d; lastBreak=%d" ,
   4585                     name,  i, breakPos, lastBreakPos);
   4586                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
   4587                     precedingBreaks[i] = 2;   // Forces an error.
   4588                 }
   4589             } else {
   4590                 if (breakPos >= 0) {
   4591                     precedingBreaks[breakPos] = 1;
   4592                 }
   4593                 lastBreakPos = breakPos;
   4594             }
   4595         }
   4596 
   4597         // Compare the expected and actual results.
   4598         for (i=0; i<=testText.length(); i++) {
   4599             const char *errorType = NULL;
   4600             if  (forwardBreaks[i] != expectedBreaks[i]) {
   4601                 errorType = "next()";
   4602             } else if (reverseBreaks[i] != forwardBreaks[i]) {
   4603                 errorType = "previous()";
   4604             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
   4605                 errorType = "isBoundary()";
   4606             } else if (followingBreaks[i] != expectedBreaks[i]) {
   4607                 errorType = "following()";
   4608             } else if (precedingBreaks[i] != expectedBreaks[i]) {
   4609                 errorType = "preceding()";
   4610             }
   4611 
   4612 
   4613             if (errorType != NULL) {
   4614                 // Format a range of the test text that includes the failure as
   4615                 //  a data item that can be included in the rbbi test data file.
   4616 
   4617                 // Start of the range is the last point where expected and actual results
   4618                 //   both agreed that there was a break position.
   4619                 int startContext = i;
   4620                 int32_t count = 0;
   4621                 for (;;) {
   4622                     if (startContext==0) { break; }
   4623                     startContext --;
   4624                     if (expectedBreaks[startContext] != 0) {
   4625                         if (count == 2) break;
   4626                         count ++;
   4627                     }
   4628                 }
   4629 
   4630                 // End of range is two expected breaks past the start position.
   4631                 int endContext = i + 1;
   4632                 int ci;
   4633                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
   4634                     for (;;) {
   4635                         if (endContext >= testText.length()) {break;}
   4636                         if (expectedBreaks[endContext-1] != 0) {
   4637                             if (count == 0) break;
   4638                             count --;
   4639                         }
   4640                         endContext ++;
   4641                     }
   4642                 }
   4643 
   4644                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
   4645                 UnicodeString errorText = "<data>";
   4646                 /***if (strcmp(errorType, "next()") == 0) {
   4647                     startContext = 0;
   4648                     endContext = testText.length();
   4649 
   4650                     printStringBreaks(testText, expected, expectedCount);
   4651                 }***/
   4652 
   4653                 for (ci=startContext; ci<endContext;) {
   4654                     UnicodeString hexChars("0123456789abcdef");
   4655                     UChar32  c;
   4656                     int      bn;
   4657                     c = testText.char32At(ci);
   4658                     if (ci == i) {
   4659                         // This is the location of the error.
   4660                         errorText.append("<?>");
   4661                     } else if (expectedBreaks[ci] != 0) {
   4662                         // This a non-error expected break position.
   4663                         errorText.append("\\");
   4664                     }
   4665                     if (c < 0x10000) {
   4666                         errorText.append("\\u");
   4667                         for (bn=12; bn>=0; bn-=4) {
   4668                             errorText.append(hexChars.charAt((c>>bn)&0xf));
   4669                         }
   4670                     } else {
   4671                         errorText.append("\\U");
   4672                         for (bn=28; bn>=0; bn-=4) {
   4673                             errorText.append(hexChars.charAt((c>>bn)&0xf));
   4674                         }
   4675                     }
   4676                     ci = testText.moveIndex32(ci, 1);
   4677                 }
   4678                 errorText.append("\\");
   4679                 errorText.append("</data>\n");
   4680 
   4681                 // Output the error
   4682                 char  charErrorTxt[500];
   4683                 UErrorCode status = U_ZERO_ERROR;
   4684                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
   4685                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
   4686                 errln("%s break monkey test error.  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
   4687                     name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
   4688                     errorType, seed, i, charErrorTxt);
   4689                 break;
   4690             }
   4691         }
   4692 
   4693         loopCount++;
   4694     }
   4695 #endif
   4696 }
   4697 
   4698 //
   4699 //  TestDebug    -  A place-holder test for debugging purposes.
   4700 //                  For putting in fragments of other tests that can be invoked
   4701 //                  for tracing  without a lot of unwanted extra stuff happening.
   4702 //
   4703 void RBBITest::TestDebug(void) {
   4704 #if 0
   4705     UErrorCode   status = U_ZERO_ERROR;
   4706     int pos = 0;
   4707     int ruleStatus = 0;
   4708 
   4709     RuleBasedBreakIterator* bi =
   4710        // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
   4711        // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
   4712        (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
   4713     UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
   4714     // UnicodeString s("Aaa.  Bcd");
   4715     s = s.unescape();
   4716     bi->setText(s);
   4717     UBool r = bi->isBoundary(8);
   4718     printf("%s", r?"true":"false");
   4719     return;
   4720     pos = bi->last();
   4721     do {
   4722         // ruleStatus = bi->getRuleStatus();
   4723         printf("%d\t%d\n", pos, ruleStatus);
   4724         pos = bi->previous();
   4725     } while (pos != BreakIterator::DONE);
   4726 #endif
   4727 }
   4728 
   4729 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   4730