Home | History | Annotate | Download | only in intltest
      1 /********************************************************************
      2  * COPYRIGHT:
      3  * Copyright (c) 1999-2009, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  ********************************************************************/
      6 /************************************************************************
      7 *   Date        Name        Description
      8 *   12/15/99    Madhu        Creation.
      9 *   01/12/2000  Madhu        Updated for changed API and added new tests
     10 ************************************************************************/
     11 
     12 #include "unicode/utypes.h"
     13 
     14 #if !UCONFIG_NO_BREAK_ITERATION
     15 
     16 #include "unicode/utypes.h"
     17 #include "unicode/brkiter.h"
     18 #include "unicode/rbbi.h"
     19 #include "unicode/uchar.h"
     20 #include "unicode/utf16.h"
     21 #include "unicode/ucnv.h"
     22 #include "unicode/schriter.h"
     23 #include "unicode/uniset.h"
     24 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
     25 #include "unicode/ustring.h"
     26 #include "unicode/utext.h"
     27 #include "intltest.h"
     28 #include "rbbitst.h"
     29 #include <string.h>
     30 #include "uvector.h"
     31 #include "uvectr32.h"
     32 #include "triedict.h"
     33 #include <string.h>
     34 #include <stdio.h>
     35 #include <stdlib.h>
     36 
     37 #define TEST_ASSERT(x) {if (!(x)) { \
     38     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
     39 
     40 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
     41     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
     42 
     43 
     44 //---------------------------------------------
     45 // runIndexedTest
     46 //---------------------------------------------
     47 
     48 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
     49 {
     50     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
     51 
     52     switch (index) {
     53         case 0: name = "TestBug4153072";
     54             if(exec) TestBug4153072();                         break;
     55         case 1: name = "TestJapaneseLineBreak";
     56             if(exec) TestJapaneseLineBreak();                  break;
     57         case 2: name = "TestStatusReturn";
     58             if(exec) TestStatusReturn();                       break;
     59         case 3: name = "TestUnicodeFiles";
     60             if(exec) TestUnicodeFiles();                       break;
     61         case 4: name = "TestEmptyString";
     62             if(exec) TestEmptyString();                        break;
     63 
     64         case 5: name = "TestGetAvailableLocales";
     65             if(exec) TestGetAvailableLocales();                break;
     66 
     67         case 6: name = "TestGetDisplayName";
     68             if(exec) TestGetDisplayName();                     break;
     69 
     70         case 7: name = "TestEndBehaviour";
     71             if(exec) TestEndBehaviour();                       break;
     72         case 8: name = "TestMixedThaiLineBreak";
     73              // BEGIN android-removed
     74              // Disable all Thai breakiterator tests.
     75              /* if(exec) TestMixedThaiLineBreak();    */       break;
     76              // END android-removed
     77         case 9: name = "TestThaiLineBreak";
     78              // BEGIN android-removed
     79              // Disable all Thai breakiterator tests.
     80              /* if(exec) TestThaiLineBreak();         */       break;
     81              // END android-removed
     82         case 10: name = "TestMaiyamok";
     83              // BEGIN android-removed
     84              // Disable all Thai breakiterator tests.
     85              /* if(exec) TestMaiyamok();              */       break;
     86              // END android-removed
     87         case 11: name = "TestWordBreaks";
     88              if(exec) TestWordBreaks();                        break;
     89         case 12: name = "TestWordBoundary";
     90              if(exec) TestWordBoundary();                      break;
     91         case 13: name = "TestLineBreaks";
     92              if(exec) TestLineBreaks();                        break;
     93         case 14: name = "TestSentBreaks";
     94              if(exec) TestSentBreaks();                        break;
     95         case 15: name = "TestExtended";
     96              if(exec) TestExtended();                          break;
     97         case 16: name = "TestMonkey";
     98              if(exec) {
     99  #if !UCONFIG_NO_REGULAR_EXPRESSIONS
    100                TestMonkey(params);
    101  #else
    102                logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)");
    103  #endif
    104              }
    105                                                                break;
    106         case 17: name = "TestBug3818";
    107              // BEGIN android-removed
    108              // Disable all Thai breakiterator tests.
    109              /* if(exec) TestBug3818();                 */     break;
    110              // END android-removed
    111         case 18: name = "TestJapaneseWordBreak";
    112             if(exec) TestJapaneseWordBreak();                  break;
    113         case 19: name = "TestDebug";
    114             if(exec) TestDebug();                              break;
    115         case 20: name = "TestTrieDict";
    116             if(exec) TestTrieDict();                           break;
    117         case 21: name = "TestBug5775";
    118             if (exec) TestBug5775();                           break;
    119         case 22: name = "TestThaiBreaks";
    120              // BEGIN android-removed
    121              // Disable all Thai breakiterator tests.
    122              /* if (exec) TestThaiBreaks();             */     break;
    123              // END android-removed
    124         case 23: name = "TestTailoredBreaks";
    125             if (exec) TestTailoredBreaks();                    break;
    126 
    127         default: name = ""; break; //needed to end loop
    128     }
    129 }
    130 
    131 
    132 //---------------------------------------------------------------------------
    133 //
    134 //   class BITestData   Holds a set of Break iterator test data and results
    135 //                      Includes
    136 //                         - the string data to be broken
    137 //                         - a vector of the expected break positions.
    138 //                         - a vector of source line numbers for the data,
    139 //                               (to help see where errors occured.)
    140 //                         - The expected break tag values.
    141 //                         - Vectors of actual break positions and tag values.
    142 //                         - Functions for comparing actual with expected and
    143 //                            reporting errors.
    144 //
    145 //----------------------------------------------------------------------------
    146 class BITestData {
    147 public:
    148     UnicodeString    fDataToBreak;
    149     UVector          fExpectedBreakPositions;
    150     UVector          fExpectedTags;
    151     UVector          fLineNum;
    152     UVector          fActualBreakPositions;   // Test Results.
    153     UVector          fActualTags;
    154 
    155     BITestData(UErrorCode &status);
    156     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
    157     void             checkResults(const char *heading, RBBITest *test);
    158     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
    159     void             clearResults();
    160 };
    161 
    162 //
    163 // Constructor.
    164 //
    165 BITestData::BITestData(UErrorCode &status)
    166 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
    167   fActualTags(status)
    168 {
    169 }
    170 
    171 //
    172 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
    173 //                 The macro form collects the line number, which is helpful
    174 //                 when tracking down failures.
    175 //
    176 //                 A null data item is inserted at the start of each test's data
    177 //                  to put the starting zero into the data list.  The position saved for
    178 //                  each non-null item is its ending position.
    179 //
    180 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
    181 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
    182     if (U_FAILURE(status)) {return;}
    183     if (data != NULL) {
    184         fDataToBreak.append(CharsToUnicodeString(data));
    185     }
    186     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
    187     fExpectedTags.addElement(tag, status);
    188     fLineNum.addElement(lineNum, status);
    189 }
    190 
    191 
    192 //
    193 //  checkResults.   Compare the actual and expected break positions, report any differences.
    194 //
    195 void BITestData::checkResults(const char *heading, RBBITest *test) {
    196     int32_t   expectedIndex = 0;
    197     int32_t   actualIndex = 0;
    198 
    199     for (;;) {
    200         // If we've run through both the expected and actual results vectors, we're done.
    201         //   break out of the loop.
    202         if (expectedIndex >= fExpectedBreakPositions.size() &&
    203             actualIndex   >= fActualBreakPositions.size()) {
    204             break;
    205         }
    206 
    207 
    208         if (expectedIndex >= fExpectedBreakPositions.size()) {
    209             err(heading, test, expectedIndex-1, actualIndex);
    210             actualIndex++;
    211             continue;
    212         }
    213 
    214         if (actualIndex >= fActualBreakPositions.size()) {
    215             err(heading, test, expectedIndex, actualIndex-1);
    216             expectedIndex++;
    217             continue;
    218         }
    219 
    220         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
    221             err(heading, test, expectedIndex, actualIndex);
    222             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
    223             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
    224                 actualIndex++;
    225             } else {
    226                 expectedIndex++;
    227             }
    228             continue;
    229         }
    230 
    231         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
    232             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
    233                 heading, fLineNum.elementAt(expectedIndex),
    234                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
    235         }
    236 
    237         actualIndex++;
    238         expectedIndex++;
    239     }
    240 }
    241 
    242 //
    243 //  err   -  An error was found.  Report it, along with information about where the
    244 //                                incorrectly broken test data appeared in the source file.
    245 //
    246 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
    247 {
    248     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
    249     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
    250     int32_t   o        = 0;
    251     int32_t   line     = fLineNum.elementAti(expectedIdx);
    252     if (expectedIdx > 0) {
    253         // The line numbers are off by one because a premature break occurs somewhere
    254         //    within the previous item, rather than at the start of the current (expected) item.
    255         //    We want to report the offset of the unexpected break from the start of
    256         //      this previous item.
    257         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
    258     }
    259     if (actual < expected) {
    260         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
    261     } else {
    262         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
    263     }
    264 }
    265 
    266 
    267 void BITestData::clearResults() {
    268     fActualBreakPositions.removeAllElements();
    269     fActualTags.removeAllElements();
    270 }
    271 
    272 
    273 //-----------------------------------------------------------------------------------
    274 //
    275 //    Cannned Test Characters
    276 //
    277 //-----------------------------------------------------------------------------------
    278 
    279 static const UChar cannedTestArray[] = {
    280     0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
    281     0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
    282     0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
    283     0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
    284     0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
    285     0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
    286     0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
    287     0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
    288 };
    289 
    290 static UnicodeString* cannedTestChars = 0;
    291 
    292 #define  halfNA     "\\u0928\\u094d\\u200d"
    293 #define  halfSA     "\\u0938\\u094d\\u200d"
    294 #define  halfCHA    "\\u091a\\u094d\\u200d"
    295 #define  halfKA     "\\u0915\\u094d\\u200d"
    296 #define  deadTA     "\\u0924\\u094d"
    297 
    298 //--------------------------------------------------------------------------------------
    299 //
    300 //    RBBITest    constructor and destructor
    301 //
    302 //--------------------------------------------------------------------------------------
    303 
    304 RBBITest::RBBITest() {
    305     UnicodeString temp(cannedTestArray);
    306     cannedTestChars = new UnicodeString();
    307     *cannedTestChars += (UChar)0x0000;
    308     *cannedTestChars += temp;
    309 }
    310 
    311 
    312 RBBITest::~RBBITest() {
    313     delete cannedTestChars;
    314 }
    315 
    316 
    317 static const int T_NUMBER = 100;
    318 static const int T_LETTER = 200;
    319 static const int T_H_OR_K = 300;
    320 static const int T_IDEO   = 400;
    321 
    322 
    323 
    324 
    325 
    326 
    327 //--------------------------------------------------------------------
    328 //Testing the BreakIterator for devanagari script
    329 //--------------------------------------------------------------------
    330 
    331 #define deadRA   "\\u0930\\u094d"         /*deadform RA = devanagari RA + virama*/
    332 #define deadPHA  "\\u092b\\u094d"         /*deadform PHA = devanagari PHA + virama*/
    333 #define deadTTHA "\\u0920\\u094d"
    334 #define deadPA   "\\u092a\\u094d"
    335 #define deadSA   "\\u0938\\u094d"
    336 #define visarga  "\\u0903"                /*devanagari visarga looks like a english colon*/
    337 
    338 
    339 
    340 
    341 
    342 
    343 //-----------------------------------------------------------------------------------
    344 //
    345 //   Test for status {tag} return value from break rules.
    346 //        TODO:  a more thorough test.
    347 //
    348 //-----------------------------------------------------------------------------------
    349 void RBBITest::TestStatusReturn() {
    350      UnicodeString rulesString1("$Letters = [:L:];\n"
    351                                   "$Numbers = [:N:];\n"
    352                                   "$Letters+{1};\n"
    353                                   "$Numbers+{2};\n"
    354                                   "Help\\ {4}/me\\!;\n"
    355                                   "[^$Letters $Numbers];\n"
    356                                   "!.*;\n", -1, US_INV);
    357      UnicodeString testString1  = "abc123..abc Help me Help me!";
    358                                 // 01234567890123456789012345678
    359      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
    360      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
    361 
    362      UErrorCode status=U_ZERO_ERROR;
    363      UParseError    parseError;
    364 
    365      RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
    366      if(U_FAILURE(status)) {
    367          dataerrln("FAIL : in construction - %s", u_errorName(status));
    368      } else {
    369          int32_t  pos;
    370          int32_t  i = 0;
    371          bi->setText(testString1);
    372          for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
    373              if (pos != bounds1[i]) {
    374                  errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
    375                  break;
    376              }
    377 
    378              int tag = bi->getRuleStatus();
    379              if (tag != brkStatus[i]) {
    380                  errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
    381                  break;
    382              }
    383              i++;
    384          }
    385      }
    386      delete bi;
    387 }
    388 
    389 
    390 static void printStringBreaks(UnicodeString ustr, int expected[],
    391                               int expectedcount)
    392 {
    393     UErrorCode status = U_ZERO_ERROR;
    394     char name[100];
    395     printf("code    alpha extend alphanum type word sent line name\n");
    396     int j;
    397     for (j = 0; j < ustr.length(); j ++) {
    398         if (expectedcount > 0) {
    399             int k;
    400             for (k = 0; k < expectedcount; k ++) {
    401                 if (j == expected[k]) {
    402                     printf("------------------------------------------------ %d\n",
    403                            j);
    404                 }
    405             }
    406         }
    407         UChar32 c = ustr.char32At(j);
    408         if (c > 0xffff) {
    409             j ++;
    410         }
    411         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
    412         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
    413                            u_isUAlphabetic(c),
    414                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
    415                            u_isalnum(c),
    416                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
    417                                                   u_charType(c),
    418                                                   U_SHORT_PROPERTY_NAME),
    419                            u_getPropertyValueName(UCHAR_WORD_BREAK,
    420                                                   u_getIntPropertyValue(c,
    421                                                           UCHAR_WORD_BREAK),
    422                                                   U_SHORT_PROPERTY_NAME),
    423                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
    424                                    u_getIntPropertyValue(c,
    425                                            UCHAR_SENTENCE_BREAK),
    426                                    U_SHORT_PROPERTY_NAME),
    427                            u_getPropertyValueName(UCHAR_LINE_BREAK,
    428                                    u_getIntPropertyValue(c,
    429                                            UCHAR_LINE_BREAK),
    430                                    U_SHORT_PROPERTY_NAME),
    431                            name);
    432     }
    433 }
    434 
    435 void RBBITest::TestThaiLineBreak() {
    436     UErrorCode status = U_ZERO_ERROR;
    437     BITestData thaiLineSelection(status);
    438 
    439     // \u0e2f-- the Thai paiyannoi character-- isn't a letter.  It's a symbol that
    440     // represents elided letters at the end of a long word.  It should be bound to
    441     // the end of the word and not treated as an independent punctuation mark.
    442 
    443 
    444     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
    445     ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status);
    446     ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status);
    447     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status);
    448     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status);
    449 //        ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);
    450 //        ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
    451     ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status);
    452     // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
    453     ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status);
    454     ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status);
    455     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status);
    456     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status);
    457     ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status);
    458     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status);
    459 
    460     // the one time where the paiyannoi occurs somewhere other than at the end
    461     // of a word is in the Thai abbrevation for "etc.", which both begins and
    462     // ends with a paiyannoi
    463     ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status);
    464     ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
    465     ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status);
    466 
    467     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
    468         Locale("th"), status);
    469     if (U_FAILURE(status))
    470     {
    471         errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestThaiLineBreak. - %s", u_errorName(status));
    472         return;
    473     }
    474 
    475     generalIteratorTest(*e, thaiLineSelection);
    476     delete e;
    477 }
    478 
    479 
    480 
    481 void RBBITest::TestMixedThaiLineBreak()
    482 {
    483     UErrorCode   status = U_ZERO_ERROR;
    484     BITestData   thaiLineSelection(status);
    485 
    486     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
    487 
    488 
    489     // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters
    490     // start
    491 
    492     ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
    493     ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status);
    494     ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status);
    495     ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status);
    496     ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
    497     ADD_DATACHUNK(thaiLineSelection, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status);
    498     ADD_DATACHUNK(thaiLineSelection, "\\u0E04\\u0E23\\u0E1A", 0, status);
    499     ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E2D\\u0E1A ", 0, status);
    500     ADD_DATACHUNK(thaiLineSelection, "\"\\u0E52\\u0E52\\u0E50 ", 0, status);
    501     ADD_DATACHUNK(thaiLineSelection, "\\u0E1b\\u0E35\" ", 0, status);
    502     ADD_DATACHUNK(thaiLineSelection, "\\u0E02\\u0E2d\\u0E07", 0, status);
    503     ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status);
    504     ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status);
    505     ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status);
    506     ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status);
    507     ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status);
    508 
    509     // @suwit - end of changes
    510 
    511 
    512     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status);
    513     if (U_FAILURE(status))
    514     {
    515         errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak. - %s", u_errorName(status));
    516         return;
    517     }
    518 
    519 
    520     generalIteratorTest(*e, thaiLineSelection);
    521     delete e;
    522 }
    523 
    524 
    525 void RBBITest::TestMaiyamok()
    526 {
    527     UErrorCode status = U_ZERO_ERROR;
    528     BITestData   thaiLineSelection(status);
    529     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
    530     // the Thai maiyamok character is a shorthand symbol that means "repeat the previous
    531     // word".  Instead of appearing as a word unto itself, however, it's kept together
    532     // with the word before it
    533     ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);
    534     ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);
    535     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);
    536     ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status);
    537     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status);
    538     ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);
    539     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status);
    540     ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status);
    541     ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
    542 
    543     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
    544         Locale("th"), status);
    545 
    546     if (U_FAILURE(status))
    547     {
    548         errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMaiyamok. - %s", u_errorName(status));
    549         return;
    550     }
    551     generalIteratorTest(*e, thaiLineSelection);
    552     delete e;
    553 }
    554 
    555 
    556 
    557 void RBBITest::TestBug3818() {
    558     UErrorCode  status = U_ZERO_ERROR;
    559 
    560     // Four Thai words...
    561     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
    562                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
    563     UnicodeString  thaiStr(thaiWordData);
    564 
    565     RuleBasedBreakIterator* bi =
    566         (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
    567     if (U_FAILURE(status) || bi == NULL) {
    568         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
    569         return;
    570     }
    571     bi->setText(thaiStr);
    572 
    573     int32_t  startOfSecondWord = bi->following(1);
    574     if (startOfSecondWord != 4) {
    575         errln("Fail at file %s, line %d expected start of word at 4, got %d",
    576             __FILE__, __LINE__, startOfSecondWord);
    577     }
    578     startOfSecondWord = bi->following(0);
    579     if (startOfSecondWord != 4) {
    580         errln("Fail at file %s, line %d expected start of word at 4, got %d",
    581             __FILE__, __LINE__, startOfSecondWord);
    582     }
    583     delete bi;
    584 }
    585 
    586 
    587 void RBBITest::TestJapaneseWordBreak() {
    588     UErrorCode status = U_ZERO_ERROR;
    589     BITestData   japaneseWordSelection(status);
    590 
    591     ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status);           // Break at start of data
    592     ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2
    593     ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5
    594     ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7
    595     ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10
    596     ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11
    597     ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12
    598 
    599     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
    600         Locale("ja"), status);
    601     if (U_FAILURE(status))
    602     {
    603         errcheckln(status, "Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
    604         return;
    605     }
    606 
    607     generalIteratorTest(*e, japaneseWordSelection);
    608     delete e;
    609 }
    610 
    611 void RBBITest::TestTrieDict() {
    612     UErrorCode      status  = U_ZERO_ERROR;
    613 
    614     //
    615     //  Open and read the test data file.
    616     //
    617     const char *testDataDirectory = IntlTest::getSourceTestData(status);
    618     char testFileName[1000];
    619     if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) {
    620         errln("Can't open test data.  Path too long.");
    621         return;
    622     }
    623     strcpy(testFileName, testDataDirectory);
    624     strcat(testFileName, "riwords.txt");
    625 
    626     // Items needing deleting at the end
    627     MutableTrieDictionary *mutableDict = NULL;
    628     CompactTrieDictionary *compactDict = NULL;
    629     UnicodeSet            *breaks      = NULL;
    630     UChar                 *testFile    = NULL;
    631     StringEnumeration     *enumer1     = NULL;
    632     StringEnumeration     *enumer2     = NULL;
    633     MutableTrieDictionary *mutable2    = NULL;
    634     StringEnumeration     *cloneEnum   = NULL;
    635     CompactTrieDictionary *compact2    = NULL;
    636 
    637 
    638     const UnicodeString *originalWord = NULL;
    639     const UnicodeString *cloneWord    = NULL;
    640     UChar *current;
    641     UChar *word;
    642     UChar uc;
    643     int32_t wordLen;
    644     int32_t wordCount;
    645     int32_t testCount;
    646 
    647     int    len;
    648     testFile = ReadAndConvertFile(testFileName, len, NULL, status);
    649     if (U_FAILURE(status)) {
    650         goto cleanup; /* something went wrong, error already output */
    651     }
    652 
    653     mutableDict = new MutableTrieDictionary(0x0E1C, status);
    654     if (U_FAILURE(status)) {
    655         errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
    656         goto cleanup;
    657     }
    658 
    659     breaks = new UnicodeSet;
    660     breaks->add(0x000A);     // Line Feed
    661     breaks->add(0x000D);     // Carriage Return
    662     breaks->add(0x2028);     // Line Separator
    663     breaks->add(0x2029);     // Paragraph Separator
    664 
    665     // Now add each non-comment line of the file as a word.
    666     current = testFile;
    667     word = current;
    668     uc = *current++;
    669     wordLen = 0;
    670     wordCount = 0;
    671 
    672     while (uc) {
    673         if (uc == 0x0023) {     // #comment line, skip
    674             while (uc && !breaks->contains(uc)) {
    675                 uc = *current++;
    676             }
    677         }
    678         else while (uc && !breaks->contains(uc)) {
    679             ++wordLen;
    680             uc = *current++;
    681         }
    682         if (wordLen > 0) {
    683             mutableDict->addWord(word, wordLen, status);
    684             if (U_FAILURE(status)) {
    685                 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
    686                 goto cleanup;
    687             }
    688             wordCount += 1;
    689         }
    690 
    691         // Find beginning of next line
    692         while (uc && breaks->contains(uc)) {
    693             uc = *current++;
    694         }
    695         word = current-1;
    696         wordLen = 0;
    697     }
    698 
    699     if (wordCount < 50) {
    700         errln("Word count (%d) unreasonably small\n", wordCount);
    701         goto cleanup;
    702     }
    703 
    704     enumer1 = mutableDict->openWords(status);
    705     if (U_FAILURE(status)) {
    706         errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
    707         goto cleanup;
    708     }
    709 
    710     testCount = 0;
    711     if (wordCount != (testCount = enumer1->count(status))) {
    712         errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
    713             testCount, wordCount, u_errorName(status));
    714         goto cleanup;
    715     }
    716 
    717     // Now compact it
    718     compactDict = new CompactTrieDictionary(*mutableDict, status);
    719     if (U_FAILURE(status)) {
    720         errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
    721         goto cleanup;
    722     }
    723 
    724     enumer2 = compactDict->openWords(status);
    725     if (U_FAILURE(status)) {
    726         errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
    727         goto cleanup;
    728     }
    729 
    730     if (wordCount != (testCount = enumer2->count(status))) {
    731         errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
    732             testCount, wordCount, u_errorName(status));
    733         goto cleanup;
    734     }
    735 
    736     if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) {
    737         errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same");
    738     }
    739     delete enumer1;
    740     enumer1 = NULL;
    741     delete enumer2;
    742     enumer2 = NULL;
    743 
    744     // Now un-compact it
    745     mutable2 = compactDict->cloneMutable(status);
    746     if (U_FAILURE(status)) {
    747         errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
    748         goto cleanup;
    749     }
    750 
    751     cloneEnum = mutable2->openWords(status);
    752     if (U_FAILURE(status)) {
    753         errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
    754         goto cleanup;
    755     }
    756 
    757     if (wordCount != (testCount = cloneEnum->count(status))) {
    758         errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
    759             testCount, wordCount, u_errorName(status));
    760         goto cleanup;
    761     }
    762 
    763     // Compact original dictionary to clone. Note that we can only compare the same kind of
    764     // dictionary as the order of the enumerators is not guaranteed to be the same between
    765     // different kinds
    766     enumer1 = mutableDict->openWords(status);
    767     if (U_FAILURE(status)) {
    768         errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
    769         goto cleanup;
    770      }
    771 
    772     originalWord = enumer1->snext(status);
    773     cloneWord = cloneEnum->snext(status);
    774     while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
    775         if (*originalWord != *cloneWord) {
    776             errln("Original and cloned MutableTrieDictionary word mismatch\n");
    777             goto cleanup;
    778         }
    779         originalWord = enumer1->snext(status);
    780         cloneWord = cloneEnum->snext(status);
    781     }
    782 
    783     if (U_FAILURE(status)) {
    784         errln("Enumeration failed: %s\n", u_errorName(status));
    785         goto cleanup;
    786     }
    787 
    788     if (originalWord != cloneWord) {
    789         errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
    790         goto cleanup;
    791     }
    792 
    793     // Test the data copying constructor for CompactTrieDict, and the data access APIs.
    794     compact2 = new CompactTrieDictionary(compactDict->data(), status);
    795     if (U_FAILURE(status)) {
    796         errln("CompactTrieDictionary(const void *,...) failed\n");
    797         goto cleanup;
    798     }
    799 
    800     if (compact2->dataSize() == 0) {
    801         errln("CompactTrieDictionary->dataSize() == 0\n");
    802         goto cleanup;
    803     }
    804 
    805     // Now count the words via the second dictionary
    806     delete enumer1;
    807     enumer1 = compact2->openWords(status);
    808     if (U_FAILURE(status)) {
    809         errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
    810         goto cleanup;
    811     }
    812 
    813     if (wordCount != (testCount = enumer1->count(status))) {
    814         errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
    815             testCount, wordCount, u_errorName(status));
    816         goto cleanup;
    817     }
    818 
    819 cleanup:
    820     delete compactDict;
    821     delete mutableDict;
    822     delete breaks;
    823     delete[] testFile;
    824     delete enumer1;
    825     delete mutable2;
    826     delete cloneEnum;
    827     delete compact2;
    828 }
    829 
    830 
    831 //----------------------------------------------------------------------------
    832 //
    833 // generalIteratorTest      Given a break iterator and a set of test data,
    834 //                          Run the tests and report the results.
    835 //
    836 //----------------------------------------------------------------------------
    837 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
    838 {
    839 
    840     bi.setText(td.fDataToBreak);
    841 
    842     testFirstAndNext(bi, td);
    843 
    844     testLastAndPrevious(bi, td);
    845 
    846     testFollowing(bi, td);
    847     testPreceding(bi, td);
    848     testIsBoundary(bi, td);
    849     doMultipleSelectionTest(bi, td);
    850 }
    851 
    852 
    853 //
    854 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
    855 //                       kind of loop.
    856 //
    857 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
    858 {
    859     UErrorCode  status = U_ZERO_ERROR;
    860     int32_t     p;
    861     int32_t     lastP = -1;
    862     int32_t     tag;
    863 
    864     logln("Test first and next");
    865     bi.setText(td.fDataToBreak);
    866     td.clearResults();
    867 
    868     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
    869         td.fActualBreakPositions.addElement(p, status);  // Save result.
    870         tag = bi.getRuleStatus();
    871         td.fActualTags.addElement(tag, status);
    872         if (p <= lastP) {
    873             // If the iterator is not making forward progress, stop.
    874             //  No need to raise an error here, it'll be detected in the normal check of results.
    875             break;
    876         }
    877         lastP = p;
    878     }
    879     td.checkResults("testFirstAndNext", this);
    880 }
    881 
    882 
    883 //
    884 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
    885 //
    886 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
    887 {
    888     UErrorCode  status = U_ZERO_ERROR;
    889     int32_t     p;
    890     int32_t     lastP  = 0x7ffffffe;
    891     int32_t     tag;
    892 
    893     logln("Test last and previous");
    894     bi.setText(td.fDataToBreak);
    895     td.clearResults();
    896 
    897     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
    898         // Save break position.  Insert it at start of vector of results, shoving
    899         //    already-saved results further towards the end.
    900         td.fActualBreakPositions.insertElementAt(p, 0, status);
    901         // bi.previous();   // TODO:  Why does this fix things up????
    902         // bi.next();
    903         tag = bi.getRuleStatus();
    904         td.fActualTags.insertElementAt(tag, 0, status);
    905         if (p >= lastP) {
    906             // If the iterator is not making progress, stop.
    907             //  No need to raise an error here, it'll be detected in the normal check of results.
    908             break;
    909         }
    910         lastP = p;
    911     }
    912     td.checkResults("testLastAndPrevious", this);
    913 }
    914 
    915 
    916 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
    917 {
    918     UErrorCode  status = U_ZERO_ERROR;
    919     int32_t     p;
    920     int32_t     tag;
    921     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
    922                                  //   cannot be -1; that is returned for DONE.
    923     int         i;
    924 
    925     logln("testFollowing():");
    926     bi.setText(td.fDataToBreak);
    927     td.clearResults();
    928 
    929     // Save the starting point, since we won't get that out of following.
    930     p = bi.first();
    931     td.fActualBreakPositions.addElement(p, status);  // Save result.
    932     tag = bi.getRuleStatus();
    933     td.fActualTags.addElement(tag, status);
    934 
    935     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
    936         p = bi.following(i);
    937         if (p != lastP) {
    938             if (p == RuleBasedBreakIterator::DONE) {
    939                 break;
    940             }
    941             // We've reached a new break position.  Save it.
    942             td.fActualBreakPositions.addElement(p, status);  // Save result.
    943             tag = bi.getRuleStatus();
    944             td.fActualTags.addElement(tag, status);
    945             lastP = p;
    946         }
    947     }
    948     // The loop normally exits by means of the break in the middle.
    949     // Make sure that the index was at the correct position for the break iterator to have
    950     //   returned DONE.
    951     if (i != td.fDataToBreak.length()) {
    952         errln("testFollowing():  iterator returned DONE prematurely.");
    953     }
    954 
    955     // Full check of all results.
    956     td.checkResults("testFollowing", this);
    957 }
    958 
    959 
    960 
    961 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
    962     UErrorCode  status = U_ZERO_ERROR;
    963     int32_t     p;
    964     int32_t     tag;
    965     int32_t     lastP  = 0x7ffffffe;
    966     int         i;
    967 
    968     logln("testPreceding():");
    969     bi.setText(td.fDataToBreak);
    970     td.clearResults();
    971 
    972     p = bi.last();
    973     td.fActualBreakPositions.addElement(p, status);
    974     tag = bi.getRuleStatus();
    975     td.fActualTags.addElement(tag, status);
    976 
    977     for (i = td.fDataToBreak.length(); i>=-1; i--) {
    978         p = bi.preceding(i);
    979         if (p != lastP) {
    980             if (p == RuleBasedBreakIterator::DONE) {
    981                 break;
    982             }
    983             // We've reached a new break position.  Save it.
    984             td.fActualBreakPositions.insertElementAt(p, 0, status);
    985             lastP = p;
    986             tag = bi.getRuleStatus();
    987             td.fActualTags.insertElementAt(tag, 0, status);
    988         }
    989     }
    990     // The loop normally exits by means of the break in the middle.
    991     // Make sure that the index was at the correct position for the break iterator to have
    992     //   returned DONE.
    993     if (i != 0) {
    994         errln("testPreceding():  iterator returned DONE prematurely.");
    995     }
    996 
    997     // Full check of all results.
    998     td.checkResults("testPreceding", this);
    999 }
   1000 
   1001 
   1002 
   1003 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
   1004     UErrorCode  status = U_ZERO_ERROR;
   1005     int         i;
   1006     int32_t     tag;
   1007 
   1008     logln("testIsBoundary():");
   1009     bi.setText(td.fDataToBreak);
   1010     td.clearResults();
   1011 
   1012     for (i = 0; i <= td.fDataToBreak.length(); i++) {
   1013         if (bi.isBoundary(i)) {
   1014             td.fActualBreakPositions.addElement(i, status);  // Save result.
   1015             tag = bi.getRuleStatus();
   1016             td.fActualTags.addElement(tag, status);
   1017         }
   1018     }
   1019     td.checkResults("testIsBoundary: ", this);
   1020 }
   1021 
   1022 
   1023 
   1024 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
   1025 {
   1026     iterator.setText(td.fDataToBreak);
   1027 
   1028     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
   1029     int32_t offset = iterator.first();
   1030     int32_t testOffset;
   1031     int32_t count = 0;
   1032 
   1033     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
   1034 
   1035     if (*testIterator != iterator)
   1036         errln("clone() or operator!= failed: two clones compared unequal");
   1037 
   1038     do {
   1039         testOffset = testIterator->first();
   1040         testOffset = testIterator->next(count);
   1041         if (offset != testOffset)
   1042             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
   1043 
   1044         if (offset != RuleBasedBreakIterator::DONE) {
   1045             count++;
   1046             offset = iterator.next();
   1047 
   1048             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
   1049                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
   1050                 if (count > 10000 || offset == -1) {
   1051                     errln("operator== failed too many times. Stopping test.");
   1052                     if (offset == -1) {
   1053                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
   1054                     }
   1055                     return;
   1056                 }
   1057             }
   1058         }
   1059     } while (offset != RuleBasedBreakIterator::DONE);
   1060 
   1061     // now do it backwards...
   1062     offset = iterator.last();
   1063     count = 0;
   1064 
   1065     do {
   1066         testOffset = testIterator->last();
   1067         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
   1068         if (offset != testOffset)
   1069             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
   1070 
   1071         if (offset != RuleBasedBreakIterator::DONE) {
   1072             count--;
   1073             offset = iterator.previous();
   1074         }
   1075     } while (offset != RuleBasedBreakIterator::DONE);
   1076 
   1077     delete testIterator;
   1078 }
   1079 
   1080 
   1081 //---------------------------------------------
   1082 //
   1083 //     other tests
   1084 //
   1085 //---------------------------------------------
   1086 void RBBITest::TestEmptyString()
   1087 {
   1088     UnicodeString text = "";
   1089     UErrorCode status = U_ZERO_ERROR;
   1090 
   1091     BITestData x(status);
   1092     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
   1093     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
   1094     if (U_FAILURE(status))
   1095     {
   1096         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
   1097         return;
   1098     }
   1099     generalIteratorTest(*bi, x);
   1100     delete bi;
   1101 }
   1102 
   1103 void RBBITest::TestGetAvailableLocales()
   1104 {
   1105     int32_t locCount = 0;
   1106     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
   1107 
   1108     if (locCount == 0)
   1109         dataerrln("getAvailableLocales() returned an empty list!");
   1110     // Just make sure that it's returning good memory.
   1111     int32_t i;
   1112     for (i = 0; i < locCount; ++i) {
   1113         logln(locList[i].getName());
   1114     }
   1115 }
   1116 
   1117 //Testing the BreakIterator::getDisplayName() function
   1118 void RBBITest::TestGetDisplayName()
   1119 {
   1120     UnicodeString   result;
   1121 
   1122     BreakIterator::getDisplayName(Locale::getUS(), result);
   1123     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
   1124         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
   1125                 + result);
   1126 
   1127     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
   1128     if (result != "French (France)")
   1129         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
   1130                 + result);
   1131 }
   1132 /**
   1133  * Test End Behaviour
   1134  * @bug 4068137
   1135  */
   1136 void RBBITest::TestEndBehaviour()
   1137 {
   1138     UErrorCode status = U_ZERO_ERROR;
   1139     UnicodeString testString("boo.");
   1140     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
   1141     if (U_FAILURE(status))
   1142     {
   1143         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
   1144         return;
   1145     }
   1146     wb->setText(testString);
   1147 
   1148     if (wb->first() != 0)
   1149         errln("Didn't get break at beginning of string.");
   1150     if (wb->next() != 3)
   1151         errln("Didn't get break before period in \"boo.\"");
   1152     if (wb->current() != 4 && wb->next() != 4)
   1153         errln("Didn't get break at end of string.");
   1154     delete wb;
   1155 }
   1156 /*
   1157  * @bug 4153072
   1158  */
   1159 void RBBITest::TestBug4153072() {
   1160     UErrorCode status = U_ZERO_ERROR;
   1161     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
   1162     if (U_FAILURE(status))
   1163     {
   1164         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
   1165         return;
   1166     }
   1167     UnicodeString str("...Hello, World!...");
   1168     int32_t begin = 3;
   1169     int32_t end = str.length() - 3;
   1170     UBool onBoundary;
   1171 
   1172     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
   1173     iter->adoptText(textIterator);
   1174     int index;
   1175     // Note: with the switch to UText, there is no way to restrict the
   1176     //       iteration range to begin at an index other than zero.
   1177     //       String character iterators created with a non-zero bound are
   1178     //         treated by RBBI as being empty.
   1179     for (index = -1; index < begin + 1; ++index) {
   1180         onBoundary = iter->isBoundary(index);
   1181         if (index == 0?  !onBoundary : onBoundary) {
   1182             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
   1183                             " and begin index = " + begin);
   1184         }
   1185     }
   1186     delete iter;
   1187 }
   1188 
   1189 
   1190 //
   1191 // Test for problem reported by Ashok Matoria on 9 July 2007
   1192 //    One.<kSoftHyphen><kSpace>Two.
   1193 //
   1194 //    Sentence break at start (0) and then on calling next() it breaks at
   1195 //   'T' of "Two". Now, at this point if I do next() and
   1196 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
   1197 //
   1198 void RBBITest::TestBug5775() {
   1199     UErrorCode status = U_ZERO_ERROR;
   1200     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
   1201     TEST_ASSERT_SUCCESS(status);
   1202     if (U_FAILURE(status)) {
   1203         return;
   1204     }
   1205 // Check for status first for better handling of no data errors.
   1206     TEST_ASSERT(bi != NULL);
   1207     if (bi == NULL) {
   1208         return;
   1209     }
   1210 
   1211     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
   1212     //               01234      56789
   1213     s = s.unescape();
   1214     bi->setText(s);
   1215     int pos = bi->next();
   1216     TEST_ASSERT(pos == 6);
   1217     pos = bi->next();
   1218     TEST_ASSERT(pos == 10);
   1219     pos = bi->previous();
   1220     TEST_ASSERT(pos == 6);
   1221     delete bi;
   1222 }
   1223 
   1224 
   1225 
   1226 /**
   1227  * Test Japanese Line Break
   1228  * @bug 4095322
   1229  */
   1230 void RBBITest::TestJapaneseLineBreak()
   1231 {
   1232 #if 0
   1233     // Test needs updating some more...   Dump it for now.
   1234 
   1235 
   1236     // Change for Unicode TR 14:  Punctuation characters with categories Pi and Pf do not count
   1237     //        as opening and closing punctuation for line breaking.
   1238     //        Also, \u30fc and \u30fe are not counted as hyphens.   Remove these chars
   1239     //        from these tests.    6-13-2002
   1240     //
   1241     UErrorCode status = U_ZERO_ERROR;
   1242     UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");
   1243     UnicodeString precedingChars = CharsToUnicodeString(
   1244         //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
   1245         "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
   1246     UnicodeString followingChars = CharsToUnicodeString(
   1247         // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
   1248         ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
   1249         // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
   1250         ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
   1251         "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
   1252     BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status);
   1253 
   1254     int32_t i;
   1255     if (U_FAILURE(status))
   1256     {
   1257         errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
   1258         return;
   1259     }
   1260 
   1261     for (i = 0; i < precedingChars.length(); i++) {
   1262         testString.setCharAt(1, precedingChars[i]);
   1263         iter->setText(testString);
   1264         int32_t j = iter->first();
   1265         if (j != 0)
   1266             errln("ja line break failure: failed to start at 0");
   1267         j = iter->next();
   1268         if (j != 1)
   1269             errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])
   1270                         + "' (" + ((int)(precedingChars[i])) + ")");
   1271         j = iter->next();
   1272         if (j != 3)
   1273             errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])
   1274                         + "' (" + ((int)(precedingChars[i])) + ")");
   1275     }
   1276 
   1277     for (i = 0; i < followingChars.length(); i++) {
   1278         testString.setCharAt(1, followingChars[i]);
   1279         iter->setText(testString);
   1280         int j = iter->first();
   1281         if (j != 0)
   1282             errln("ja line break failure: failed to start at 0");
   1283         j = iter->next();
   1284         if (j != 2)
   1285             errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])
   1286                         + "' (" + ((int)(followingChars[i])) + ")");
   1287         j = iter->next();
   1288         if (j != 3)
   1289             errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])
   1290                         + "' (" + ((int)(followingChars[i])) + ")");
   1291     }
   1292     delete iter;
   1293 #endif
   1294 }
   1295 
   1296 
   1297 //------------------------------------------------------------------------------
   1298 //
   1299 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
   1300 //
   1301 //------------------------------------------------------------------------------
   1302 
   1303 struct TestParams {
   1304     BreakIterator   *bi;
   1305     UnicodeString    dataToBreak;
   1306     UVector32       *expectedBreaks;
   1307     UVector32       *srcLine;
   1308     UVector32       *srcCol;
   1309 };
   1310 
   1311 void RBBITest::executeTest(TestParams *t) {
   1312     int32_t    bp;
   1313     int32_t    prevBP;
   1314     int32_t    i;
   1315 
   1316     if (t->bi == NULL) {
   1317         return;
   1318     }
   1319 
   1320     t->bi->setText(t->dataToBreak);
   1321     //
   1322     //  Run the iterator forward
   1323     //
   1324     prevBP = -1;
   1325     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
   1326         if (prevBP ==  bp) {
   1327             // Fail for lack of forward progress.
   1328             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
   1329                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
   1330             break;
   1331         }
   1332 
   1333         // Check that there were we didn't miss an expected break between the last one
   1334         //  and this one.
   1335         for (i=prevBP+1; i<bp; i++) {
   1336             if (t->expectedBreaks->elementAti(i) != 0) {
   1337                 int expected[] = {0, i};
   1338                 printStringBreaks(t->dataToBreak, expected, 2);
   1339                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1340                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
   1341             }
   1342         }
   1343 
   1344         // Check that the break we did find was expected
   1345         if (t->expectedBreaks->elementAti(bp) == 0) {
   1346             int expected[] = {0, bp};
   1347             printStringBreaks(t->dataToBreak, expected, 2);
   1348             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
   1349                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
   1350         } else {
   1351             // The break was expected.
   1352             //   Check that the {nnn} tag value is correct.
   1353             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
   1354             if (expectedTagVal == -1) {
   1355                 expectedTagVal = 0;
   1356             }
   1357             int32_t line = t->srcLine->elementAti(bp);
   1358             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
   1359             if (rs != expectedTagVal) {
   1360                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
   1361                       "          Actual, Expected status = %4d, %4d",
   1362                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
   1363             }
   1364         }
   1365 
   1366 
   1367         prevBP = bp;
   1368     }
   1369 
   1370     // Verify that there were no missed expected breaks after the last one found
   1371     for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
   1372         if (t->expectedBreaks->elementAti(i) != 0) {
   1373             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1374                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
   1375         }
   1376     }
   1377 
   1378     //
   1379     //  Run the iterator backwards, verify that the same breaks are found.
   1380     //
   1381     prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
   1382     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
   1383         if (prevBP ==  bp) {
   1384             // Fail for lack of progress.
   1385             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
   1386                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
   1387             break;
   1388         }
   1389 
   1390         // Check that there were we didn't miss an expected break between the last one
   1391         //  and this one.  (UVector returns zeros for index out of bounds.)
   1392         for (i=prevBP-1; i>bp; i--) {
   1393             if (t->expectedBreaks->elementAti(i) != 0) {
   1394                 errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1395                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
   1396             }
   1397         }
   1398 
   1399         // Check that the break we did find was expected
   1400         if (t->expectedBreaks->elementAti(bp) == 0) {
   1401             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
   1402                    bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
   1403         } else {
   1404             // The break was expected.
   1405             //   Check that the {nnn} tag value is correct.
   1406             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
   1407             if (expectedTagVal == -1) {
   1408                 expectedTagVal = 0;
   1409             }
   1410             int line = t->srcLine->elementAti(bp);
   1411             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
   1412             if (rs != expectedTagVal) {
   1413                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
   1414                       "          Actual, Expected status = %4d, %4d",
   1415                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
   1416             }
   1417         }
   1418 
   1419         prevBP = bp;
   1420     }
   1421 
   1422     // Verify that there were no missed breaks prior to the last one found
   1423     for (i=prevBP-1; i>=0; i--) {
   1424         if (t->expectedBreaks->elementAti(i) != 0) {
   1425             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
   1426                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
   1427         }
   1428     }
   1429 }
   1430 
   1431 
   1432 void RBBITest::TestExtended() {
   1433 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   1434     UErrorCode      status  = U_ZERO_ERROR;
   1435     Locale          locale("");
   1436 
   1437     UnicodeString       rules;
   1438     TestParams          tp;
   1439     tp.bi             = NULL;
   1440     tp.expectedBreaks = new UVector32(status);
   1441     tp.srcLine        = new UVector32(status);
   1442     tp.srcCol         = new UVector32(status);
   1443 
   1444     RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
   1445     if (U_FAILURE(status)) {
   1446         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
   1447     }
   1448 
   1449 
   1450     //
   1451     //  Open and read the test data file.
   1452     //
   1453     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   1454     char testFileName[1000];
   1455     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
   1456         errln("Can't open test data.  Path too long.");
   1457         return;
   1458     }
   1459     strcpy(testFileName, testDataDirectory);
   1460     strcat(testFileName, "rbbitst.txt");
   1461 
   1462     int    len;
   1463     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
   1464     if (U_FAILURE(status)) {
   1465         return; /* something went wrong, error already output */
   1466     }
   1467 
   1468 
   1469 
   1470 
   1471     //
   1472     //  Put the test data into a UnicodeString
   1473     //
   1474     UnicodeString testString(FALSE, testFile, len);
   1475 
   1476     enum EParseState{
   1477         PARSE_COMMENT,
   1478         PARSE_TAG,
   1479         PARSE_DATA,
   1480         PARSE_NUM
   1481     }
   1482     parseState = PARSE_TAG;
   1483 
   1484     EParseState savedState = PARSE_TAG;
   1485 
   1486     static const UChar CH_LF        = 0x0a;
   1487     static const UChar CH_CR        = 0x0d;
   1488     static const UChar CH_HASH      = 0x23;
   1489     /*static const UChar CH_PERIOD    = 0x2e;*/
   1490     static const UChar CH_LT        = 0x3c;
   1491     static const UChar CH_GT        = 0x3e;
   1492     static const UChar CH_BACKSLASH = 0x5c;
   1493     static const UChar CH_BULLET    = 0x2022;
   1494 
   1495     int32_t    lineNum  = 1;
   1496     int32_t    colStart = 0;
   1497     int32_t    column   = 0;
   1498     int32_t    charIdx  = 0;
   1499 
   1500     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
   1501 
   1502     for (charIdx = 0; charIdx < len; ) {
   1503         status = U_ZERO_ERROR;
   1504         UChar  c = testString.charAt(charIdx);
   1505         charIdx++;
   1506         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
   1507             // treat CRLF as a unit
   1508             c = CH_LF;
   1509             charIdx++;
   1510         }
   1511         if (c == CH_LF || c == CH_CR) {
   1512             lineNum++;
   1513             colStart = charIdx;
   1514         }
   1515         column = charIdx - colStart + 1;
   1516 
   1517         switch (parseState) {
   1518         case PARSE_COMMENT:
   1519             if (c == 0x0a || c == 0x0d) {
   1520                 parseState = savedState;
   1521             }
   1522             break;
   1523 
   1524         case PARSE_TAG:
   1525             {
   1526             if (c == CH_HASH) {
   1527                 parseState = PARSE_COMMENT;
   1528                 savedState = PARSE_TAG;
   1529                 break;
   1530             }
   1531             if (u_isUWhiteSpace(c)) {
   1532                 break;
   1533             }
   1534             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
   1535                 delete tp.bi;
   1536                 tp.bi = BreakIterator::createWordInstance(locale,  status);
   1537                 charIdx += 5;
   1538                 break;
   1539             }
   1540             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
   1541                 delete tp.bi;
   1542                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
   1543                 charIdx += 5;
   1544                 break;
   1545             }
   1546             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
   1547                 delete tp.bi;
   1548                 tp.bi = BreakIterator::createLineInstance(locale,  status);
   1549                 charIdx += 5;
   1550                 break;
   1551             }
   1552             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
   1553                 delete tp.bi;
   1554                 tp.bi = NULL;
   1555                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
   1556                 charIdx += 5;
   1557                 break;
   1558             }
   1559             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
   1560                 delete tp.bi;
   1561                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
   1562                 charIdx += 6;
   1563                 break;
   1564             }
   1565 
   1566             // <locale  loc_name>
   1567             localeMatcher.reset(testString);
   1568             if (localeMatcher.lookingAt(charIdx-1, status)) {
   1569                 UnicodeString localeName = localeMatcher.group(1, status);
   1570                 char localeName8[100];
   1571                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
   1572                 locale = Locale::createFromName(localeName8);
   1573                 charIdx += localeMatcher.group(0, status).length();
   1574                 TEST_ASSERT_SUCCESS(status);
   1575                 break;
   1576             }
   1577             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
   1578                 parseState = PARSE_DATA;
   1579                 charIdx += 5;
   1580                 tp.dataToBreak = "";
   1581                 tp.expectedBreaks->removeAllElements();
   1582                 tp.srcCol ->removeAllElements();
   1583                 tp.srcLine->removeAllElements();
   1584                 break;
   1585             }
   1586 
   1587             errln("line %d: Tag expected in test file.", lineNum);
   1588             parseState = PARSE_COMMENT;
   1589             savedState = PARSE_DATA;
   1590             goto end_test; // Stop the test.
   1591             }
   1592             break;
   1593 
   1594         case PARSE_DATA:
   1595             if (c == CH_BULLET) {
   1596                 int32_t  breakIdx = tp.dataToBreak.length();
   1597                 tp.expectedBreaks->setSize(breakIdx+1);
   1598                 tp.expectedBreaks->setElementAt(-1, breakIdx);
   1599                 tp.srcLine->setSize(breakIdx+1);
   1600                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1601                 tp.srcCol ->setSize(breakIdx+1);
   1602                 tp.srcCol ->setElementAt(column, breakIdx);
   1603                 break;
   1604             }
   1605 
   1606             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
   1607                 // Add final entry to mappings from break location to source file position.
   1608                 //  Need one extra because last break position returned is after the
   1609                 //    last char in the data, not at the last char.
   1610                 tp.srcLine->addElement(lineNum, status);
   1611                 tp.srcCol ->addElement(column, status);
   1612 
   1613                 parseState = PARSE_TAG;
   1614                 charIdx += 6;
   1615 
   1616                 // RUN THE TEST!
   1617                 executeTest(&tp);
   1618                 break;
   1619             }
   1620 
   1621             if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
   1622                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
   1623                 // Get the code point from the name and insert it into the test data.
   1624                 //   (Damn, no API takes names in Unicode  !!!
   1625                 //    we've got to take it back to char *)
   1626                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
   1627                 int32_t nameLength = nameEndIdx - (charIdx+2);
   1628                 char charNameBuf[200];
   1629                 UChar32 theChar = -1;
   1630                 if (nameEndIdx != -1) {
   1631                     UErrorCode status = U_ZERO_ERROR;
   1632                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
   1633                     charNameBuf[sizeof(charNameBuf)-1] = 0;
   1634                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
   1635                     if (U_FAILURE(status)) {
   1636                         theChar = -1;
   1637                     }
   1638                 }
   1639                 if (theChar == -1) {
   1640                     errln("Error in named character in test file at line %d, col %d",
   1641                         lineNum, column);
   1642                 } else {
   1643                     // Named code point was recognized.  Insert it
   1644                     //   into the test data.
   1645                     tp.dataToBreak.append(theChar);
   1646                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
   1647                         tp.srcLine->addElement(lineNum, status);
   1648                         tp.srcCol ->addElement(column, status);
   1649                     }
   1650                 }
   1651                 if (nameEndIdx > charIdx) {
   1652                     charIdx = nameEndIdx+1;
   1653 
   1654                 }
   1655                 break;
   1656             }
   1657 
   1658 
   1659 
   1660 
   1661             if (testString.compare(charIdx-1, 2, "<>") == 0) {
   1662                 charIdx++;
   1663                 int32_t  breakIdx = tp.dataToBreak.length();
   1664                 tp.expectedBreaks->setSize(breakIdx+1);
   1665                 tp.expectedBreaks->setElementAt(-1, breakIdx);
   1666                 tp.srcLine->setSize(breakIdx+1);
   1667                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1668                 tp.srcCol ->setSize(breakIdx+1);
   1669                 tp.srcCol ->setElementAt(column, breakIdx);
   1670                 break;
   1671             }
   1672 
   1673             if (c == CH_LT) {
   1674                 tagValue   = 0;
   1675                 parseState = PARSE_NUM;
   1676                 break;
   1677             }
   1678 
   1679             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
   1680                 parseState = PARSE_COMMENT;
   1681                 savedState = PARSE_DATA;
   1682                 break;
   1683             }
   1684 
   1685             if (c == CH_BACKSLASH) {
   1686                 // Check for \ at end of line, a line continuation.
   1687                 //     Advance over (discard) the newline
   1688                 UChar32 cp = testString.char32At(charIdx);
   1689                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
   1690                     // We have a CR LF
   1691                     //  Need an extra increment of the input ptr to move over both of them
   1692                     charIdx++;
   1693                 }
   1694                 if (cp == CH_LF || cp == CH_CR) {
   1695                     lineNum++;
   1696                     colStart = charIdx;
   1697                     charIdx++;
   1698                     break;
   1699                 }
   1700 
   1701                 // Let unescape handle the back slash.
   1702                 cp = testString.unescapeAt(charIdx);
   1703                 if (cp != -1) {
   1704                     // Escape sequence was recognized.  Insert the char
   1705                     //   into the test data.
   1706                     tp.dataToBreak.append(cp);
   1707                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
   1708                         tp.srcLine->addElement(lineNum, status);
   1709                         tp.srcCol ->addElement(column, status);
   1710                     }
   1711                     break;
   1712                 }
   1713 
   1714 
   1715                 // Not a recognized backslash escape sequence.
   1716                 // Take the next char as a literal.
   1717                 //  TODO:  Should this be an error?
   1718                 c = testString.charAt(charIdx);
   1719                 charIdx = testString.moveIndex32(charIdx, 1);
   1720             }
   1721 
   1722             // Normal, non-escaped data char.
   1723             tp.dataToBreak.append(c);
   1724 
   1725             // Save the mapping from offset in the data to line/column numbers in
   1726             //   the original input file.  Will be used for better error messages only.
   1727             //   If there's an expected break before this char, the slot in the mapping
   1728             //     vector will already be set for this char; don't overwrite it.
   1729             if (tp.dataToBreak.length() > tp.srcLine->size()) {
   1730                 tp.srcLine->addElement(lineNum, status);
   1731                 tp.srcCol ->addElement(column, status);
   1732             }
   1733             break;
   1734 
   1735 
   1736         case PARSE_NUM:
   1737             // We are parsing an expected numeric tag value, like <1234>,
   1738             //   within a chunk of data.
   1739             if (u_isUWhiteSpace(c)) {
   1740                 break;
   1741             }
   1742 
   1743             if (c == CH_GT) {
   1744                 // Finished the number.  Add the info to the expected break data,
   1745                 //   and switch parse state back to doing plain data.
   1746                 parseState = PARSE_DATA;
   1747                 if (tagValue == 0) {
   1748                     tagValue = -1;
   1749                 }
   1750                 int32_t  breakIdx = tp.dataToBreak.length();
   1751                 tp.expectedBreaks->setSize(breakIdx+1);
   1752                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
   1753                 tp.srcLine->setSize(breakIdx+1);
   1754                 tp.srcLine->setElementAt(lineNum, breakIdx);
   1755                 tp.srcCol ->setSize(breakIdx+1);
   1756                 tp.srcCol ->setElementAt(column, breakIdx);
   1757                 break;
   1758             }
   1759 
   1760             if (u_isdigit(c)) {
   1761                 tagValue = tagValue*10 + u_charDigitValue(c);
   1762                 break;
   1763             }
   1764 
   1765             errln("Syntax Error in test file at line %d, col %d",
   1766                 lineNum, column);
   1767             parseState = PARSE_COMMENT;
   1768             goto end_test; // Stop the test
   1769             break;
   1770         }
   1771 
   1772 
   1773         if (U_FAILURE(status)) {
   1774             errln("ICU Error %s while parsing test file at line %d.",
   1775                 u_errorName(status), lineNum);
   1776             status = U_ZERO_ERROR;
   1777             goto end_test; // Stop the test
   1778         }
   1779 
   1780     }
   1781 
   1782 end_test:
   1783     delete tp.bi;
   1784     delete tp.expectedBreaks;
   1785     delete tp.srcLine;
   1786     delete tp.srcCol;
   1787     delete [] testFile;
   1788 #endif
   1789 }
   1790 
   1791 void RBBITest::TestThaiBreaks() {
   1792     UErrorCode status=U_ZERO_ERROR;
   1793     BreakIterator* b;
   1794     Locale locale = Locale("th");
   1795     int32_t p, index;
   1796     UChar c[]= {
   1797             0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B,
   1798             0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19,
   1799             0x0E16, 0x0E49, 0x0E33
   1800     };
   1801     int32_t expectedWordResult[] = {
   1802             2, 3, 6, 10, 11, 15, 17, 20, 22
   1803     };
   1804     int32_t expectedLineResult[] = {
   1805             3, 6, 11, 15, 17, 20, 22
   1806     };
   1807     int32_t size = sizeof(c)/sizeof(UChar);
   1808     UnicodeString text=UnicodeString(c);
   1809 
   1810     b = BreakIterator::createWordInstance(locale, status);
   1811     if (U_FAILURE(status)) {
   1812         errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status));
   1813         return;
   1814     }
   1815     b->setText(text);
   1816     p = index = 0;
   1817     while ((p=b->next())!=BreakIterator::DONE && p < size) {
   1818         if (p != expectedWordResult[index++]) {
   1819             errln("Incorrect break given by thai word break iterator. Expected: %d  Got: %d", expectedWordResult[index-1], p);
   1820         }
   1821     }
   1822     delete b;
   1823 
   1824     b = BreakIterator::createLineInstance(locale, status);
   1825     if (U_FAILURE(status)) {
   1826         printf("Unable to create thai line break iterator.\n");
   1827         return;
   1828     }
   1829     b->setText(text);
   1830     p = index = 0;
   1831     while ((p=b->next())!=BreakIterator::DONE && p < size) {
   1832         if (p != expectedLineResult[index++]) {
   1833             errln("Incorrect break given by thai line break iterator. Expected: %d  Got: %d", expectedLineResult[index-1], p);
   1834         }
   1835     }
   1836 
   1837     delete b;
   1838 }
   1839 
   1840 // UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"
   1841 // Words don't include colon or period (cldrbug #1969).
   1842 static const char    posxWordText[]     = "Can't have breaks in xx:yy or struct.field for CS-types.";
   1843 static const int32_t posxWordTOffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 };
   1844 static const int32_t posxWordROffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21,         26, 27, 29, 30,         42, 43, 46, 47, 49, 50, 55, 56 };
   1845 
   1846 // UBreakIteratorType UBRK_WORD, Locale "ja"
   1847 // Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
   1848 static const char    jaWordText[]     = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"
   1849                                         "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";
   1850 static const int32_t jaWordTOffsets[] = {    2, 3,          7, 8, 14,         17, 18,     20, 21, 24,         27, 28 };
   1851 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 };
   1852 
   1853 // UBreakIteratorType UBRK_SENTENCE, Locale "el"
   1854 // Add break after Greek question mark (cldrbug #2069).
   1855 static const char    elSentText[]     = "\\u0391\\u03B2, \\u03B3\\u03B4; \\u0395 \\u03B6\\u03B7\\u037E \\u0398 \\u03B9\\u03BA. "
   1856                                         "\\u039B\\u03BC \\u03BD\\u03BE! \\u039F\\u03C0, \\u03A1\\u03C2? \\u03A3";
   1857 static const int32_t elSentTOffsets[] = { 8, 14, 20, 27, 35, 36 };
   1858 static const int32_t elSentROffsets[] = {        20, 27, 35, 36 };
   1859 
   1860 // UBreakIteratorType UBRK_CHARACTER, Locale "th"
   1861 // Clusters should not include spacing Thai/Lao vowels (prefix or postfix), except for [SARA] AM (cldrbug #2161).
   1862 static const char    thCharText[]     = "\\u0E01\\u0E23\\u0E30\\u0E17\\u0E48\\u0E2D\\u0E21\\u0E23\\u0E08\\u0E19\\u0E32 "
   1863                                         "(\\u0E2A\\u0E38\\u0E0A\\u0E32\\u0E15\\u0E34-\\u0E08\\u0E38\\u0E11\\u0E32\\u0E21\\u0E32\\u0E28) "
   1864                                         "\\u0E40\\u0E14\\u0E47\\u0E01\\u0E21\\u0E35\\u0E1B\\u0E31\\u0E0D\\u0E2B\\u0E32 ";
   1865 static const int32_t thCharTOffsets[] = { 1, 2, 3, 5, 6, 7, 8, 9, 10, 11,
   1866                                           12, 13, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28,
   1867                                           29, 30, 32, 33, 35, 37, 38, 39, 40, 41 };
   1868 static const int32_t thCharROffsets[] = { 1,    3, 5, 6, 7, 8, 9,     11,
   1869                                           12, 13, 15,     17, 19, 20, 22,     24,     26, 27, 28,
   1870                                           29,     32, 33, 35, 37, 38,     40, 41 };
   1871 
   1872 typedef struct {
   1873     UBreakIteratorType  type;
   1874     const char *        locale;
   1875     const char *        escapedText;
   1876     const int32_t *     tailoredOffsets;
   1877     int32_t             tailoredOffsetsCount;
   1878     const int32_t *     rootOffsets;
   1879     int32_t             rootOffsetsCount;
   1880 } TailoredBreakItem;
   1881 
   1882 #define ARRAY_PTR_LEN(array) (array),(sizeof(array)/sizeof(array[0]))
   1883 
   1884 static const TailoredBreakItem tbItems[] = {
   1885     { UBRK_WORD,      "en_US_POSIX", posxWordText, ARRAY_PTR_LEN(posxWordTOffsets), ARRAY_PTR_LEN(posxWordROffsets) },
   1886     { UBRK_WORD,      "ja",          jaWordText,   ARRAY_PTR_LEN(jaWordTOffsets),   ARRAY_PTR_LEN(jaWordROffsets)   },
   1887     { UBRK_SENTENCE,  "el",          elSentText,   ARRAY_PTR_LEN(elSentTOffsets),   ARRAY_PTR_LEN(elSentROffsets)   },
   1888     { UBRK_CHARACTER, "th",          thCharText,   ARRAY_PTR_LEN(thCharTOffsets),   ARRAY_PTR_LEN(thCharROffsets)   },
   1889     { UBRK_CHARACTER, NULL,          NULL,         NULL,0,                          NULL,0                          } // terminator
   1890 };
   1891 
   1892 static void formatOffsets(char* buffer, int32_t buflen, int32_t count, const int32_t* offsets) {
   1893     while (count-- > 0) {
   1894         int writeCount;
   1895         sprintf(buffer, /* buflen, */ " %d%n", *offsets++, &writeCount); /* wants to be snprintf */
   1896         buffer += writeCount;
   1897         buflen -= writeCount;
   1898     }
   1899 }
   1900 
   1901 enum { kMaxOffsetCount = 128 };
   1902 
   1903 void RBBITest::TBTest(BreakIterator* brkitr, int type, const char *locale, const char* escapedText, const int32_t *expectOffsets, int32_t expectOffsetsCount) {
   1904     brkitr->setText( CharsToUnicodeString(escapedText) );
   1905     int32_t foundOffsets[kMaxOffsetCount];
   1906     int32_t offset, foundOffsetsCount = 0;
   1907     // do forwards iteration test
   1908     while ( foundOffsetsCount < kMaxOffsetCount && (offset = brkitr->next()) != BreakIterator::DONE ) {
   1909         foundOffsets[foundOffsetsCount++] = offset;
   1910     }
   1911     if ( foundOffsetsCount != expectOffsetsCount || memcmp(expectOffsets, foundOffsets, foundOffsetsCount*sizeof(foundOffsets[0])) != 0 ) {
   1912         // log error for forwards test
   1913         char formatExpect[512], formatFound[512];
   1914         formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets);
   1915         formatOffsets(formatFound, sizeof(formatFound), foundOffsetsCount, foundOffsets);
   1916         errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found %d offsets fwd:%s\n",
   1917                 type, locale, escapedText, expectOffsetsCount, formatExpect, foundOffsetsCount, formatFound);
   1918     } else {
   1919         // do backwards iteration test
   1920         --foundOffsetsCount; // back off one from the end offset
   1921         while ( foundOffsetsCount > 0 ) {
   1922             offset = brkitr->previous();
   1923             if ( offset != foundOffsets[--foundOffsetsCount] ) {
   1924                 // log error for backwards test
   1925                 char formatExpect[512];
   1926                 formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets);
   1927                 errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found rev offset %d where expect %d\n",
   1928                         type, locale, escapedText, expectOffsetsCount, formatExpect, offset, foundOffsets[foundOffsetsCount]);
   1929                 break;
   1930             }
   1931         }
   1932     }
   1933 }
   1934 
   1935 void RBBITest::TestTailoredBreaks() {
   1936     const TailoredBreakItem * tbItemPtr;
   1937     Locale rootLocale = Locale("root");
   1938     for (tbItemPtr = tbItems; tbItemPtr->escapedText != NULL; ++tbItemPtr) {
   1939         Locale testLocale = Locale(tbItemPtr->locale);
   1940         BreakIterator * tailoredBrkiter;
   1941         BreakIterator * rootBrkiter;
   1942         UErrorCode status = U_ZERO_ERROR;
   1943         switch (tbItemPtr->type) {
   1944             case UBRK_CHARACTER:
   1945                 tailoredBrkiter = BreakIterator::createCharacterInstance(testLocale, status);
   1946                 rootBrkiter = BreakIterator::createCharacterInstance(rootLocale, status);
   1947                 break;
   1948             case UBRK_WORD:
   1949                 tailoredBrkiter = BreakIterator::createWordInstance(testLocale, status);
   1950                 rootBrkiter = BreakIterator::createWordInstance(rootLocale, status);
   1951                 break;
   1952             case UBRK_LINE:
   1953                 tailoredBrkiter = BreakIterator::createLineInstance(testLocale, status);
   1954                 rootBrkiter = BreakIterator::createLineInstance(rootLocale, status);
   1955                 break;
   1956             case UBRK_SENTENCE:
   1957                 tailoredBrkiter = BreakIterator::createSentenceInstance(testLocale, status);
   1958                 rootBrkiter = BreakIterator::createSentenceInstance(rootLocale, status);
   1959                 break;
   1960             default:
   1961                 status = U_UNSUPPORTED_ERROR;
   1962                 break;
   1963         }
   1964         if (U_FAILURE(status)) {
   1965             errcheckln(status, "BreakIterator create failed for type %d, locales root or %s - Error: %s", (int)(tbItemPtr->type), tbItemPtr->locale, u_errorName(status));
   1966             continue;
   1967         }
   1968         TBTest(tailoredBrkiter, (int)(tbItemPtr->type), tbItemPtr->locale, tbItemPtr->escapedText, tbItemPtr->tailoredOffsets, tbItemPtr->tailoredOffsetsCount);
   1969         TBTest(rootBrkiter,     (int)(tbItemPtr->type), "root",            tbItemPtr->escapedText, tbItemPtr->rootOffsets,     tbItemPtr->rootOffsetsCount);
   1970 
   1971         delete rootBrkiter;
   1972         delete tailoredBrkiter;
   1973     }
   1974 }
   1975 
   1976 
   1977 //-------------------------------------------------------------------------------
   1978 //
   1979 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
   1980 //    return the datain one big UChar * buffer, which the caller must delete.
   1981 //
   1982 //    parameters:
   1983 //          fileName:   the name of the file, with no directory part.  The test data directory
   1984 //                      is assumed.
   1985 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
   1986 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
   1987 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
   1988 //                      Pass NULL for the system default encoding.
   1989 //          status
   1990 //    returns:
   1991 //                      The file data, converted to UChar.
   1992 //                      The caller must delete this when done with
   1993 //                           delete [] theBuffer;
   1994 //
   1995 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
   1996 //           Move this function to some common place.
   1997 //
   1998 //--------------------------------------------------------------------------------
   1999 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
   2000     UChar       *retPtr  = NULL;
   2001     char        *fileBuf = NULL;
   2002     UConverter* conv     = NULL;
   2003     FILE        *f       = NULL;
   2004 
   2005     ulen = 0;
   2006     if (U_FAILURE(status)) {
   2007         return retPtr;
   2008     }
   2009 
   2010     //
   2011     //  Open the file.
   2012     //
   2013     f = fopen(fileName, "rb");
   2014     if (f == 0) {
   2015         dataerrln("Error opening test data file %s\n", fileName);
   2016         status = U_FILE_ACCESS_ERROR;
   2017         return NULL;
   2018     }
   2019     //
   2020     //  Read it in
   2021     //
   2022     int   fileSize;
   2023     int   amt_read;
   2024 
   2025     fseek( f, 0, SEEK_END);
   2026     fileSize = ftell(f);
   2027     fileBuf = new char[fileSize];
   2028     fseek(f, 0, SEEK_SET);
   2029     amt_read = fread(fileBuf, 1, fileSize, f);
   2030     if (amt_read != fileSize || fileSize <= 0) {
   2031         errln("Error reading test data file.");
   2032         goto cleanUpAndReturn;
   2033     }
   2034 
   2035     //
   2036     // Look for a Unicode Signature (BOM) on the data just read
   2037     //
   2038     int32_t        signatureLength;
   2039     const char *   fileBufC;
   2040     const char*    bomEncoding;
   2041 
   2042     fileBufC = fileBuf;
   2043     bomEncoding = ucnv_detectUnicodeSignature(
   2044         fileBuf, fileSize, &signatureLength, &status);
   2045     if(bomEncoding!=NULL ){
   2046         fileBufC  += signatureLength;
   2047         fileSize  -= signatureLength;
   2048         encoding = bomEncoding;
   2049     }
   2050 
   2051     //
   2052     // Open a converter to take the rule file to UTF-16
   2053     //
   2054     conv = ucnv_open(encoding, &status);
   2055     if (U_FAILURE(status)) {
   2056         goto cleanUpAndReturn;
   2057     }
   2058 
   2059     //
   2060     // Convert the rules to UChar.
   2061     //  Preflight first to determine required buffer size.
   2062     //
   2063     ulen = ucnv_toUChars(conv,
   2064         NULL,           //  dest,
   2065         0,              //  destCapacity,
   2066         fileBufC,
   2067         fileSize,
   2068         &status);
   2069     if (status == U_BUFFER_OVERFLOW_ERROR) {
   2070         // Buffer Overflow is expected from the preflight operation.
   2071         status = U_ZERO_ERROR;
   2072 
   2073         retPtr = new UChar[ulen+1];
   2074         ucnv_toUChars(conv,
   2075             retPtr,       //  dest,
   2076             ulen+1,
   2077             fileBufC,
   2078             fileSize,
   2079             &status);
   2080     }
   2081 
   2082 cleanUpAndReturn:
   2083     fclose(f);
   2084     delete []fileBuf;
   2085     ucnv_close(conv);
   2086     if (U_FAILURE(status)) {
   2087         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   2088         delete retPtr;
   2089         retPtr = 0;
   2090         ulen   = 0;
   2091     };
   2092     return retPtr;
   2093 }
   2094 
   2095 
   2096 
   2097 //--------------------------------------------------------------------------------------------
   2098 //
   2099 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
   2100 //
   2101 //-------------------------------------------------------------------------------------------
   2102 void RBBITest::TestUnicodeFiles() {
   2103     RuleBasedBreakIterator  *bi;
   2104     UErrorCode               status = U_ZERO_ERROR;
   2105 
   2106     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getDefault(), status);
   2107     TEST_ASSERT_SUCCESS(status);
   2108     if (U_SUCCESS(status)) {
   2109         runUnicodeTestData("GraphemeBreakTest.txt", bi);
   2110     }
   2111     delete bi;
   2112 
   2113     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getDefault(), status);
   2114     TEST_ASSERT_SUCCESS(status);
   2115     if (U_SUCCESS(status)) {
   2116         runUnicodeTestData("WordBreakTest.txt", bi);
   2117     }
   2118     delete bi;
   2119 
   2120     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
   2121     TEST_ASSERT_SUCCESS(status);
   2122     if (U_SUCCESS(status)) {
   2123         runUnicodeTestData("SentenceBreakTest.txt", bi);
   2124     }
   2125     delete bi;
   2126 
   2127     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
   2128     TEST_ASSERT_SUCCESS(status);
   2129     if (U_SUCCESS(status)) {
   2130         runUnicodeTestData("LineBreakTest.txt", bi);
   2131     }
   2132     delete bi;
   2133 }
   2134 
   2135 
   2136 //--------------------------------------------------------------------------------------------
   2137 //
   2138 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
   2139 //
   2140 //-------------------------------------------------------------------------------------------
   2141 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
   2142 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   2143     UErrorCode  status = U_ZERO_ERROR;
   2144 
   2145     //
   2146     //  Open and read the test data file, put it into a UnicodeString.
   2147     //
   2148     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   2149     char testFileName[1000];
   2150     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
   2151         dataerrln("Can't open test data.  Path too long.");
   2152         return;
   2153     }
   2154     strcpy(testFileName, testDataDirectory);
   2155     strcat(testFileName, fileName);
   2156 
   2157     logln("Opening data file %s\n", fileName);
   2158 
   2159     int    len;
   2160     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
   2161     if (status != U_FILE_ACCESS_ERROR) {
   2162         TEST_ASSERT_SUCCESS(status);
   2163         TEST_ASSERT(testFile != NULL);
   2164     }
   2165     if (U_FAILURE(status) || testFile == NULL) {
   2166         return; /* something went wrong, error already output */
   2167     }
   2168     UnicodeString testFileAsString(TRUE, testFile, len);
   2169 
   2170     //
   2171     //  Parse the test data file using a regular expression.
   2172     //  Each kind of token is recognized in its own capture group; what type of item was scanned
   2173     //     is identified by which group had a match.
   2174     //
   2175     //    Caputure Group #                  1          2            3            4           5
   2176     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
   2177     //
   2178     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
   2179     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
   2180     UnicodeString   testString;
   2181     UVector32       breakPositions(status);
   2182     int             lineNumber = 1;
   2183     TEST_ASSERT_SUCCESS(status);
   2184     if (U_FAILURE(status)) {
   2185         return;
   2186     }
   2187 
   2188     //
   2189     //  Scan through each test case, building up the string to be broken in testString,
   2190     //   and the positions that should be boundaries in the breakPositions vector.
   2191     //
   2192     while (tokenMatcher.find()) {
   2193         if (tokenMatcher.start(1, status) >= 0) {
   2194             // Scanned a divide sign, indicating a break position in the test data.
   2195             if (testString.length()>0) {
   2196                 breakPositions.addElement(testString.length(), status);
   2197             }
   2198         }
   2199         else if (tokenMatcher.start(2, status) >= 0) {
   2200             // Scanned an 'x', meaning no break at this position in the test data
   2201             //   Nothing to be done here.
   2202             }
   2203         else if (tokenMatcher.start(3, status) >= 0) {
   2204             // Scanned Hex digits.  Convert them to binary, append to the character data string.
   2205             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
   2206             int length = hexNumber.length();
   2207             if (length<=8) {
   2208                 char buf[10];
   2209                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
   2210                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
   2211                 if (c<=0x10ffff) {
   2212                     testString.append(c);
   2213                 } else {
   2214                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
   2215                        fileName, lineNumber);
   2216                 }
   2217             } else {
   2218                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
   2219                        fileName, lineNumber);
   2220              }
   2221         }
   2222         else if (tokenMatcher.start(4, status) >= 0) {
   2223             // Scanned to end of a line, possibly skipping over a comment in the process.
   2224             //   If the line from the file contained test data, run the test now.
   2225             //
   2226             if (testString.length() > 0) {
   2227                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
   2228             }
   2229 
   2230             // Clear out this test case.
   2231             //    The string and breakPositions vector will be refilled as the next
   2232             //       test case is parsed.
   2233             testString.remove();
   2234             breakPositions.removeAllElements();
   2235             lineNumber++;
   2236         } else {
   2237             // Scanner catchall.  Something unrecognized appeared on the line.
   2238             char token[16];
   2239             UnicodeString uToken = tokenMatcher.group(0, status);
   2240             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
   2241             token[sizeof(token)-1] = 0;
   2242             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
   2243 
   2244             // Clean up, in preparation for continuing with the next line.
   2245             testString.remove();
   2246             breakPositions.removeAllElements();
   2247             lineNumber++;
   2248         }
   2249         TEST_ASSERT_SUCCESS(status);
   2250         if (U_FAILURE(status)) {
   2251             break;
   2252         }
   2253     }
   2254 
   2255     delete [] testFile;
   2256  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
   2257 }
   2258 
   2259 //--------------------------------------------------------------------------------------------
   2260 //
   2261 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
   2262 //                            test data files.  Do only a simple, forward-only check -
   2263 //                            this test is mostly to check that ICU and the Unicode
   2264 //                            data agree with each other.
   2265 //
   2266 //--------------------------------------------------------------------------------------------
   2267 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
   2268                          const UnicodeString &testString,   // Text data to be broken
   2269                          UVector32 *breakPositions,         // Positions where breaks should be found.
   2270                          RuleBasedBreakIterator *bi) {
   2271     int32_t pos;                 // Break Position in the test string
   2272     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
   2273     int32_t expectedPos;         // Expected break position (index into test string)
   2274 
   2275     bi->setText(testString);
   2276     pos = bi->first();
   2277     pos = bi->next();
   2278 
   2279     while (pos != BreakIterator::DONE) {
   2280         if (expectedI >= breakPositions->size()) {
   2281             errln("Test file \"%s\", line %d, unexpected break found at position %d",
   2282                 testFileName, lineNumber, pos);
   2283             break;
   2284         }
   2285         expectedPos = breakPositions->elementAti(expectedI);
   2286         if (pos < expectedPos) {
   2287             errln("Test file \"%s\", line %d, unexpected break found at position %d",
   2288                 testFileName, lineNumber, pos);
   2289             break;
   2290         }
   2291         if (pos > expectedPos) {
   2292             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
   2293                 testFileName, lineNumber, expectedPos);
   2294             break;
   2295         }
   2296         pos = bi->next();
   2297         expectedI++;
   2298     }
   2299 
   2300     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
   2301         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
   2302             testFileName, lineNumber, breakPositions->elementAti(expectedI));
   2303     }
   2304 }
   2305 
   2306 
   2307 
   2308 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   2309 //---------------------------------------------------------------------------------------
   2310 //
   2311 //   classs RBBIMonkeyKind
   2312 //
   2313 //      Monkey Test for Break Iteration
   2314 //      Abstract interface class.   Concrete derived classes independently
   2315 //      implement the break rules for different iterator types.
   2316 //
   2317 //      The Monkey Test itself uses doesn't know which type of break iterator it is
   2318 //      testing, but works purely in terms of the interface defined here.
   2319 //
   2320 //---------------------------------------------------------------------------------------
   2321 class RBBIMonkeyKind {
   2322 public:
   2323     // Return a UVector of UnicodeSets, representing the character classes used
   2324     //   for this type of iterator.
   2325     virtual  UVector  *charClasses() = 0;
   2326 
   2327     // Set the test text on which subsequent calls to next() will operate
   2328     virtual  void      setText(const UnicodeString &s) = 0;
   2329 
   2330     // Find the next break postion, starting from the prev break position, or from zero.
   2331     // Return -1 after reaching end of string.
   2332     virtual  int32_t   next(int32_t i) = 0;
   2333 
   2334     virtual ~RBBIMonkeyKind();
   2335     UErrorCode       deferredStatus;
   2336 
   2337 
   2338 protected:
   2339     RBBIMonkeyKind();
   2340 
   2341 private:
   2342 };
   2343 
   2344 RBBIMonkeyKind::RBBIMonkeyKind() {
   2345     deferredStatus = U_ZERO_ERROR;
   2346 }
   2347 
   2348 RBBIMonkeyKind::~RBBIMonkeyKind() {
   2349 }
   2350 
   2351 
   2352 //----------------------------------------------------------------------------------------
   2353 //
   2354 //   Random Numbers.  Similar to standard lib rand() and srand()
   2355 //                    Not using library to
   2356 //                      1.  Get same results on all platforms.
   2357 //                      2.  Get access to current seed, to more easily reproduce failures.
   2358 //
   2359 //---------------------------------------------------------------------------------------
   2360 static uint32_t m_seed = 1;
   2361 
   2362 static uint32_t m_rand()
   2363 {
   2364     m_seed = m_seed * 1103515245 + 12345;
   2365     return (uint32_t)(m_seed/65536) % 32768;
   2366 }
   2367 
   2368 
   2369 //------------------------------------------------------------------------------------------
   2370 //
   2371 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
   2372 //                             of RBBIMonkeyKind.
   2373 //
   2374 //------------------------------------------------------------------------------------------
   2375 class RBBICharMonkey: public RBBIMonkeyKind {
   2376 public:
   2377     RBBICharMonkey();
   2378     virtual          ~RBBICharMonkey();
   2379     virtual  UVector *charClasses();
   2380     virtual  void     setText(const UnicodeString &s);
   2381     virtual  int32_t  next(int32_t i);
   2382 private:
   2383     UVector   *fSets;
   2384 
   2385     UnicodeSet  *fCRLFSet;
   2386     UnicodeSet  *fControlSet;
   2387     UnicodeSet  *fExtendSet;
   2388     UnicodeSet  *fPrependSet;
   2389     UnicodeSet  *fSpacingSet;
   2390     UnicodeSet  *fLSet;
   2391     UnicodeSet  *fVSet;
   2392     UnicodeSet  *fTSet;
   2393     UnicodeSet  *fLVSet;
   2394     UnicodeSet  *fLVTSet;
   2395     UnicodeSet  *fHangulSet;
   2396     UnicodeSet  *fAnySet;
   2397 
   2398     const UnicodeString *fText;
   2399 };
   2400 
   2401 
   2402 RBBICharMonkey::RBBICharMonkey() {
   2403     UErrorCode  status = U_ZERO_ERROR;
   2404 
   2405     fText = NULL;
   2406 
   2407     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
   2408     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
   2409     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
   2410     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
   2411     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
   2412     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
   2413     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
   2414     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
   2415     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
   2416     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
   2417     fHangulSet  = new UnicodeSet();
   2418     fHangulSet->addAll(*fLSet);
   2419     fHangulSet->addAll(*fVSet);
   2420     fHangulSet->addAll(*fTSet);
   2421     fHangulSet->addAll(*fLVSet);
   2422     fHangulSet->addAll(*fLVTSet);
   2423     fAnySet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0000-\\U0010ffff]"), status);
   2424 
   2425     fSets       = new UVector(status);
   2426     fSets->addElement(fCRLFSet,    status);
   2427     fSets->addElement(fControlSet, status);
   2428     fSets->addElement(fExtendSet,  status);
   2429     fSets->addElement(fPrependSet, status);
   2430     fSets->addElement(fSpacingSet, status);
   2431     fSets->addElement(fHangulSet,  status);
   2432     fSets->addElement(fAnySet,     status);
   2433     if (U_FAILURE(status)) {
   2434         deferredStatus = status;
   2435     }
   2436 }
   2437 
   2438 
   2439 void RBBICharMonkey::setText(const UnicodeString &s) {
   2440     fText = &s;
   2441 }
   2442 
   2443 
   2444 
   2445 int32_t RBBICharMonkey::next(int32_t prevPos) {
   2446     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2447                               //   break position being tested.  The candidate break
   2448                               //   location is before p2.
   2449 
   2450     int     breakPos = -1;
   2451 
   2452     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2453 
   2454     if (U_FAILURE(deferredStatus)) {
   2455         return -1;
   2456     }
   2457 
   2458     // Previous break at end of string.  return DONE.
   2459     if (prevPos >= fText->length()) {
   2460         return -1;
   2461     }
   2462     p0 = p1 = p2 = p3 = prevPos;
   2463     c3 =  fText->char32At(prevPos);
   2464     c0 = c1 = c2 = 0;
   2465 
   2466     // Loop runs once per "significant" character position in the input text.
   2467     for (;;) {
   2468         // Move all of the positions forward in the input string.
   2469         p0 = p1;  c0 = c1;
   2470         p1 = p2;  c1 = c2;
   2471         p2 = p3;  c2 = c3;
   2472 
   2473         // Advancd p3 by one codepoint
   2474         p3 = fText->moveIndex32(p3, 1);
   2475         c3 = fText->char32At(p3);
   2476 
   2477         if (p1 == p2) {
   2478             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2479             continue;
   2480         }
   2481         if (p2 == fText->length()) {
   2482             // Reached end of string.  Always a break position.
   2483             break;
   2484         }
   2485 
   2486         // Rule  GB3   CR x LF
   2487         //     No Extend or Format characters may appear between the CR and LF,
   2488         //     which requires the additional check for p2 immediately following p1.
   2489         //
   2490         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
   2491             continue;
   2492         }
   2493 
   2494         // Rule (GB4).   ( Control | CR | LF ) <break>
   2495         if (fControlSet->contains(c1) ||
   2496             c1 == 0x0D ||
   2497             c1 == 0x0A)  {
   2498             break;
   2499         }
   2500 
   2501         // Rule (GB5)    <break>  ( Control | CR | LF )
   2502         //
   2503         if (fControlSet->contains(c2) ||
   2504             c2 == 0x0D ||
   2505             c2 == 0x0A)  {
   2506             break;
   2507         }
   2508 
   2509 
   2510         // Rule (GB6)  L x ( L | V | LV | LVT )
   2511         if (fLSet->contains(c1) &&
   2512                (fLSet->contains(c2)  ||
   2513                 fVSet->contains(c2)  ||
   2514                 fLVSet->contains(c2) ||
   2515                 fLVTSet->contains(c2))) {
   2516             continue;
   2517         }
   2518 
   2519         // Rule (GB7)    ( LV | V )  x  ( V | T )
   2520         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
   2521             (fVSet->contains(c2) || fTSet->contains(c2)))  {
   2522             continue;
   2523         }
   2524 
   2525         // Rule (GB8)    ( LVT | T)  x T
   2526         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
   2527             fTSet->contains(c2))  {
   2528             continue;
   2529         }
   2530 
   2531         // Rule (GB9)    Numeric x ALetter
   2532         if (fExtendSet->contains(c2))  {
   2533             continue;
   2534         }
   2535 
   2536         // Rule (GB9a)   x  SpacingMark
   2537         if (fSpacingSet->contains(c2)) {
   2538             continue;
   2539         }
   2540 
   2541         // Rule (GB9b)   Prepend x
   2542         if (fPrependSet->contains(c1)) {
   2543             continue;
   2544         }
   2545 
   2546         // Rule (GB10)  Any  <break>  Any
   2547         break;
   2548     }
   2549 
   2550     breakPos = p2;
   2551     return breakPos;
   2552 }
   2553 
   2554 
   2555 
   2556 UVector  *RBBICharMonkey::charClasses() {
   2557     return fSets;
   2558 }
   2559 
   2560 
   2561 RBBICharMonkey::~RBBICharMonkey() {
   2562     delete fSets;
   2563     delete fCRLFSet;
   2564     delete fControlSet;
   2565     delete fExtendSet;
   2566     delete fPrependSet;
   2567     delete fSpacingSet;
   2568     delete fLSet;
   2569     delete fVSet;
   2570     delete fTSet;
   2571     delete fLVSet;
   2572     delete fLVTSet;
   2573     delete fHangulSet;
   2574     delete fAnySet;
   2575 }
   2576 
   2577 //------------------------------------------------------------------------------------------
   2578 //
   2579 //   class RBBIWordMonkey      Word Break specific implementation
   2580 //                             of RBBIMonkeyKind.
   2581 //
   2582 //------------------------------------------------------------------------------------------
   2583 class RBBIWordMonkey: public RBBIMonkeyKind {
   2584 public:
   2585     RBBIWordMonkey();
   2586     virtual          ~RBBIWordMonkey();
   2587     virtual  UVector *charClasses();
   2588     virtual  void     setText(const UnicodeString &s);
   2589     virtual int32_t   next(int32_t i);
   2590 private:
   2591     UVector      *fSets;
   2592 
   2593     UnicodeSet  *fCRSet;
   2594     UnicodeSet  *fLFSet;
   2595     UnicodeSet  *fNewlineSet;
   2596     UnicodeSet  *fKatakanaSet;
   2597     UnicodeSet  *fALetterSet;
   2598     UnicodeSet  *fMidNumLetSet;
   2599     UnicodeSet  *fMidLetterSet;
   2600     UnicodeSet  *fMidNumSet;
   2601     UnicodeSet  *fNumericSet;
   2602     UnicodeSet  *fFormatSet;
   2603     UnicodeSet  *fOtherSet;
   2604     UnicodeSet  *fExtendSet;
   2605     UnicodeSet  *fExtendNumLetSet;
   2606 
   2607     RegexMatcher  *fMatcher;
   2608 
   2609     const UnicodeString  *fText;
   2610 };
   2611 
   2612 
   2613 RBBIWordMonkey::RBBIWordMonkey()
   2614 {
   2615     UErrorCode  status = U_ZERO_ERROR;
   2616 
   2617     fSets            = new UVector(status);
   2618 
   2619     fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
   2620     fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
   2621     fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
   2622     fALetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"),      status);
   2623     fKatakanaSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
   2624     fMidNumLetSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
   2625     fMidLetterSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
   2626     fMidNumSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
   2627     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
   2628     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
   2629     fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
   2630     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
   2631 
   2632     fOtherSet        = new UnicodeSet();
   2633     if(U_FAILURE(status)) {
   2634       deferredStatus = status;
   2635       return;
   2636     }
   2637 
   2638     fOtherSet->complement();
   2639     fOtherSet->removeAll(*fCRSet);
   2640     fOtherSet->removeAll(*fLFSet);
   2641     fOtherSet->removeAll(*fNewlineSet);
   2642     fOtherSet->removeAll(*fKatakanaSet);
   2643     fOtherSet->removeAll(*fALetterSet);
   2644     fOtherSet->removeAll(*fMidLetterSet);
   2645     fOtherSet->removeAll(*fMidNumSet);
   2646     fOtherSet->removeAll(*fNumericSet);
   2647     fOtherSet->removeAll(*fExtendNumLetSet);
   2648     fOtherSet->removeAll(*fFormatSet);
   2649     fOtherSet->removeAll(*fExtendSet);
   2650     // Inhibit dictionary characters from being tested at all.
   2651     fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
   2652 
   2653     fSets->addElement(fCRSet,        status);
   2654     fSets->addElement(fLFSet,        status);
   2655     fSets->addElement(fNewlineSet,   status);
   2656     fSets->addElement(fALetterSet,   status);
   2657     fSets->addElement(fKatakanaSet,  status);
   2658     fSets->addElement(fMidLetterSet, status);
   2659     fSets->addElement(fMidNumLetSet, status);
   2660     fSets->addElement(fMidNumSet,    status);
   2661     fSets->addElement(fNumericSet,   status);
   2662     fSets->addElement(fFormatSet,    status);
   2663     fSets->addElement(fExtendSet,    status);
   2664     fSets->addElement(fOtherSet,     status);
   2665     fSets->addElement(fExtendNumLetSet, status);
   2666 
   2667     if (U_FAILURE(status)) {
   2668         deferredStatus = status;
   2669     }
   2670 }
   2671 
   2672 void RBBIWordMonkey::setText(const UnicodeString &s) {
   2673     fText       = &s;
   2674 }
   2675 
   2676 
   2677 int32_t RBBIWordMonkey::next(int32_t prevPos) {
   2678     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   2679                               //   break position being tested.  The candidate break
   2680                               //   location is before p2.
   2681 
   2682     int     breakPos = -1;
   2683 
   2684     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   2685 
   2686     if (U_FAILURE(deferredStatus)) {
   2687         return -1;
   2688     }
   2689 
   2690     // Prev break at end of string.  return DONE.
   2691     if (prevPos >= fText->length()) {
   2692         return -1;
   2693     }
   2694     p0 = p1 = p2 = p3 = prevPos;
   2695     c3 =  fText->char32At(prevPos);
   2696     c0 = c1 = c2 = 0;
   2697 
   2698     // Loop runs once per "significant" character position in the input text.
   2699     for (;;) {
   2700         // Move all of the positions forward in the input string.
   2701         p0 = p1;  c0 = c1;
   2702         p1 = p2;  c1 = c2;
   2703         p2 = p3;  c2 = c3;
   2704 
   2705         // Advancd p3 by    X(Extend | Format)*   Rule 4
   2706         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
   2707         do {
   2708             p3 = fText->moveIndex32(p3, 1);
   2709             c3 = fText->char32At(p3);
   2710             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
   2711                break;
   2712             };
   2713         }
   2714         while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
   2715 
   2716 
   2717         if (p1 == p2) {
   2718             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   2719             continue;
   2720         }
   2721         if (p2 == fText->length()) {
   2722             // Reached end of string.  Always a break position.
   2723             break;
   2724         }
   2725 
   2726         // Rule  (3)   CR x LF
   2727         //     No Extend or Format characters may appear between the CR and LF,
   2728         //     which requires the additional check for p2 immediately following p1.
   2729         //
   2730         if (c1==0x0D && c2==0x0A) {
   2731             continue;
   2732         }
   2733 
   2734         // Rule (3a)  Break before and after newlines (including CR and LF)
   2735         //
   2736         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
   2737             break;
   2738         };
   2739         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
   2740             break;
   2741         };
   2742 
   2743         // Rule (5).   ALetter x ALetter
   2744         if (fALetterSet->contains(c1) &&
   2745             fALetterSet->contains(c2))  {
   2746             continue;
   2747         }
   2748 
   2749         // Rule (6)  ALetter  x  (MidLetter | MidNumLet) ALetter
   2750         //
   2751         if ( fALetterSet->contains(c1)   &&
   2752              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
   2753              fALetterSet->contains(c3)) {
   2754             continue;
   2755         }
   2756 
   2757 
   2758         // Rule (7)  ALetter (MidLetter | MidNumLet)  x  ALetter
   2759         if (fALetterSet->contains(c0) &&
   2760             (fMidLetterSet->contains(c1) ||  fMidNumLetSet->contains(c1)) &&
   2761             fALetterSet->contains(c2)) {
   2762             continue;
   2763         }
   2764 
   2765         // Rule (8)    Numeric x Numeric
   2766         if (fNumericSet->contains(c1) &&
   2767             fNumericSet->contains(c2))  {
   2768             continue;
   2769         }
   2770 
   2771         // Rule (9)    ALetter x Numeric
   2772         if (fALetterSet->contains(c1) &&
   2773             fNumericSet->contains(c2))  {
   2774             continue;
   2775         }
   2776 
   2777         // Rule (10)    Numeric x ALetter
   2778         if (fNumericSet->contains(c1) &&
   2779             fALetterSet->contains(c2))  {
   2780             continue;
   2781         }
   2782 
   2783         // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric
   2784         if (fNumericSet->contains(c0) &&
   2785             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1))  &&
   2786             fNumericSet->contains(c2)) {
   2787             continue;
   2788         }
   2789 
   2790         // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric
   2791         if (fNumericSet->contains(c1) &&
   2792             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2))  &&
   2793             fNumericSet->contains(c3)) {
   2794             continue;
   2795         }
   2796 
   2797         // Rule (13)  Katakana x Katakana
   2798         if (fKatakanaSet->contains(c1) &&
   2799             fKatakanaSet->contains(c2))  {
   2800             continue;
   2801         }
   2802 
   2803         // Rule 13a
   2804         if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
   2805              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
   2806              fExtendNumLetSet->contains(c2)) {
   2807                 continue;
   2808              }
   2809 
   2810         // Rule 13b
   2811         if (fExtendNumLetSet->contains(c1) &&
   2812                 (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
   2813                 fKatakanaSet->contains(c2)))  {
   2814                 continue;
   2815              }
   2816 
   2817         // Rule 14.  Break found here.
   2818         break;
   2819     }
   2820 
   2821     breakPos = p2;
   2822     return breakPos;
   2823 }
   2824 
   2825 
   2826 UVector  *RBBIWordMonkey::charClasses() {
   2827     return fSets;
   2828 }
   2829 
   2830 
   2831 RBBIWordMonkey::~RBBIWordMonkey() {
   2832     delete fSets;
   2833     delete fCRSet;
   2834     delete fLFSet;
   2835     delete fNewlineSet;
   2836     delete fKatakanaSet;
   2837     delete fALetterSet;
   2838     delete fMidNumLetSet;
   2839     delete fMidLetterSet;
   2840     delete fMidNumSet;
   2841     delete fNumericSet;
   2842     delete fFormatSet;
   2843     delete fExtendSet;
   2844     delete fExtendNumLetSet;
   2845     delete fOtherSet;
   2846 }
   2847 
   2848 
   2849 
   2850 
   2851 //------------------------------------------------------------------------------------------
   2852 //
   2853 //   class RBBISentMonkey      Sentence Break specific implementation
   2854 //                             of RBBIMonkeyKind.
   2855 //
   2856 //------------------------------------------------------------------------------------------
   2857 class RBBISentMonkey: public RBBIMonkeyKind {
   2858 public:
   2859     RBBISentMonkey();
   2860     virtual          ~RBBISentMonkey();
   2861     virtual  UVector *charClasses();
   2862     virtual  void     setText(const UnicodeString &s);
   2863     virtual int32_t   next(int32_t i);
   2864 private:
   2865     int               moveBack(int posFrom);
   2866     int               moveForward(int posFrom);
   2867     UChar32           cAt(int pos);
   2868 
   2869     UVector      *fSets;
   2870 
   2871     UnicodeSet  *fSepSet;
   2872     UnicodeSet  *fFormatSet;
   2873     UnicodeSet  *fSpSet;
   2874     UnicodeSet  *fLowerSet;
   2875     UnicodeSet  *fUpperSet;
   2876     UnicodeSet  *fOLetterSet;
   2877     UnicodeSet  *fNumericSet;
   2878     UnicodeSet  *fATermSet;
   2879     UnicodeSet  *fSContinueSet;
   2880     UnicodeSet  *fSTermSet;
   2881     UnicodeSet  *fCloseSet;
   2882     UnicodeSet  *fOtherSet;
   2883     UnicodeSet  *fExtendSet;
   2884 
   2885     const UnicodeString  *fText;
   2886 
   2887 };
   2888 
   2889 RBBISentMonkey::RBBISentMonkey()
   2890 {
   2891     UErrorCode  status = U_ZERO_ERROR;
   2892 
   2893     fSets            = new UVector(status);
   2894 
   2895     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
   2896     //                       set and made into character classes of their own.  For the monkey impl,
   2897     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
   2898     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
   2899     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
   2900     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
   2901     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
   2902     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
   2903     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
   2904     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
   2905     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
   2906     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
   2907     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
   2908     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
   2909     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
   2910     fOtherSet        = new UnicodeSet();
   2911 
   2912     if(U_FAILURE(status)) {
   2913       deferredStatus = status;
   2914       return;
   2915     }
   2916 
   2917     fOtherSet->complement();
   2918     fOtherSet->removeAll(*fSepSet);
   2919     fOtherSet->removeAll(*fFormatSet);
   2920     fOtherSet->removeAll(*fSpSet);
   2921     fOtherSet->removeAll(*fLowerSet);
   2922     fOtherSet->removeAll(*fUpperSet);
   2923     fOtherSet->removeAll(*fOLetterSet);
   2924     fOtherSet->removeAll(*fNumericSet);
   2925     fOtherSet->removeAll(*fATermSet);
   2926     fOtherSet->removeAll(*fSContinueSet);
   2927     fOtherSet->removeAll(*fSTermSet);
   2928     fOtherSet->removeAll(*fCloseSet);
   2929     fOtherSet->removeAll(*fExtendSet);
   2930 
   2931     fSets->addElement(fSepSet,       status);
   2932     fSets->addElement(fFormatSet,    status);
   2933     fSets->addElement(fSpSet,        status);
   2934     fSets->addElement(fLowerSet,     status);
   2935     fSets->addElement(fUpperSet,     status);
   2936     fSets->addElement(fOLetterSet,   status);
   2937     fSets->addElement(fNumericSet,   status);
   2938     fSets->addElement(fATermSet,     status);
   2939     fSets->addElement(fSContinueSet, status);
   2940     fSets->addElement(fSTermSet,     status);
   2941     fSets->addElement(fCloseSet,     status);
   2942     fSets->addElement(fOtherSet,     status);
   2943     fSets->addElement(fExtendSet,    status);
   2944 
   2945     if (U_FAILURE(status)) {
   2946         deferredStatus = status;
   2947     }
   2948 }
   2949 
   2950 
   2951 
   2952 void RBBISentMonkey::setText(const UnicodeString &s) {
   2953     fText       = &s;
   2954 }
   2955 
   2956 UVector  *RBBISentMonkey::charClasses() {
   2957     return fSets;
   2958 }
   2959 
   2960 
   2961 //  moveBack()   Find the "significant" code point preceding the index i.
   2962 //               Skips over ($Extend | $Format)* .
   2963 //
   2964 int RBBISentMonkey::moveBack(int i) {
   2965     if (i <= 0) {
   2966         return -1;
   2967     }
   2968     UChar32   c;
   2969     int32_t   j = i;
   2970     do {
   2971         j = fText->moveIndex32(j, -1);
   2972         c = fText->char32At(j);
   2973     }
   2974     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
   2975     return j;
   2976 
   2977  }
   2978 
   2979 
   2980 int RBBISentMonkey::moveForward(int i) {
   2981     if (i>=fText->length()) {
   2982         return fText->length();
   2983     }
   2984     UChar32   c;
   2985     int32_t   j = i;
   2986     do {
   2987         j = fText->moveIndex32(j, 1);
   2988         c = cAt(j);
   2989     }
   2990     while (fFormatSet->contains(c) || fExtendSet->contains(c));
   2991     return j;
   2992 }
   2993 
   2994 UChar32 RBBISentMonkey::cAt(int pos) {
   2995     if (pos<0 || pos>=fText->length()) {
   2996         return -1;
   2997     } else {
   2998         return fText->char32At(pos);
   2999     }
   3000 }
   3001 
   3002 int32_t RBBISentMonkey::next(int32_t prevPos) {
   3003     int    p0, p1, p2, p3;    // Indices of the significant code points around the
   3004                               //   break position being tested.  The candidate break
   3005                               //   location is before p2.
   3006 
   3007     int     breakPos = -1;
   3008 
   3009     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
   3010     UChar32 c;
   3011 
   3012     if (U_FAILURE(deferredStatus)) {
   3013         return -1;
   3014     }
   3015 
   3016     // Prev break at end of string.  return DONE.
   3017     if (prevPos >= fText->length()) {
   3018         return -1;
   3019     }
   3020     p0 = p1 = p2 = p3 = prevPos;
   3021     c3 =  fText->char32At(prevPos);
   3022     c0 = c1 = c2 = 0;
   3023 
   3024     // Loop runs once per "significant" character position in the input text.
   3025     for (;;) {
   3026         // Move all of the positions forward in the input string.
   3027         p0 = p1;  c0 = c1;
   3028         p1 = p2;  c1 = c2;
   3029         p2 = p3;  c2 = c3;
   3030 
   3031         // Advancd p3 by    X(Extend | Format)*   Rule 4
   3032         p3 = moveForward(p3);
   3033         c3 = cAt(p3);
   3034 
   3035         // Rule (3)  CR x LF
   3036         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
   3037             continue;
   3038         }
   3039 
   3040         // Rule (4).   Sep  <break>
   3041         if (fSepSet->contains(c1)) {
   3042             p2 = p1+1;   // Separators don't combine with Extend or Format.
   3043             break;
   3044         }
   3045 
   3046         if (p2 >= fText->length()) {
   3047             // Reached end of string.  Always a break position.
   3048             break;
   3049         }
   3050 
   3051         if (p2 == prevPos) {
   3052             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
   3053             continue;
   3054         }
   3055 
   3056         // Rule (6).   ATerm x Numeric
   3057         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
   3058             continue;
   3059         }
   3060 
   3061         // Rule (7).  Upper ATerm  x  Uppper
   3062         if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
   3063             continue;
   3064         }
   3065 
   3066         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
   3067         //           Note:  STerm | ATerm are added to the negated part of the expression by a
   3068         //                  note to the Unicode 5.0 documents.
   3069         int p8 = p1;
   3070         while (fSpSet->contains(cAt(p8))) {
   3071             p8 = moveBack(p8);
   3072         }
   3073         while (fCloseSet->contains(cAt(p8))) {
   3074             p8 = moveBack(p8);
   3075         }
   3076         if (fATermSet->contains(cAt(p8))) {
   3077             p8=p2;
   3078             for (;;) {
   3079                 c = cAt(p8);
   3080                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
   3081                     fLowerSet->contains(c) || fSepSet->contains(c) ||
   3082                     fATermSet->contains(c) || fSTermSet->contains(c))  {
   3083                     break;
   3084                 }
   3085                 p8 = moveForward(p8);
   3086             }
   3087             if (fLowerSet->contains(cAt(p8))) {
   3088                 continue;
   3089             }
   3090         }
   3091 
   3092         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
   3093         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
   3094             p8 = p1;
   3095             while (fSpSet->contains(cAt(p8))) {
   3096                 p8 = moveBack(p8);
   3097             }
   3098             while (fCloseSet->contains(cAt(p8))) {
   3099                 p8 = moveBack(p8);
   3100             }
   3101             c = cAt(p8);
   3102             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
   3103                 continue;
   3104             }
   3105         }
   3106 
   3107         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
   3108         int p9 = p1;
   3109         while (fCloseSet->contains(cAt(p9))) {
   3110             p9 = moveBack(p9);
   3111         }
   3112         c = cAt(p9);
   3113         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
   3114             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
   3115                 continue;
   3116             }
   3117         }
   3118 
   3119         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
   3120         int p10 = p1;
   3121         while (fSpSet->contains(cAt(p10))) {
   3122             p10 = moveBack(p10);
   3123         }
   3124         while (fCloseSet->contains(cAt(p10))) {
   3125             p10 = moveBack(p10);
   3126         }
   3127         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
   3128             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
   3129                 continue;
   3130             }
   3131         }
   3132 
   3133         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
   3134         int p11 = p1;
   3135         if (fSepSet->contains(cAt(p11))) {
   3136             p11 = moveBack(p11);
   3137         }
   3138         while (fSpSet->contains(cAt(p11))) {
   3139             p11 = moveBack(p11);
   3140         }
   3141         while (fCloseSet->contains(cAt(p11))) {
   3142             p11 = moveBack(p11);
   3143         }
   3144         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
   3145             break;
   3146         }
   3147 
   3148         //  Rule (12)  Any x Any
   3149         continue;
   3150     }
   3151     breakPos = p2;
   3152     return breakPos;
   3153 }
   3154 
   3155 RBBISentMonkey::~RBBISentMonkey() {
   3156     delete fSets;
   3157     delete fSepSet;
   3158     delete fFormatSet;
   3159     delete fSpSet;
   3160     delete fLowerSet;
   3161     delete fUpperSet;
   3162     delete fOLetterSet;
   3163     delete fNumericSet;
   3164     delete fATermSet;
   3165     delete fSContinueSet;
   3166     delete fSTermSet;
   3167     delete fCloseSet;
   3168     delete fOtherSet;
   3169     delete fExtendSet;
   3170 }
   3171 
   3172 
   3173 
   3174 //-------------------------------------------------------------------------------------------
   3175 //
   3176 //  RBBILineMonkey
   3177 //
   3178 //-------------------------------------------------------------------------------------------
   3179 
   3180 class RBBILineMonkey: public RBBIMonkeyKind {
   3181 public:
   3182     RBBILineMonkey();
   3183     virtual          ~RBBILineMonkey();
   3184     virtual  UVector *charClasses();
   3185     virtual  void     setText(const UnicodeString &s);
   3186     virtual  int32_t  next(int32_t i);
   3187     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
   3188 private:
   3189     UVector      *fSets;
   3190 
   3191     UnicodeSet  *fBK;
   3192     UnicodeSet  *fCR;
   3193     UnicodeSet  *fLF;
   3194     UnicodeSet  *fCM;
   3195     UnicodeSet  *fNL;
   3196     UnicodeSet  *fSG;
   3197     UnicodeSet  *fWJ;
   3198     UnicodeSet  *fZW;
   3199     UnicodeSet  *fGL;
   3200     UnicodeSet  *fCB;
   3201     UnicodeSet  *fSP;
   3202     UnicodeSet  *fB2;
   3203     UnicodeSet  *fBA;
   3204     UnicodeSet  *fBB;
   3205     UnicodeSet  *fHY;
   3206     UnicodeSet  *fH2;
   3207     UnicodeSet  *fH3;
   3208     UnicodeSet  *fCL;
   3209     UnicodeSet  *fEX;
   3210     UnicodeSet  *fIN;
   3211     UnicodeSet  *fJL;
   3212     UnicodeSet  *fJV;
   3213     UnicodeSet  *fJT;
   3214     UnicodeSet  *fNS;
   3215     UnicodeSet  *fOP;
   3216     UnicodeSet  *fQU;
   3217     UnicodeSet  *fIS;
   3218     UnicodeSet  *fNU;
   3219     UnicodeSet  *fPO;
   3220     UnicodeSet  *fPR;
   3221     UnicodeSet  *fSY;
   3222     UnicodeSet  *fAI;
   3223     UnicodeSet  *fAL;
   3224     UnicodeSet  *fID;
   3225     UnicodeSet  *fSA;
   3226     UnicodeSet  *fXX;
   3227 
   3228     BreakIterator  *fCharBI;
   3229 
   3230     const UnicodeString  *fText;
   3231     int32_t              *fOrigPositions;
   3232 
   3233     RegexMatcher         *fNumberMatcher;
   3234     RegexMatcher         *fLB11Matcher;
   3235 };
   3236 
   3237 
   3238 RBBILineMonkey::RBBILineMonkey()
   3239 {
   3240     UErrorCode  status = U_ZERO_ERROR;
   3241 
   3242     fSets  = new UVector(status);
   3243 
   3244     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
   3245     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
   3246     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
   3247     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
   3248     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
   3249     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
   3250     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
   3251     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
   3252     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
   3253     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
   3254     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
   3255     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
   3256     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
   3257     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
   3258     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
   3259     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
   3260     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
   3261     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
   3262     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
   3263     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
   3264     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
   3265     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
   3266     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
   3267     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
   3268     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
   3269     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
   3270     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
   3271     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
   3272     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
   3273     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
   3274     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
   3275     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
   3276     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
   3277     fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
   3278     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
   3279     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
   3280 
   3281     if (U_FAILURE(status)) {
   3282         deferredStatus = status;
   3283         fCharBI = NULL;
   3284         fNumberMatcher = NULL;
   3285         return;
   3286     }
   3287 
   3288     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
   3289     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
   3290     fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
   3291     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
   3292 
   3293     fSets->addElement(fBK, status);
   3294     fSets->addElement(fCR, status);
   3295     fSets->addElement(fLF, status);
   3296     fSets->addElement(fCM, status);
   3297     fSets->addElement(fNL, status);
   3298     fSets->addElement(fWJ, status);
   3299     fSets->addElement(fZW, status);
   3300     fSets->addElement(fGL, status);
   3301     fSets->addElement(fCB, status);
   3302     fSets->addElement(fSP, status);
   3303     fSets->addElement(fB2, status);
   3304     fSets->addElement(fBA, status);
   3305     fSets->addElement(fBB, status);
   3306     fSets->addElement(fHY, status);
   3307     fSets->addElement(fH2, status);
   3308     fSets->addElement(fH3, status);
   3309     fSets->addElement(fCL, status);
   3310     fSets->addElement(fEX, status);
   3311     fSets->addElement(fIN, status);
   3312     fSets->addElement(fJL, status);
   3313     fSets->addElement(fJT, status);
   3314     fSets->addElement(fJV, status);
   3315     fSets->addElement(fNS, status);
   3316     fSets->addElement(fOP, status);
   3317     fSets->addElement(fQU, status);
   3318     fSets->addElement(fIS, status);
   3319     fSets->addElement(fNU, status);
   3320     fSets->addElement(fPO, status);
   3321     fSets->addElement(fPR, status);
   3322     fSets->addElement(fSY, status);
   3323     fSets->addElement(fAI, status);
   3324     fSets->addElement(fAL, status);
   3325     fSets->addElement(fID, status);
   3326     fSets->addElement(fWJ, status);
   3327     fSets->addElement(fSA, status);
   3328     fSets->addElement(fSG, status);
   3329 
   3330     const char *rules =
   3331             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
   3332             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
   3333             "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
   3334             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
   3335             "(\\p{Line_Break=CL}\\p{Line_Break=CM}*)?"
   3336             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
   3337 
   3338     fNumberMatcher = new RegexMatcher(
   3339         UnicodeString(rules, -1, US_INV), 0, status);
   3340 
   3341     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
   3342 
   3343     if (U_FAILURE(status)) {
   3344         deferredStatus = status;
   3345     }
   3346 }
   3347 
   3348 
   3349 void RBBILineMonkey::setText(const UnicodeString &s) {
   3350     fText       = &s;
   3351     fCharBI->setText(s);
   3352     fNumberMatcher->reset(s);
   3353 }
   3354 
   3355 //
   3356 //  rule9Adjust
   3357 //     Line Break TR rules 9 and 10 implementation.
   3358 //     This deals with combining marks and other sequences that
   3359 //     that must be treated as if they were something other than what they actually are.
   3360 //
   3361 //     This is factored out into a separate function because it must be applied twice for
   3362 //     each potential break, once to the chars before the position being checked, then
   3363 //     again to the text following the possible break.
   3364 //
   3365 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
   3366     if (pos == -1) {
   3367         // Invalid initial position.  Happens during the warmup iteration of the
   3368         //   main loop in next().
   3369         return;
   3370     }
   3371 
   3372     int32_t  nPos = *nextPos;
   3373 
   3374     // LB 9  Keep combining sequences together.
   3375     //  advance over any CM class chars.  Note that Line Break CM is different
   3376     //  from the normal Grapheme Extend property.
   3377     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
   3378           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
   3379         for (;;) {
   3380             *nextChar = fText->char32At(nPos);
   3381             if (!fCM->contains(*nextChar)) {
   3382                 break;
   3383             }
   3384             nPos = fText->moveIndex32(nPos, 1);
   3385         }
   3386     }
   3387 
   3388 
   3389     // LB 9 Treat X CM* as if it were x.
   3390     //       No explicit action required.
   3391 
   3392     // LB 10  Treat any remaining combining mark as AL
   3393     if (fCM->contains(*posChar)) {
   3394         *posChar = 0x41;   // thisChar = 'A';
   3395     }
   3396 
   3397     // Push the updated nextPos and nextChar back to our caller.
   3398     // This only makes a difference if posChar got bigger by consuming a
   3399     // combining sequence.
   3400     *nextPos  = nPos;
   3401     *nextChar = fText->char32At(nPos);
   3402 }
   3403 
   3404 
   3405 
   3406 int32_t RBBILineMonkey::next(int32_t startPos) {
   3407     UErrorCode status = U_ZERO_ERROR;
   3408     int32_t    pos;       //  Index of the char following a potential break position
   3409     UChar32    thisChar;  //  Character at above position "pos"
   3410 
   3411     int32_t    prevPos;   //  Index of the char preceding a potential break position
   3412     UChar32    prevChar;  //  Character at above position.  Note that prevChar
   3413                           //   and thisChar may not be adjacent because combining
   3414                           //   characters between them will be ignored.
   3415 
   3416     int32_t    nextPos;   //  Index of the next character following pos.
   3417                           //     Usually skips over combining marks.
   3418     int32_t    nextCPPos; //  Index of the code point following "pos."
   3419                           //     May point to a combining mark.
   3420     int32_t    tPos;      //  temp value.
   3421     UChar32    c;
   3422 
   3423     if (U_FAILURE(deferredStatus)) {
   3424         return -1;
   3425     }
   3426 
   3427     if (startPos >= fText->length()) {
   3428         return -1;
   3429     }
   3430 
   3431 
   3432     // Initial values for loop.  Loop will run the first time without finding breaks,
   3433     //                           while the invalid values shift out and the "this" and
   3434     //                           "prev" positions are filled in with good values.
   3435     pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.
   3436     thisChar = prevChar  = 0;
   3437     nextPos  = nextCPPos = startPos;
   3438 
   3439 
   3440     // Loop runs once per position in the test text, until a break position
   3441     //  is found.
   3442     for (;;) {
   3443         prevPos   = pos;
   3444         prevChar  = thisChar;
   3445 
   3446         pos       = nextPos;
   3447         thisChar  = fText->char32At(pos);
   3448 
   3449         nextCPPos = fText->moveIndex32(pos, 1);
   3450         nextPos   = nextCPPos;
   3451 
   3452         // Rule LB2 - Break at end of text.
   3453         if (pos >= fText->length()) {
   3454             break;
   3455         }
   3456 
   3457         // Rule LB 9 - adjust for combining sequences.
   3458         //             We do this one out-of-order because the adjustment does not change anything
   3459         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
   3460         //             be applied.
   3461         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
   3462         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
   3463         c = fText->char32At(nextPos);
   3464         rule9Adjust(pos,     &thisChar, &nextPos, &c);
   3465 
   3466         // If the loop is still warming up - if we haven't shifted the initial
   3467         //   -1 positions out of prevPos yet - loop back to advance the
   3468         //    position in the input without any further looking for breaks.
   3469         if (prevPos == -1) {
   3470             continue;
   3471         }
   3472 
   3473         // LB 4  Always break after hard line breaks,
   3474         if (fBK->contains(prevChar)) {
   3475             break;
   3476         }
   3477 
   3478         // LB 5  Break after CR, LF, NL, but not inside CR LF
   3479         if (prevChar == 0x0d && thisChar == 0x0a) {
   3480             continue;
   3481         }
   3482         if (prevChar == 0x0d ||
   3483             prevChar == 0x0a ||
   3484             prevChar == 0x85)  {
   3485             break;
   3486         }
   3487 
   3488         // LB 6  Don't break before hard line breaks
   3489         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
   3490             fBK->contains(thisChar)) {
   3491                 continue;
   3492         }
   3493 
   3494 
   3495         // LB 7  Don't break before spaces or zero-width space.
   3496         if (fSP->contains(thisChar)) {
   3497             continue;
   3498         }
   3499 
   3500         if (fZW->contains(thisChar)) {
   3501             continue;
   3502         }
   3503 
   3504         // LB 8  Break after zero width space
   3505         if (fZW->contains(prevChar)) {
   3506             break;
   3507         }
   3508 
   3509         // LB 9, 10  Already done, at top of loop.
   3510         //
   3511 
   3512 
   3513         // LB 11  Do not break before or after WORD JOINER and related characters.
   3514         //    x  WJ
   3515         //    WJ  x
   3516         //
   3517         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
   3518             continue;
   3519         }
   3520 
   3521         // LB 12
   3522         //    GL  x
   3523         if (fGL->contains(prevChar)) {
   3524             continue;
   3525         }
   3526 
   3527         // LB 12a
   3528         //    [^SP BA HY] x GL
   3529         if (!(fSP->contains(prevChar) ||
   3530               fBA->contains(prevChar) ||
   3531               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
   3532             continue;
   3533         }
   3534 
   3535 
   3536 
   3537         // LB 13  Don't break before closings.
   3538         //        NU x CL  and NU x IS are not matched here so that they will
   3539         //        fall into LB 17 and the more general number regular expression.
   3540         //
   3541         if (!fNU->contains(prevChar) && fCL->contains(thisChar) ||
   3542                                         fEX->contains(thisChar) ||
   3543             !fNU->contains(prevChar) && fIS->contains(thisChar) ||
   3544             !fNU->contains(prevChar) && fSY->contains(thisChar))    {
   3545             continue;
   3546         }
   3547 
   3548         // LB 14 Don't break after OP SP*
   3549         //       Scan backwards, checking for this sequence.
   3550         //       The OP char could include combining marks, so we actually check for
   3551         //           OP CM* SP*
   3552         //       Another Twist: The Rule 67 fixes may have changed a SP CM
   3553         //       sequence into a ID char, so before scanning back through spaces,
   3554         //       verify that prevChar is indeed a space.  The prevChar variable
   3555         //       may differ from fText[prevPos]
   3556         tPos = prevPos;
   3557         if (fSP->contains(prevChar)) {
   3558             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
   3559                 tPos=fText->moveIndex32(tPos, -1);
   3560             }
   3561         }
   3562         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
   3563             tPos=fText->moveIndex32(tPos, -1);
   3564         }
   3565         if (fOP->contains(fText->char32At(tPos))) {
   3566             continue;
   3567         }
   3568 
   3569 
   3570         // LB 15    QU SP* x OP
   3571         if (fOP->contains(thisChar)) {
   3572             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
   3573             int tPos = prevPos;
   3574             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
   3575                 tPos = fText->moveIndex32(tPos, -1);
   3576             }
   3577             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
   3578                 tPos = fText->moveIndex32(tPos, -1);
   3579             }
   3580             if (fQU->contains(fText->char32At(tPos))) {
   3581                 continue;
   3582             }
   3583         }
   3584 
   3585 
   3586 
   3587         // LB 16   CL SP* x NS
   3588         //    Scan backwards for SP* CM* CL
   3589         if (fNS->contains(thisChar)) {
   3590             int tPos = prevPos;
   3591             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
   3592                 tPos = fText->moveIndex32(tPos, -1);
   3593             }
   3594             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
   3595                 tPos = fText->moveIndex32(tPos, -1);
   3596             }
   3597             if (fCL->contains(fText->char32At(tPos))) {
   3598                 continue;
   3599             }
   3600         }
   3601 
   3602 
   3603         // LB 17        B2 SP* x B2
   3604         if (fB2->contains(thisChar)) {
   3605             //  Scan backwards, checking for the B2 CM* SP* sequence.
   3606             tPos = prevPos;
   3607             if (fSP->contains(prevChar)) {
   3608                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
   3609                     tPos=fText->moveIndex32(tPos, -1);
   3610                 }
   3611             }
   3612             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
   3613                 tPos=fText->moveIndex32(tPos, -1);
   3614             }
   3615             if (fB2->contains(fText->char32At(tPos))) {
   3616                 continue;
   3617             }
   3618         }
   3619 
   3620 
   3621         // LB 18    break after space
   3622         if (fSP->contains(prevChar)) {
   3623             break;
   3624         }
   3625 
   3626         // LB 19
   3627         //    x   QU
   3628         //    QU  x
   3629         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
   3630             continue;
   3631         }
   3632 
   3633         // LB 20  Break around a CB
   3634         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
   3635             break;
   3636         }
   3637 
   3638         // LB 21
   3639         if (fBA->contains(thisChar) ||
   3640             fHY->contains(thisChar) ||
   3641             fNS->contains(thisChar) ||
   3642             fBB->contains(prevChar) )   {
   3643             continue;
   3644         }
   3645 
   3646         // LB 22
   3647         if (fAL->contains(prevChar) && fIN->contains(thisChar) ||
   3648             fID->contains(prevChar) && fIN->contains(thisChar) ||
   3649             fIN->contains(prevChar) && fIN->contains(thisChar) ||
   3650             fNU->contains(prevChar) && fIN->contains(thisChar) )   {
   3651             continue;
   3652         }
   3653 
   3654 
   3655         // LB 23    ID x PO
   3656         //          AL x NU
   3657         //          NU x AL
   3658         if (fID->contains(prevChar) && fPO->contains(thisChar) ||
   3659             fAL->contains(prevChar) && fNU->contains(thisChar) ||
   3660             fNU->contains(prevChar) && fAL->contains(thisChar) )   {
   3661             continue;
   3662         }
   3663 
   3664         // LB 24  Do not break between prefix and letters or ideographs.
   3665         //        PR x ID
   3666         //        PR x AL
   3667         //        PO x AL
   3668         if (fPR->contains(prevChar) && fID->contains(thisChar) ||
   3669             fPR->contains(prevChar) && fAL->contains(thisChar) ||
   3670             fPO->contains(prevChar) && fAL->contains(thisChar) )   {
   3671             continue;
   3672         }
   3673 
   3674 
   3675 
   3676         // LB 25    Numbers
   3677         if (fNumberMatcher->lookingAt(prevPos, status)) {
   3678             if (U_FAILURE(status)) {
   3679                 break;
   3680             }
   3681             // Matched a number.  But could have been just a single digit, which would
   3682             //    not represent a "no break here" between prevChar and thisChar
   3683             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
   3684             if (numEndIdx > pos) {
   3685                 // Number match includes at least our two chars being checked
   3686                 if (numEndIdx > nextPos) {
   3687                     // Number match includes additional chars.  Update pos and nextPos
   3688                     //   so that next loop iteration will continue at the end of the number,
   3689                     //   checking for breaks between last char in number & whatever follows.
   3690                     pos = nextPos = numEndIdx;
   3691                     do {
   3692                         pos = fText->moveIndex32(pos, -1);
   3693                         thisChar = fText->char32At(pos);
   3694                     } while (fCM->contains(thisChar));
   3695                 }
   3696                 continue;
   3697             }
   3698         }
   3699 
   3700 
   3701         // LB 26 Do not break a Korean syllable.
   3702         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
   3703                                         fJV->contains(thisChar) ||
   3704                                         fH2->contains(thisChar) ||
   3705                                         fH3->contains(thisChar))) {
   3706                                             continue;
   3707                                         }
   3708 
   3709         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
   3710             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
   3711                 continue;
   3712         }
   3713 
   3714         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
   3715             fJT->contains(thisChar)) {
   3716                 continue;
   3717         }
   3718 
   3719         // LB 27 Treat a Korean Syllable Block the same as ID.
   3720         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
   3721             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
   3722             fIN->contains(thisChar)) {
   3723                 continue;
   3724             }
   3725         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
   3726             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
   3727             fPO->contains(thisChar)) {
   3728                 continue;
   3729             }
   3730         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
   3731             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
   3732                 continue;
   3733             }
   3734 
   3735 
   3736 
   3737         // LB 28  Do not break between alphabetics ("at").
   3738         if (fAL->contains(prevChar) && fAL->contains(thisChar)) {
   3739             continue;
   3740         }
   3741 
   3742         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
   3743         if (fIS->contains(prevChar) && fAL->contains(thisChar)) {
   3744             continue;
   3745         }
   3746 
   3747         // LB 31    Break everywhere else
   3748         break;
   3749 
   3750     }
   3751 
   3752     return pos;
   3753 }
   3754 
   3755 
   3756 UVector  *RBBILineMonkey::charClasses() {
   3757     return fSets;
   3758 }
   3759 
   3760 
   3761 RBBILineMonkey::~RBBILineMonkey() {
   3762     delete fSets;
   3763 
   3764     delete fBK;
   3765     delete fCR;
   3766     delete fLF;
   3767     delete fCM;
   3768     delete fNL;
   3769     delete fWJ;
   3770     delete fZW;
   3771     delete fGL;
   3772     delete fCB;
   3773     delete fSP;
   3774     delete fB2;
   3775     delete fBA;
   3776     delete fBB;
   3777     delete fHY;
   3778     delete fH2;
   3779     delete fH3;
   3780     delete fCL;
   3781     delete fEX;
   3782     delete fIN;
   3783     delete fJL;
   3784     delete fJV;
   3785     delete fJT;
   3786     delete fNS;
   3787     delete fOP;
   3788     delete fQU;
   3789     delete fIS;
   3790     delete fNU;
   3791     delete fPO;
   3792     delete fPR;
   3793     delete fSY;
   3794     delete fAI;
   3795     delete fAL;
   3796     delete fID;
   3797     delete fSA;
   3798     delete fSG;
   3799     delete fXX;
   3800 
   3801     delete fCharBI;
   3802     delete fNumberMatcher;
   3803 }
   3804 
   3805 
   3806 //-------------------------------------------------------------------------------------------
   3807 //
   3808 //   TestMonkey
   3809 //
   3810 //     params
   3811 //       seed=nnnnn        Random number starting seed.
   3812 //                         Setting the seed allows errors to be reproduced.
   3813 //       loop=nnn          Looping count.  Controls running time.
   3814 //                         -1:  run forever.
   3815 //                          0 or greater:  run length.
   3816 //
   3817 //       type = char | word | line | sent | title
   3818 //
   3819 //-------------------------------------------------------------------------------------------
   3820 
   3821 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
   3822     int32_t val = defaultVal;
   3823     name.append(" *= *(-?\\d+)");
   3824     UErrorCode status = U_ZERO_ERROR;
   3825     RegexMatcher m(name, params, 0, status);
   3826     if (m.find()) {
   3827         // The param exists.  Convert the string to an int.
   3828         char valString[100];
   3829         int32_t paramLength = m.end(1, status) - m.start(1, status);
   3830         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
   3831             paramLength = (int32_t)(sizeof(valString)-2);
   3832         }
   3833         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
   3834         val = strtol(valString,  NULL, 10);
   3835 
   3836         // Delete this parameter from the params string.
   3837         m.reset();
   3838         params = m.replaceFirst("", status);
   3839     }
   3840     U_ASSERT(U_SUCCESS(status));
   3841     return val;
   3842 }
   3843 #endif
   3844 
   3845 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
   3846                                     BreakIterator *bi,
   3847                                     int expected[],
   3848                                     int expectedcount)
   3849 {
   3850     int count = 0;
   3851     int i = 0;
   3852     int forward[50];
   3853     bi->setText(ustr);
   3854     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
   3855         forward[count] = i;
   3856         if (count < expectedcount && expected[count] != i) {
   3857             test->errln("break forward test failed: expected %d but got %d",
   3858                         expected[count], i);
   3859             break;
   3860         }
   3861         count ++;
   3862     }
   3863     if (count != expectedcount) {
   3864         printStringBreaks(ustr, expected, expectedcount);
   3865         test->errln("break forward test failed: missed %d match",
   3866                     expectedcount - count);
   3867         return;
   3868     }
   3869     // testing boundaries
   3870     for (i = 1; i < expectedcount; i ++) {
   3871         int j = expected[i - 1];
   3872         if (!bi->isBoundary(j)) {
   3873             printStringBreaks(ustr, expected, expectedcount);
   3874             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
   3875             return;
   3876         }
   3877         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
   3878             if (bi->isBoundary(j)) {
   3879                 printStringBreaks(ustr, expected, expectedcount);
   3880                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
   3881                 return;
   3882             }
   3883         }
   3884     }
   3885 
   3886     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
   3887         count --;
   3888         if (forward[count] != i) {
   3889             test->errln("happy break test previous() failed: expected %d but got %d",
   3890                         forward[count], i);
   3891             break;
   3892         }
   3893     }
   3894     if (count != 0) {
   3895         printStringBreaks(ustr, expected, expectedcount);
   3896         test->errln("break test previous() failed: missed a match");
   3897         return;
   3898     }
   3899 
   3900     // testing preceding
   3901     for (i = 0; i < expectedcount - 1; i ++) {
   3902         // int j = expected[i] + 1;
   3903         int j = ustr.moveIndex32(expected[i], 1);
   3904         for (; j <= expected[i + 1]; j ++) {
   3905             if (bi->preceding(j) != expected[i]) {
   3906                 printStringBreaks(ustr, expected, expectedcount);
   3907                 test->errln("preceding(): Not expecting boundary at position %d", j);
   3908                 return;
   3909             }
   3910         }
   3911     }
   3912 }
   3913 
   3914 void RBBITest::TestWordBreaks(void)
   3915 {
   3916 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   3917 
   3918     Locale        locale("en");
   3919     UErrorCode    status = U_ZERO_ERROR;
   3920     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   3921     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
   3922     static const char *strlist[] =
   3923     {
   3924     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
   3925     "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
   3926     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
   3927     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
   3928     "\\u90ca\\u3588\\u009c\\u0953\\u194b",
   3929     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   3930     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
   3931     "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
   3932     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   3933     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   3934     "\\u2027\\U000e0067\\u0a47\\u00b7",
   3935     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   3936     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   3937     "\\u0589\\U000e006e\\u0a42\\U000104a5",
   3938     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
   3939     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   3940     "\\u0027\\u11af\\U000e0057\\u0602",
   3941     "\\U0001d7f2\\U000e007\\u0004\\u0589",
   3942     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
   3943     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   3944     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   3945     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   3946     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
   3947     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   3948     "\\u0233\\U000e0020\\u0a69\\u0d6a",
   3949     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   3950     "\\u58f4\\U000e0049\\u20e7\\u2027",
   3951     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   3952     "\\ua183\\u102d\\u0bec\\u003a",
   3953     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   3954     "\\u003a\\u0e57\\u0fad\\u002e",
   3955     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
   3956     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
   3957     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
   3958     "\\u003a\\u0664\\u00b7\\u1fba",
   3959     "\\u003b\\u0027\\u00b7\\u47a3",
   3960     "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
   3961     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
   3962     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
   3963     };
   3964     int loop;
   3965     if (U_FAILURE(status)) {
   3966         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   3967         return;
   3968     }
   3969     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   3970         // printf("looping %d\n", loop);
   3971         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
   3972         // RBBICharMonkey monkey;
   3973         RBBIWordMonkey monkey;
   3974 
   3975         int expected[50];
   3976         int expectedcount = 0;
   3977 
   3978         monkey.setText(ustr);
   3979         int i;
   3980         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   3981             expected[expectedcount ++] = i;
   3982         }
   3983 
   3984         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   3985     }
   3986     delete bi;
   3987 #endif
   3988 }
   3989 
   3990 void RBBITest::TestWordBoundary(void)
   3991 {
   3992     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
   3993     Locale        locale("en");
   3994     UErrorCode    status = U_ZERO_ERROR;
   3995     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   3996     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
   3997     UChar         str[50];
   3998     static const char *strlist[] =
   3999     {
   4000     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   4001     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   4002     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   4003     "\\u2027\\U000e0067\\u0a47\\u00b7",
   4004     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   4005     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   4006     "\\u0589\\U000e006e\\u0a42\\U000104a5",
   4007     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
   4008     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   4009     "\\u0027\\u11af\\U000e0057\\u0602",
   4010     "\\U0001d7f2\\U000e007\\u0004\\u0589",
   4011     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
   4012     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   4013     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   4014     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   4015     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
   4016     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   4017     "\\u0233\\U000e0020\\u0a69\\u0d6a",
   4018     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   4019     "\\u58f4\\U000e0049\\u20e7\\u2027",
   4020     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   4021     "\\ua183\\u102d\\u0bec\\u003a",
   4022     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   4023     "\\u003a\\u0e57\\u0fad\\u002e",
   4024     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
   4025     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
   4026     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
   4027     "\\u003a\\u0664\\u00b7\\u1fba",
   4028     "\\u003b\\u0027\\u00b7\\u47a3",
   4029     };
   4030     int loop;
   4031     if (U_FAILURE(status)) {
   4032         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   4033         return;
   4034     }
   4035     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   4036         // printf("looping %d\n", loop);
   4037         u_unescape(strlist[loop], str, 20);
   4038         UnicodeString ustr(str);
   4039         int forward[50];
   4040         int count = 0;
   4041 
   4042         bi->setText(ustr);
   4043         int prev = 0;
   4044         int i;
   4045         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
   4046             forward[count ++] = i;
   4047             if (i > prev) {
   4048                 int j;
   4049                 for (j = prev + 1; j < i; j ++) {
   4050                     if (bi->isBoundary(j)) {
   4051                         printStringBreaks(ustr, forward, count);
   4052                         errln("happy boundary test failed: expected %d not a boundary",
   4053                                j);
   4054                         return;
   4055                     }
   4056                 }
   4057             }
   4058             if (!bi->isBoundary(i)) {
   4059                 printStringBreaks(ustr, forward, count);
   4060                 errln("happy boundary test failed: expected %d a boundary",
   4061                        i);
   4062                 return;
   4063             }
   4064             prev = i;
   4065         }
   4066     }
   4067     delete bi;
   4068 }
   4069 
   4070 void RBBITest::TestLineBreaks(void)
   4071 {
   4072 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4073     Locale        locale("en");
   4074     UErrorCode    status = U_ZERO_ERROR;
   4075     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
   4076     const int32_t  STRSIZE = 50;
   4077     UChar         str[STRSIZE];
   4078     static const char *strlist[] =
   4079     {
   4080      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
   4081      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
   4082              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
   4083      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
   4084              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
   4085      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
   4086      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
   4087      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
   4088      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
   4089      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
   4090      "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
   4091      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
   4092      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
   4093      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
   4094      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
   4095      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
   4096      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
   4097      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
   4098      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
   4099      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
   4100      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
   4101      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
   4102      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
   4103      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
   4104      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
   4105      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
   4106      "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
   4107      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
   4108      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
   4109      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
   4110      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
   4111      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
   4112      "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
   4113      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
   4114      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
   4115      "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
   4116      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
   4117      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
   4118      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
   4119      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
   4120      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
   4121      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
   4122          "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
   4123          "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
   4124          "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
   4125      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
   4126          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
   4127     };
   4128     int loop;
   4129     TEST_ASSERT_SUCCESS(status);
   4130     if (U_FAILURE(status)) {
   4131         return;
   4132     }
   4133     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   4134         // printf("looping %d\n", loop);
   4135         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
   4136         if (t >= STRSIZE) {
   4137             TEST_ASSERT(FALSE);
   4138             continue;
   4139         }
   4140 
   4141 
   4142         UnicodeString ustr(str);
   4143         RBBILineMonkey monkey;
   4144         if (U_FAILURE(monkey.deferredStatus)) {
   4145             continue;
   4146         }
   4147 
   4148         const int EXPECTEDSIZE = 50;
   4149         int expected[EXPECTEDSIZE];
   4150         int expectedcount = 0;
   4151 
   4152         monkey.setText(ustr);
   4153         int i;
   4154         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   4155             if (expectedcount >= EXPECTEDSIZE) {
   4156                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
   4157                 return;
   4158             }
   4159             expected[expectedcount ++] = i;
   4160         }
   4161 
   4162         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   4163     }
   4164     delete bi;
   4165 #endif
   4166 }
   4167 
   4168 void RBBITest::TestSentBreaks(void)
   4169 {
   4170 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4171     Locale        locale("en");
   4172     UErrorCode    status = U_ZERO_ERROR;
   4173     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
   4174     UChar         str[200];
   4175     static const char *strlist[] =
   4176     {
   4177      "Now\ris\nthe\r\ntime\n\rfor\r\r",
   4178      "This\n",
   4179      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
   4180      "\"Sentence ending with a quote.\" Bye.",
   4181      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
   4182      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
   4183      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
   4184      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
   4185      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
   4186      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
   4187      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
   4188              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
   4189              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
   4190              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
   4191      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
   4192              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
   4193              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
   4194              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
   4195              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
   4196              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
   4197     };
   4198     int loop;
   4199     if (U_FAILURE(status)) {
   4200         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
   4201         return;
   4202     }
   4203     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
   4204         u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
   4205         UnicodeString ustr(str);
   4206 
   4207         RBBISentMonkey monkey;
   4208         if (U_FAILURE(monkey.deferredStatus)) {
   4209             continue;
   4210         }
   4211 
   4212         const int EXPECTEDSIZE = 50;
   4213         int expected[EXPECTEDSIZE];
   4214         int expectedcount = 0;
   4215 
   4216         monkey.setText(ustr);
   4217         int i;
   4218         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
   4219             if (expectedcount >= EXPECTEDSIZE) {
   4220                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
   4221                 return;
   4222             }
   4223             expected[expectedcount ++] = i;
   4224         }
   4225 
   4226         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
   4227     }
   4228     delete bi;
   4229 #endif
   4230 }
   4231 
   4232 void RBBITest::TestMonkey(char *params) {
   4233 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4234 
   4235     UErrorCode     status    = U_ZERO_ERROR;
   4236     int32_t        loopCount = 500;
   4237     int32_t        seed      = 1;
   4238     UnicodeString  breakType = "all";
   4239     Locale         locale("en");
   4240     UBool          useUText  = FALSE;
   4241 
   4242     if (quick == FALSE) {
   4243         loopCount = 10000;
   4244     }
   4245 
   4246     if (params) {
   4247         UnicodeString p(params);
   4248         loopCount = getIntParam("loop", p, loopCount);
   4249         seed      = getIntParam("seed", p, seed);
   4250 
   4251         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
   4252         if (m.find()) {
   4253             breakType = m.group(1, status);
   4254             m.reset();
   4255             p = m.replaceFirst("", status);
   4256         }
   4257 
   4258         RegexMatcher u(" *utext", p, 0, status);
   4259         if (u.find()) {
   4260             useUText = TRUE;
   4261             u.reset();
   4262             p = u.replaceFirst("", status);
   4263         }
   4264 
   4265 
   4266         // m.reset(p);
   4267         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
   4268             // Each option is stripped out of the option string as it is processed.
   4269             // All options have been checked.  The option string should have been completely emptied..
   4270             char buf[100];
   4271             p.extract(buf, sizeof(buf), NULL, status);
   4272             buf[sizeof(buf)-1] = 0;
   4273             errln("Unrecognized or extra parameter:  %s\n", buf);
   4274             return;
   4275         }
   4276 
   4277     }
   4278 
   4279     if (breakType == "char" || breakType == "all") {
   4280         RBBICharMonkey  m;
   4281         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   4282         if (U_SUCCESS(status)) {
   4283             RunMonkey(bi, m, "char", seed, loopCount, useUText);
   4284             if (breakType == "all" && useUText==FALSE) {
   4285                 // Also run a quick test with UText when "all" is specified
   4286                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
   4287             }
   4288         }
   4289         else {
   4290             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
   4291         }
   4292         delete bi;
   4293     }
   4294 
   4295     if (breakType == "word" || breakType == "all") {
   4296         logln("Word Break Monkey Test");
   4297         RBBIWordMonkey  m;
   4298         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
   4299         if (U_SUCCESS(status)) {
   4300             RunMonkey(bi, m, "word", seed, loopCount, useUText);
   4301         }
   4302         else {
   4303             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
   4304         }
   4305         delete bi;
   4306     }
   4307 
   4308     if (breakType == "line" || breakType == "all") {
   4309         logln("Line Break Monkey Test");
   4310         RBBILineMonkey  m;
   4311         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
   4312         if (loopCount >= 10) {
   4313             loopCount = loopCount / 5;   // Line break runs slower than the others.
   4314         }
   4315         if (U_SUCCESS(status)) {
   4316             RunMonkey(bi, m, "line", seed, loopCount, useUText);
   4317         }
   4318         else {
   4319             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
   4320         }
   4321         delete bi;
   4322     }
   4323 
   4324     if (breakType == "sent" || breakType == "all"  ) {
   4325         logln("Sentence Break Monkey Test");
   4326         RBBISentMonkey  m;
   4327         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
   4328         if (loopCount >= 10) {
   4329             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
   4330         }
   4331         if (U_SUCCESS(status)) {
   4332             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
   4333         }
   4334         else {
   4335             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
   4336         }
   4337         delete bi;
   4338     }
   4339 
   4340 #endif
   4341 }
   4342 
   4343 //
   4344 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
   4345 //    Parameters:
   4346 //       bi      - the break iterator to use
   4347 //       mk      - MonkeyKind, abstraction for obtaining expected results
   4348 //       name    - Name of test (char, word, etc.) for use in error messages
   4349 //       seed    - Seed for starting random number generator (parameter from user)
   4350 //       numIterations
   4351 //
   4352 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
   4353                          int32_t numIterations, UBool useUText) {
   4354 
   4355 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
   4356 
   4357     const int32_t    TESTSTRINGLEN = 500;
   4358     UnicodeString    testText;
   4359     int32_t          numCharClasses;
   4360     UVector          *chClasses;
   4361     int              expected[TESTSTRINGLEN*2 + 1];
   4362     int              expectedCount = 0;
   4363     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
   4364     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
   4365     char             reverseBreaks[TESTSTRINGLEN*2+1];
   4366     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
   4367     char             followingBreaks[TESTSTRINGLEN*2+1];
   4368     char             precedingBreaks[TESTSTRINGLEN*2+1];
   4369     int              i;
   4370     int              loopCount = 0;
   4371 
   4372     m_seed = seed;
   4373 
   4374     numCharClasses = mk.charClasses()->size();
   4375     chClasses      = mk.charClasses();
   4376 
   4377     // Check for errors that occured during the construction of the MonkeyKind object.
   4378     //  Can't report them where they occured because errln() is a method coming from intlTest,
   4379     //  and is not visible outside of RBBITest :-(
   4380     if (U_FAILURE(mk.deferredStatus)) {
   4381         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
   4382         return;
   4383     }
   4384 
   4385     // Verify that the character classes all have at least one member.
   4386     for (i=0; i<numCharClasses; i++) {
   4387         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
   4388         if (s == NULL || s->size() == 0) {
   4389             errln("Character Class #%d is null or of zero size.", i);
   4390             return;
   4391         }
   4392     }
   4393 
   4394     while (loopCount < numIterations || numIterations == -1) {
   4395         if (numIterations == -1 && loopCount % 10 == 0) {
   4396             // If test is running in an infinite loop, display a periodic tic so
   4397             //   we can tell that it is making progress.
   4398             fprintf(stderr, ".");
   4399         }
   4400         // Save current random number seed, so that we can recreate the random numbers
   4401         //   for this loop iteration in event of an error.
   4402         seed = m_seed;
   4403 
   4404         // Populate a test string with data.
   4405         testText.truncate(0);
   4406         for (i=0; i<TESTSTRINGLEN; i++) {
   4407             int32_t  aClassNum = m_rand() % numCharClasses;
   4408             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
   4409             int32_t   charIdx = m_rand() % classSet->size();
   4410             UChar32   c = classSet->charAt(charIdx);
   4411             if (c < 0) {   // TODO:  deal with sets containing strings.
   4412                 errln("c < 0");
   4413                 break;
   4414             }
   4415             testText.append(c);
   4416         }
   4417 
   4418         // Calculate the expected results for this test string.
   4419         mk.setText(testText);
   4420         memset(expectedBreaks, 0, sizeof(expectedBreaks));
   4421         expectedBreaks[0] = 1;
   4422         int32_t breakPos = 0;
   4423         expectedCount = 0;
   4424         for (;;) {
   4425             breakPos = mk.next(breakPos);
   4426             if (breakPos == -1) {
   4427                 break;
   4428             }
   4429             if (breakPos > testText.length()) {
   4430                 errln("breakPos > testText.length()");
   4431             }
   4432             expectedBreaks[breakPos] = 1;
   4433             U_ASSERT(expectedCount<testText.length());
   4434             expected[expectedCount ++] = breakPos;
   4435         }
   4436 
   4437         // Find the break positions using forward iteration
   4438         memset(forwardBreaks, 0, sizeof(forwardBreaks));
   4439         if (useUText) {
   4440             UErrorCode status = U_ZERO_ERROR;
   4441             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
   4442             // testUText = utext_openUnicodeString(testUText, &testText, &status);
   4443             bi->setText(testUText, status);
   4444             TEST_ASSERT_SUCCESS(status);
   4445             utext_close(testUText);   // The break iterator does a shallow clone of the UText
   4446                                       //  This UText can be closed immediately, so long as the
   4447                                       //  testText string continues to exist.
   4448         } else {
   4449             bi->setText(testText);
   4450         }
   4451 
   4452         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
   4453             if (i < 0 || i > testText.length()) {
   4454                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
   4455                 break;
   4456             }
   4457             forwardBreaks[i] = 1;
   4458         }
   4459 
   4460         // Find the break positions using reverse iteration
   4461         memset(reverseBreaks, 0, sizeof(reverseBreaks));
   4462         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
   4463             if (i < 0 || i > testText.length()) {
   4464                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
   4465                 break;
   4466             }
   4467             reverseBreaks[i] = 1;
   4468         }
   4469 
   4470         // Find the break positions using isBoundary() tests.
   4471         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
   4472         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
   4473         for (i=0; i<=testText.length(); i++) {
   4474             isBoundaryBreaks[i] = bi->isBoundary(i);
   4475         }
   4476 
   4477 
   4478         // Find the break positions using the following() function.
   4479         // printf(".");
   4480         memset(followingBreaks, 0, sizeof(followingBreaks));
   4481         int32_t   lastBreakPos = 0;
   4482         followingBreaks[0] = 1;
   4483         for (i=0; i<testText.length(); i++) {
   4484             breakPos = bi->following(i);
   4485             if (breakPos <= i ||
   4486                 breakPos < lastBreakPos ||
   4487                 breakPos > testText.length() ||
   4488                 breakPos > lastBreakPos && lastBreakPos > i ) {
   4489                 errln("%s break monkey test: "
   4490                     "Out of range value returned by BreakIterator::following().\n"
   4491                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
   4492                          name, seed, i, breakPos, lastBreakPos);
   4493                 break;
   4494             }
   4495             followingBreaks[breakPos] = 1;
   4496             lastBreakPos = breakPos;
   4497         }
   4498 
   4499         // Find the break positions using the preceding() function.
   4500         memset(precedingBreaks, 0, sizeof(precedingBreaks));
   4501         lastBreakPos = testText.length();
   4502         precedingBreaks[testText.length()] = 1;
   4503         for (i=testText.length(); i>0; i--) {
   4504             breakPos = bi->preceding(i);
   4505             if (breakPos >= i ||
   4506                 breakPos > lastBreakPos ||
   4507                 breakPos < 0 && testText.getChar32Start(i)>0 ||
   4508                 breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i) ) {
   4509                 errln("%s break monkey test: "
   4510                     "Out of range value returned by BreakIterator::preceding().\n"
   4511                     "index=%d;  prev returned %d; lastBreak=%d" ,
   4512                     name,  i, breakPos, lastBreakPos);
   4513                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
   4514                     precedingBreaks[i] = 2;   // Forces an error.
   4515                 }
   4516             } else {
   4517                 if (breakPos >= 0) {
   4518                     precedingBreaks[breakPos] = 1;
   4519                 }
   4520                 lastBreakPos = breakPos;
   4521             }
   4522         }
   4523 
   4524         // Compare the expected and actual results.
   4525         for (i=0; i<=testText.length(); i++) {
   4526             const char *errorType = NULL;
   4527             if  (forwardBreaks[i] != expectedBreaks[i]) {
   4528                 errorType = "next()";
   4529             } else if (reverseBreaks[i] != forwardBreaks[i]) {
   4530                 errorType = "previous()";
   4531             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
   4532                 errorType = "isBoundary()";
   4533             } else if (followingBreaks[i] != expectedBreaks[i]) {
   4534                 errorType = "following()";
   4535             } else if (precedingBreaks[i] != expectedBreaks[i]) {
   4536                 errorType = "preceding()";
   4537             }
   4538 
   4539 
   4540             if (errorType != NULL) {
   4541                 // Format a range of the test text that includes the failure as
   4542                 //  a data item that can be included in the rbbi test data file.
   4543 
   4544                 // Start of the range is the last point where expected and actual results
   4545                 //   both agreed that there was a break position.
   4546                 int startContext = i;
   4547                 int32_t count = 0;
   4548                 for (;;) {
   4549                     if (startContext==0) { break; }
   4550                     startContext --;
   4551                     if (expectedBreaks[startContext] != 0) {
   4552                         if (count == 2) break;
   4553                         count ++;
   4554                     }
   4555                 }
   4556 
   4557                 // End of range is two expected breaks past the start position.
   4558                 int endContext = i + 1;
   4559                 int ci;
   4560                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
   4561                     for (;;) {
   4562                         if (endContext >= testText.length()) {break;}
   4563                         if (expectedBreaks[endContext-1] != 0) {
   4564                             if (count == 0) break;
   4565                             count --;
   4566                         }
   4567                         endContext ++;
   4568                     }
   4569                 }
   4570 
   4571                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
   4572                 UnicodeString errorText = "<data>";
   4573                 /***if (strcmp(errorType, "next()") == 0) {
   4574                     startContext = 0;
   4575                     endContext = testText.length();
   4576 
   4577                     printStringBreaks(testText, expected, expectedCount);
   4578                 }***/
   4579 
   4580                 for (ci=startContext; ci<endContext;) {
   4581                     UnicodeString hexChars("0123456789abcdef");
   4582                     UChar32  c;
   4583                     int      bn;
   4584                     c = testText.char32At(ci);
   4585                     if (ci == i) {
   4586                         // This is the location of the error.
   4587                         errorText.append("<?>");
   4588                     } else if (expectedBreaks[ci] != 0) {
   4589                         // This a non-error expected break position.
   4590                         errorText.append("\\");
   4591                     }
   4592                     if (c < 0x10000) {
   4593                         errorText.append("\\u");
   4594                         for (bn=12; bn>=0; bn-=4) {
   4595                             errorText.append(hexChars.charAt((c>>bn)&0xf));
   4596                         }
   4597                     } else {
   4598                         errorText.append("\\U");
   4599                         for (bn=28; bn>=0; bn-=4) {
   4600                             errorText.append(hexChars.charAt((c>>bn)&0xf));
   4601                         }
   4602                     }
   4603                     ci = testText.moveIndex32(ci, 1);
   4604                 }
   4605                 errorText.append("\\");
   4606                 errorText.append("</data>\n");
   4607 
   4608                 // Output the error
   4609                 char  charErrorTxt[500];
   4610                 UErrorCode status = U_ZERO_ERROR;
   4611                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
   4612                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
   4613                 errln("%s break monkey test error.  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
   4614                     name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
   4615                     errorType, seed, i, charErrorTxt);
   4616                 break;
   4617             }
   4618         }
   4619 
   4620         loopCount++;
   4621     }
   4622 #endif
   4623 }
   4624 
   4625 //
   4626 //  TestDebug    -  A place-holder test for debugging purposes.
   4627 //                  For putting in fragments of other tests that can be invoked
   4628 //                  for tracing  without a lot of unwanted extra stuff happening.
   4629 //
   4630 void RBBITest::TestDebug(void) {
   4631 #if 0
   4632     UErrorCode   status = U_ZERO_ERROR;
   4633     int pos = 0;
   4634     int ruleStatus = 0;
   4635 
   4636     RuleBasedBreakIterator* bi =
   4637        // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
   4638        // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
   4639        (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
   4640     UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
   4641     // UnicodeString s("Aaa.  Bcd");
   4642     s = s.unescape();
   4643     bi->setText(s);
   4644     UBool r = bi->isBoundary(8);
   4645     printf("%s", r?"true":"false");
   4646     return;
   4647     pos = bi->last();
   4648     do {
   4649         // ruleStatus = bi->getRuleStatus();
   4650         printf("%d\t%d\n", pos, ruleStatus);
   4651         pos = bi->previous();
   4652     } while (pos != BreakIterator::DONE);
   4653 #endif
   4654 }
   4655 
   4656 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   4657