Home | History | Annotate | Download | only in common
      1 /*
      2 ***************************************************************************
      3 *   Copyright (C) 1999-2010 International Business Machines Corporation
      4 *   and others. All rights reserved.
      5 ***************************************************************************
      6 */
      7 //
      8 //  file:  rbbi.c    Contains the implementation of the rule based break iterator
      9 //                   runtime engine and the API implementation for
     10 //                   class RuleBasedBreakIterator
     11 //
     12 
     13 #include "unicode/utypeinfo.h"  // for 'typeid' to work
     14 
     15 #include "unicode/utypes.h"
     16 
     17 #if !UCONFIG_NO_BREAK_ITERATION
     18 
     19 #include "unicode/rbbi.h"
     20 #include "unicode/schriter.h"
     21 #include "unicode/uchriter.h"
     22 #include "unicode/udata.h"
     23 #include "unicode/uclean.h"
     24 #include "rbbidata.h"
     25 #include "rbbirb.h"
     26 #include "cmemory.h"
     27 #include "cstring.h"
     28 #include "umutex.h"
     29 #include "ucln_cmn.h"
     30 #include "brkeng.h"
     31 
     32 #include "uassert.h"
     33 #include "uvector.h"
     34 
     35 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
     36 #if U_LOCAL_SERVICE_HOOK
     37 #include "localsvc.h"
     38 #endif
     39 
     40 #ifdef RBBI_DEBUG
     41 static UBool fTrace = FALSE;
     42 #endif
     43 
     44 U_NAMESPACE_BEGIN
     45 
     46 // The state number of the starting state
     47 #define START_STATE 1
     48 
     49 // The state-transition value indicating "stop"
     50 #define STOP_STATE  0
     51 
     52 
     53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
     54 
     55 
     56 //=======================================================================
     57 // constructors
     58 //=======================================================================
     59 
     60 /**
     61  * Constructs a RuleBasedBreakIterator that uses the already-created
     62  * tables object that is passed in as a parameter.
     63  */
     64 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
     65 {
     66     init();
     67     fData = new RBBIDataWrapper(data, status); // status checked in constructor
     68     if (U_FAILURE(status)) {return;}
     69     if(fData == 0) {
     70         status = U_MEMORY_ALLOCATION_ERROR;
     71         return;
     72     }
     73 }
     74 
     75 /**
     76  * Same as above but does not adopt memory
     77  */
     78 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status)
     79 {
     80     init();
     81     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor
     82     if (U_FAILURE(status)) {return;}
     83     if(fData == 0) {
     84         status = U_MEMORY_ALLOCATION_ERROR;
     85         return;
     86     }
     87 }
     88 
     89 //-------------------------------------------------------------------------------
     90 //
     91 //   Constructor   from a UDataMemory handle to precompiled break rules
     92 //                 stored in an ICU data file.
     93 //
     94 //-------------------------------------------------------------------------------
     95 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
     96 {
     97     init();
     98     fData = new RBBIDataWrapper(udm, status); // status checked in constructor
     99     if (U_FAILURE(status)) {return;}
    100     if(fData == 0) {
    101         status = U_MEMORY_ALLOCATION_ERROR;
    102         return;
    103     }
    104 }
    105 
    106 
    107 
    108 //-------------------------------------------------------------------------------
    109 //
    110 //   Constructor       from a set of rules supplied as a string.
    111 //
    112 //-------------------------------------------------------------------------------
    113 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
    114                                                 UParseError          &parseError,
    115                                                 UErrorCode           &status)
    116 {
    117     init();
    118     if (U_FAILURE(status)) {return;}
    119     RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
    120         RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
    121     // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that
    122     //        creates and returns a complete RBBI.  From here, in a constructor, we
    123     //        can't just return the object created by the builder factory, hence
    124     //        the assignment of the factory created object to "this".
    125     if (U_SUCCESS(status)) {
    126         *this = *bi;
    127         delete bi;
    128     }
    129 }
    130 
    131 
    132 //-------------------------------------------------------------------------------
    133 //
    134 // Default Constructor.      Create an empty shell that can be set up later.
    135 //                           Used when creating a RuleBasedBreakIterator from a set
    136 //                           of rules.
    137 //-------------------------------------------------------------------------------
    138 RuleBasedBreakIterator::RuleBasedBreakIterator() {
    139     init();
    140 }
    141 
    142 
    143 //-------------------------------------------------------------------------------
    144 //
    145 //   Copy constructor.  Will produce a break iterator with the same behavior,
    146 //                      and which iterates over the same text, as the one passed in.
    147 //
    148 //-------------------------------------------------------------------------------
    149 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
    150 : BreakIterator(other)
    151 {
    152     this->init();
    153     *this = other;
    154 }
    155 
    156 
    157 /**
    158  * Destructor
    159  */
    160 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
    161     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    162         // fCharIter was adopted from the outside.
    163         delete fCharIter;
    164     }
    165     fCharIter = NULL;
    166     delete fSCharIter;
    167     fCharIter = NULL;
    168     delete fDCharIter;
    169     fDCharIter = NULL;
    170 
    171     utext_close(fText);
    172 
    173     if (fData != NULL) {
    174         fData->removeReference();
    175         fData = NULL;
    176     }
    177     if (fCachedBreakPositions) {
    178         uprv_free(fCachedBreakPositions);
    179         fCachedBreakPositions = NULL;
    180     }
    181     if (fLanguageBreakEngines) {
    182         delete fLanguageBreakEngines;
    183         fLanguageBreakEngines = NULL;
    184     }
    185     if (fUnhandledBreakEngine) {
    186         delete fUnhandledBreakEngine;
    187         fUnhandledBreakEngine = NULL;
    188     }
    189 }
    190 
    191 /**
    192  * Assignment operator.  Sets this iterator to have the same behavior,
    193  * and iterate over the same text, as the one passed in.
    194  */
    195 RuleBasedBreakIterator&
    196 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
    197     if (this == &that) {
    198         return *this;
    199     }
    200     reset();    // Delete break cache information
    201     fBreakType = that.fBreakType;
    202     if (fLanguageBreakEngines != NULL) {
    203         delete fLanguageBreakEngines;
    204         fLanguageBreakEngines = NULL;   // Just rebuild for now
    205     }
    206     // TODO: clone fLanguageBreakEngines from "that"
    207     UErrorCode status = U_ZERO_ERROR;
    208     fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
    209 
    210     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    211         delete fCharIter;
    212     }
    213     fCharIter = NULL;
    214 
    215     if (that.fCharIter != NULL ) {
    216         // This is a little bit tricky - it will intially appear that
    217         //  this->fCharIter is adopted, even if that->fCharIter was
    218         //  not adopted.  That's ok.
    219         fCharIter = that.fCharIter->clone();
    220     }
    221 
    222     if (fData != NULL) {
    223         fData->removeReference();
    224         fData = NULL;
    225     }
    226     if (that.fData != NULL) {
    227         fData = that.fData->addReference();
    228     }
    229 
    230     return *this;
    231 }
    232 
    233 
    234 
    235 //-----------------------------------------------------------------------------
    236 //
    237 //    init()      Shared initialization routine.   Used by all the constructors.
    238 //                Initializes all fields, leaving the object in a consistent state.
    239 //
    240 //-----------------------------------------------------------------------------
    241 void RuleBasedBreakIterator::init() {
    242     UErrorCode  status    = U_ZERO_ERROR;
    243     fBufferClone          = FALSE;
    244     fText                 = utext_openUChars(NULL, NULL, 0, &status);
    245     fCharIter             = NULL;
    246     fSCharIter            = NULL;
    247     fDCharIter            = NULL;
    248     fData                 = NULL;
    249     fLastRuleStatusIndex  = 0;
    250     fLastStatusIndexValid = TRUE;
    251     fDictionaryCharCount  = 0;
    252     fBreakType            = UBRK_WORD;  // Defaulting BreakType to word gives reasonable
    253                                         //   dictionary behavior for Break Iterators that are
    254                                         //   built from rules.  Even better would be the ability to
    255                                         //   declare the type in the rules.
    256 
    257     fCachedBreakPositions    = NULL;
    258     fLanguageBreakEngines    = NULL;
    259     fUnhandledBreakEngine    = NULL;
    260     fNumCachedBreakPositions = 0;
    261     fPositionInCache         = 0;
    262 
    263 #ifdef RBBI_DEBUG
    264     static UBool debugInitDone = FALSE;
    265     if (debugInitDone == FALSE) {
    266         char *debugEnv = getenv("U_RBBIDEBUG");
    267         if (debugEnv && uprv_strstr(debugEnv, "trace")) {
    268             fTrace = TRUE;
    269         }
    270         debugInitDone = TRUE;
    271     }
    272 #endif
    273 }
    274 
    275 
    276 
    277 //-----------------------------------------------------------------------------
    278 //
    279 //    clone - Returns a newly-constructed RuleBasedBreakIterator with the same
    280 //            behavior, and iterating over the same text, as this one.
    281 //            Virtual function: does the right thing with subclasses.
    282 //
    283 //-----------------------------------------------------------------------------
    284 BreakIterator*
    285 RuleBasedBreakIterator::clone(void) const {
    286     return new RuleBasedBreakIterator(*this);
    287 }
    288 
    289 /**
    290  * Equality operator.  Returns TRUE if both BreakIterators are of the
    291  * same class, have the same behavior, and iterate over the same text.
    292  */
    293 UBool
    294 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
    295     if (typeid(*this) != typeid(that)) {
    296         return FALSE;
    297     }
    298 
    299     const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
    300 
    301     if (!utext_equals(fText, that2.fText)) {
    302         // The two break iterators are operating on different text,
    303         //   or have a different interation position.
    304         return FALSE;
    305     };
    306 
    307     // TODO:  need a check for when in a dictionary region at different offsets.
    308 
    309     if (that2.fData == fData ||
    310         (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
    311             // The two break iterators are using the same rules.
    312             return TRUE;
    313         }
    314     return FALSE;
    315 }
    316 
    317 /**
    318  * Compute a hash code for this BreakIterator
    319  * @return A hash code
    320  */
    321 int32_t
    322 RuleBasedBreakIterator::hashCode(void) const {
    323     int32_t   hash = 0;
    324     if (fData != NULL) {
    325         hash = fData->hashCode();
    326     }
    327     return hash;
    328 }
    329 
    330 
    331 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
    332     if (U_FAILURE(status)) {
    333         return;
    334     }
    335     reset();
    336     fText = utext_clone(fText, ut, FALSE, TRUE, &status);
    337 
    338     // Set up a dummy CharacterIterator to be returned if anyone
    339     //   calls getText().  With input from UText, there is no reasonable
    340     //   way to return a characterIterator over the actual input text.
    341     //   Return one over an empty string instead - this is the closest
    342     //   we can come to signaling a failure.
    343     //   (GetText() is obsolete, this failure is sort of OK)
    344     if (fDCharIter == NULL) {
    345         static const UChar c = 0;
    346         fDCharIter = new UCharCharacterIterator(&c, 0);
    347         if (fDCharIter == NULL) {
    348             status = U_MEMORY_ALLOCATION_ERROR;
    349             return;
    350         }
    351     }
    352 
    353     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    354         // existing fCharIter was adopted from the outside.  Delete it now.
    355         delete fCharIter;
    356     }
    357     fCharIter = fDCharIter;
    358 
    359     this->first();
    360 }
    361 
    362 
    363 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
    364     UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
    365     return result;
    366 }
    367 
    368 
    369 
    370 /**
    371  * Returns the description used to create this iterator
    372  */
    373 const UnicodeString&
    374 RuleBasedBreakIterator::getRules() const {
    375     if (fData != NULL) {
    376         return fData->getRuleSourceString();
    377     } else {
    378         static const UnicodeString *s;
    379         if (s == NULL) {
    380             // TODO:  something more elegant here.
    381             //        perhaps API should return the string by value.
    382             //        Note:  thread unsafe init & leak are semi-ok, better than
    383             //               what was before.  Sould be cleaned up, though.
    384             s = new UnicodeString;
    385         }
    386         return *s;
    387     }
    388 }
    389 
    390 //=======================================================================
    391 // BreakIterator overrides
    392 //=======================================================================
    393 
    394 /**
    395  * Return a CharacterIterator over the text being analyzed.
    396  */
    397 CharacterIterator&
    398 RuleBasedBreakIterator::getText() const {
    399     return *fCharIter;
    400 }
    401 
    402 /**
    403  * Set the iterator to analyze a new piece of text.  This function resets
    404  * the current iteration position to the beginning of the text.
    405  * @param newText An iterator over the text to analyze.
    406  */
    407 void
    408 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
    409     // If we are holding a CharacterIterator adopted from a
    410     //   previous call to this function, delete it now.
    411     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    412         delete fCharIter;
    413     }
    414 
    415     fCharIter = newText;
    416     UErrorCode status = U_ZERO_ERROR;
    417     reset();
    418     if (newText==NULL || newText->startIndex() != 0) {
    419         // startIndex !=0 wants to be an error, but there's no way to report it.
    420         // Make the iterator text be an empty string.
    421         fText = utext_openUChars(fText, NULL, 0, &status);
    422     } else {
    423         fText = utext_openCharacterIterator(fText, newText, &status);
    424     }
    425     this->first();
    426 }
    427 
    428 /**
    429  * Set the iterator to analyze a new piece of text.  This function resets
    430  * the current iteration position to the beginning of the text.
    431  * @param newText An iterator over the text to analyze.
    432  */
    433 void
    434 RuleBasedBreakIterator::setText(const UnicodeString& newText) {
    435     UErrorCode status = U_ZERO_ERROR;
    436     reset();
    437     fText = utext_openConstUnicodeString(fText, &newText, &status);
    438 
    439     // Set up a character iterator on the string.
    440     //   Needed in case someone calls getText().
    441     //  Can not, unfortunately, do this lazily on the (probably never)
    442     //  call to getText(), because getText is const.
    443     if (fSCharIter == NULL) {
    444         fSCharIter = new StringCharacterIterator(newText);
    445     } else {
    446         fSCharIter->setText(newText);
    447     }
    448 
    449     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    450         // old fCharIter was adopted from the outside.  Delete it.
    451         delete fCharIter;
    452     }
    453     fCharIter = fSCharIter;
    454 
    455     this->first();
    456 }
    457 
    458 
    459 
    460 /**
    461  * Sets the current iteration position to the beginning of the text.
    462  * @return The offset of the beginning of the text.
    463  */
    464 int32_t RuleBasedBreakIterator::first(void) {
    465     reset();
    466     fLastRuleStatusIndex  = 0;
    467     fLastStatusIndexValid = TRUE;
    468     //if (fText == NULL)
    469     //    return BreakIterator::DONE;
    470 
    471     utext_setNativeIndex(fText, 0);
    472     return 0;
    473 }
    474 
    475 /**
    476  * Sets the current iteration position to the end of the text.
    477  * @return The text's past-the-end offset.
    478  */
    479 int32_t RuleBasedBreakIterator::last(void) {
    480     reset();
    481     if (fText == NULL) {
    482         fLastRuleStatusIndex  = 0;
    483         fLastStatusIndexValid = TRUE;
    484         return BreakIterator::DONE;
    485     }
    486 
    487     fLastStatusIndexValid = FALSE;
    488     int32_t pos = (int32_t)utext_nativeLength(fText);
    489     utext_setNativeIndex(fText, pos);
    490     return pos;
    491 }
    492 
    493 /**
    494  * Advances the iterator either forward or backward the specified number of steps.
    495  * Negative values move backward, and positive values move forward.  This is
    496  * equivalent to repeatedly calling next() or previous().
    497  * @param n The number of steps to move.  The sign indicates the direction
    498  * (negative is backwards, and positive is forwards).
    499  * @return The character offset of the boundary position n boundaries away from
    500  * the current one.
    501  */
    502 int32_t RuleBasedBreakIterator::next(int32_t n) {
    503     int32_t result = current();
    504     while (n > 0) {
    505         result = next();
    506         --n;
    507     }
    508     while (n < 0) {
    509         result = previous();
    510         ++n;
    511     }
    512     return result;
    513 }
    514 
    515 /**
    516  * Advances the iterator to the next boundary position.
    517  * @return The position of the first boundary after this one.
    518  */
    519 int32_t RuleBasedBreakIterator::next(void) {
    520     // if we have cached break positions and we're still in the range
    521     // covered by them, just move one step forward in the cache
    522     if (fCachedBreakPositions != NULL) {
    523         if (fPositionInCache < fNumCachedBreakPositions - 1) {
    524             ++fPositionInCache;
    525             int32_t pos = fCachedBreakPositions[fPositionInCache];
    526             utext_setNativeIndex(fText, pos);
    527             return pos;
    528         }
    529         else {
    530             reset();
    531         }
    532     }
    533 
    534     int32_t startPos = current();
    535     int32_t result = handleNext(fData->fForwardTable);
    536     if (fDictionaryCharCount > 0) {
    537         result = checkDictionary(startPos, result, FALSE);
    538     }
    539     return result;
    540 }
    541 
    542 /**
    543  * Advances the iterator backwards, to the last boundary preceding this one.
    544  * @return The position of the last boundary position preceding this one.
    545  */
    546 int32_t RuleBasedBreakIterator::previous(void) {
    547     int32_t result;
    548     int32_t startPos;
    549 
    550     // if we have cached break positions and we're still in the range
    551     // covered by them, just move one step backward in the cache
    552     if (fCachedBreakPositions != NULL) {
    553         if (fPositionInCache > 0) {
    554             --fPositionInCache;
    555             // If we're at the beginning of the cache, need to reevaluate the
    556             // rule status
    557             if (fPositionInCache <= 0) {
    558                 fLastStatusIndexValid = FALSE;
    559             }
    560             int32_t pos = fCachedBreakPositions[fPositionInCache];
    561             utext_setNativeIndex(fText, pos);
    562             return pos;
    563         }
    564         else {
    565             reset();
    566         }
    567     }
    568 
    569     // if we're already sitting at the beginning of the text, return DONE
    570     if (fText == NULL || (startPos = current()) == 0) {
    571         fLastRuleStatusIndex  = 0;
    572         fLastStatusIndexValid = TRUE;
    573         return BreakIterator::DONE;
    574     }
    575 
    576     if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
    577         result = handlePrevious(fData->fReverseTable);
    578         if (fDictionaryCharCount > 0) {
    579             result = checkDictionary(result, startPos, TRUE);
    580         }
    581         return result;
    582     }
    583 
    584     // old rule syntax
    585     // set things up.  handlePrevious() will back us up to some valid
    586     // break position before the current position (we back our internal
    587     // iterator up one step to prevent handlePrevious() from returning
    588     // the current position), but not necessarily the last one before
    589 
    590     // where we started
    591 
    592     int32_t start = current();
    593 
    594     UTEXT_PREVIOUS32(fText);
    595     int32_t lastResult    = handlePrevious(fData->fReverseTable);
    596     if (lastResult == UBRK_DONE) {
    597         lastResult = 0;
    598         utext_setNativeIndex(fText, 0);
    599     }
    600     result = lastResult;
    601     int32_t lastTag       = 0;
    602     UBool   breakTagValid = FALSE;
    603 
    604     // iterate forward from the known break position until we pass our
    605     // starting point.  The last break position before the starting
    606     // point is our return value
    607 
    608     for (;;) {
    609         result         = next();
    610         if (result == BreakIterator::DONE || result >= start) {
    611             break;
    612         }
    613         lastResult     = result;
    614         lastTag        = fLastRuleStatusIndex;
    615         breakTagValid  = TRUE;
    616     }
    617 
    618     // fLastBreakTag wants to have the value for section of text preceding
    619     // the result position that we are to return (in lastResult.)  If
    620     // the backwards rules overshot and the above loop had to do two or more
    621     // next()s to move up to the desired return position, we will have a valid
    622     // tag value. But, if handlePrevious() took us to exactly the correct result positon,
    623     // we wont have a tag value for that position, which is only set by handleNext().
    624 
    625     // set the current iteration position to be the last break position
    626     // before where we started, and then return that value
    627     utext_setNativeIndex(fText, lastResult);
    628     fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()
    629     fLastStatusIndexValid = breakTagValid;
    630 
    631     // No need to check the dictionary; it will have been handled by
    632     // next()
    633 
    634     return lastResult;
    635 }
    636 
    637 /**
    638  * Sets the iterator to refer to the first boundary position following
    639  * the specified position.
    640  * @offset The position from which to begin searching for a break position.
    641  * @return The position of the first break after the current position.
    642  */
    643 int32_t RuleBasedBreakIterator::following(int32_t offset) {
    644     // if we have cached break positions and offset is in the range
    645     // covered by them, use them
    646     // TODO: could use binary search
    647     // TODO: what if offset is outside range, but break is not?
    648     if (fCachedBreakPositions != NULL) {
    649         if (offset >= fCachedBreakPositions[0]
    650                 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
    651             fPositionInCache = 0;
    652             // We are guaranteed not to leave the array due to range test above
    653             while (offset >= fCachedBreakPositions[fPositionInCache]) {
    654                 ++fPositionInCache;
    655             }
    656             int32_t pos = fCachedBreakPositions[fPositionInCache];
    657             utext_setNativeIndex(fText, pos);
    658             return pos;
    659         }
    660         else {
    661             reset();
    662         }
    663     }
    664 
    665     // if the offset passed in is already past the end of the text,
    666     // just return DONE; if it's before the beginning, return the
    667     // text's starting offset
    668     fLastRuleStatusIndex  = 0;
    669     fLastStatusIndexValid = TRUE;
    670     if (fText == NULL || offset >= utext_nativeLength(fText)) {
    671         last();
    672         return next();
    673     }
    674     else if (offset < 0) {
    675         return first();
    676     }
    677 
    678     // otherwise, set our internal iteration position (temporarily)
    679     // to the position passed in.  If this is the _beginning_ position,
    680     // then we can just use next() to get our return value
    681 
    682     int32_t result = 0;
    683 
    684     if (fData->fSafeRevTable != NULL) {
    685         // new rule syntax
    686         utext_setNativeIndex(fText, offset);
    687         // move forward one codepoint to prepare for moving back to a
    688         // safe point.
    689         // this handles offset being between a supplementary character
    690         UTEXT_NEXT32(fText);
    691         // handlePrevious will move most of the time to < 1 boundary away
    692         handlePrevious(fData->fSafeRevTable);
    693         int32_t result = next();
    694         while (result <= offset) {
    695             result = next();
    696         }
    697         return result;
    698     }
    699     if (fData->fSafeFwdTable != NULL) {
    700         // backup plan if forward safe table is not available
    701         utext_setNativeIndex(fText, offset);
    702         UTEXT_PREVIOUS32(fText);
    703         // handle next will give result >= offset
    704         handleNext(fData->fSafeFwdTable);
    705         // previous will give result 0 or 1 boundary away from offset,
    706         // most of the time
    707         // we have to
    708         int32_t oldresult = previous();
    709         while (oldresult > offset) {
    710             int32_t result = previous();
    711             if (result <= offset) {
    712                 return oldresult;
    713             }
    714             oldresult = result;
    715         }
    716         int32_t result = next();
    717         if (result <= offset) {
    718             return next();
    719         }
    720         return result;
    721     }
    722     // otherwise, we have to sync up first.  Use handlePrevious() to back
    723     // up to a known break position before the specified position (if
    724     // we can determine that the specified position is a break position,
    725     // we don't back up at all).  This may or may not be the last break
    726     // position at or before our starting position.  Advance forward
    727     // from here until we've passed the starting position.  The position
    728     // we stop on will be the first break position after the specified one.
    729     // old rule syntax
    730 
    731     utext_setNativeIndex(fText, offset);
    732     if (offset==0 ||
    733         (offset==1  && utext_getNativeIndex(fText)==0)) {
    734         return next();
    735     }
    736     result = previous();
    737 
    738     while (result != BreakIterator::DONE && result <= offset) {
    739         result = next();
    740     }
    741 
    742     return result;
    743 }
    744 
    745 /**
    746  * Sets the iterator to refer to the last boundary position before the
    747  * specified position.
    748  * @offset The position to begin searching for a break from.
    749  * @return The position of the last boundary before the starting position.
    750  */
    751 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
    752     // if we have cached break positions and offset is in the range
    753     // covered by them, use them
    754     if (fCachedBreakPositions != NULL) {
    755         // TODO: binary search?
    756         // TODO: What if offset is outside range, but break is not?
    757         if (offset > fCachedBreakPositions[0]
    758                 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
    759             fPositionInCache = 0;
    760             while (fPositionInCache < fNumCachedBreakPositions
    761                    && offset > fCachedBreakPositions[fPositionInCache])
    762                 ++fPositionInCache;
    763             --fPositionInCache;
    764             // If we're at the beginning of the cache, need to reevaluate the
    765             // rule status
    766             if (fPositionInCache <= 0) {
    767                 fLastStatusIndexValid = FALSE;
    768             }
    769             utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
    770             return fCachedBreakPositions[fPositionInCache];
    771         }
    772         else {
    773             reset();
    774         }
    775     }
    776 
    777     // if the offset passed in is already past the end of the text,
    778     // just return DONE; if it's before the beginning, return the
    779     // text's starting offset
    780     if (fText == NULL || offset > utext_nativeLength(fText)) {
    781         // return BreakIterator::DONE;
    782         return last();
    783     }
    784     else if (offset < 0) {
    785         return first();
    786     }
    787 
    788     // if we start by updating the current iteration position to the
    789     // position specified by the caller, we can just use previous()
    790     // to carry out this operation
    791 
    792     if (fData->fSafeFwdTable != NULL) {
    793         // new rule syntax
    794         utext_setNativeIndex(fText, offset);
    795         int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    796         if (newOffset != offset) {
    797             // Will come here if specified offset was not a code point boundary AND
    798             //   the underlying implmentation is using UText, which snaps any non-code-point-boundary
    799             //   indices to the containing code point.
    800             // For breakitereator::preceding only, these non-code-point indices need to be moved
    801             //   up to refer to the following codepoint.
    802             UTEXT_NEXT32(fText);
    803             offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    804         }
    805 
    806         // TODO:  (synwee) would it be better to just check for being in the middle of a surrogate pair,
    807         //        rather than adjusting the position unconditionally?
    808         //        (Change would interact with safe rules.)
    809         // TODO:  change RBBI behavior for off-boundary indices to match that of UText?
    810         //        affects only preceding(), seems cleaner, but is slightly different.
    811         UTEXT_PREVIOUS32(fText);
    812         handleNext(fData->fSafeFwdTable);
    813         int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    814         while (result >= offset) {
    815             result = previous();
    816         }
    817         return result;
    818     }
    819     if (fData->fSafeRevTable != NULL) {
    820         // backup plan if forward safe table is not available
    821         //  TODO:  check whether this path can be discarded
    822         //         It's probably OK to say that rules must supply both safe tables
    823         //            if they use safe tables at all.  We have certainly never described
    824         //            to anyone how to work with just one safe table.
    825         utext_setNativeIndex(fText, offset);
    826         UTEXT_NEXT32(fText);
    827 
    828         // handle previous will give result <= offset
    829         handlePrevious(fData->fSafeRevTable);
    830 
    831         // next will give result 0 or 1 boundary away from offset,
    832         // most of the time
    833         // we have to
    834         int32_t oldresult = next();
    835         while (oldresult < offset) {
    836             int32_t result = next();
    837             if (result >= offset) {
    838                 return oldresult;
    839             }
    840             oldresult = result;
    841         }
    842         int32_t result = previous();
    843         if (result >= offset) {
    844             return previous();
    845         }
    846         return result;
    847     }
    848 
    849     // old rule syntax
    850     utext_setNativeIndex(fText, offset);
    851     return previous();
    852 }
    853 
    854 /**
    855  * Returns true if the specfied position is a boundary position.  As a side
    856  * effect, leaves the iterator pointing to the first boundary position at
    857  * or after "offset".
    858  * @param offset the offset to check.
    859  * @return True if "offset" is a boundary position.
    860  */
    861 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
    862     // the beginning index of the iterator is always a boundary position by definition
    863     if (offset == 0) {
    864         first();       // For side effects on current position, tag values.
    865         return TRUE;
    866     }
    867 
    868     if (offset == (int32_t)utext_nativeLength(fText)) {
    869         last();       // For side effects on current position, tag values.
    870         return TRUE;
    871     }
    872 
    873     // out-of-range indexes are never boundary positions
    874     if (offset < 0) {
    875         first();       // For side effects on current position, tag values.
    876         return FALSE;
    877     }
    878 
    879     if (offset > utext_nativeLength(fText)) {
    880         last();        // For side effects on current position, tag values.
    881         return FALSE;
    882     }
    883 
    884     // otherwise, we can use following() on the position before the specified
    885     // one and return true if the position we get back is the one the user
    886     // specified
    887     utext_previous32From(fText, offset);
    888     int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    889     UBool    result  = following(backOne) == offset;
    890     return result;
    891 }
    892 
    893 /**
    894  * Returns the current iteration position.
    895  * @return The current iteration position.
    896  */
    897 int32_t RuleBasedBreakIterator::current(void) const {
    898     int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    899     return pos;
    900 }
    901 
    902 //=======================================================================
    903 // implementation
    904 //=======================================================================
    905 
    906 //
    907 // RBBIRunMode  -  the state machine runs an extra iteration at the beginning and end
    908 //                 of user text.  A variable with this enum type keeps track of where we
    909 //                 are.  The state machine only fetches user input while in the RUN mode.
    910 //
    911 enum RBBIRunMode {
    912     RBBI_START,     // state machine processing is before first char of input
    913     RBBI_RUN,       // state machine processing is in the user text
    914     RBBI_END        // state machine processing is after end of user text.
    915 };
    916 
    917 
    918 //-----------------------------------------------------------------------------------
    919 //
    920 //  handleNext(stateTable)
    921 //     This method is the actual implementation of the rbbi next() method.
    922 //     This method initializes the state machine to state 1
    923 //     and advances through the text character by character until we reach the end
    924 //     of the text or the state machine transitions to state 0.  We update our return
    925 //     value every time the state machine passes through an accepting state.
    926 //
    927 //-----------------------------------------------------------------------------------
    928 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
    929     int32_t             state;
    930     int16_t             category        = 0;
    931     RBBIRunMode         mode;
    932 
    933     RBBIStateTableRow  *row;
    934     UChar32             c;
    935     int32_t             lookaheadStatus = 0;
    936     int32_t             lookaheadTagIdx = 0;
    937     int32_t             result          = 0;
    938     int32_t             initialPosition = 0;
    939     int32_t             lookaheadResult = 0;
    940     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
    941     const char         *tableData       = statetable->fTableData;
    942     uint32_t            tableRowLen     = statetable->fRowLen;
    943 
    944     #ifdef RBBI_DEBUG
    945         if (fTrace) {
    946             RBBIDebugPuts("Handle Next   pos   char  state category");
    947         }
    948     #endif
    949 
    950     // No matter what, handleNext alway correctly sets the break tag value.
    951     fLastStatusIndexValid = TRUE;
    952     fLastRuleStatusIndex = 0;
    953 
    954     // if we're already at the end of the text, return DONE.
    955     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    956     result          = initialPosition;
    957     c               = UTEXT_NEXT32(fText);
    958     if (fData == NULL || c==U_SENTINEL) {
    959         return BreakIterator::DONE;
    960     }
    961 
    962     //  Set the initial state for the state machine
    963     state = START_STATE;
    964     row = (RBBIStateTableRow *)
    965             //(statetable->fTableData + (statetable->fRowLen * state));
    966             (tableData + tableRowLen * state);
    967 
    968 
    969     mode     = RBBI_RUN;
    970     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
    971         category = 2;
    972         mode     = RBBI_START;
    973     }
    974 
    975 
    976     // loop until we reach the end of the text or transition to state 0
    977     //
    978     for (;;) {
    979         if (c == U_SENTINEL) {
    980             // Reached end of input string.
    981             if (mode == RBBI_END) {
    982                 // We have already run the loop one last time with the
    983                 //   character set to the psueudo {eof} value.  Now it is time
    984                 //   to unconditionally bail out.
    985                 if (lookaheadResult > result) {
    986                     // We ran off the end of the string with a pending look-ahead match.
    987                     // Treat this as if the look-ahead condition had been met, and return
    988                     //  the match at the / position from the look-ahead rule.
    989                     result               = lookaheadResult;
    990                     fLastRuleStatusIndex = lookaheadTagIdx;
    991                     lookaheadStatus = 0;
    992                 }
    993                 break;
    994             }
    995             // Run the loop one last time with the fake end-of-input character category.
    996             mode = RBBI_END;
    997             category = 1;
    998         }
    999 
   1000         //
   1001         // Get the char category.  An incoming category of 1 or 2 means that
   1002         //      we are preset for doing the beginning or end of input, and
   1003         //      that we shouldn't get a category from an actual text input character.
   1004         //
   1005         if (mode == RBBI_RUN) {
   1006             // look up the current character's character category, which tells us
   1007             // which column in the state table to look at.
   1008             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
   1009             //        not the size of the character going in, which is a UChar32.
   1010             //
   1011             UTRIE_GET16(&fData->fTrie, c, category);
   1012 
   1013             // Check the dictionary bit in the character's category.
   1014             //    Counter is only used by dictionary based iterators (subclasses).
   1015             //    Chars that need to be handled by a dictionary have a flag bit set
   1016             //    in their category values.
   1017             //
   1018             if ((category & 0x4000) != 0)  {
   1019                 fDictionaryCharCount++;
   1020                 //  And off the dictionary flag bit.
   1021                 category &= ~0x4000;
   1022             }
   1023         }
   1024 
   1025         #ifdef RBBI_DEBUG
   1026             if (fTrace) {
   1027                 RBBIDebugPrintf("             %4ld   ", utext_getNativeIndex(fText));
   1028                 if (0x20<=c && c<0x7f) {
   1029                     RBBIDebugPrintf("\"%c\"  ", c);
   1030                 } else {
   1031                     RBBIDebugPrintf("%5x  ", c);
   1032                 }
   1033                 RBBIDebugPrintf("%3d  %3d\n", state, category);
   1034             }
   1035         #endif
   1036 
   1037         // State Transition - move machine to its next state
   1038         //
   1039         state = row->fNextState[category];
   1040         row = (RBBIStateTableRow *)
   1041             // (statetable->fTableData + (statetable->fRowLen * state));
   1042             (tableData + tableRowLen * state);
   1043 
   1044 
   1045         if (row->fAccepting == -1) {
   1046             // Match found, common case.
   1047             if (mode != RBBI_START) {
   1048                 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1049             }
   1050             fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
   1051         }
   1052 
   1053         if (row->fLookAhead != 0) {
   1054             if (lookaheadStatus != 0
   1055                 && row->fAccepting == lookaheadStatus) {
   1056                 // Lookahead match is completed.
   1057                 result               = lookaheadResult;
   1058                 fLastRuleStatusIndex = lookaheadTagIdx;
   1059                 lookaheadStatus      = 0;
   1060                 // TODO:  make a standalone hard break in a rule work.
   1061                 if (lookAheadHardBreak) {
   1062                     UTEXT_SETNATIVEINDEX(fText, result);
   1063                     return result;
   1064                 }
   1065                 // Look-ahead completed, but other rules may match further.  Continue on
   1066                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
   1067                 goto continueOn;
   1068             }
   1069 
   1070             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1071             lookaheadResult = r;
   1072             lookaheadStatus = row->fLookAhead;
   1073             lookaheadTagIdx = row->fTagIdx;
   1074             goto continueOn;
   1075         }
   1076 
   1077 
   1078         if (row->fAccepting != 0) {
   1079             // Because this is an accepting state, any in-progress look-ahead match
   1080             //   is no longer relavant.  Clear out the pending lookahead status.
   1081             lookaheadStatus = 0;           // clear out any pending look-ahead match.
   1082         }
   1083 
   1084 continueOn:
   1085         if (state == STOP_STATE) {
   1086             // This is the normal exit from the lookup state machine.
   1087             // We have advanced through the string until it is certain that no
   1088             //   longer match is possible, no matter what characters follow.
   1089             break;
   1090         }
   1091 
   1092         // Advance to the next character.
   1093         // If this is a beginning-of-input loop iteration, don't advance
   1094         //    the input position.  The next iteration will be processing the
   1095         //    first real input character.
   1096         if (mode == RBBI_RUN) {
   1097             c = UTEXT_NEXT32(fText);
   1098         } else {
   1099             if (mode == RBBI_START) {
   1100                 mode = RBBI_RUN;
   1101             }
   1102         }
   1103 
   1104 
   1105     }
   1106 
   1107     // The state machine is done.  Check whether it found a match...
   1108 
   1109     // If the iterator failed to advance in the match engine, force it ahead by one.
   1110     //   (This really indicates a defect in the break rules.  They should always match
   1111     //    at least one character.)
   1112     if (result == initialPosition) {
   1113         UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1114         UTEXT_NEXT32(fText);
   1115         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1116     }
   1117 
   1118     // Leave the iterator at our result position.
   1119     UTEXT_SETNATIVEINDEX(fText, result);
   1120     #ifdef RBBI_DEBUG
   1121         if (fTrace) {
   1122             RBBIDebugPrintf("result = %d\n\n", result);
   1123         }
   1124     #endif
   1125     return result;
   1126 }
   1127 
   1128 
   1129 
   1130 //-----------------------------------------------------------------------------------
   1131 //
   1132 //  handlePrevious()
   1133 //
   1134 //      Iterate backwards, according to the logic of the reverse rules.
   1135 //      This version handles the exact style backwards rules.
   1136 //
   1137 //      The logic of this function is very similar to handleNext(), above.
   1138 //
   1139 //-----------------------------------------------------------------------------------
   1140 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
   1141     int32_t             state;
   1142     int16_t             category        = 0;
   1143     RBBIRunMode         mode;
   1144     RBBIStateTableRow  *row;
   1145     UChar32             c;
   1146     int32_t             lookaheadStatus = 0;
   1147     int32_t             result          = 0;
   1148     int32_t             initialPosition = 0;
   1149     int32_t             lookaheadResult = 0;
   1150     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
   1151 
   1152     #ifdef RBBI_DEBUG
   1153         if (fTrace) {
   1154             RBBIDebugPuts("Handle Previous   pos   char  state category");
   1155         }
   1156     #endif
   1157 
   1158     // handlePrevious() never gets the rule status.
   1159     // Flag the status as invalid; if the user ever asks for status, we will need
   1160     // to back up, then re-find the break position using handleNext(), which does
   1161     // get the status value.
   1162     fLastStatusIndexValid = FALSE;
   1163     fLastRuleStatusIndex = 0;
   1164 
   1165     // if we're already at the start of the text, return DONE.
   1166     if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
   1167         return BreakIterator::DONE;
   1168     }
   1169 
   1170     //  Set up the starting char.
   1171     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1172     result          = initialPosition;
   1173     c               = UTEXT_PREVIOUS32(fText);
   1174 
   1175     //  Set the initial state for the state machine
   1176     state = START_STATE;
   1177     row = (RBBIStateTableRow *)
   1178             (statetable->fTableData + (statetable->fRowLen * state));
   1179     category = 3;
   1180     mode     = RBBI_RUN;
   1181     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
   1182         category = 2;
   1183         mode     = RBBI_START;
   1184     }
   1185 
   1186 
   1187     // loop until we reach the start of the text or transition to state 0
   1188     //
   1189     for (;;) {
   1190         if (c == U_SENTINEL) {
   1191             // Reached end of input string.
   1192             if (mode == RBBI_END) {
   1193                 // We have already run the loop one last time with the
   1194                 //   character set to the psueudo {eof} value.  Now it is time
   1195                 //   to unconditionally bail out.
   1196                 if (lookaheadResult < result) {
   1197                     // We ran off the end of the string with a pending look-ahead match.
   1198                     // Treat this as if the look-ahead condition had been met, and return
   1199                     //  the match at the / position from the look-ahead rule.
   1200                     result               = lookaheadResult;
   1201                     lookaheadStatus = 0;
   1202                 } else if (result == initialPosition) {
   1203                     // Ran off start, no match found.
   1204                     // move one index one (towards the start, since we are doing a previous())
   1205                     UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1206                     UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.
   1207                 }
   1208                 break;
   1209             }
   1210             // Run the loop one last time with the fake end-of-input character category.
   1211             mode = RBBI_END;
   1212             category = 1;
   1213         }
   1214 
   1215         //
   1216         // Get the char category.  An incoming category of 1 or 2 means that
   1217         //      we are preset for doing the beginning or end of input, and
   1218         //      that we shouldn't get a category from an actual text input character.
   1219         //
   1220         if (mode == RBBI_RUN) {
   1221             // look up the current character's character category, which tells us
   1222             // which column in the state table to look at.
   1223             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
   1224             //        not the size of the character going in, which is a UChar32.
   1225             //
   1226             UTRIE_GET16(&fData->fTrie, c, category);
   1227 
   1228             // Check the dictionary bit in the character's category.
   1229             //    Counter is only used by dictionary based iterators (subclasses).
   1230             //    Chars that need to be handled by a dictionary have a flag bit set
   1231             //    in their category values.
   1232             //
   1233             if ((category & 0x4000) != 0)  {
   1234                 fDictionaryCharCount++;
   1235                 //  And off the dictionary flag bit.
   1236                 category &= ~0x4000;
   1237             }
   1238         }
   1239 
   1240         #ifdef RBBI_DEBUG
   1241             if (fTrace) {
   1242                 RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(fText));
   1243                 if (0x20<=c && c<0x7f) {
   1244                     RBBIDebugPrintf("\"%c\"  ", c);
   1245                 } else {
   1246                     RBBIDebugPrintf("%5x  ", c);
   1247                 }
   1248                 RBBIDebugPrintf("%3d  %3d\n", state, category);
   1249             }
   1250         #endif
   1251 
   1252         // State Transition - move machine to its next state
   1253         //
   1254         state = row->fNextState[category];
   1255         row = (RBBIStateTableRow *)
   1256             (statetable->fTableData + (statetable->fRowLen * state));
   1257 
   1258         if (row->fAccepting == -1) {
   1259             // Match found, common case.
   1260             result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1261         }
   1262 
   1263         if (row->fLookAhead != 0) {
   1264             if (lookaheadStatus != 0
   1265                 && row->fAccepting == lookaheadStatus) {
   1266                 // Lookahead match is completed.
   1267                 result               = lookaheadResult;
   1268                 lookaheadStatus      = 0;
   1269                 // TODO:  make a standalone hard break in a rule work.
   1270                 if (lookAheadHardBreak) {
   1271                     UTEXT_SETNATIVEINDEX(fText, result);
   1272                     return result;
   1273                 }
   1274                 // Look-ahead completed, but other rules may match further.  Continue on
   1275                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
   1276                 goto continueOn;
   1277             }
   1278 
   1279             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1280             lookaheadResult = r;
   1281             lookaheadStatus = row->fLookAhead;
   1282             goto continueOn;
   1283         }
   1284 
   1285 
   1286         if (row->fAccepting != 0) {
   1287             // Because this is an accepting state, any in-progress look-ahead match
   1288             //   is no longer relavant.  Clear out the pending lookahead status.
   1289             lookaheadStatus = 0;
   1290         }
   1291 
   1292 continueOn:
   1293         if (state == STOP_STATE) {
   1294             // This is the normal exit from the lookup state machine.
   1295             // We have advanced through the string until it is certain that no
   1296             //   longer match is possible, no matter what characters follow.
   1297             break;
   1298         }
   1299 
   1300         // Move (backwards) to the next character to process.
   1301         // If this is a beginning-of-input loop iteration, don't advance
   1302         //    the input position.  The next iteration will be processing the
   1303         //    first real input character.
   1304         if (mode == RBBI_RUN) {
   1305             c = UTEXT_PREVIOUS32(fText);
   1306         } else {
   1307             if (mode == RBBI_START) {
   1308                 mode = RBBI_RUN;
   1309             }
   1310         }
   1311     }
   1312 
   1313     // The state machine is done.  Check whether it found a match...
   1314 
   1315     // If the iterator failed to advance in the match engine, force it ahead by one.
   1316     //   (This really indicates a defect in the break rules.  They should always match
   1317     //    at least one character.)
   1318     if (result == initialPosition) {
   1319         UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1320         UTEXT_PREVIOUS32(fText);
   1321         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1322     }
   1323 
   1324     // Leave the iterator at our result position.
   1325     UTEXT_SETNATIVEINDEX(fText, result);
   1326     #ifdef RBBI_DEBUG
   1327         if (fTrace) {
   1328             RBBIDebugPrintf("result = %d\n\n", result);
   1329         }
   1330     #endif
   1331     return result;
   1332 }
   1333 
   1334 
   1335 void
   1336 RuleBasedBreakIterator::reset()
   1337 {
   1338     if (fCachedBreakPositions) {
   1339         uprv_free(fCachedBreakPositions);
   1340     }
   1341     fCachedBreakPositions = NULL;
   1342     fNumCachedBreakPositions = 0;
   1343     fDictionaryCharCount = 0;
   1344     fPositionInCache = 0;
   1345 }
   1346 
   1347 
   1348 
   1349 //-------------------------------------------------------------------------------
   1350 //
   1351 //   getRuleStatus()   Return the break rule tag associated with the current
   1352 //                     iterator position.  If the iterator arrived at its current
   1353 //                     position by iterating forwards, the value will have been
   1354 //                     cached by the handleNext() function.
   1355 //
   1356 //                     If no cached status value is available, the status is
   1357 //                     found by doing a previous() followed by a next(), which
   1358 //                     leaves the iterator where it started, and computes the
   1359 //                     status while doing the next().
   1360 //
   1361 //-------------------------------------------------------------------------------
   1362 void RuleBasedBreakIterator::makeRuleStatusValid() {
   1363     if (fLastStatusIndexValid == FALSE) {
   1364         //  No cached status is available.
   1365         if (fText == NULL || current() == 0) {
   1366             //  At start of text, or there is no text.  Status is always zero.
   1367             fLastRuleStatusIndex = 0;
   1368             fLastStatusIndexValid = TRUE;
   1369         } else {
   1370             //  Not at start of text.  Find status the tedious way.
   1371             int32_t pa = current();
   1372             previous();
   1373             if (fNumCachedBreakPositions > 0) {
   1374                 reset();                // Blow off the dictionary cache
   1375             }
   1376             int32_t pb = next();
   1377             if (pa != pb) {
   1378                 // note: the if (pa != pb) test is here only to eliminate warnings for
   1379                 //       unused local variables on gcc.  Logically, it isn't needed.
   1380                 U_ASSERT(pa == pb);
   1381             }
   1382         }
   1383     }
   1384     U_ASSERT(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fData->fStatusMaxIdx);
   1385 }
   1386 
   1387 
   1388 int32_t  RuleBasedBreakIterator::getRuleStatus() const {
   1389     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
   1390     nonConstThis->makeRuleStatusValid();
   1391 
   1392     // fLastRuleStatusIndex indexes to the start of the appropriate status record
   1393     //                                                 (the number of status values.)
   1394     //   This function returns the last (largest) of the array of status values.
   1395     int32_t  idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
   1396     int32_t  tagVal = fData->fRuleStatusTable[idx];
   1397 
   1398     return tagVal;
   1399 }
   1400 
   1401 
   1402 
   1403 
   1404 int32_t RuleBasedBreakIterator::getRuleStatusVec(
   1405              int32_t *fillInVec, int32_t capacity, UErrorCode &status)
   1406 {
   1407     if (U_FAILURE(status)) {
   1408         return 0;
   1409     }
   1410 
   1411     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
   1412     nonConstThis->makeRuleStatusValid();
   1413     int32_t  numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
   1414     int32_t  numValsToCopy = numVals;
   1415     if (numVals > capacity) {
   1416         status = U_BUFFER_OVERFLOW_ERROR;
   1417         numValsToCopy = capacity;
   1418     }
   1419     int i;
   1420     for (i=0; i<numValsToCopy; i++) {
   1421         fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
   1422     }
   1423     return numVals;
   1424 }
   1425 
   1426 
   1427 
   1428 //-------------------------------------------------------------------------------
   1429 //
   1430 //   getBinaryRules        Access to the compiled form of the rules,
   1431 //                         for use by build system tools that save the data
   1432 //                         for standard iterator types.
   1433 //
   1434 //-------------------------------------------------------------------------------
   1435 const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
   1436     const uint8_t  *retPtr = NULL;
   1437     length = 0;
   1438 
   1439     if (fData != NULL) {
   1440         retPtr = (const uint8_t *)fData->fHeader;
   1441         length = fData->fHeader->fLength;
   1442     }
   1443     return retPtr;
   1444 }
   1445 
   1446 
   1447 
   1448 
   1449 //-------------------------------------------------------------------------------
   1450 //
   1451 //  BufferClone       TODO:  In my (Andy) opinion, this function should be deprecated.
   1452 //                    Saving one heap allocation isn't worth the trouble.
   1453 //                    Cloning shouldn't be done in tight loops, and
   1454 //                    making the clone copy involves other heap operations anyway.
   1455 //                    And the application code for correctly dealing with buffer
   1456 //                    size problems and the eventual object destruction is ugly.
   1457 //
   1458 //-------------------------------------------------------------------------------
   1459 BreakIterator *  RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
   1460                                    int32_t &bufferSize,
   1461                                    UErrorCode &status)
   1462 {
   1463     if (U_FAILURE(status)){
   1464         return NULL;
   1465     }
   1466 
   1467     //
   1468     //  If user buffer size is zero this is a preflight operation to
   1469     //    obtain the needed buffer size, allowing for worst case misalignment.
   1470     //
   1471     if (bufferSize == 0) {
   1472         bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
   1473         return NULL;
   1474     }
   1475 
   1476 
   1477     //
   1478     //  Check the alignment and size of the user supplied buffer.
   1479     //  Allocate heap memory if the user supplied memory is insufficient.
   1480     //
   1481     char    *buf   = (char *)stackBuffer;
   1482     uint32_t s      = bufferSize;
   1483 
   1484     if (stackBuffer == NULL) {
   1485         s = 0;   // Ignore size, force allocation if user didn't give us a buffer.
   1486     }
   1487     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
   1488         uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf);
   1489         s   -= offsetUp;
   1490         buf += offsetUp;
   1491     }
   1492     if (s < sizeof(RuleBasedBreakIterator)) {
   1493         // Not enough room in the caller-supplied buffer.
   1494         // Do a plain-vanilla heap based clone and return that, along with
   1495         //   a warning that the clone was allocated.
   1496         RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this);
   1497         if (clonedBI == 0) {
   1498             status = U_MEMORY_ALLOCATION_ERROR;
   1499         } else {
   1500             status = U_SAFECLONE_ALLOCATED_WARNING;
   1501         }
   1502         return clonedBI;
   1503     }
   1504 
   1505     //
   1506     //  Clone the source BI into the caller-supplied buffer.
   1507     //
   1508     RuleBasedBreakIterator *clone = new(buf) RuleBasedBreakIterator(*this);
   1509     clone->fBufferClone = TRUE;   // Flag to prevent deleting storage on close (From C code)
   1510 
   1511     return clone;
   1512 }
   1513 
   1514 
   1515 //-------------------------------------------------------------------------------
   1516 //
   1517 //  isDictionaryChar      Return true if the category lookup for this char
   1518 //                        indicates that it is in the set of dictionary lookup
   1519 //                        chars.
   1520 //
   1521 //                        This function is intended for use by dictionary based
   1522 //                        break iterators.
   1523 //
   1524 //-------------------------------------------------------------------------------
   1525 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32   c) {
   1526     if (fData == NULL) {
   1527         return FALSE;
   1528     }
   1529     uint16_t category;
   1530     UTRIE_GET16(&fData->fTrie, c, category);
   1531     return (category & 0x4000) != 0;
   1532 }*/
   1533 
   1534 
   1535 //-------------------------------------------------------------------------------
   1536 //
   1537 //  checkDictionary       This function handles all processing of characters in
   1538 //                        the "dictionary" set. It will determine the appropriate
   1539 //                        course of action, and possibly set up a cache in the
   1540 //                        process.
   1541 //
   1542 //-------------------------------------------------------------------------------
   1543 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
   1544                             int32_t endPos,
   1545                             UBool reverse) {
   1546     // Reset the old break cache first.
   1547     reset();
   1548 
   1549     // note: code segment below assumes that dictionary chars are in the
   1550     // startPos-endPos range
   1551     // value returned should be next character in sequence
   1552     if ((endPos - startPos) <= 1) {
   1553         return (reverse ? startPos : endPos);
   1554     }
   1555 
   1556     // Bug 5532.  The dictionary code will crash if the input text is UTF-8
   1557     //      because native indexes are different from UTF-16 indexes.
   1558     //      Temporary hack: skip dictionary lookup for UTF-8 encoded text.
   1559     //      It wont give the right breaks, but it's better than a crash.
   1560     //
   1561     //      Check the type of the UText by checking its pFuncs field, which
   1562     //      is UText's function dispatch table.  It will be the same for all
   1563     //      UTF-8 UTexts and different for any other UText type.
   1564     //
   1565     //      We have no other type of UText available with non-UTF-16 native indexing.
   1566     //      This whole check will go away once the dictionary code is fixed.
   1567     static const void *utext_utf8Funcs;
   1568     if (utext_utf8Funcs == NULL) {
   1569         // Cache the UTF-8 UText function pointer value.
   1570         UErrorCode status = U_ZERO_ERROR;
   1571         UText tempUText = UTEXT_INITIALIZER;
   1572         utext_openUTF8(&tempUText, NULL, 0, &status);
   1573         utext_utf8Funcs = tempUText.pFuncs;
   1574         utext_close(&tempUText);
   1575     }
   1576     if (fText->pFuncs == utext_utf8Funcs) {
   1577         return (reverse ? startPos : endPos);
   1578     }
   1579 
   1580     // Starting from the starting point, scan towards the proposed result,
   1581     // looking for the first dictionary character (which may be the one
   1582     // we're on, if we're starting in the middle of a range).
   1583     utext_setNativeIndex(fText, reverse ? endPos : startPos);
   1584     if (reverse) {
   1585         UTEXT_PREVIOUS32(fText);
   1586     }
   1587 
   1588     int32_t rangeStart = startPos;
   1589     int32_t rangeEnd = endPos;
   1590 
   1591     uint16_t    category;
   1592     int32_t     current;
   1593     UErrorCode  status = U_ZERO_ERROR;
   1594     UStack      breaks(status);
   1595     int32_t     foundBreakCount = 0;
   1596     UChar32     c = utext_current32(fText);
   1597 
   1598     UTRIE_GET16(&fData->fTrie, c, category);
   1599 
   1600     // Is the character we're starting on a dictionary character? If so, we
   1601     // need to back up to include the entire run; otherwise the results of
   1602     // the break algorithm will differ depending on where we start. Since
   1603     // the result is cached and there is typically a non-dictionary break
   1604     // within a small number of words, there should be little performance impact.
   1605     if (category & 0x4000) {
   1606         if (reverse) {
   1607             do {
   1608                 utext_next32(fText);          // TODO:  recast to work directly with postincrement.
   1609                 c = utext_current32(fText);
   1610                 UTRIE_GET16(&fData->fTrie, c, category);
   1611             } while (c != U_SENTINEL && (category & 0x4000));
   1612             // Back up to the last dictionary character
   1613             rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1614             if (c == U_SENTINEL) {
   1615                 // c = fText->last32();
   1616                 //   TODO:  why was this if needed?
   1617                 c = UTEXT_PREVIOUS32(fText);
   1618             }
   1619             else {
   1620                 c = UTEXT_PREVIOUS32(fText);
   1621             }
   1622         }
   1623         else {
   1624             do {
   1625                 c = UTEXT_PREVIOUS32(fText);
   1626                 UTRIE_GET16(&fData->fTrie, c, category);
   1627             }
   1628             while (c != U_SENTINEL && (category & 0x4000));
   1629             // Back up to the last dictionary character
   1630             if (c == U_SENTINEL) {
   1631                 // c = fText->first32();
   1632                 c = utext_current32(fText);
   1633             }
   1634             else {
   1635                 utext_next32(fText);
   1636                 c = utext_current32(fText);
   1637             }
   1638             rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
   1639         }
   1640         UTRIE_GET16(&fData->fTrie, c, category);
   1641     }
   1642 
   1643     // Loop through the text, looking for ranges of dictionary characters.
   1644     // For each span, find the appropriate break engine, and ask it to find
   1645     // any breaks within the span.
   1646     // Note: we always do this in the forward direction, so that the break
   1647     // cache is built in the right order.
   1648     if (reverse) {
   1649         utext_setNativeIndex(fText, rangeStart);
   1650         c = utext_current32(fText);
   1651         UTRIE_GET16(&fData->fTrie, c, category);
   1652     }
   1653     while(U_SUCCESS(status)) {
   1654         while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
   1655             utext_next32(fText);           // TODO:  tweak for post-increment operation
   1656             c = utext_current32(fText);
   1657             UTRIE_GET16(&fData->fTrie, c, category);
   1658         }
   1659         if (current >= rangeEnd) {
   1660             break;
   1661         }
   1662 
   1663         // We now have a dictionary character. Get the appropriate language object
   1664         // to deal with it.
   1665         const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
   1666 
   1667         // Ask the language object if there are any breaks. It will leave the text
   1668         // pointer on the other side of its range, ready to search for the next one.
   1669         if (lbe != NULL) {
   1670             foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
   1671         }
   1672 
   1673         // Reload the loop variables for the next go-round
   1674         c = utext_current32(fText);
   1675         UTRIE_GET16(&fData->fTrie, c, category);
   1676     }
   1677 
   1678     // If we found breaks, build a new break cache. The first and last entries must
   1679     // be the original starting and ending position.
   1680     if (foundBreakCount > 0) {
   1681         int32_t totalBreaks = foundBreakCount;
   1682         if (startPos < breaks.elementAti(0)) {
   1683             totalBreaks += 1;
   1684         }
   1685         if (endPos > breaks.peeki()) {
   1686             totalBreaks += 1;
   1687         }
   1688         fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
   1689         if (fCachedBreakPositions != NULL) {
   1690             int32_t out = 0;
   1691             fNumCachedBreakPositions = totalBreaks;
   1692             if (startPos < breaks.elementAti(0)) {
   1693                 fCachedBreakPositions[out++] = startPos;
   1694             }
   1695             for (int32_t i = 0; i < foundBreakCount; ++i) {
   1696                 fCachedBreakPositions[out++] = breaks.elementAti(i);
   1697             }
   1698             if (endPos > fCachedBreakPositions[out-1]) {
   1699                 fCachedBreakPositions[out] = endPos;
   1700             }
   1701             // If there are breaks, then by definition, we are replacing the original
   1702             // proposed break by one of the breaks we found. Use following() and
   1703             // preceding() to do the work. They should never recurse in this case.
   1704             if (reverse) {
   1705                 return preceding(endPos);
   1706             }
   1707             else {
   1708                 return following(startPos);
   1709             }
   1710         }
   1711         // If the allocation failed, just fall through to the "no breaks found" case.
   1712     }
   1713 
   1714     // If we get here, there were no language-based breaks. Set the text pointer
   1715     // to the original proposed break.
   1716     utext_setNativeIndex(fText, reverse ? startPos : endPos);
   1717     return (reverse ? startPos : endPos);
   1718 }
   1719 
   1720 U_NAMESPACE_END
   1721 
   1722 // defined in ucln_cmn.h
   1723 
   1724 static U_NAMESPACE_QUALIFIER UStack *gLanguageBreakFactories = NULL;
   1725 
   1726 /**
   1727  * Release all static memory held by breakiterator.
   1728  */
   1729 U_CDECL_BEGIN
   1730 static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
   1731     if (gLanguageBreakFactories) {
   1732         delete gLanguageBreakFactories;
   1733         gLanguageBreakFactories = NULL;
   1734     }
   1735     return TRUE;
   1736 }
   1737 U_CDECL_END
   1738 
   1739 U_CDECL_BEGIN
   1740 static void U_CALLCONV _deleteFactory(void *obj) {
   1741     delete (U_NAMESPACE_QUALIFIER LanguageBreakFactory *) obj;
   1742 }
   1743 U_CDECL_END
   1744 U_NAMESPACE_BEGIN
   1745 
   1746 static const LanguageBreakEngine*
   1747 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
   1748 {
   1749     UBool       needsInit;
   1750     UErrorCode  status = U_ZERO_ERROR;
   1751     UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit);
   1752 
   1753     if (needsInit) {
   1754         UStack  *factories = new UStack(_deleteFactory, NULL, status);
   1755         if (factories != NULL && U_SUCCESS(status)) {
   1756             ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
   1757             factories->push(builtIn, status);
   1758 #ifdef U_LOCAL_SERVICE_HOOK
   1759             LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
   1760             if (extra != NULL) {
   1761                 factories->push(extra, status);
   1762             }
   1763 #endif
   1764         }
   1765         umtx_lock(NULL);
   1766         if (gLanguageBreakFactories == NULL) {
   1767             gLanguageBreakFactories = factories;
   1768             factories = NULL;
   1769             ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
   1770         }
   1771         umtx_unlock(NULL);
   1772         delete factories;
   1773     }
   1774 
   1775     if (gLanguageBreakFactories == NULL) {
   1776         return NULL;
   1777     }
   1778 
   1779     int32_t i = gLanguageBreakFactories->size();
   1780     const LanguageBreakEngine *lbe = NULL;
   1781     while (--i >= 0) {
   1782         LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
   1783         lbe = factory->getEngineFor(c, breakType);
   1784         if (lbe != NULL) {
   1785             break;
   1786         }
   1787     }
   1788     return lbe;
   1789 }
   1790 
   1791 
   1792 //-------------------------------------------------------------------------------
   1793 //
   1794 //  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the
   1795 //                          the characer c.
   1796 //
   1797 //-------------------------------------------------------------------------------
   1798 const LanguageBreakEngine *
   1799 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
   1800     const LanguageBreakEngine *lbe = NULL;
   1801     UErrorCode status = U_ZERO_ERROR;
   1802 
   1803     if (fLanguageBreakEngines == NULL) {
   1804         fLanguageBreakEngines = new UStack(status);
   1805         if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
   1806             delete fLanguageBreakEngines;
   1807             fLanguageBreakEngines = 0;
   1808             return NULL;
   1809         }
   1810     }
   1811 
   1812     int32_t i = fLanguageBreakEngines->size();
   1813     while (--i >= 0) {
   1814         lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
   1815         if (lbe->handles(c, fBreakType)) {
   1816             return lbe;
   1817         }
   1818     }
   1819 
   1820     // No existing dictionary took the character. See if a factory wants to
   1821     // give us a new LanguageBreakEngine for this character.
   1822     lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
   1823 
   1824     // If we got one, use it and push it on our stack.
   1825     if (lbe != NULL) {
   1826         fLanguageBreakEngines->push((void *)lbe, status);
   1827         // Even if we can't remember it, we can keep looking it up, so
   1828         // return it even if the push fails.
   1829         return lbe;
   1830     }
   1831 
   1832     // No engine is forthcoming for this character. Add it to the
   1833     // reject set. Create the reject break engine if needed.
   1834     if (fUnhandledBreakEngine == NULL) {
   1835         fUnhandledBreakEngine = new UnhandledEngine(status);
   1836         if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
   1837             status = U_MEMORY_ALLOCATION_ERROR;
   1838         }
   1839         // Put it last so that scripts for which we have an engine get tried
   1840         // first.
   1841         fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
   1842         // If we can't insert it, or creation failed, get rid of it
   1843         if (U_FAILURE(status)) {
   1844             delete fUnhandledBreakEngine;
   1845             fUnhandledBreakEngine = 0;
   1846             return NULL;
   1847         }
   1848     }
   1849 
   1850     // Tell the reject engine about the character; at its discretion, it may
   1851     // add more than just the one character.
   1852     fUnhandledBreakEngine->handleCharacter(c, fBreakType);
   1853 
   1854     return fUnhandledBreakEngine;
   1855 }
   1856 
   1857 
   1858 
   1859 /*int32_t RuleBasedBreakIterator::getBreakType() const {
   1860     return fBreakType;
   1861 }*/
   1862 
   1863 void RuleBasedBreakIterator::setBreakType(int32_t type) {
   1864     fBreakType = type;
   1865     reset();
   1866 }
   1867 
   1868 U_NAMESPACE_END
   1869 
   1870 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   1871