Home | History | Annotate | Download | only in common
      1 /*
      2 ***************************************************************************
      3 *   Copyright (C) 1999-2011 International Business Machines Corporation
      4 *   and others. All rights reserved.
      5 ***************************************************************************
      6 */
      7 //
      8 //  file:  rbbi.c    Contains the implementation of the rule based break iterator
      9 //                   runtime engine and the API implementation for
     10 //                   class RuleBasedBreakIterator
     11 //
     12 
     13 #include <typeinfo>  // for 'typeid' to work
     14 
     15 #include "unicode/utypes.h"
     16 
     17 #if !UCONFIG_NO_BREAK_ITERATION
     18 
     19 #include "unicode/rbbi.h"
     20 #include "unicode/schriter.h"
     21 #include "unicode/uchriter.h"
     22 #include "unicode/udata.h"
     23 #include "unicode/uclean.h"
     24 #include "rbbidata.h"
     25 #include "rbbirb.h"
     26 #include "cmemory.h"
     27 #include "cstring.h"
     28 #include "umutex.h"
     29 #include "ucln_cmn.h"
     30 #include "brkeng.h"
     31 
     32 #include "uassert.h"
     33 #include "uvector.h"
     34 
     35 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
     36 #if U_LOCAL_SERVICE_HOOK
     37 #include "localsvc.h"
     38 #endif
     39 
     40 #ifdef RBBI_DEBUG
     41 static UBool fTrace = FALSE;
     42 #endif
     43 
     44 U_NAMESPACE_BEGIN
     45 
     46 // The state number of the starting state
     47 #define START_STATE 1
     48 
     49 // The state-transition value indicating "stop"
     50 #define STOP_STATE  0
     51 
     52 
     53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
     54 
     55 
     56 //=======================================================================
     57 // constructors
     58 //=======================================================================
     59 
     60 /**
     61  * Constructs a RuleBasedBreakIterator that uses the already-created
     62  * tables object that is passed in as a parameter.
     63  */
     64 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
     65 {
     66     init();
     67     fData = new RBBIDataWrapper(data, status); // status checked in constructor
     68     if (U_FAILURE(status)) {return;}
     69     if(fData == 0) {
     70         status = U_MEMORY_ALLOCATION_ERROR;
     71         return;
     72     }
     73 }
     74 
     75 /**
     76  * Same as above but does not adopt memory
     77  */
     78 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status)
     79 {
     80     init();
     81     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor
     82     if (U_FAILURE(status)) {return;}
     83     if(fData == 0) {
     84         status = U_MEMORY_ALLOCATION_ERROR;
     85         return;
     86     }
     87 }
     88 
     89 
     90 //
     91 //  Construct from precompiled binary rules (tables).  This constructor is public API,
     92 //  taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
     93 //
     94 RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
     95                        uint32_t       ruleLength,
     96                        UErrorCode     &status) {
     97     init();
     98     if (U_FAILURE(status)) {
     99         return;
    100     }
    101     if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) {
    102         status = U_ILLEGAL_ARGUMENT_ERROR;
    103         return;
    104     }
    105     const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules;
    106     if (data->fLength > ruleLength) {
    107         status = U_ILLEGAL_ARGUMENT_ERROR;
    108         return;
    109     }
    110     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status);
    111     if (U_FAILURE(status)) {return;}
    112     if(fData == 0) {
    113         status = U_MEMORY_ALLOCATION_ERROR;
    114         return;
    115     }
    116 }
    117 
    118 
    119 //-------------------------------------------------------------------------------
    120 //
    121 //   Constructor   from a UDataMemory handle to precompiled break rules
    122 //                 stored in an ICU data file.
    123 //
    124 //-------------------------------------------------------------------------------
    125 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
    126 {
    127     init();
    128     fData = new RBBIDataWrapper(udm, status); // status checked in constructor
    129     if (U_FAILURE(status)) {return;}
    130     if(fData == 0) {
    131         status = U_MEMORY_ALLOCATION_ERROR;
    132         return;
    133     }
    134 }
    135 
    136 
    137 
    138 //-------------------------------------------------------------------------------
    139 //
    140 //   Constructor       from a set of rules supplied as a string.
    141 //
    142 //-------------------------------------------------------------------------------
    143 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
    144                                                 UParseError          &parseError,
    145                                                 UErrorCode           &status)
    146 {
    147     init();
    148     if (U_FAILURE(status)) {return;}
    149     RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
    150         RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
    151     // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that
    152     //        creates and returns a complete RBBI.  From here, in a constructor, we
    153     //        can't just return the object created by the builder factory, hence
    154     //        the assignment of the factory created object to "this".
    155     if (U_SUCCESS(status)) {
    156         *this = *bi;
    157         delete bi;
    158     }
    159 }
    160 
    161 
    162 //-------------------------------------------------------------------------------
    163 //
    164 // Default Constructor.      Create an empty shell that can be set up later.
    165 //                           Used when creating a RuleBasedBreakIterator from a set
    166 //                           of rules.
    167 //-------------------------------------------------------------------------------
    168 RuleBasedBreakIterator::RuleBasedBreakIterator() {
    169     init();
    170 }
    171 
    172 
    173 //-------------------------------------------------------------------------------
    174 //
    175 //   Copy constructor.  Will produce a break iterator with the same behavior,
    176 //                      and which iterates over the same text, as the one passed in.
    177 //
    178 //-------------------------------------------------------------------------------
    179 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
    180 : BreakIterator(other)
    181 {
    182     this->init();
    183     *this = other;
    184 }
    185 
    186 
    187 /**
    188  * Destructor
    189  */
    190 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
    191     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    192         // fCharIter was adopted from the outside.
    193         delete fCharIter;
    194     }
    195     fCharIter = NULL;
    196     delete fSCharIter;
    197     fCharIter = NULL;
    198     delete fDCharIter;
    199     fDCharIter = NULL;
    200 
    201     utext_close(fText);
    202 
    203     if (fData != NULL) {
    204         fData->removeReference();
    205         fData = NULL;
    206     }
    207     if (fCachedBreakPositions) {
    208         uprv_free(fCachedBreakPositions);
    209         fCachedBreakPositions = NULL;
    210     }
    211     if (fLanguageBreakEngines) {
    212         delete fLanguageBreakEngines;
    213         fLanguageBreakEngines = NULL;
    214     }
    215     if (fUnhandledBreakEngine) {
    216         delete fUnhandledBreakEngine;
    217         fUnhandledBreakEngine = NULL;
    218     }
    219 }
    220 
    221 /**
    222  * Assignment operator.  Sets this iterator to have the same behavior,
    223  * and iterate over the same text, as the one passed in.
    224  */
    225 RuleBasedBreakIterator&
    226 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
    227     if (this == &that) {
    228         return *this;
    229     }
    230     reset();    // Delete break cache information
    231     fBreakType = that.fBreakType;
    232     if (fLanguageBreakEngines != NULL) {
    233         delete fLanguageBreakEngines;
    234         fLanguageBreakEngines = NULL;   // Just rebuild for now
    235     }
    236     // TODO: clone fLanguageBreakEngines from "that"
    237     UErrorCode status = U_ZERO_ERROR;
    238     fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
    239 
    240     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    241         delete fCharIter;
    242     }
    243     fCharIter = NULL;
    244 
    245     if (that.fCharIter != NULL ) {
    246         // This is a little bit tricky - it will intially appear that
    247         //  this->fCharIter is adopted, even if that->fCharIter was
    248         //  not adopted.  That's ok.
    249         fCharIter = that.fCharIter->clone();
    250     }
    251 
    252     if (fData != NULL) {
    253         fData->removeReference();
    254         fData = NULL;
    255     }
    256     if (that.fData != NULL) {
    257         fData = that.fData->addReference();
    258     }
    259 
    260     return *this;
    261 }
    262 
    263 
    264 
    265 //-----------------------------------------------------------------------------
    266 //
    267 //    init()      Shared initialization routine.   Used by all the constructors.
    268 //                Initializes all fields, leaving the object in a consistent state.
    269 //
    270 //-----------------------------------------------------------------------------
    271 void RuleBasedBreakIterator::init() {
    272     UErrorCode  status    = U_ZERO_ERROR;
    273     fBufferClone          = FALSE;
    274     fText                 = utext_openUChars(NULL, NULL, 0, &status);
    275     fCharIter             = NULL;
    276     fSCharIter            = NULL;
    277     fDCharIter            = NULL;
    278     fData                 = NULL;
    279     fLastRuleStatusIndex  = 0;
    280     fLastStatusIndexValid = TRUE;
    281     fDictionaryCharCount  = 0;
    282     fBreakType            = UBRK_WORD;  // Defaulting BreakType to word gives reasonable
    283                                         //   dictionary behavior for Break Iterators that are
    284                                         //   built from rules.  Even better would be the ability to
    285                                         //   declare the type in the rules.
    286 
    287     fCachedBreakPositions    = NULL;
    288     fLanguageBreakEngines    = NULL;
    289     fUnhandledBreakEngine    = NULL;
    290     fNumCachedBreakPositions = 0;
    291     fPositionInCache         = 0;
    292 
    293 #ifdef RBBI_DEBUG
    294     static UBool debugInitDone = FALSE;
    295     if (debugInitDone == FALSE) {
    296         char *debugEnv = getenv("U_RBBIDEBUG");
    297         if (debugEnv && uprv_strstr(debugEnv, "trace")) {
    298             fTrace = TRUE;
    299         }
    300         debugInitDone = TRUE;
    301     }
    302 #endif
    303 }
    304 
    305 
    306 
    307 //-----------------------------------------------------------------------------
    308 //
    309 //    clone - Returns a newly-constructed RuleBasedBreakIterator with the same
    310 //            behavior, and iterating over the same text, as this one.
    311 //            Virtual function: does the right thing with subclasses.
    312 //
    313 //-----------------------------------------------------------------------------
    314 BreakIterator*
    315 RuleBasedBreakIterator::clone(void) const {
    316     return new RuleBasedBreakIterator(*this);
    317 }
    318 
    319 /**
    320  * Equality operator.  Returns TRUE if both BreakIterators are of the
    321  * same class, have the same behavior, and iterate over the same text.
    322  */
    323 UBool
    324 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
    325     if (typeid(*this) != typeid(that)) {
    326         return FALSE;
    327     }
    328 
    329     const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
    330 
    331     if (!utext_equals(fText, that2.fText)) {
    332         // The two break iterators are operating on different text,
    333         //   or have a different interation position.
    334         return FALSE;
    335     };
    336 
    337     // TODO:  need a check for when in a dictionary region at different offsets.
    338 
    339     if (that2.fData == fData ||
    340         (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
    341             // The two break iterators are using the same rules.
    342             return TRUE;
    343         }
    344     return FALSE;
    345 }
    346 
    347 /**
    348  * Compute a hash code for this BreakIterator
    349  * @return A hash code
    350  */
    351 int32_t
    352 RuleBasedBreakIterator::hashCode(void) const {
    353     int32_t   hash = 0;
    354     if (fData != NULL) {
    355         hash = fData->hashCode();
    356     }
    357     return hash;
    358 }
    359 
    360 
    361 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
    362     if (U_FAILURE(status)) {
    363         return;
    364     }
    365     reset();
    366     fText = utext_clone(fText, ut, FALSE, TRUE, &status);
    367 
    368     // Set up a dummy CharacterIterator to be returned if anyone
    369     //   calls getText().  With input from UText, there is no reasonable
    370     //   way to return a characterIterator over the actual input text.
    371     //   Return one over an empty string instead - this is the closest
    372     //   we can come to signaling a failure.
    373     //   (GetText() is obsolete, this failure is sort of OK)
    374     if (fDCharIter == NULL) {
    375         static const UChar c = 0;
    376         fDCharIter = new UCharCharacterIterator(&c, 0);
    377         if (fDCharIter == NULL) {
    378             status = U_MEMORY_ALLOCATION_ERROR;
    379             return;
    380         }
    381     }
    382 
    383     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    384         // existing fCharIter was adopted from the outside.  Delete it now.
    385         delete fCharIter;
    386     }
    387     fCharIter = fDCharIter;
    388 
    389     this->first();
    390 }
    391 
    392 
    393 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
    394     UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
    395     return result;
    396 }
    397 
    398 
    399 
    400 /**
    401  * Returns the description used to create this iterator
    402  */
    403 const UnicodeString&
    404 RuleBasedBreakIterator::getRules() const {
    405     if (fData != NULL) {
    406         return fData->getRuleSourceString();
    407     } else {
    408         static const UnicodeString *s;
    409         if (s == NULL) {
    410             // TODO:  something more elegant here.
    411             //        perhaps API should return the string by value.
    412             //        Note:  thread unsafe init & leak are semi-ok, better than
    413             //               what was before.  Sould be cleaned up, though.
    414             s = new UnicodeString;
    415         }
    416         return *s;
    417     }
    418 }
    419 
    420 //=======================================================================
    421 // BreakIterator overrides
    422 //=======================================================================
    423 
    424 /**
    425  * Return a CharacterIterator over the text being analyzed.
    426  */
    427 CharacterIterator&
    428 RuleBasedBreakIterator::getText() const {
    429     return *fCharIter;
    430 }
    431 
    432 /**
    433  * Set the iterator to analyze a new piece of text.  This function resets
    434  * the current iteration position to the beginning of the text.
    435  * @param newText An iterator over the text to analyze.
    436  */
    437 void
    438 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
    439     // If we are holding a CharacterIterator adopted from a
    440     //   previous call to this function, delete it now.
    441     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    442         delete fCharIter;
    443     }
    444 
    445     fCharIter = newText;
    446     UErrorCode status = U_ZERO_ERROR;
    447     reset();
    448     if (newText==NULL || newText->startIndex() != 0) {
    449         // startIndex !=0 wants to be an error, but there's no way to report it.
    450         // Make the iterator text be an empty string.
    451         fText = utext_openUChars(fText, NULL, 0, &status);
    452     } else {
    453         fText = utext_openCharacterIterator(fText, newText, &status);
    454     }
    455     this->first();
    456 }
    457 
    458 /**
    459  * Set the iterator to analyze a new piece of text.  This function resets
    460  * the current iteration position to the beginning of the text.
    461  * @param newText An iterator over the text to analyze.
    462  */
    463 void
    464 RuleBasedBreakIterator::setText(const UnicodeString& newText) {
    465     UErrorCode status = U_ZERO_ERROR;
    466     reset();
    467     fText = utext_openConstUnicodeString(fText, &newText, &status);
    468 
    469     // Set up a character iterator on the string.
    470     //   Needed in case someone calls getText().
    471     //  Can not, unfortunately, do this lazily on the (probably never)
    472     //  call to getText(), because getText is const.
    473     if (fSCharIter == NULL) {
    474         fSCharIter = new StringCharacterIterator(newText);
    475     } else {
    476         fSCharIter->setText(newText);
    477     }
    478 
    479     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    480         // old fCharIter was adopted from the outside.  Delete it.
    481         delete fCharIter;
    482     }
    483     fCharIter = fSCharIter;
    484 
    485     this->first();
    486 }
    487 
    488 
    489 
    490 /**
    491  * Sets the current iteration position to the beginning of the text.
    492  * @return The offset of the beginning of the text.
    493  */
    494 int32_t RuleBasedBreakIterator::first(void) {
    495     reset();
    496     fLastRuleStatusIndex  = 0;
    497     fLastStatusIndexValid = TRUE;
    498     //if (fText == NULL)
    499     //    return BreakIterator::DONE;
    500 
    501     utext_setNativeIndex(fText, 0);
    502     return 0;
    503 }
    504 
    505 /**
    506  * Sets the current iteration position to the end of the text.
    507  * @return The text's past-the-end offset.
    508  */
    509 int32_t RuleBasedBreakIterator::last(void) {
    510     reset();
    511     if (fText == NULL) {
    512         fLastRuleStatusIndex  = 0;
    513         fLastStatusIndexValid = TRUE;
    514         return BreakIterator::DONE;
    515     }
    516 
    517     fLastStatusIndexValid = FALSE;
    518     int32_t pos = (int32_t)utext_nativeLength(fText);
    519     utext_setNativeIndex(fText, pos);
    520     return pos;
    521 }
    522 
    523 /**
    524  * Advances the iterator either forward or backward the specified number of steps.
    525  * Negative values move backward, and positive values move forward.  This is
    526  * equivalent to repeatedly calling next() or previous().
    527  * @param n The number of steps to move.  The sign indicates the direction
    528  * (negative is backwards, and positive is forwards).
    529  * @return The character offset of the boundary position n boundaries away from
    530  * the current one.
    531  */
    532 int32_t RuleBasedBreakIterator::next(int32_t n) {
    533     int32_t result = current();
    534     while (n > 0) {
    535         result = next();
    536         --n;
    537     }
    538     while (n < 0) {
    539         result = previous();
    540         ++n;
    541     }
    542     return result;
    543 }
    544 
    545 /**
    546  * Advances the iterator to the next boundary position.
    547  * @return The position of the first boundary after this one.
    548  */
    549 int32_t RuleBasedBreakIterator::next(void) {
    550     // if we have cached break positions and we're still in the range
    551     // covered by them, just move one step forward in the cache
    552     if (fCachedBreakPositions != NULL) {
    553         if (fPositionInCache < fNumCachedBreakPositions - 1) {
    554             ++fPositionInCache;
    555             int32_t pos = fCachedBreakPositions[fPositionInCache];
    556             utext_setNativeIndex(fText, pos);
    557             return pos;
    558         }
    559         else {
    560             reset();
    561         }
    562     }
    563 
    564     int32_t startPos = current();
    565     int32_t result = handleNext(fData->fForwardTable);
    566     if (fDictionaryCharCount > 0) {
    567         result = checkDictionary(startPos, result, FALSE);
    568     }
    569     return result;
    570 }
    571 
    572 /**
    573  * Advances the iterator backwards, to the last boundary preceding this one.
    574  * @return The position of the last boundary position preceding this one.
    575  */
    576 int32_t RuleBasedBreakIterator::previous(void) {
    577     int32_t result;
    578     int32_t startPos;
    579 
    580     // if we have cached break positions and we're still in the range
    581     // covered by them, just move one step backward in the cache
    582     if (fCachedBreakPositions != NULL) {
    583         if (fPositionInCache > 0) {
    584             --fPositionInCache;
    585             // If we're at the beginning of the cache, need to reevaluate the
    586             // rule status
    587             if (fPositionInCache <= 0) {
    588                 fLastStatusIndexValid = FALSE;
    589             }
    590             int32_t pos = fCachedBreakPositions[fPositionInCache];
    591             utext_setNativeIndex(fText, pos);
    592             return pos;
    593         }
    594         else {
    595             reset();
    596         }
    597     }
    598 
    599     // if we're already sitting at the beginning of the text, return DONE
    600     if (fText == NULL || (startPos = current()) == 0) {
    601         fLastRuleStatusIndex  = 0;
    602         fLastStatusIndexValid = TRUE;
    603         return BreakIterator::DONE;
    604     }
    605 
    606     if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
    607         result = handlePrevious(fData->fReverseTable);
    608         if (fDictionaryCharCount > 0) {
    609             result = checkDictionary(result, startPos, TRUE);
    610         }
    611         return result;
    612     }
    613 
    614     // old rule syntax
    615     // set things up.  handlePrevious() will back us up to some valid
    616     // break position before the current position (we back our internal
    617     // iterator up one step to prevent handlePrevious() from returning
    618     // the current position), but not necessarily the last one before
    619 
    620     // where we started
    621 
    622     int32_t start = current();
    623 
    624     (void)UTEXT_PREVIOUS32(fText);
    625     int32_t lastResult    = handlePrevious(fData->fReverseTable);
    626     if (lastResult == UBRK_DONE) {
    627         lastResult = 0;
    628         utext_setNativeIndex(fText, 0);
    629     }
    630     result = lastResult;
    631     int32_t lastTag       = 0;
    632     UBool   breakTagValid = FALSE;
    633 
    634     // iterate forward from the known break position until we pass our
    635     // starting point.  The last break position before the starting
    636     // point is our return value
    637 
    638     for (;;) {
    639         result         = next();
    640         if (result == BreakIterator::DONE || result >= start) {
    641             break;
    642         }
    643         lastResult     = result;
    644         lastTag        = fLastRuleStatusIndex;
    645         breakTagValid  = TRUE;
    646     }
    647 
    648     // fLastBreakTag wants to have the value for section of text preceding
    649     // the result position that we are to return (in lastResult.)  If
    650     // the backwards rules overshot and the above loop had to do two or more
    651     // next()s to move up to the desired return position, we will have a valid
    652     // tag value. But, if handlePrevious() took us to exactly the correct result positon,
    653     // we wont have a tag value for that position, which is only set by handleNext().
    654 
    655     // set the current iteration position to be the last break position
    656     // before where we started, and then return that value
    657     utext_setNativeIndex(fText, lastResult);
    658     fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()
    659     fLastStatusIndexValid = breakTagValid;
    660 
    661     // No need to check the dictionary; it will have been handled by
    662     // next()
    663 
    664     return lastResult;
    665 }
    666 
    667 /**
    668  * Sets the iterator to refer to the first boundary position following
    669  * the specified position.
    670  * @offset The position from which to begin searching for a break position.
    671  * @return The position of the first break after the current position.
    672  */
    673 int32_t RuleBasedBreakIterator::following(int32_t offset) {
    674     // if we have cached break positions and offset is in the range
    675     // covered by them, use them
    676     // TODO: could use binary search
    677     // TODO: what if offset is outside range, but break is not?
    678     if (fCachedBreakPositions != NULL) {
    679         if (offset >= fCachedBreakPositions[0]
    680                 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
    681             fPositionInCache = 0;
    682             // We are guaranteed not to leave the array due to range test above
    683             while (offset >= fCachedBreakPositions[fPositionInCache]) {
    684                 ++fPositionInCache;
    685             }
    686             int32_t pos = fCachedBreakPositions[fPositionInCache];
    687             utext_setNativeIndex(fText, pos);
    688             return pos;
    689         }
    690         else {
    691             reset();
    692         }
    693     }
    694 
    695     // if the offset passed in is already past the end of the text,
    696     // just return DONE; if it's before the beginning, return the
    697     // text's starting offset
    698     fLastRuleStatusIndex  = 0;
    699     fLastStatusIndexValid = TRUE;
    700     if (fText == NULL || offset >= utext_nativeLength(fText)) {
    701         last();
    702         return next();
    703     }
    704     else if (offset < 0) {
    705         return first();
    706     }
    707 
    708     // otherwise, set our internal iteration position (temporarily)
    709     // to the position passed in.  If this is the _beginning_ position,
    710     // then we can just use next() to get our return value
    711 
    712     int32_t result = 0;
    713 
    714     if (fData->fSafeRevTable != NULL) {
    715         // new rule syntax
    716         utext_setNativeIndex(fText, offset);
    717         // move forward one codepoint to prepare for moving back to a
    718         // safe point.
    719         // this handles offset being between a supplementary character
    720         (void)UTEXT_NEXT32(fText);
    721         // handlePrevious will move most of the time to < 1 boundary away
    722         handlePrevious(fData->fSafeRevTable);
    723         int32_t result = next();
    724         while (result <= offset) {
    725             result = next();
    726         }
    727         return result;
    728     }
    729     if (fData->fSafeFwdTable != NULL) {
    730         // backup plan if forward safe table is not available
    731         utext_setNativeIndex(fText, offset);
    732         (void)UTEXT_PREVIOUS32(fText);
    733         // handle next will give result >= offset
    734         handleNext(fData->fSafeFwdTable);
    735         // previous will give result 0 or 1 boundary away from offset,
    736         // most of the time
    737         // we have to
    738         int32_t oldresult = previous();
    739         while (oldresult > offset) {
    740             int32_t result = previous();
    741             if (result <= offset) {
    742                 return oldresult;
    743             }
    744             oldresult = result;
    745         }
    746         int32_t result = next();
    747         if (result <= offset) {
    748             return next();
    749         }
    750         return result;
    751     }
    752     // otherwise, we have to sync up first.  Use handlePrevious() to back
    753     // up to a known break position before the specified position (if
    754     // we can determine that the specified position is a break position,
    755     // we don't back up at all).  This may or may not be the last break
    756     // position at or before our starting position.  Advance forward
    757     // from here until we've passed the starting position.  The position
    758     // we stop on will be the first break position after the specified one.
    759     // old rule syntax
    760 
    761     utext_setNativeIndex(fText, offset);
    762     if (offset==0 ||
    763         (offset==1  && utext_getNativeIndex(fText)==0)) {
    764         return next();
    765     }
    766     result = previous();
    767 
    768     while (result != BreakIterator::DONE && result <= offset) {
    769         result = next();
    770     }
    771 
    772     return result;
    773 }
    774 
    775 /**
    776  * Sets the iterator to refer to the last boundary position before the
    777  * specified position.
    778  * @offset The position to begin searching for a break from.
    779  * @return The position of the last boundary before the starting position.
    780  */
    781 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
    782     // if we have cached break positions and offset is in the range
    783     // covered by them, use them
    784     if (fCachedBreakPositions != NULL) {
    785         // TODO: binary search?
    786         // TODO: What if offset is outside range, but break is not?
    787         if (offset > fCachedBreakPositions[0]
    788                 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
    789             fPositionInCache = 0;
    790             while (fPositionInCache < fNumCachedBreakPositions
    791                    && offset > fCachedBreakPositions[fPositionInCache])
    792                 ++fPositionInCache;
    793             --fPositionInCache;
    794             // If we're at the beginning of the cache, need to reevaluate the
    795             // rule status
    796             if (fPositionInCache <= 0) {
    797                 fLastStatusIndexValid = FALSE;
    798             }
    799             utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
    800             return fCachedBreakPositions[fPositionInCache];
    801         }
    802         else {
    803             reset();
    804         }
    805     }
    806 
    807     // if the offset passed in is already past the end of the text,
    808     // just return DONE; if it's before the beginning, return the
    809     // text's starting offset
    810     if (fText == NULL || offset > utext_nativeLength(fText)) {
    811         // return BreakIterator::DONE;
    812         return last();
    813     }
    814     else if (offset < 0) {
    815         return first();
    816     }
    817 
    818     // if we start by updating the current iteration position to the
    819     // position specified by the caller, we can just use previous()
    820     // to carry out this operation
    821 
    822     if (fData->fSafeFwdTable != NULL) {
    823         // new rule syntax
    824         utext_setNativeIndex(fText, offset);
    825         int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    826         if (newOffset != offset) {
    827             // Will come here if specified offset was not a code point boundary AND
    828             //   the underlying implmentation is using UText, which snaps any non-code-point-boundary
    829             //   indices to the containing code point.
    830             // For breakitereator::preceding only, these non-code-point indices need to be moved
    831             //   up to refer to the following codepoint.
    832             (void)UTEXT_NEXT32(fText);
    833             offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    834         }
    835 
    836         // TODO:  (synwee) would it be better to just check for being in the middle of a surrogate pair,
    837         //        rather than adjusting the position unconditionally?
    838         //        (Change would interact with safe rules.)
    839         // TODO:  change RBBI behavior for off-boundary indices to match that of UText?
    840         //        affects only preceding(), seems cleaner, but is slightly different.
    841         (void)UTEXT_PREVIOUS32(fText);
    842         handleNext(fData->fSafeFwdTable);
    843         int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    844         while (result >= offset) {
    845             result = previous();
    846         }
    847         return result;
    848     }
    849     if (fData->fSafeRevTable != NULL) {
    850         // backup plan if forward safe table is not available
    851         //  TODO:  check whether this path can be discarded
    852         //         It's probably OK to say that rules must supply both safe tables
    853         //            if they use safe tables at all.  We have certainly never described
    854         //            to anyone how to work with just one safe table.
    855         utext_setNativeIndex(fText, offset);
    856         (void)UTEXT_NEXT32(fText);
    857 
    858         // handle previous will give result <= offset
    859         handlePrevious(fData->fSafeRevTable);
    860 
    861         // next will give result 0 or 1 boundary away from offset,
    862         // most of the time
    863         // we have to
    864         int32_t oldresult = next();
    865         while (oldresult < offset) {
    866             int32_t result = next();
    867             if (result >= offset) {
    868                 return oldresult;
    869             }
    870             oldresult = result;
    871         }
    872         int32_t result = previous();
    873         if (result >= offset) {
    874             return previous();
    875         }
    876         return result;
    877     }
    878 
    879     // old rule syntax
    880     utext_setNativeIndex(fText, offset);
    881     return previous();
    882 }
    883 
    884 /**
    885  * Returns true if the specfied position is a boundary position.  As a side
    886  * effect, leaves the iterator pointing to the first boundary position at
    887  * or after "offset".
    888  * @param offset the offset to check.
    889  * @return True if "offset" is a boundary position.
    890  */
    891 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
    892     // the beginning index of the iterator is always a boundary position by definition
    893     if (offset == 0) {
    894         first();       // For side effects on current position, tag values.
    895         return TRUE;
    896     }
    897 
    898     if (offset == (int32_t)utext_nativeLength(fText)) {
    899         last();       // For side effects on current position, tag values.
    900         return TRUE;
    901     }
    902 
    903     // out-of-range indexes are never boundary positions
    904     if (offset < 0) {
    905         first();       // For side effects on current position, tag values.
    906         return FALSE;
    907     }
    908 
    909     if (offset > utext_nativeLength(fText)) {
    910         last();        // For side effects on current position, tag values.
    911         return FALSE;
    912     }
    913 
    914     // otherwise, we can use following() on the position before the specified
    915     // one and return true if the position we get back is the one the user
    916     // specified
    917     utext_previous32From(fText, offset);
    918     int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    919     UBool    result  = following(backOne) == offset;
    920     return result;
    921 }
    922 
    923 /**
    924  * Returns the current iteration position.
    925  * @return The current iteration position.
    926  */
    927 int32_t RuleBasedBreakIterator::current(void) const {
    928     int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    929     return pos;
    930 }
    931 
    932 //=======================================================================
    933 // implementation
    934 //=======================================================================
    935 
    936 //
    937 // RBBIRunMode  -  the state machine runs an extra iteration at the beginning and end
    938 //                 of user text.  A variable with this enum type keeps track of where we
    939 //                 are.  The state machine only fetches user input while in the RUN mode.
    940 //
    941 enum RBBIRunMode {
    942     RBBI_START,     // state machine processing is before first char of input
    943     RBBI_RUN,       // state machine processing is in the user text
    944     RBBI_END        // state machine processing is after end of user text.
    945 };
    946 
    947 
    948 //-----------------------------------------------------------------------------------
    949 //
    950 //  handleNext(stateTable)
    951 //     This method is the actual implementation of the rbbi next() method.
    952 //     This method initializes the state machine to state 1
    953 //     and advances through the text character by character until we reach the end
    954 //     of the text or the state machine transitions to state 0.  We update our return
    955 //     value every time the state machine passes through an accepting state.
    956 //
    957 //-----------------------------------------------------------------------------------
    958 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
    959     int32_t             state;
    960     int16_t             category        = 0;
    961     RBBIRunMode         mode;
    962 
    963     RBBIStateTableRow  *row;
    964     UChar32             c;
    965     int32_t             lookaheadStatus = 0;
    966     int32_t             lookaheadTagIdx = 0;
    967     int32_t             result          = 0;
    968     int32_t             initialPosition = 0;
    969     int32_t             lookaheadResult = 0;
    970     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
    971     const char         *tableData       = statetable->fTableData;
    972     uint32_t            tableRowLen     = statetable->fRowLen;
    973 
    974     #ifdef RBBI_DEBUG
    975         if (fTrace) {
    976             RBBIDebugPuts("Handle Next   pos   char  state category");
    977         }
    978     #endif
    979 
    980     // No matter what, handleNext alway correctly sets the break tag value.
    981     fLastStatusIndexValid = TRUE;
    982     fLastRuleStatusIndex = 0;
    983 
    984     // if we're already at the end of the text, return DONE.
    985     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    986     result          = initialPosition;
    987     c               = UTEXT_NEXT32(fText);
    988     if (fData == NULL || c==U_SENTINEL) {
    989         return BreakIterator::DONE;
    990     }
    991 
    992     //  Set the initial state for the state machine
    993     state = START_STATE;
    994     row = (RBBIStateTableRow *)
    995             //(statetable->fTableData + (statetable->fRowLen * state));
    996             (tableData + tableRowLen * state);
    997 
    998 
    999     mode     = RBBI_RUN;
   1000     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
   1001         category = 2;
   1002         mode     = RBBI_START;
   1003     }
   1004 
   1005 
   1006     // loop until we reach the end of the text or transition to state 0
   1007     //
   1008     for (;;) {
   1009         if (c == U_SENTINEL) {
   1010             // Reached end of input string.
   1011             if (mode == RBBI_END) {
   1012                 // We have already run the loop one last time with the
   1013                 //   character set to the psueudo {eof} value.  Now it is time
   1014                 //   to unconditionally bail out.
   1015                 if (lookaheadResult > result) {
   1016                     // We ran off the end of the string with a pending look-ahead match.
   1017                     // Treat this as if the look-ahead condition had been met, and return
   1018                     //  the match at the / position from the look-ahead rule.
   1019                     result               = lookaheadResult;
   1020                     fLastRuleStatusIndex = lookaheadTagIdx;
   1021                     lookaheadStatus = 0;
   1022                 }
   1023                 break;
   1024             }
   1025             // Run the loop one last time with the fake end-of-input character category.
   1026             mode = RBBI_END;
   1027             category = 1;
   1028         }
   1029 
   1030         //
   1031         // Get the char category.  An incoming category of 1 or 2 means that
   1032         //      we are preset for doing the beginning or end of input, and
   1033         //      that we shouldn't get a category from an actual text input character.
   1034         //
   1035         if (mode == RBBI_RUN) {
   1036             // look up the current character's character category, which tells us
   1037             // which column in the state table to look at.
   1038             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
   1039             //        not the size of the character going in, which is a UChar32.
   1040             //
   1041             UTRIE_GET16(&fData->fTrie, c, category);
   1042 
   1043             // Check the dictionary bit in the character's category.
   1044             //    Counter is only used by dictionary based iterators (subclasses).
   1045             //    Chars that need to be handled by a dictionary have a flag bit set
   1046             //    in their category values.
   1047             //
   1048             if ((category & 0x4000) != 0)  {
   1049                 fDictionaryCharCount++;
   1050                 //  And off the dictionary flag bit.
   1051                 category &= ~0x4000;
   1052             }
   1053         }
   1054 
   1055         #ifdef RBBI_DEBUG
   1056             if (fTrace) {
   1057                 RBBIDebugPrintf("             %4ld   ", utext_getNativeIndex(fText));
   1058                 if (0x20<=c && c<0x7f) {
   1059                     RBBIDebugPrintf("\"%c\"  ", c);
   1060                 } else {
   1061                     RBBIDebugPrintf("%5x  ", c);
   1062                 }
   1063                 RBBIDebugPrintf("%3d  %3d\n", state, category);
   1064             }
   1065         #endif
   1066 
   1067         // State Transition - move machine to its next state
   1068         //
   1069         state = row->fNextState[category];
   1070         row = (RBBIStateTableRow *)
   1071             // (statetable->fTableData + (statetable->fRowLen * state));
   1072             (tableData + tableRowLen * state);
   1073 
   1074 
   1075         if (row->fAccepting == -1) {
   1076             // Match found, common case.
   1077             if (mode != RBBI_START) {
   1078                 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1079             }
   1080             fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
   1081         }
   1082 
   1083         if (row->fLookAhead != 0) {
   1084             if (lookaheadStatus != 0
   1085                 && row->fAccepting == lookaheadStatus) {
   1086                 // Lookahead match is completed.
   1087                 result               = lookaheadResult;
   1088                 fLastRuleStatusIndex = lookaheadTagIdx;
   1089                 lookaheadStatus      = 0;
   1090                 // TODO:  make a standalone hard break in a rule work.
   1091                 if (lookAheadHardBreak) {
   1092                     UTEXT_SETNATIVEINDEX(fText, result);
   1093                     return result;
   1094                 }
   1095                 // Look-ahead completed, but other rules may match further.  Continue on
   1096                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
   1097                 goto continueOn;
   1098             }
   1099 
   1100             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1101             lookaheadResult = r;
   1102             lookaheadStatus = row->fLookAhead;
   1103             lookaheadTagIdx = row->fTagIdx;
   1104             goto continueOn;
   1105         }
   1106 
   1107 
   1108         if (row->fAccepting != 0) {
   1109             // Because this is an accepting state, any in-progress look-ahead match
   1110             //   is no longer relavant.  Clear out the pending lookahead status.
   1111             lookaheadStatus = 0;           // clear out any pending look-ahead match.
   1112         }
   1113 
   1114 continueOn:
   1115         if (state == STOP_STATE) {
   1116             // This is the normal exit from the lookup state machine.
   1117             // We have advanced through the string until it is certain that no
   1118             //   longer match is possible, no matter what characters follow.
   1119             break;
   1120         }
   1121 
   1122         // Advance to the next character.
   1123         // If this is a beginning-of-input loop iteration, don't advance
   1124         //    the input position.  The next iteration will be processing the
   1125         //    first real input character.
   1126         if (mode == RBBI_RUN) {
   1127             c = UTEXT_NEXT32(fText);
   1128         } else {
   1129             if (mode == RBBI_START) {
   1130                 mode = RBBI_RUN;
   1131             }
   1132         }
   1133 
   1134 
   1135     }
   1136 
   1137     // The state machine is done.  Check whether it found a match...
   1138 
   1139     // If the iterator failed to advance in the match engine, force it ahead by one.
   1140     //   (This really indicates a defect in the break rules.  They should always match
   1141     //    at least one character.)
   1142     if (result == initialPosition) {
   1143         UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1144         UTEXT_NEXT32(fText);
   1145         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1146     }
   1147 
   1148     // Leave the iterator at our result position.
   1149     UTEXT_SETNATIVEINDEX(fText, result);
   1150     #ifdef RBBI_DEBUG
   1151         if (fTrace) {
   1152             RBBIDebugPrintf("result = %d\n\n", result);
   1153         }
   1154     #endif
   1155     return result;
   1156 }
   1157 
   1158 
   1159 
   1160 //-----------------------------------------------------------------------------------
   1161 //
   1162 //  handlePrevious()
   1163 //
   1164 //      Iterate backwards, according to the logic of the reverse rules.
   1165 //      This version handles the exact style backwards rules.
   1166 //
   1167 //      The logic of this function is very similar to handleNext(), above.
   1168 //
   1169 //-----------------------------------------------------------------------------------
   1170 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
   1171     int32_t             state;
   1172     int16_t             category        = 0;
   1173     RBBIRunMode         mode;
   1174     RBBIStateTableRow  *row;
   1175     UChar32             c;
   1176     int32_t             lookaheadStatus = 0;
   1177     int32_t             result          = 0;
   1178     int32_t             initialPosition = 0;
   1179     int32_t             lookaheadResult = 0;
   1180     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
   1181 
   1182     #ifdef RBBI_DEBUG
   1183         if (fTrace) {
   1184             RBBIDebugPuts("Handle Previous   pos   char  state category");
   1185         }
   1186     #endif
   1187 
   1188     // handlePrevious() never gets the rule status.
   1189     // Flag the status as invalid; if the user ever asks for status, we will need
   1190     // to back up, then re-find the break position using handleNext(), which does
   1191     // get the status value.
   1192     fLastStatusIndexValid = FALSE;
   1193     fLastRuleStatusIndex = 0;
   1194 
   1195     // if we're already at the start of the text, return DONE.
   1196     if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
   1197         return BreakIterator::DONE;
   1198     }
   1199 
   1200     //  Set up the starting char.
   1201     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1202     result          = initialPosition;
   1203     c               = UTEXT_PREVIOUS32(fText);
   1204 
   1205     //  Set the initial state for the state machine
   1206     state = START_STATE;
   1207     row = (RBBIStateTableRow *)
   1208             (statetable->fTableData + (statetable->fRowLen * state));
   1209     category = 3;
   1210     mode     = RBBI_RUN;
   1211     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
   1212         category = 2;
   1213         mode     = RBBI_START;
   1214     }
   1215 
   1216 
   1217     // loop until we reach the start of the text or transition to state 0
   1218     //
   1219     for (;;) {
   1220         if (c == U_SENTINEL) {
   1221             // Reached end of input string.
   1222             if (mode == RBBI_END) {
   1223                 // We have already run the loop one last time with the
   1224                 //   character set to the psueudo {eof} value.  Now it is time
   1225                 //   to unconditionally bail out.
   1226                 if (lookaheadResult < result) {
   1227                     // We ran off the end of the string with a pending look-ahead match.
   1228                     // Treat this as if the look-ahead condition had been met, and return
   1229                     //  the match at the / position from the look-ahead rule.
   1230                     result               = lookaheadResult;
   1231                     lookaheadStatus = 0;
   1232                 } else if (result == initialPosition) {
   1233                     // Ran off start, no match found.
   1234                     // move one index one (towards the start, since we are doing a previous())
   1235                     UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1236                     (void)UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.
   1237                 }
   1238                 break;
   1239             }
   1240             // Run the loop one last time with the fake end-of-input character category.
   1241             mode = RBBI_END;
   1242             category = 1;
   1243         }
   1244 
   1245         //
   1246         // Get the char category.  An incoming category of 1 or 2 means that
   1247         //      we are preset for doing the beginning or end of input, and
   1248         //      that we shouldn't get a category from an actual text input character.
   1249         //
   1250         if (mode == RBBI_RUN) {
   1251             // look up the current character's character category, which tells us
   1252             // which column in the state table to look at.
   1253             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
   1254             //        not the size of the character going in, which is a UChar32.
   1255             //
   1256             UTRIE_GET16(&fData->fTrie, c, category);
   1257 
   1258             // Check the dictionary bit in the character's category.
   1259             //    Counter is only used by dictionary based iterators (subclasses).
   1260             //    Chars that need to be handled by a dictionary have a flag bit set
   1261             //    in their category values.
   1262             //
   1263             if ((category & 0x4000) != 0)  {
   1264                 fDictionaryCharCount++;
   1265                 //  And off the dictionary flag bit.
   1266                 category &= ~0x4000;
   1267             }
   1268         }
   1269 
   1270         #ifdef RBBI_DEBUG
   1271             if (fTrace) {
   1272                 RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(fText));
   1273                 if (0x20<=c && c<0x7f) {
   1274                     RBBIDebugPrintf("\"%c\"  ", c);
   1275                 } else {
   1276                     RBBIDebugPrintf("%5x  ", c);
   1277                 }
   1278                 RBBIDebugPrintf("%3d  %3d\n", state, category);
   1279             }
   1280         #endif
   1281 
   1282         // State Transition - move machine to its next state
   1283         //
   1284         state = row->fNextState[category];
   1285         row = (RBBIStateTableRow *)
   1286             (statetable->fTableData + (statetable->fRowLen * state));
   1287 
   1288         if (row->fAccepting == -1) {
   1289             // Match found, common case.
   1290             result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1291         }
   1292 
   1293         if (row->fLookAhead != 0) {
   1294             if (lookaheadStatus != 0
   1295                 && row->fAccepting == lookaheadStatus) {
   1296                 // Lookahead match is completed.
   1297                 result               = lookaheadResult;
   1298                 lookaheadStatus      = 0;
   1299                 // TODO:  make a standalone hard break in a rule work.
   1300                 if (lookAheadHardBreak) {
   1301                     UTEXT_SETNATIVEINDEX(fText, result);
   1302                     return result;
   1303                 }
   1304                 // Look-ahead completed, but other rules may match further.  Continue on
   1305                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
   1306                 goto continueOn;
   1307             }
   1308 
   1309             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1310             lookaheadResult = r;
   1311             lookaheadStatus = row->fLookAhead;
   1312             goto continueOn;
   1313         }
   1314 
   1315 
   1316         if (row->fAccepting != 0) {
   1317             // Because this is an accepting state, any in-progress look-ahead match
   1318             //   is no longer relavant.  Clear out the pending lookahead status.
   1319             lookaheadStatus = 0;
   1320         }
   1321 
   1322 continueOn:
   1323         if (state == STOP_STATE) {
   1324             // This is the normal exit from the lookup state machine.
   1325             // We have advanced through the string until it is certain that no
   1326             //   longer match is possible, no matter what characters follow.
   1327             break;
   1328         }
   1329 
   1330         // Move (backwards) to the next character to process.
   1331         // If this is a beginning-of-input loop iteration, don't advance
   1332         //    the input position.  The next iteration will be processing the
   1333         //    first real input character.
   1334         if (mode == RBBI_RUN) {
   1335             c = UTEXT_PREVIOUS32(fText);
   1336         } else {
   1337             if (mode == RBBI_START) {
   1338                 mode = RBBI_RUN;
   1339             }
   1340         }
   1341     }
   1342 
   1343     // The state machine is done.  Check whether it found a match...
   1344 
   1345     // If the iterator failed to advance in the match engine, force it ahead by one.
   1346     //   (This really indicates a defect in the break rules.  They should always match
   1347     //    at least one character.)
   1348     if (result == initialPosition) {
   1349         UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1350         UTEXT_PREVIOUS32(fText);
   1351         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1352     }
   1353 
   1354     // Leave the iterator at our result position.
   1355     UTEXT_SETNATIVEINDEX(fText, result);
   1356     #ifdef RBBI_DEBUG
   1357         if (fTrace) {
   1358             RBBIDebugPrintf("result = %d\n\n", result);
   1359         }
   1360     #endif
   1361     return result;
   1362 }
   1363 
   1364 
   1365 void
   1366 RuleBasedBreakIterator::reset()
   1367 {
   1368     if (fCachedBreakPositions) {
   1369         uprv_free(fCachedBreakPositions);
   1370     }
   1371     fCachedBreakPositions = NULL;
   1372     fNumCachedBreakPositions = 0;
   1373     fDictionaryCharCount = 0;
   1374     fPositionInCache = 0;
   1375 }
   1376 
   1377 
   1378 
   1379 //-------------------------------------------------------------------------------
   1380 //
   1381 //   getRuleStatus()   Return the break rule tag associated with the current
   1382 //                     iterator position.  If the iterator arrived at its current
   1383 //                     position by iterating forwards, the value will have been
   1384 //                     cached by the handleNext() function.
   1385 //
   1386 //                     If no cached status value is available, the status is
   1387 //                     found by doing a previous() followed by a next(), which
   1388 //                     leaves the iterator where it started, and computes the
   1389 //                     status while doing the next().
   1390 //
   1391 //-------------------------------------------------------------------------------
   1392 void RuleBasedBreakIterator::makeRuleStatusValid() {
   1393     if (fLastStatusIndexValid == FALSE) {
   1394         //  No cached status is available.
   1395         if (fText == NULL || current() == 0) {
   1396             //  At start of text, or there is no text.  Status is always zero.
   1397             fLastRuleStatusIndex = 0;
   1398             fLastStatusIndexValid = TRUE;
   1399         } else {
   1400             //  Not at start of text.  Find status the tedious way.
   1401             int32_t pa = current();
   1402             previous();
   1403             if (fNumCachedBreakPositions > 0) {
   1404                 reset();                // Blow off the dictionary cache
   1405             }
   1406             int32_t pb = next();
   1407             if (pa != pb) {
   1408                 // note: the if (pa != pb) test is here only to eliminate warnings for
   1409                 //       unused local variables on gcc.  Logically, it isn't needed.
   1410                 U_ASSERT(pa == pb);
   1411             }
   1412         }
   1413     }
   1414     U_ASSERT(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fData->fStatusMaxIdx);
   1415 }
   1416 
   1417 
   1418 int32_t  RuleBasedBreakIterator::getRuleStatus() const {
   1419     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
   1420     nonConstThis->makeRuleStatusValid();
   1421 
   1422     // fLastRuleStatusIndex indexes to the start of the appropriate status record
   1423     //                                                 (the number of status values.)
   1424     //   This function returns the last (largest) of the array of status values.
   1425     int32_t  idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
   1426     int32_t  tagVal = fData->fRuleStatusTable[idx];
   1427 
   1428     return tagVal;
   1429 }
   1430 
   1431 
   1432 
   1433 
   1434 int32_t RuleBasedBreakIterator::getRuleStatusVec(
   1435              int32_t *fillInVec, int32_t capacity, UErrorCode &status)
   1436 {
   1437     if (U_FAILURE(status)) {
   1438         return 0;
   1439     }
   1440 
   1441     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
   1442     nonConstThis->makeRuleStatusValid();
   1443     int32_t  numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
   1444     int32_t  numValsToCopy = numVals;
   1445     if (numVals > capacity) {
   1446         status = U_BUFFER_OVERFLOW_ERROR;
   1447         numValsToCopy = capacity;
   1448     }
   1449     int i;
   1450     for (i=0; i<numValsToCopy; i++) {
   1451         fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
   1452     }
   1453     return numVals;
   1454 }
   1455 
   1456 
   1457 
   1458 //-------------------------------------------------------------------------------
   1459 //
   1460 //   getBinaryRules        Access to the compiled form of the rules,
   1461 //                         for use by build system tools that save the data
   1462 //                         for standard iterator types.
   1463 //
   1464 //-------------------------------------------------------------------------------
   1465 const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
   1466     const uint8_t  *retPtr = NULL;
   1467     length = 0;
   1468 
   1469     if (fData != NULL) {
   1470         retPtr = (const uint8_t *)fData->fHeader;
   1471         length = fData->fHeader->fLength;
   1472     }
   1473     return retPtr;
   1474 }
   1475 
   1476 
   1477 
   1478 
   1479 //-------------------------------------------------------------------------------
   1480 //
   1481 //  BufferClone       TODO:  In my (Andy) opinion, this function should be deprecated.
   1482 //                    Saving one heap allocation isn't worth the trouble.
   1483 //                    Cloning shouldn't be done in tight loops, and
   1484 //                    making the clone copy involves other heap operations anyway.
   1485 //                    And the application code for correctly dealing with buffer
   1486 //                    size problems and the eventual object destruction is ugly.
   1487 //
   1488 //-------------------------------------------------------------------------------
   1489 BreakIterator *  RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
   1490                                    int32_t &bufferSize,
   1491                                    UErrorCode &status)
   1492 {
   1493     if (U_FAILURE(status)){
   1494         return NULL;
   1495     }
   1496 
   1497     //
   1498     //  If user buffer size is zero this is a preflight operation to
   1499     //    obtain the needed buffer size, allowing for worst case misalignment.
   1500     //
   1501     if (bufferSize == 0) {
   1502         bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
   1503         return NULL;
   1504     }
   1505 
   1506 
   1507     //
   1508     //  Check the alignment and size of the user supplied buffer.
   1509     //  Allocate heap memory if the user supplied memory is insufficient.
   1510     //
   1511     char    *buf   = (char *)stackBuffer;
   1512     uint32_t s      = bufferSize;
   1513 
   1514     if (stackBuffer == NULL) {
   1515         s = 0;   // Ignore size, force allocation if user didn't give us a buffer.
   1516     }
   1517     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
   1518         uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf);
   1519         s   -= offsetUp;
   1520         buf += offsetUp;
   1521     }
   1522     if (s < sizeof(RuleBasedBreakIterator)) {
   1523         // Not enough room in the caller-supplied buffer.
   1524         // Do a plain-vanilla heap based clone and return that, along with
   1525         //   a warning that the clone was allocated.
   1526         RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this);
   1527         if (clonedBI == 0) {
   1528             status = U_MEMORY_ALLOCATION_ERROR;
   1529         } else {
   1530             status = U_SAFECLONE_ALLOCATED_WARNING;
   1531         }
   1532         return clonedBI;
   1533     }
   1534 
   1535     //
   1536     //  Clone the source BI into the caller-supplied buffer.
   1537     //
   1538     RuleBasedBreakIterator *clone = new(buf) RuleBasedBreakIterator(*this);
   1539     clone->fBufferClone = TRUE;   // Flag to prevent deleting storage on close (From C code)
   1540 
   1541     return clone;
   1542 }
   1543 
   1544 
   1545 //-------------------------------------------------------------------------------
   1546 //
   1547 //  isDictionaryChar      Return true if the category lookup for this char
   1548 //                        indicates that it is in the set of dictionary lookup
   1549 //                        chars.
   1550 //
   1551 //                        This function is intended for use by dictionary based
   1552 //                        break iterators.
   1553 //
   1554 //-------------------------------------------------------------------------------
   1555 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32   c) {
   1556     if (fData == NULL) {
   1557         return FALSE;
   1558     }
   1559     uint16_t category;
   1560     UTRIE_GET16(&fData->fTrie, c, category);
   1561     return (category & 0x4000) != 0;
   1562 }*/
   1563 
   1564 
   1565 //-------------------------------------------------------------------------------
   1566 //
   1567 //  checkDictionary       This function handles all processing of characters in
   1568 //                        the "dictionary" set. It will determine the appropriate
   1569 //                        course of action, and possibly set up a cache in the
   1570 //                        process.
   1571 //
   1572 //-------------------------------------------------------------------------------
   1573 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
   1574                             int32_t endPos,
   1575                             UBool reverse) {
   1576     // Reset the old break cache first.
   1577     uint32_t dictionaryCount = fDictionaryCharCount;
   1578     reset();
   1579 
   1580     if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
   1581         return (reverse ? startPos : endPos);
   1582     }
   1583 
   1584     // Bug 5532.  The dictionary code will crash if the input text is UTF-8
   1585     //      because native indexes are different from UTF-16 indexes.
   1586     //      Temporary hack: skip dictionary lookup for UTF-8 encoded text.
   1587     //      It wont give the right breaks, but it's better than a crash.
   1588     //
   1589     //      Check the type of the UText by checking its pFuncs field, which
   1590     //      is UText's function dispatch table.  It will be the same for all
   1591     //      UTF-8 UTexts and different for any other UText type.
   1592     //
   1593     //      We have no other type of UText available with non-UTF-16 native indexing.
   1594     //      This whole check will go away once the dictionary code is fixed.
   1595     static const void *utext_utf8Funcs;
   1596     if (utext_utf8Funcs == NULL) {
   1597         // Cache the UTF-8 UText function pointer value.
   1598         UErrorCode status = U_ZERO_ERROR;
   1599         UText tempUText = UTEXT_INITIALIZER;
   1600         utext_openUTF8(&tempUText, NULL, 0, &status);
   1601         utext_utf8Funcs = tempUText.pFuncs;
   1602         utext_close(&tempUText);
   1603     }
   1604     if (fText->pFuncs == utext_utf8Funcs) {
   1605         return (reverse ? startPos : endPos);
   1606     }
   1607 
   1608     // Starting from the starting point, scan towards the proposed result,
   1609     // looking for the first dictionary character (which may be the one
   1610     // we're on, if we're starting in the middle of a range).
   1611     utext_setNativeIndex(fText, reverse ? endPos : startPos);
   1612     if (reverse) {
   1613         UTEXT_PREVIOUS32(fText);
   1614     }
   1615 
   1616     int32_t rangeStart = startPos;
   1617     int32_t rangeEnd = endPos;
   1618 
   1619     uint16_t    category;
   1620     int32_t     current;
   1621     UErrorCode  status = U_ZERO_ERROR;
   1622     UStack      breaks(status);
   1623     int32_t     foundBreakCount = 0;
   1624     UChar32     c = utext_current32(fText);
   1625 
   1626     UTRIE_GET16(&fData->fTrie, c, category);
   1627 
   1628     // Is the character we're starting on a dictionary character? If so, we
   1629     // need to back up to include the entire run; otherwise the results of
   1630     // the break algorithm will differ depending on where we start. Since
   1631     // the result is cached and there is typically a non-dictionary break
   1632     // within a small number of words, there should be little performance impact.
   1633     if (category & 0x4000) {
   1634         if (reverse) {
   1635             do {
   1636                 utext_next32(fText);          // TODO:  recast to work directly with postincrement.
   1637                 c = utext_current32(fText);
   1638                 UTRIE_GET16(&fData->fTrie, c, category);
   1639             } while (c != U_SENTINEL && (category & 0x4000));
   1640             // Back up to the last dictionary character
   1641             rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1642             if (c == U_SENTINEL) {
   1643                 // c = fText->last32();
   1644                 //   TODO:  why was this if needed?
   1645                 c = UTEXT_PREVIOUS32(fText);
   1646             }
   1647             else {
   1648                 c = UTEXT_PREVIOUS32(fText);
   1649             }
   1650         }
   1651         else {
   1652             do {
   1653                 c = UTEXT_PREVIOUS32(fText);
   1654                 UTRIE_GET16(&fData->fTrie, c, category);
   1655             }
   1656             while (c != U_SENTINEL && (category & 0x4000));
   1657             // Back up to the last dictionary character
   1658             if (c == U_SENTINEL) {
   1659                 // c = fText->first32();
   1660                 c = utext_current32(fText);
   1661             }
   1662             else {
   1663                 utext_next32(fText);
   1664                 c = utext_current32(fText);
   1665             }
   1666             rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
   1667         }
   1668         UTRIE_GET16(&fData->fTrie, c, category);
   1669     }
   1670 
   1671     // Loop through the text, looking for ranges of dictionary characters.
   1672     // For each span, find the appropriate break engine, and ask it to find
   1673     // any breaks within the span.
   1674     // Note: we always do this in the forward direction, so that the break
   1675     // cache is built in the right order.
   1676     if (reverse) {
   1677         utext_setNativeIndex(fText, rangeStart);
   1678         c = utext_current32(fText);
   1679         UTRIE_GET16(&fData->fTrie, c, category);
   1680     }
   1681     while(U_SUCCESS(status)) {
   1682         while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
   1683             utext_next32(fText);           // TODO:  tweak for post-increment operation
   1684             c = utext_current32(fText);
   1685             UTRIE_GET16(&fData->fTrie, c, category);
   1686         }
   1687         if (current >= rangeEnd) {
   1688             break;
   1689         }
   1690 
   1691         // We now have a dictionary character. Get the appropriate language object
   1692         // to deal with it.
   1693         const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
   1694 
   1695         // Ask the language object if there are any breaks. It will leave the text
   1696         // pointer on the other side of its range, ready to search for the next one.
   1697         if (lbe != NULL) {
   1698             foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
   1699         }
   1700 
   1701         // Reload the loop variables for the next go-round
   1702         c = utext_current32(fText);
   1703         UTRIE_GET16(&fData->fTrie, c, category);
   1704     }
   1705 
   1706     // If we found breaks, build a new break cache. The first and last entries must
   1707     // be the original starting and ending position.
   1708     if (foundBreakCount > 0) {
   1709         int32_t totalBreaks = foundBreakCount;
   1710         if (startPos < breaks.elementAti(0)) {
   1711             totalBreaks += 1;
   1712         }
   1713         if (endPos > breaks.peeki()) {
   1714             totalBreaks += 1;
   1715         }
   1716         fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
   1717         if (fCachedBreakPositions != NULL) {
   1718             int32_t out = 0;
   1719             fNumCachedBreakPositions = totalBreaks;
   1720             if (startPos < breaks.elementAti(0)) {
   1721                 fCachedBreakPositions[out++] = startPos;
   1722             }
   1723             for (int32_t i = 0; i < foundBreakCount; ++i) {
   1724                 fCachedBreakPositions[out++] = breaks.elementAti(i);
   1725             }
   1726             if (endPos > fCachedBreakPositions[out-1]) {
   1727                 fCachedBreakPositions[out] = endPos;
   1728             }
   1729             // If there are breaks, then by definition, we are replacing the original
   1730             // proposed break by one of the breaks we found. Use following() and
   1731             // preceding() to do the work. They should never recurse in this case.
   1732             if (reverse) {
   1733                 return preceding(endPos - 1);
   1734             }
   1735             else {
   1736                 return following(startPos);
   1737             }
   1738         }
   1739         // If the allocation failed, just fall through to the "no breaks found" case.
   1740     }
   1741 
   1742     // If we get here, there were no language-based breaks. Set the text pointer
   1743     // to the original proposed break.
   1744     utext_setNativeIndex(fText, reverse ? startPos : endPos);
   1745     return (reverse ? startPos : endPos);
   1746 }
   1747 
   1748 U_NAMESPACE_END
   1749 
   1750 // defined in ucln_cmn.h
   1751 
   1752 static U_NAMESPACE_QUALIFIER UStack *gLanguageBreakFactories = NULL;
   1753 
   1754 /**
   1755  * Release all static memory held by breakiterator.
   1756  */
   1757 U_CDECL_BEGIN
   1758 static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
   1759     if (gLanguageBreakFactories) {
   1760         delete gLanguageBreakFactories;
   1761         gLanguageBreakFactories = NULL;
   1762     }
   1763     return TRUE;
   1764 }
   1765 U_CDECL_END
   1766 
   1767 U_CDECL_BEGIN
   1768 static void U_CALLCONV _deleteFactory(void *obj) {
   1769     delete (U_NAMESPACE_QUALIFIER LanguageBreakFactory *) obj;
   1770 }
   1771 U_CDECL_END
   1772 U_NAMESPACE_BEGIN
   1773 
   1774 static const LanguageBreakEngine*
   1775 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
   1776 {
   1777     UBool       needsInit;
   1778     UErrorCode  status = U_ZERO_ERROR;
   1779     UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit);
   1780 
   1781     if (needsInit) {
   1782         UStack  *factories = new UStack(_deleteFactory, NULL, status);
   1783         if (factories != NULL && U_SUCCESS(status)) {
   1784             ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
   1785             factories->push(builtIn, status);
   1786 #ifdef U_LOCAL_SERVICE_HOOK
   1787             LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
   1788             if (extra != NULL) {
   1789                 factories->push(extra, status);
   1790             }
   1791 #endif
   1792         }
   1793         umtx_lock(NULL);
   1794         if (gLanguageBreakFactories == NULL) {
   1795             gLanguageBreakFactories = factories;
   1796             factories = NULL;
   1797             ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
   1798         }
   1799         umtx_unlock(NULL);
   1800         delete factories;
   1801     }
   1802 
   1803     if (gLanguageBreakFactories == NULL) {
   1804         return NULL;
   1805     }
   1806 
   1807     int32_t i = gLanguageBreakFactories->size();
   1808     const LanguageBreakEngine *lbe = NULL;
   1809     while (--i >= 0) {
   1810         LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
   1811         lbe = factory->getEngineFor(c, breakType);
   1812         if (lbe != NULL) {
   1813             break;
   1814         }
   1815     }
   1816     return lbe;
   1817 }
   1818 
   1819 
   1820 //-------------------------------------------------------------------------------
   1821 //
   1822 //  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the
   1823 //                          the characer c.
   1824 //
   1825 //-------------------------------------------------------------------------------
   1826 const LanguageBreakEngine *
   1827 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
   1828     const LanguageBreakEngine *lbe = NULL;
   1829     UErrorCode status = U_ZERO_ERROR;
   1830 
   1831     if (fLanguageBreakEngines == NULL) {
   1832         fLanguageBreakEngines = new UStack(status);
   1833         if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
   1834             delete fLanguageBreakEngines;
   1835             fLanguageBreakEngines = 0;
   1836             return NULL;
   1837         }
   1838     }
   1839 
   1840     int32_t i = fLanguageBreakEngines->size();
   1841     while (--i >= 0) {
   1842         lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
   1843         if (lbe->handles(c, fBreakType)) {
   1844             return lbe;
   1845         }
   1846     }
   1847 
   1848     // No existing dictionary took the character. See if a factory wants to
   1849     // give us a new LanguageBreakEngine for this character.
   1850     lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
   1851 
   1852     // If we got one, use it and push it on our stack.
   1853     if (lbe != NULL) {
   1854         fLanguageBreakEngines->push((void *)lbe, status);
   1855         // Even if we can't remember it, we can keep looking it up, so
   1856         // return it even if the push fails.
   1857         return lbe;
   1858     }
   1859 
   1860     // No engine is forthcoming for this character. Add it to the
   1861     // reject set. Create the reject break engine if needed.
   1862     if (fUnhandledBreakEngine == NULL) {
   1863         fUnhandledBreakEngine = new UnhandledEngine(status);
   1864         if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
   1865             status = U_MEMORY_ALLOCATION_ERROR;
   1866         }
   1867         // Put it last so that scripts for which we have an engine get tried
   1868         // first.
   1869         fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
   1870         // If we can't insert it, or creation failed, get rid of it
   1871         if (U_FAILURE(status)) {
   1872             delete fUnhandledBreakEngine;
   1873             fUnhandledBreakEngine = 0;
   1874             return NULL;
   1875         }
   1876     }
   1877 
   1878     // Tell the reject engine about the character; at its discretion, it may
   1879     // add more than just the one character.
   1880     fUnhandledBreakEngine->handleCharacter(c, fBreakType);
   1881 
   1882     return fUnhandledBreakEngine;
   1883 }
   1884 
   1885 
   1886 
   1887 /*int32_t RuleBasedBreakIterator::getBreakType() const {
   1888     return fBreakType;
   1889 }*/
   1890 
   1891 void RuleBasedBreakIterator::setBreakType(int32_t type) {
   1892     fBreakType = type;
   1893     reset();
   1894 }
   1895 
   1896 U_NAMESPACE_END
   1897 
   1898 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   1899