Home | History | Annotate | Download | only in common
      1 /*
      2 ***************************************************************************
      3 *   Copyright (C) 1999-2014 International Business Machines Corporation
      4 *   and others. All rights reserved.
      5 ***************************************************************************
      6 */
      7 //
      8 //  file:  rbbi.c    Contains the implementation of the rule based break iterator
      9 //                   runtime engine and the API implementation for
     10 //                   class RuleBasedBreakIterator
     11 //
     12 
     13 #include "utypeinfo.h"  // for 'typeid' to work
     14 
     15 #include "unicode/utypes.h"
     16 
     17 #if !UCONFIG_NO_BREAK_ITERATION
     18 
     19 #include "unicode/rbbi.h"
     20 #include "unicode/schriter.h"
     21 #include "unicode/uchriter.h"
     22 #include "unicode/udata.h"
     23 #include "unicode/uclean.h"
     24 #include "rbbidata.h"
     25 #include "rbbirb.h"
     26 #include "cmemory.h"
     27 #include "cstring.h"
     28 #include "umutex.h"
     29 #include "ucln_cmn.h"
     30 #include "brkeng.h"
     31 
     32 #include "uassert.h"
     33 #include "uvector.h"
     34 
     35 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
     36 #if U_LOCAL_SERVICE_HOOK
     37 #include "localsvc.h"
     38 #endif
     39 
     40 #ifdef RBBI_DEBUG
     41 static UBool fTrace = FALSE;
     42 #endif
     43 
     44 U_NAMESPACE_BEGIN
     45 
     46 // The state number of the starting state
     47 #define START_STATE 1
     48 
     49 // The state-transition value indicating "stop"
     50 #define STOP_STATE  0
     51 
     52 
     53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
     54 
     55 
     56 //=======================================================================
     57 // constructors
     58 //=======================================================================
     59 
     60 /**
     61  * Constructs a RuleBasedBreakIterator that uses the already-created
     62  * tables object that is passed in as a parameter.
     63  */
     64 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
     65 {
     66     init();
     67     fData = new RBBIDataWrapper(data, status); // status checked in constructor
     68     if (U_FAILURE(status)) {return;}
     69     if(fData == 0) {
     70         status = U_MEMORY_ALLOCATION_ERROR;
     71         return;
     72     }
     73 }
     74 
     75 /**
     76  * Same as above but does not adopt memory
     77  */
     78 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status)
     79 {
     80     init();
     81     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor
     82     if (U_FAILURE(status)) {return;}
     83     if(fData == 0) {
     84         status = U_MEMORY_ALLOCATION_ERROR;
     85         return;
     86     }
     87 }
     88 
     89 
     90 //
     91 //  Construct from precompiled binary rules (tables).  This constructor is public API,
     92 //  taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
     93 //
     94 RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
     95                        uint32_t       ruleLength,
     96                        UErrorCode     &status) {
     97     init();
     98     if (U_FAILURE(status)) {
     99         return;
    100     }
    101     if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) {
    102         status = U_ILLEGAL_ARGUMENT_ERROR;
    103         return;
    104     }
    105     const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules;
    106     if (data->fLength > ruleLength) {
    107         status = U_ILLEGAL_ARGUMENT_ERROR;
    108         return;
    109     }
    110     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status);
    111     if (U_FAILURE(status)) {return;}
    112     if(fData == 0) {
    113         status = U_MEMORY_ALLOCATION_ERROR;
    114         return;
    115     }
    116 }
    117 
    118 
    119 //-------------------------------------------------------------------------------
    120 //
    121 //   Constructor   from a UDataMemory handle to precompiled break rules
    122 //                 stored in an ICU data file.
    123 //
    124 //-------------------------------------------------------------------------------
    125 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
    126 {
    127     init();
    128     fData = new RBBIDataWrapper(udm, status); // status checked in constructor
    129     if (U_FAILURE(status)) {return;}
    130     if(fData == 0) {
    131         status = U_MEMORY_ALLOCATION_ERROR;
    132         return;
    133     }
    134 }
    135 
    136 
    137 
    138 //-------------------------------------------------------------------------------
    139 //
    140 //   Constructor       from a set of rules supplied as a string.
    141 //
    142 //-------------------------------------------------------------------------------
    143 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
    144                                                 UParseError          &parseError,
    145                                                 UErrorCode           &status)
    146 {
    147     init();
    148     if (U_FAILURE(status)) {return;}
    149     RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
    150         RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
    151     // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that
    152     //        creates and returns a complete RBBI.  From here, in a constructor, we
    153     //        can't just return the object created by the builder factory, hence
    154     //        the assignment of the factory created object to "this".
    155     if (U_SUCCESS(status)) {
    156         *this = *bi;
    157         delete bi;
    158     }
    159 }
    160 
    161 
    162 //-------------------------------------------------------------------------------
    163 //
    164 // Default Constructor.      Create an empty shell that can be set up later.
    165 //                           Used when creating a RuleBasedBreakIterator from a set
    166 //                           of rules.
    167 //-------------------------------------------------------------------------------
    168 RuleBasedBreakIterator::RuleBasedBreakIterator() {
    169     init();
    170 }
    171 
    172 
    173 //-------------------------------------------------------------------------------
    174 //
    175 //   Copy constructor.  Will produce a break iterator with the same behavior,
    176 //                      and which iterates over the same text, as the one passed in.
    177 //
    178 //-------------------------------------------------------------------------------
    179 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
    180 : BreakIterator(other)
    181 {
    182     this->init();
    183     *this = other;
    184 }
    185 
    186 
    187 /**
    188  * Destructor
    189  */
    190 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
    191     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    192         // fCharIter was adopted from the outside.
    193         delete fCharIter;
    194     }
    195     fCharIter = NULL;
    196     delete fSCharIter;
    197     fCharIter = NULL;
    198     delete fDCharIter;
    199     fDCharIter = NULL;
    200 
    201     utext_close(fText);
    202 
    203     if (fData != NULL) {
    204         fData->removeReference();
    205         fData = NULL;
    206     }
    207     if (fCachedBreakPositions) {
    208         uprv_free(fCachedBreakPositions);
    209         fCachedBreakPositions = NULL;
    210     }
    211     if (fLanguageBreakEngines) {
    212         delete fLanguageBreakEngines;
    213         fLanguageBreakEngines = NULL;
    214     }
    215     if (fUnhandledBreakEngine) {
    216         delete fUnhandledBreakEngine;
    217         fUnhandledBreakEngine = NULL;
    218     }
    219 }
    220 
    221 /**
    222  * Assignment operator.  Sets this iterator to have the same behavior,
    223  * and iterate over the same text, as the one passed in.
    224  */
    225 RuleBasedBreakIterator&
    226 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
    227     if (this == &that) {
    228         return *this;
    229     }
    230     reset();    // Delete break cache information
    231     fBreakType = that.fBreakType;
    232     if (fLanguageBreakEngines != NULL) {
    233         delete fLanguageBreakEngines;
    234         fLanguageBreakEngines = NULL;   // Just rebuild for now
    235     }
    236     // TODO: clone fLanguageBreakEngines from "that"
    237     UErrorCode status = U_ZERO_ERROR;
    238     fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
    239 
    240     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    241         delete fCharIter;
    242     }
    243     fCharIter = NULL;
    244 
    245     if (that.fCharIter != NULL ) {
    246         // This is a little bit tricky - it will intially appear that
    247         //  this->fCharIter is adopted, even if that->fCharIter was
    248         //  not adopted.  That's ok.
    249         fCharIter = that.fCharIter->clone();
    250     }
    251 
    252     if (fData != NULL) {
    253         fData->removeReference();
    254         fData = NULL;
    255     }
    256     if (that.fData != NULL) {
    257         fData = that.fData->addReference();
    258     }
    259 
    260     return *this;
    261 }
    262 
    263 
    264 
    265 //-----------------------------------------------------------------------------
    266 //
    267 //    init()      Shared initialization routine.   Used by all the constructors.
    268 //                Initializes all fields, leaving the object in a consistent state.
    269 //
    270 //-----------------------------------------------------------------------------
    271 void RuleBasedBreakIterator::init() {
    272     UErrorCode  status    = U_ZERO_ERROR;
    273     fText                 = utext_openUChars(NULL, NULL, 0, &status);
    274     fCharIter             = NULL;
    275     fSCharIter            = NULL;
    276     fDCharIter            = NULL;
    277     fData                 = NULL;
    278     fLastRuleStatusIndex  = 0;
    279     fLastStatusIndexValid = TRUE;
    280     fDictionaryCharCount  = 0;
    281     fBreakType            = UBRK_WORD;  // Defaulting BreakType to word gives reasonable
    282                                         //   dictionary behavior for Break Iterators that are
    283                                         //   built from rules.  Even better would be the ability to
    284                                         //   declare the type in the rules.
    285 
    286     fCachedBreakPositions    = NULL;
    287     fLanguageBreakEngines    = NULL;
    288     fUnhandledBreakEngine    = NULL;
    289     fNumCachedBreakPositions = 0;
    290     fPositionInCache         = 0;
    291 
    292 #ifdef RBBI_DEBUG
    293     static UBool debugInitDone = FALSE;
    294     if (debugInitDone == FALSE) {
    295         char *debugEnv = getenv("U_RBBIDEBUG");
    296         if (debugEnv && uprv_strstr(debugEnv, "trace")) {
    297             fTrace = TRUE;
    298         }
    299         debugInitDone = TRUE;
    300     }
    301 #endif
    302 }
    303 
    304 
    305 
    306 //-----------------------------------------------------------------------------
    307 //
    308 //    clone - Returns a newly-constructed RuleBasedBreakIterator with the same
    309 //            behavior, and iterating over the same text, as this one.
    310 //            Virtual function: does the right thing with subclasses.
    311 //
    312 //-----------------------------------------------------------------------------
    313 BreakIterator*
    314 RuleBasedBreakIterator::clone(void) const {
    315     return new RuleBasedBreakIterator(*this);
    316 }
    317 
    318 /**
    319  * Equality operator.  Returns TRUE if both BreakIterators are of the
    320  * same class, have the same behavior, and iterate over the same text.
    321  */
    322 UBool
    323 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
    324     if (typeid(*this) != typeid(that)) {
    325         return FALSE;
    326     }
    327 
    328     const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
    329 
    330     if (!utext_equals(fText, that2.fText)) {
    331         // The two break iterators are operating on different text,
    332         //   or have a different interation position.
    333         return FALSE;
    334     };
    335 
    336     // TODO:  need a check for when in a dictionary region at different offsets.
    337 
    338     if (that2.fData == fData ||
    339         (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
    340             // The two break iterators are using the same rules.
    341             return TRUE;
    342         }
    343     return FALSE;
    344 }
    345 
    346 /**
    347  * Compute a hash code for this BreakIterator
    348  * @return A hash code
    349  */
    350 int32_t
    351 RuleBasedBreakIterator::hashCode(void) const {
    352     int32_t   hash = 0;
    353     if (fData != NULL) {
    354         hash = fData->hashCode();
    355     }
    356     return hash;
    357 }
    358 
    359 
    360 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
    361     if (U_FAILURE(status)) {
    362         return;
    363     }
    364     reset();
    365     fText = utext_clone(fText, ut, FALSE, TRUE, &status);
    366 
    367     // Set up a dummy CharacterIterator to be returned if anyone
    368     //   calls getText().  With input from UText, there is no reasonable
    369     //   way to return a characterIterator over the actual input text.
    370     //   Return one over an empty string instead - this is the closest
    371     //   we can come to signaling a failure.
    372     //   (GetText() is obsolete, this failure is sort of OK)
    373     if (fDCharIter == NULL) {
    374         static const UChar c = 0;
    375         fDCharIter = new UCharCharacterIterator(&c, 0);
    376         if (fDCharIter == NULL) {
    377             status = U_MEMORY_ALLOCATION_ERROR;
    378             return;
    379         }
    380     }
    381 
    382     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    383         // existing fCharIter was adopted from the outside.  Delete it now.
    384         delete fCharIter;
    385     }
    386     fCharIter = fDCharIter;
    387 
    388     this->first();
    389 }
    390 
    391 
    392 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
    393     UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
    394     return result;
    395 }
    396 
    397 
    398 
    399 /**
    400  * Returns the description used to create this iterator
    401  */
    402 const UnicodeString&
    403 RuleBasedBreakIterator::getRules() const {
    404     if (fData != NULL) {
    405         return fData->getRuleSourceString();
    406     } else {
    407         static const UnicodeString *s;
    408         if (s == NULL) {
    409             // TODO:  something more elegant here.
    410             //        perhaps API should return the string by value.
    411             //        Note:  thread unsafe init & leak are semi-ok, better than
    412             //               what was before.  Sould be cleaned up, though.
    413             s = new UnicodeString;
    414         }
    415         return *s;
    416     }
    417 }
    418 
    419 //=======================================================================
    420 // BreakIterator overrides
    421 //=======================================================================
    422 
    423 /**
    424  * Return a CharacterIterator over the text being analyzed.
    425  */
    426 CharacterIterator&
    427 RuleBasedBreakIterator::getText() const {
    428     return *fCharIter;
    429 }
    430 
    431 /**
    432  * Set the iterator to analyze a new piece of text.  This function resets
    433  * the current iteration position to the beginning of the text.
    434  * @param newText An iterator over the text to analyze.
    435  */
    436 void
    437 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
    438     // If we are holding a CharacterIterator adopted from a
    439     //   previous call to this function, delete it now.
    440     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    441         delete fCharIter;
    442     }
    443 
    444     fCharIter = newText;
    445     UErrorCode status = U_ZERO_ERROR;
    446     reset();
    447     if (newText==NULL || newText->startIndex() != 0) {
    448         // startIndex !=0 wants to be an error, but there's no way to report it.
    449         // Make the iterator text be an empty string.
    450         fText = utext_openUChars(fText, NULL, 0, &status);
    451     } else {
    452         fText = utext_openCharacterIterator(fText, newText, &status);
    453     }
    454     this->first();
    455 }
    456 
    457 /**
    458  * Set the iterator to analyze a new piece of text.  This function resets
    459  * the current iteration position to the beginning of the text.
    460  * @param newText An iterator over the text to analyze.
    461  */
    462 void
    463 RuleBasedBreakIterator::setText(const UnicodeString& newText) {
    464     UErrorCode status = U_ZERO_ERROR;
    465     reset();
    466     fText = utext_openConstUnicodeString(fText, &newText, &status);
    467 
    468     // Set up a character iterator on the string.
    469     //   Needed in case someone calls getText().
    470     //  Can not, unfortunately, do this lazily on the (probably never)
    471     //  call to getText(), because getText is const.
    472     if (fSCharIter == NULL) {
    473         fSCharIter = new StringCharacterIterator(newText);
    474     } else {
    475         fSCharIter->setText(newText);
    476     }
    477 
    478     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    479         // old fCharIter was adopted from the outside.  Delete it.
    480         delete fCharIter;
    481     }
    482     fCharIter = fSCharIter;
    483 
    484     this->first();
    485 }
    486 
    487 
    488 /**
    489  *  Provide a new UText for the input text.  Must reference text with contents identical
    490  *  to the original.
    491  *  Intended for use with text data originating in Java (garbage collected) environments
    492  *  where the data may be moved in memory at arbitrary times.
    493  */
    494 RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
    495     if (U_FAILURE(status)) {
    496         return *this;
    497     }
    498     if (input == NULL) {
    499         status = U_ILLEGAL_ARGUMENT_ERROR;
    500         return *this;
    501     }
    502     int64_t pos = utext_getNativeIndex(fText);
    503     //  Shallow read-only clone of the new UText into the existing input UText
    504     fText = utext_clone(fText, input, FALSE, TRUE, &status);
    505     if (U_FAILURE(status)) {
    506         return *this;
    507     }
    508     utext_setNativeIndex(fText, pos);
    509     if (utext_getNativeIndex(fText) != pos) {
    510         // Sanity check.  The new input utext is supposed to have the exact same
    511         // contents as the old.  If we can't set to the same position, it doesn't.
    512         // The contents underlying the old utext might be invalid at this point,
    513         // so it's not safe to check directly.
    514         status = U_ILLEGAL_ARGUMENT_ERROR;
    515     }
    516     return *this;
    517 }
    518 
    519 
    520 /**
    521  * Sets the current iteration position to the beginning of the text, position zero.
    522  * @return The new iterator position, which is zero.
    523  */
    524 int32_t RuleBasedBreakIterator::first(void) {
    525     reset();
    526     fLastRuleStatusIndex  = 0;
    527     fLastStatusIndexValid = TRUE;
    528     //if (fText == NULL)
    529     //    return BreakIterator::DONE;
    530 
    531     utext_setNativeIndex(fText, 0);
    532     return 0;
    533 }
    534 
    535 /**
    536  * Sets the current iteration position to the end of the text.
    537  * @return The text's past-the-end offset.
    538  */
    539 int32_t RuleBasedBreakIterator::last(void) {
    540     reset();
    541     if (fText == NULL) {
    542         fLastRuleStatusIndex  = 0;
    543         fLastStatusIndexValid = TRUE;
    544         return BreakIterator::DONE;
    545     }
    546 
    547     fLastStatusIndexValid = FALSE;
    548     int32_t pos = (int32_t)utext_nativeLength(fText);
    549     utext_setNativeIndex(fText, pos);
    550     return pos;
    551 }
    552 
    553 /**
    554  * Advances the iterator either forward or backward the specified number of steps.
    555  * Negative values move backward, and positive values move forward.  This is
    556  * equivalent to repeatedly calling next() or previous().
    557  * @param n The number of steps to move.  The sign indicates the direction
    558  * (negative is backwards, and positive is forwards).
    559  * @return The character offset of the boundary position n boundaries away from
    560  * the current one.
    561  */
    562 int32_t RuleBasedBreakIterator::next(int32_t n) {
    563     int32_t result = current();
    564     while (n > 0) {
    565         result = next();
    566         --n;
    567     }
    568     while (n < 0) {
    569         result = previous();
    570         ++n;
    571     }
    572     return result;
    573 }
    574 
    575 /**
    576  * Advances the iterator to the next boundary position.
    577  * @return The position of the first boundary after this one.
    578  */
    579 int32_t RuleBasedBreakIterator::next(void) {
    580     // if we have cached break positions and we're still in the range
    581     // covered by them, just move one step forward in the cache
    582     if (fCachedBreakPositions != NULL) {
    583         if (fPositionInCache < fNumCachedBreakPositions - 1) {
    584             ++fPositionInCache;
    585             int32_t pos = fCachedBreakPositions[fPositionInCache];
    586             utext_setNativeIndex(fText, pos);
    587             return pos;
    588         }
    589         else {
    590             reset();
    591         }
    592     }
    593 
    594     int32_t startPos = current();
    595     fDictionaryCharCount = 0;
    596     int32_t result = handleNext(fData->fForwardTable);
    597     if (fDictionaryCharCount > 0) {
    598         result = checkDictionary(startPos, result, FALSE);
    599     }
    600     return result;
    601 }
    602 
    603 /**
    604  * Advances the iterator backwards, to the last boundary preceding this one.
    605  * @return The position of the last boundary position preceding this one.
    606  */
    607 int32_t RuleBasedBreakIterator::previous(void) {
    608     int32_t result;
    609     int32_t startPos;
    610 
    611     // if we have cached break positions and we're still in the range
    612     // covered by them, just move one step backward in the cache
    613     if (fCachedBreakPositions != NULL) {
    614         if (fPositionInCache > 0) {
    615             --fPositionInCache;
    616             // If we're at the beginning of the cache, need to reevaluate the
    617             // rule status
    618             if (fPositionInCache <= 0) {
    619                 fLastStatusIndexValid = FALSE;
    620             }
    621             int32_t pos = fCachedBreakPositions[fPositionInCache];
    622             utext_setNativeIndex(fText, pos);
    623             return pos;
    624         }
    625         else {
    626             reset();
    627         }
    628     }
    629 
    630     // if we're already sitting at the beginning of the text, return DONE
    631     if (fText == NULL || (startPos = current()) == 0) {
    632         fLastRuleStatusIndex  = 0;
    633         fLastStatusIndexValid = TRUE;
    634         return BreakIterator::DONE;
    635     }
    636 
    637     if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
    638         result = handlePrevious(fData->fReverseTable);
    639         if (fDictionaryCharCount > 0) {
    640             result = checkDictionary(result, startPos, TRUE);
    641         }
    642         return result;
    643     }
    644 
    645     // old rule syntax
    646     // set things up.  handlePrevious() will back us up to some valid
    647     // break position before the current position (we back our internal
    648     // iterator up one step to prevent handlePrevious() from returning
    649     // the current position), but not necessarily the last one before
    650     // where we started
    651 
    652     int32_t start = current();
    653 
    654     (void)UTEXT_PREVIOUS32(fText);
    655     int32_t lastResult    = handlePrevious(fData->fReverseTable);
    656     if (lastResult == UBRK_DONE) {
    657         lastResult = 0;
    658         utext_setNativeIndex(fText, 0);
    659     }
    660     result = lastResult;
    661     int32_t lastTag       = 0;
    662     UBool   breakTagValid = FALSE;
    663 
    664     // iterate forward from the known break position until we pass our
    665     // starting point.  The last break position before the starting
    666     // point is our return value
    667 
    668     for (;;) {
    669         result         = next();
    670         if (result == BreakIterator::DONE || result >= start) {
    671             break;
    672         }
    673         lastResult     = result;
    674         lastTag        = fLastRuleStatusIndex;
    675         breakTagValid  = TRUE;
    676     }
    677 
    678     // fLastBreakTag wants to have the value for section of text preceding
    679     // the result position that we are to return (in lastResult.)  If
    680     // the backwards rules overshot and the above loop had to do two or more
    681     // next()s to move up to the desired return position, we will have a valid
    682     // tag value. But, if handlePrevious() took us to exactly the correct result position,
    683     // we wont have a tag value for that position, which is only set by handleNext().
    684 
    685     // Set the current iteration position to be the last break position
    686     // before where we started, and then return that value.
    687     utext_setNativeIndex(fText, lastResult);
    688     fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()
    689     fLastStatusIndexValid = breakTagValid;
    690 
    691     // No need to check the dictionary; it will have been handled by
    692     // next()
    693 
    694     return lastResult;
    695 }
    696 
    697 /**
    698  * Sets the iterator to refer to the first boundary position following
    699  * the specified position.
    700  * @offset The position from which to begin searching for a break position.
    701  * @return The position of the first break after the current position.
    702  */
    703 int32_t RuleBasedBreakIterator::following(int32_t offset) {
    704     // if the offset passed in is already past the end of the text,
    705     // just return DONE; if it's before the beginning, return the
    706     // text's starting offset
    707     if (fText == NULL || offset >= utext_nativeLength(fText)) {
    708         last();
    709         return next();
    710     }
    711     else if (offset < 0) {
    712         return first();
    713     }
    714 
    715     // Move requested offset to a code point start. It might be on a trail surrogate,
    716     // or on a trail byte if the input is UTF-8.
    717     utext_setNativeIndex(fText, offset);
    718     offset = utext_getNativeIndex(fText);
    719 
    720     // if we have cached break positions and offset is in the range
    721     // covered by them, use them
    722     // TODO: could use binary search
    723     // TODO: what if offset is outside range, but break is not?
    724     if (fCachedBreakPositions != NULL) {
    725         if (offset >= fCachedBreakPositions[0]
    726                 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
    727             fPositionInCache = 0;
    728             // We are guaranteed not to leave the array due to range test above
    729             while (offset >= fCachedBreakPositions[fPositionInCache]) {
    730                 ++fPositionInCache;
    731             }
    732             int32_t pos = fCachedBreakPositions[fPositionInCache];
    733             utext_setNativeIndex(fText, pos);
    734             return pos;
    735         }
    736         else {
    737             reset();
    738         }
    739     }
    740 
    741     // Set our internal iteration position (temporarily)
    742     // to the position passed in.  If this is the _beginning_ position,
    743     // then we can just use next() to get our return value
    744 
    745     int32_t result = 0;
    746 
    747     if (fData->fSafeRevTable != NULL) {
    748         // new rule syntax
    749         utext_setNativeIndex(fText, offset);
    750         // move forward one codepoint to prepare for moving back to a
    751         // safe point.
    752         // this handles offset being between a supplementary character
    753         // TODO: is this still needed, with move to code point boundary handled above?
    754         (void)UTEXT_NEXT32(fText);
    755         // handlePrevious will move most of the time to < 1 boundary away
    756         handlePrevious(fData->fSafeRevTable);
    757         int32_t result = next();
    758         while (result <= offset) {
    759             result = next();
    760         }
    761         return result;
    762     }
    763     if (fData->fSafeFwdTable != NULL) {
    764         // backup plan if forward safe table is not available
    765         utext_setNativeIndex(fText, offset);
    766         (void)UTEXT_PREVIOUS32(fText);
    767         // handle next will give result >= offset
    768         handleNext(fData->fSafeFwdTable);
    769         // previous will give result 0 or 1 boundary away from offset,
    770         // most of the time
    771         // we have to
    772         int32_t oldresult = previous();
    773         while (oldresult > offset) {
    774             int32_t result = previous();
    775             if (result <= offset) {
    776                 return oldresult;
    777             }
    778             oldresult = result;
    779         }
    780         int32_t result = next();
    781         if (result <= offset) {
    782             return next();
    783         }
    784         return result;
    785     }
    786     // otherwise, we have to sync up first.  Use handlePrevious() to back
    787     // up to a known break position before the specified position (if
    788     // we can determine that the specified position is a break position,
    789     // we don't back up at all).  This may or may not be the last break
    790     // position at or before our starting position.  Advance forward
    791     // from here until we've passed the starting position.  The position
    792     // we stop on will be the first break position after the specified one.
    793     // old rule syntax
    794 
    795     utext_setNativeIndex(fText, offset);
    796     if (offset==0 ||
    797         (offset==1  && utext_getNativeIndex(fText)==0)) {
    798         return next();
    799     }
    800     result = previous();
    801 
    802     while (result != BreakIterator::DONE && result <= offset) {
    803         result = next();
    804     }
    805 
    806     return result;
    807 }
    808 
    809 /**
    810  * Sets the iterator to refer to the last boundary position before the
    811  * specified position.
    812  * @offset The position to begin searching for a break from.
    813  * @return The position of the last boundary before the starting position.
    814  */
    815 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
    816     // if the offset passed in is already past the end of the text,
    817     // just return DONE; if it's before the beginning, return the
    818     // text's starting offset
    819     if (fText == NULL || offset > utext_nativeLength(fText)) {
    820         return last();
    821     }
    822     else if (offset < 0) {
    823         return first();
    824     }
    825 
    826     // Move requested offset to a code point start. It might be on a trail surrogate,
    827     // or on a trail byte if the input is UTF-8.
    828     utext_setNativeIndex(fText, offset);
    829     offset = utext_getNativeIndex(fText);
    830 
    831     // if we have cached break positions and offset is in the range
    832     // covered by them, use them
    833     if (fCachedBreakPositions != NULL) {
    834         // TODO: binary search?
    835         // TODO: What if offset is outside range, but break is not?
    836         if (offset > fCachedBreakPositions[0]
    837                 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
    838             fPositionInCache = 0;
    839             while (fPositionInCache < fNumCachedBreakPositions
    840                    && offset > fCachedBreakPositions[fPositionInCache])
    841                 ++fPositionInCache;
    842             --fPositionInCache;
    843             // If we're at the beginning of the cache, need to reevaluate the
    844             // rule status
    845             if (fPositionInCache <= 0) {
    846                 fLastStatusIndexValid = FALSE;
    847             }
    848             utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
    849             return fCachedBreakPositions[fPositionInCache];
    850         }
    851         else {
    852             reset();
    853         }
    854     }
    855 
    856     // if we start by updating the current iteration position to the
    857     // position specified by the caller, we can just use previous()
    858     // to carry out this operation
    859 
    860     if (fData->fSafeFwdTable != NULL) {
    861         // new rule syntax
    862         utext_setNativeIndex(fText, offset);
    863         int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    864         if (newOffset != offset) {
    865             // Will come here if specified offset was not a code point boundary AND
    866             //   the underlying implmentation is using UText, which snaps any non-code-point-boundary
    867             //   indices to the containing code point.
    868             // For breakitereator::preceding only, these non-code-point indices need to be moved
    869             //   up to refer to the following codepoint.
    870             (void)UTEXT_NEXT32(fText);
    871             offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    872         }
    873 
    874         // TODO:  (synwee) would it be better to just check for being in the middle of a surrogate pair,
    875         //        rather than adjusting the position unconditionally?
    876         //        (Change would interact with safe rules.)
    877         // TODO:  change RBBI behavior for off-boundary indices to match that of UText?
    878         //        affects only preceding(), seems cleaner, but is slightly different.
    879         (void)UTEXT_PREVIOUS32(fText);
    880         handleNext(fData->fSafeFwdTable);
    881         int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    882         while (result >= offset) {
    883             result = previous();
    884         }
    885         return result;
    886     }
    887     if (fData->fSafeRevTable != NULL) {
    888         // backup plan if forward safe table is not available
    889         //  TODO:  check whether this path can be discarded
    890         //         It's probably OK to say that rules must supply both safe tables
    891         //            if they use safe tables at all.  We have certainly never described
    892         //            to anyone how to work with just one safe table.
    893         utext_setNativeIndex(fText, offset);
    894         (void)UTEXT_NEXT32(fText);
    895 
    896         // handle previous will give result <= offset
    897         handlePrevious(fData->fSafeRevTable);
    898 
    899         // next will give result 0 or 1 boundary away from offset,
    900         // most of the time
    901         // we have to
    902         int32_t oldresult = next();
    903         while (oldresult < offset) {
    904             int32_t result = next();
    905             if (result >= offset) {
    906                 return oldresult;
    907             }
    908             oldresult = result;
    909         }
    910         int32_t result = previous();
    911         if (result >= offset) {
    912             return previous();
    913         }
    914         return result;
    915     }
    916 
    917     // old rule syntax
    918     utext_setNativeIndex(fText, offset);
    919     return previous();
    920 }
    921 
    922 /**
    923  * Returns true if the specfied position is a boundary position.  As a side
    924  * effect, leaves the iterator pointing to the first boundary position at
    925  * or after "offset".
    926  * @param offset the offset to check.
    927  * @return True if "offset" is a boundary position.
    928  */
    929 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
    930     // the beginning index of the iterator is always a boundary position by definition
    931     if (offset == 0) {
    932         first();       // For side effects on current position, tag values.
    933         return TRUE;
    934     }
    935 
    936     if (offset == (int32_t)utext_nativeLength(fText)) {
    937         last();       // For side effects on current position, tag values.
    938         return TRUE;
    939     }
    940 
    941     // out-of-range indexes are never boundary positions
    942     if (offset < 0) {
    943         first();       // For side effects on current position, tag values.
    944         return FALSE;
    945     }
    946 
    947     if (offset > utext_nativeLength(fText)) {
    948         last();        // For side effects on current position, tag values.
    949         return FALSE;
    950     }
    951 
    952     // otherwise, we can use following() on the position before the specified
    953     // one and return true if the position we get back is the one the user
    954     // specified
    955     utext_previous32From(fText, offset);
    956     int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    957     UBool    result  = following(backOne) == offset;
    958     return result;
    959 }
    960 
    961 /**
    962  * Returns the current iteration position.
    963  * @return The current iteration position.
    964  */
    965 int32_t RuleBasedBreakIterator::current(void) const {
    966     int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    967     return pos;
    968 }
    969 
    970 //=======================================================================
    971 // implementation
    972 //=======================================================================
    973 
    974 //
    975 // RBBIRunMode  -  the state machine runs an extra iteration at the beginning and end
    976 //                 of user text.  A variable with this enum type keeps track of where we
    977 //                 are.  The state machine only fetches user input while in the RUN mode.
    978 //
    979 enum RBBIRunMode {
    980     RBBI_START,     // state machine processing is before first char of input
    981     RBBI_RUN,       // state machine processing is in the user text
    982     RBBI_END        // state machine processing is after end of user text.
    983 };
    984 
    985 
    986 //-----------------------------------------------------------------------------------
    987 //
    988 //  handleNext(stateTable)
    989 //     This method is the actual implementation of the rbbi next() method.
    990 //     This method initializes the state machine to state 1
    991 //     and advances through the text character by character until we reach the end
    992 //     of the text or the state machine transitions to state 0.  We update our return
    993 //     value every time the state machine passes through an accepting state.
    994 //
    995 //-----------------------------------------------------------------------------------
    996 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
    997     int32_t             state;
    998     uint16_t            category        = 0;
    999     RBBIRunMode         mode;
   1000 
   1001     RBBIStateTableRow  *row;
   1002     UChar32             c;
   1003     int32_t             lookaheadStatus = 0;
   1004     int32_t             lookaheadTagIdx = 0;
   1005     int32_t             result          = 0;
   1006     int32_t             initialPosition = 0;
   1007     int32_t             lookaheadResult = 0;
   1008     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
   1009     const char         *tableData       = statetable->fTableData;
   1010     uint32_t            tableRowLen     = statetable->fRowLen;
   1011 
   1012     #ifdef RBBI_DEBUG
   1013         if (fTrace) {
   1014             RBBIDebugPuts("Handle Next   pos   char  state category");
   1015         }
   1016     #endif
   1017 
   1018     // No matter what, handleNext alway correctly sets the break tag value.
   1019     fLastStatusIndexValid = TRUE;
   1020     fLastRuleStatusIndex = 0;
   1021 
   1022     // if we're already at the end of the text, return DONE.
   1023     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1024     result          = initialPosition;
   1025     c               = UTEXT_NEXT32(fText);
   1026     if (fData == NULL || c==U_SENTINEL) {
   1027         return BreakIterator::DONE;
   1028     }
   1029 
   1030     //  Set the initial state for the state machine
   1031     state = START_STATE;
   1032     row = (RBBIStateTableRow *)
   1033             //(statetable->fTableData + (statetable->fRowLen * state));
   1034             (tableData + tableRowLen * state);
   1035 
   1036 
   1037     mode     = RBBI_RUN;
   1038     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
   1039         category = 2;
   1040         mode     = RBBI_START;
   1041     }
   1042 
   1043 
   1044     // loop until we reach the end of the text or transition to state 0
   1045     //
   1046     for (;;) {
   1047         if (c == U_SENTINEL) {
   1048             // Reached end of input string.
   1049             if (mode == RBBI_END) {
   1050                 // We have already run the loop one last time with the
   1051                 //   character set to the psueudo {eof} value.  Now it is time
   1052                 //   to unconditionally bail out.
   1053                 if (lookaheadResult > result) {
   1054                     // We ran off the end of the string with a pending look-ahead match.
   1055                     // Treat this as if the look-ahead condition had been met, and return
   1056                     //  the match at the / position from the look-ahead rule.
   1057                     result               = lookaheadResult;
   1058                     fLastRuleStatusIndex = lookaheadTagIdx;
   1059                     lookaheadStatus = 0;
   1060                 }
   1061                 break;
   1062             }
   1063             // Run the loop one last time with the fake end-of-input character category.
   1064             mode = RBBI_END;
   1065             category = 1;
   1066         }
   1067 
   1068         //
   1069         // Get the char category.  An incoming category of 1 or 2 means that
   1070         //      we are preset for doing the beginning or end of input, and
   1071         //      that we shouldn't get a category from an actual text input character.
   1072         //
   1073         if (mode == RBBI_RUN) {
   1074             // look up the current character's character category, which tells us
   1075             // which column in the state table to look at.
   1076             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
   1077             //        not the size of the character going in, which is a UChar32.
   1078             //
   1079             UTRIE_GET16(&fData->fTrie, c, category);
   1080 
   1081             // Check the dictionary bit in the character's category.
   1082             //    Counter is only used by dictionary based iterators (subclasses).
   1083             //    Chars that need to be handled by a dictionary have a flag bit set
   1084             //    in their category values.
   1085             //
   1086             if ((category & 0x4000) != 0)  {
   1087                 fDictionaryCharCount++;
   1088                 //  And off the dictionary flag bit.
   1089                 category &= ~0x4000;
   1090             }
   1091         }
   1092 
   1093        #ifdef RBBI_DEBUG
   1094             if (fTrace) {
   1095                 RBBIDebugPrintf("             %4ld   ", utext_getNativeIndex(fText));
   1096                 if (0x20<=c && c<0x7f) {
   1097                     RBBIDebugPrintf("\"%c\"  ", c);
   1098                 } else {
   1099                     RBBIDebugPrintf("%5x  ", c);
   1100                 }
   1101                 RBBIDebugPrintf("%3d  %3d\n", state, category);
   1102             }
   1103         #endif
   1104 
   1105         // State Transition - move machine to its next state
   1106         //
   1107 
   1108         // Note: fNextState is defined as uint16_t[2], but we are casting
   1109         // a generated RBBI table to RBBIStateTableRow and some tables
   1110         // actually have more than 2 categories.
   1111         U_ASSERT(category<fData->fHeader->fCatCount);
   1112         state = row->fNextState[category];  /*Not accessing beyond memory*/
   1113         row = (RBBIStateTableRow *)
   1114             // (statetable->fTableData + (statetable->fRowLen * state));
   1115             (tableData + tableRowLen * state);
   1116 
   1117 
   1118         if (row->fAccepting == -1) {
   1119             // Match found, common case.
   1120             if (mode != RBBI_START) {
   1121                 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1122             }
   1123             fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
   1124         }
   1125 
   1126         if (row->fLookAhead != 0) {
   1127             if (lookaheadStatus != 0
   1128                 && row->fAccepting == lookaheadStatus) {
   1129                 // Lookahead match is completed.
   1130                 result               = lookaheadResult;
   1131                 fLastRuleStatusIndex = lookaheadTagIdx;
   1132                 lookaheadStatus      = 0;
   1133                 // TODO:  make a standalone hard break in a rule work.
   1134                 if (lookAheadHardBreak) {
   1135                     UTEXT_SETNATIVEINDEX(fText, result);
   1136                     return result;
   1137                 }
   1138                 // Look-ahead completed, but other rules may match further.  Continue on
   1139                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
   1140                 goto continueOn;
   1141             }
   1142 
   1143             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1144             lookaheadResult = r;
   1145             lookaheadStatus = row->fLookAhead;
   1146             lookaheadTagIdx = row->fTagIdx;
   1147             goto continueOn;
   1148         }
   1149 
   1150 
   1151         if (row->fAccepting != 0) {
   1152             // Because this is an accepting state, any in-progress look-ahead match
   1153             //   is no longer relavant.  Clear out the pending lookahead status.
   1154             lookaheadStatus = 0;           // clear out any pending look-ahead match.
   1155         }
   1156 
   1157 continueOn:
   1158         if (state == STOP_STATE) {
   1159             // This is the normal exit from the lookup state machine.
   1160             // We have advanced through the string until it is certain that no
   1161             //   longer match is possible, no matter what characters follow.
   1162             break;
   1163         }
   1164 
   1165         // Advance to the next character.
   1166         // If this is a beginning-of-input loop iteration, don't advance
   1167         //    the input position.  The next iteration will be processing the
   1168         //    first real input character.
   1169         if (mode == RBBI_RUN) {
   1170             c = UTEXT_NEXT32(fText);
   1171         } else {
   1172             if (mode == RBBI_START) {
   1173                 mode = RBBI_RUN;
   1174             }
   1175         }
   1176 
   1177 
   1178     }
   1179 
   1180     // The state machine is done.  Check whether it found a match...
   1181 
   1182     // If the iterator failed to advance in the match engine, force it ahead by one.
   1183     //   (This really indicates a defect in the break rules.  They should always match
   1184     //    at least one character.)
   1185     if (result == initialPosition) {
   1186         UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1187         UTEXT_NEXT32(fText);
   1188         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1189     }
   1190 
   1191     // Leave the iterator at our result position.
   1192     UTEXT_SETNATIVEINDEX(fText, result);
   1193     #ifdef RBBI_DEBUG
   1194         if (fTrace) {
   1195             RBBIDebugPrintf("result = %d\n\n", result);
   1196         }
   1197     #endif
   1198     return result;
   1199 }
   1200 
   1201 
   1202 
   1203 //-----------------------------------------------------------------------------------
   1204 //
   1205 //  handlePrevious()
   1206 //
   1207 //      Iterate backwards, according to the logic of the reverse rules.
   1208 //      This version handles the exact style backwards rules.
   1209 //
   1210 //      The logic of this function is very similar to handleNext(), above.
   1211 //
   1212 //-----------------------------------------------------------------------------------
   1213 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
   1214     int32_t             state;
   1215     uint16_t            category        = 0;
   1216     RBBIRunMode         mode;
   1217     RBBIStateTableRow  *row;
   1218     UChar32             c;
   1219     int32_t             lookaheadStatus = 0;
   1220     int32_t             result          = 0;
   1221     int32_t             initialPosition = 0;
   1222     int32_t             lookaheadResult = 0;
   1223     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
   1224 
   1225     #ifdef RBBI_DEBUG
   1226         if (fTrace) {
   1227             RBBIDebugPuts("Handle Previous   pos   char  state category");
   1228         }
   1229     #endif
   1230 
   1231     // handlePrevious() never gets the rule status.
   1232     // Flag the status as invalid; if the user ever asks for status, we will need
   1233     // to back up, then re-find the break position using handleNext(), which does
   1234     // get the status value.
   1235     fLastStatusIndexValid = FALSE;
   1236     fLastRuleStatusIndex = 0;
   1237 
   1238     // if we're already at the start of the text, return DONE.
   1239     if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
   1240         return BreakIterator::DONE;
   1241     }
   1242 
   1243     //  Set up the starting char.
   1244     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1245     result          = initialPosition;
   1246     c               = UTEXT_PREVIOUS32(fText);
   1247 
   1248     //  Set the initial state for the state machine
   1249     state = START_STATE;
   1250     row = (RBBIStateTableRow *)
   1251             (statetable->fTableData + (statetable->fRowLen * state));
   1252     category = 3;
   1253     mode     = RBBI_RUN;
   1254     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
   1255         category = 2;
   1256         mode     = RBBI_START;
   1257     }
   1258 
   1259 
   1260     // loop until we reach the start of the text or transition to state 0
   1261     //
   1262     for (;;) {
   1263         if (c == U_SENTINEL) {
   1264             // Reached end of input string.
   1265             if (mode == RBBI_END) {
   1266                 // We have already run the loop one last time with the
   1267                 //   character set to the psueudo {eof} value.  Now it is time
   1268                 //   to unconditionally bail out.
   1269                 if (lookaheadResult < result) {
   1270                     // We ran off the end of the string with a pending look-ahead match.
   1271                     // Treat this as if the look-ahead condition had been met, and return
   1272                     //  the match at the / position from the look-ahead rule.
   1273                     result               = lookaheadResult;
   1274                     lookaheadStatus = 0;
   1275                 } else if (result == initialPosition) {
   1276                     // Ran off start, no match found.
   1277                     // move one index one (towards the start, since we are doing a previous())
   1278                     UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1279                     (void)UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.
   1280                 }
   1281                 break;
   1282             }
   1283             // Run the loop one last time with the fake end-of-input character category.
   1284             mode = RBBI_END;
   1285             category = 1;
   1286         }
   1287 
   1288         //
   1289         // Get the char category.  An incoming category of 1 or 2 means that
   1290         //      we are preset for doing the beginning or end of input, and
   1291         //      that we shouldn't get a category from an actual text input character.
   1292         //
   1293         if (mode == RBBI_RUN) {
   1294             // look up the current character's character category, which tells us
   1295             // which column in the state table to look at.
   1296             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
   1297             //        not the size of the character going in, which is a UChar32.
   1298             //
   1299             UTRIE_GET16(&fData->fTrie, c, category);
   1300 
   1301             // Check the dictionary bit in the character's category.
   1302             //    Counter is only used by dictionary based iterators (subclasses).
   1303             //    Chars that need to be handled by a dictionary have a flag bit set
   1304             //    in their category values.
   1305             //
   1306             if ((category & 0x4000) != 0)  {
   1307                 fDictionaryCharCount++;
   1308                 //  And off the dictionary flag bit.
   1309                 category &= ~0x4000;
   1310             }
   1311         }
   1312 
   1313         #ifdef RBBI_DEBUG
   1314             if (fTrace) {
   1315                 RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(fText));
   1316                 if (0x20<=c && c<0x7f) {
   1317                     RBBIDebugPrintf("\"%c\"  ", c);
   1318                 } else {
   1319                     RBBIDebugPrintf("%5x  ", c);
   1320                 }
   1321                 RBBIDebugPrintf("%3d  %3d\n", state, category);
   1322             }
   1323         #endif
   1324 
   1325         // State Transition - move machine to its next state
   1326         //
   1327 
   1328         // Note: fNextState is defined as uint16_t[2], but we are casting
   1329         // a generated RBBI table to RBBIStateTableRow and some tables
   1330         // actually have more than 2 categories.
   1331         U_ASSERT(category<fData->fHeader->fCatCount);
   1332         state = row->fNextState[category];  /*Not accessing beyond memory*/
   1333         row = (RBBIStateTableRow *)
   1334             (statetable->fTableData + (statetable->fRowLen * state));
   1335 
   1336         if (row->fAccepting == -1) {
   1337             // Match found, common case.
   1338             result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1339         }
   1340 
   1341         if (row->fLookAhead != 0) {
   1342             if (lookaheadStatus != 0
   1343                 && row->fAccepting == lookaheadStatus) {
   1344                 // Lookahead match is completed.
   1345                 result               = lookaheadResult;
   1346                 lookaheadStatus      = 0;
   1347                 // TODO:  make a standalone hard break in a rule work.
   1348                 if (lookAheadHardBreak) {
   1349                     UTEXT_SETNATIVEINDEX(fText, result);
   1350                     return result;
   1351                 }
   1352                 // Look-ahead completed, but other rules may match further.  Continue on
   1353                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
   1354                 goto continueOn;
   1355             }
   1356 
   1357             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1358             lookaheadResult = r;
   1359             lookaheadStatus = row->fLookAhead;
   1360             goto continueOn;
   1361         }
   1362 
   1363 
   1364         if (row->fAccepting != 0) {
   1365             // Because this is an accepting state, any in-progress look-ahead match
   1366             //   is no longer relavant.  Clear out the pending lookahead status.
   1367             lookaheadStatus = 0;
   1368         }
   1369 
   1370 continueOn:
   1371         if (state == STOP_STATE) {
   1372             // This is the normal exit from the lookup state machine.
   1373             // We have advanced through the string until it is certain that no
   1374             //   longer match is possible, no matter what characters follow.
   1375             break;
   1376         }
   1377 
   1378         // Move (backwards) to the next character to process.
   1379         // If this is a beginning-of-input loop iteration, don't advance
   1380         //    the input position.  The next iteration will be processing the
   1381         //    first real input character.
   1382         if (mode == RBBI_RUN) {
   1383             c = UTEXT_PREVIOUS32(fText);
   1384         } else {
   1385             if (mode == RBBI_START) {
   1386                 mode = RBBI_RUN;
   1387             }
   1388         }
   1389     }
   1390 
   1391     // The state machine is done.  Check whether it found a match...
   1392 
   1393     // If the iterator failed to advance in the match engine, force it ahead by one.
   1394     //   (This really indicates a defect in the break rules.  They should always match
   1395     //    at least one character.)
   1396     if (result == initialPosition) {
   1397         UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1398         UTEXT_PREVIOUS32(fText);
   1399         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1400     }
   1401 
   1402     // Leave the iterator at our result position.
   1403     UTEXT_SETNATIVEINDEX(fText, result);
   1404     #ifdef RBBI_DEBUG
   1405         if (fTrace) {
   1406             RBBIDebugPrintf("result = %d\n\n", result);
   1407         }
   1408     #endif
   1409     return result;
   1410 }
   1411 
   1412 
   1413 void
   1414 RuleBasedBreakIterator::reset()
   1415 {
   1416     if (fCachedBreakPositions) {
   1417         uprv_free(fCachedBreakPositions);
   1418     }
   1419     fCachedBreakPositions = NULL;
   1420     fNumCachedBreakPositions = 0;
   1421     fDictionaryCharCount = 0;
   1422     fPositionInCache = 0;
   1423 }
   1424 
   1425 
   1426 
   1427 //-------------------------------------------------------------------------------
   1428 //
   1429 //   getRuleStatus()   Return the break rule tag associated with the current
   1430 //                     iterator position.  If the iterator arrived at its current
   1431 //                     position by iterating forwards, the value will have been
   1432 //                     cached by the handleNext() function.
   1433 //
   1434 //                     If no cached status value is available, the status is
   1435 //                     found by doing a previous() followed by a next(), which
   1436 //                     leaves the iterator where it started, and computes the
   1437 //                     status while doing the next().
   1438 //
   1439 //-------------------------------------------------------------------------------
   1440 void RuleBasedBreakIterator::makeRuleStatusValid() {
   1441     if (fLastStatusIndexValid == FALSE) {
   1442         //  No cached status is available.
   1443         if (fText == NULL || current() == 0) {
   1444             //  At start of text, or there is no text.  Status is always zero.
   1445             fLastRuleStatusIndex = 0;
   1446             fLastStatusIndexValid = TRUE;
   1447         } else {
   1448             //  Not at start of text.  Find status the tedious way.
   1449             int32_t pa = current();
   1450             previous();
   1451             if (fNumCachedBreakPositions > 0) {
   1452                 reset();                // Blow off the dictionary cache
   1453             }
   1454             int32_t pb = next();
   1455             if (pa != pb) {
   1456                 // note: the if (pa != pb) test is here only to eliminate warnings for
   1457                 //       unused local variables on gcc.  Logically, it isn't needed.
   1458                 U_ASSERT(pa == pb);
   1459             }
   1460         }
   1461     }
   1462     U_ASSERT(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fData->fStatusMaxIdx);
   1463 }
   1464 
   1465 
   1466 int32_t  RuleBasedBreakIterator::getRuleStatus() const {
   1467     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
   1468     nonConstThis->makeRuleStatusValid();
   1469 
   1470     // fLastRuleStatusIndex indexes to the start of the appropriate status record
   1471     //                                                 (the number of status values.)
   1472     //   This function returns the last (largest) of the array of status values.
   1473     int32_t  idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
   1474     int32_t  tagVal = fData->fRuleStatusTable[idx];
   1475 
   1476     return tagVal;
   1477 }
   1478 
   1479 
   1480 
   1481 
   1482 int32_t RuleBasedBreakIterator::getRuleStatusVec(
   1483              int32_t *fillInVec, int32_t capacity, UErrorCode &status)
   1484 {
   1485     if (U_FAILURE(status)) {
   1486         return 0;
   1487     }
   1488 
   1489     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
   1490     nonConstThis->makeRuleStatusValid();
   1491     int32_t  numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
   1492     int32_t  numValsToCopy = numVals;
   1493     if (numVals > capacity) {
   1494         status = U_BUFFER_OVERFLOW_ERROR;
   1495         numValsToCopy = capacity;
   1496     }
   1497     int i;
   1498     for (i=0; i<numValsToCopy; i++) {
   1499         fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
   1500     }
   1501     return numVals;
   1502 }
   1503 
   1504 
   1505 
   1506 //-------------------------------------------------------------------------------
   1507 //
   1508 //   getBinaryRules        Access to the compiled form of the rules,
   1509 //                         for use by build system tools that save the data
   1510 //                         for standard iterator types.
   1511 //
   1512 //-------------------------------------------------------------------------------
   1513 const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
   1514     const uint8_t  *retPtr = NULL;
   1515     length = 0;
   1516 
   1517     if (fData != NULL) {
   1518         retPtr = (const uint8_t *)fData->fHeader;
   1519         length = fData->fHeader->fLength;
   1520     }
   1521     return retPtr;
   1522 }
   1523 
   1524 
   1525 BreakIterator *  RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/,
   1526                                    int32_t &bufferSize,
   1527                                    UErrorCode &status)
   1528 {
   1529     if (U_FAILURE(status)){
   1530         return NULL;
   1531     }
   1532 
   1533     if (bufferSize == 0) {
   1534         bufferSize = 1;  // preflighting for deprecated functionality
   1535         return NULL;
   1536     }
   1537 
   1538     BreakIterator *clonedBI = clone();
   1539     if (clonedBI == NULL) {
   1540         status = U_MEMORY_ALLOCATION_ERROR;
   1541     } else {
   1542         status = U_SAFECLONE_ALLOCATED_WARNING;
   1543     }
   1544     return (RuleBasedBreakIterator *)clonedBI;
   1545 }
   1546 
   1547 
   1548 //-------------------------------------------------------------------------------
   1549 //
   1550 //  isDictionaryChar      Return true if the category lookup for this char
   1551 //                        indicates that it is in the set of dictionary lookup
   1552 //                        chars.
   1553 //
   1554 //                        This function is intended for use by dictionary based
   1555 //                        break iterators.
   1556 //
   1557 //-------------------------------------------------------------------------------
   1558 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32   c) {
   1559     if (fData == NULL) {
   1560         return FALSE;
   1561     }
   1562     uint16_t category;
   1563     UTRIE_GET16(&fData->fTrie, c, category);
   1564     return (category & 0x4000) != 0;
   1565 }*/
   1566 
   1567 
   1568 //-------------------------------------------------------------------------------
   1569 //
   1570 //  checkDictionary       This function handles all processing of characters in
   1571 //                        the "dictionary" set. It will determine the appropriate
   1572 //                        course of action, and possibly set up a cache in the
   1573 //                        process.
   1574 //
   1575 //-------------------------------------------------------------------------------
   1576 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
   1577                             int32_t endPos,
   1578                             UBool reverse) {
   1579     // Reset the old break cache first.
   1580     reset();
   1581 
   1582     // note: code segment below assumes that dictionary chars are in the
   1583     // startPos-endPos range
   1584     // value returned should be next character in sequence
   1585     if ((endPos - startPos) <= 1) {
   1586         return (reverse ? startPos : endPos);
   1587     }
   1588 
   1589     // Starting from the starting point, scan towards the proposed result,
   1590     // looking for the first dictionary character (which may be the one
   1591     // we're on, if we're starting in the middle of a range).
   1592     utext_setNativeIndex(fText, reverse ? endPos : startPos);
   1593     if (reverse) {
   1594         UTEXT_PREVIOUS32(fText);
   1595     }
   1596 
   1597     int32_t rangeStart = startPos;
   1598     int32_t rangeEnd = endPos;
   1599 
   1600     uint16_t    category;
   1601     int32_t     current;
   1602     UErrorCode  status = U_ZERO_ERROR;
   1603     UStack      breaks(status);
   1604     int32_t     foundBreakCount = 0;
   1605     UChar32     c = utext_current32(fText);
   1606 
   1607     UTRIE_GET16(&fData->fTrie, c, category);
   1608 
   1609     // Is the character we're starting on a dictionary character? If so, we
   1610     // need to back up to include the entire run; otherwise the results of
   1611     // the break algorithm will differ depending on where we start. Since
   1612     // the result is cached and there is typically a non-dictionary break
   1613     // within a small number of words, there should be little performance impact.
   1614     if (category & 0x4000) {
   1615         if (reverse) {
   1616             do {
   1617                 utext_next32(fText);          // TODO:  recast to work directly with postincrement.
   1618                 c = utext_current32(fText);
   1619                 UTRIE_GET16(&fData->fTrie, c, category);
   1620             } while (c != U_SENTINEL && (category & 0x4000));
   1621             // Back up to the last dictionary character
   1622             rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1623             if (c == U_SENTINEL) {
   1624                 // c = fText->last32();
   1625                 //   TODO:  why was this if needed?
   1626                 c = UTEXT_PREVIOUS32(fText);
   1627             }
   1628             else {
   1629                 c = UTEXT_PREVIOUS32(fText);
   1630             }
   1631         }
   1632         else {
   1633             do {
   1634                 c = UTEXT_PREVIOUS32(fText);
   1635                 UTRIE_GET16(&fData->fTrie, c, category);
   1636             }
   1637             while (c != U_SENTINEL && (category & 0x4000));
   1638             // Back up to the last dictionary character
   1639             if (c == U_SENTINEL) {
   1640                 // c = fText->first32();
   1641                 c = utext_current32(fText);
   1642             }
   1643             else {
   1644                 utext_next32(fText);
   1645                 c = utext_current32(fText);
   1646             }
   1647             rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
   1648         }
   1649         UTRIE_GET16(&fData->fTrie, c, category);
   1650     }
   1651 
   1652     // Loop through the text, looking for ranges of dictionary characters.
   1653     // For each span, find the appropriate break engine, and ask it to find
   1654     // any breaks within the span.
   1655     // Note: we always do this in the forward direction, so that the break
   1656     // cache is built in the right order.
   1657     if (reverse) {
   1658         utext_setNativeIndex(fText, rangeStart);
   1659         c = utext_current32(fText);
   1660         UTRIE_GET16(&fData->fTrie, c, category);
   1661     }
   1662     while(U_SUCCESS(status)) {
   1663         while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
   1664             utext_next32(fText);           // TODO:  tweak for post-increment operation
   1665             c = utext_current32(fText);
   1666             UTRIE_GET16(&fData->fTrie, c, category);
   1667         }
   1668         if (current >= rangeEnd) {
   1669             break;
   1670         }
   1671 
   1672         // We now have a dictionary character. Get the appropriate language object
   1673         // to deal with it.
   1674         const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
   1675 
   1676         // Ask the language object if there are any breaks. It will leave the text
   1677         // pointer on the other side of its range, ready to search for the next one.
   1678         if (lbe != NULL) {
   1679             foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
   1680         }
   1681 
   1682         // Reload the loop variables for the next go-round
   1683         c = utext_current32(fText);
   1684         UTRIE_GET16(&fData->fTrie, c, category);
   1685     }
   1686 
   1687     // If we found breaks, build a new break cache. The first and last entries must
   1688     // be the original starting and ending position.
   1689     if (foundBreakCount > 0) {
   1690         U_ASSERT(foundBreakCount == breaks.size());
   1691         int32_t totalBreaks = foundBreakCount;
   1692         if (startPos < breaks.elementAti(0)) {
   1693             totalBreaks += 1;
   1694         }
   1695         if (endPos > breaks.peeki()) {
   1696             totalBreaks += 1;
   1697         }
   1698         fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
   1699         if (fCachedBreakPositions != NULL) {
   1700             int32_t out = 0;
   1701             fNumCachedBreakPositions = totalBreaks;
   1702             if (startPos < breaks.elementAti(0)) {
   1703                 fCachedBreakPositions[out++] = startPos;
   1704             }
   1705             for (int32_t i = 0; i < foundBreakCount; ++i) {
   1706                 fCachedBreakPositions[out++] = breaks.elementAti(i);
   1707             }
   1708             if (endPos > fCachedBreakPositions[out-1]) {
   1709                 fCachedBreakPositions[out] = endPos;
   1710             }
   1711             // If there are breaks, then by definition, we are replacing the original
   1712             // proposed break by one of the breaks we found. Use following() and
   1713             // preceding() to do the work. They should never recurse in this case.
   1714             if (reverse) {
   1715                 return preceding(endPos);
   1716             }
   1717             else {
   1718                 return following(startPos);
   1719             }
   1720         }
   1721         // If the allocation failed, just fall through to the "no breaks found" case.
   1722     }
   1723 
   1724     // If we get here, there were no language-based breaks. Set the text pointer
   1725     // to the original proposed break.
   1726     utext_setNativeIndex(fText, reverse ? startPos : endPos);
   1727     return (reverse ? startPos : endPos);
   1728 }
   1729 
   1730 U_NAMESPACE_END
   1731 
   1732 
   1733 static icu::UStack *gLanguageBreakFactories = NULL;
   1734 static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER;
   1735 
   1736 /**
   1737  * Release all static memory held by breakiterator.
   1738  */
   1739 U_CDECL_BEGIN
   1740 static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
   1741     if (gLanguageBreakFactories) {
   1742         delete gLanguageBreakFactories;
   1743         gLanguageBreakFactories = NULL;
   1744     }
   1745     gLanguageBreakFactoriesInitOnce.reset();
   1746     return TRUE;
   1747 }
   1748 U_CDECL_END
   1749 
   1750 U_CDECL_BEGIN
   1751 static void U_CALLCONV _deleteFactory(void *obj) {
   1752     delete (icu::LanguageBreakFactory *) obj;
   1753 }
   1754 U_CDECL_END
   1755 U_NAMESPACE_BEGIN
   1756 
   1757 static void U_CALLCONV initLanguageFactories() {
   1758     UErrorCode status = U_ZERO_ERROR;
   1759     U_ASSERT(gLanguageBreakFactories == NULL);
   1760     gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status);
   1761     if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) {
   1762         ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
   1763         gLanguageBreakFactories->push(builtIn, status);
   1764 #ifdef U_LOCAL_SERVICE_HOOK
   1765         LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
   1766         if (extra != NULL) {
   1767             gLanguageBreakFactories->push(extra, status);
   1768         }
   1769 #endif
   1770     }
   1771     ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
   1772 }
   1773 
   1774 
   1775 static const LanguageBreakEngine*
   1776 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
   1777 {
   1778     umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
   1779     if (gLanguageBreakFactories == NULL) {
   1780         return NULL;
   1781     }
   1782 
   1783     int32_t i = gLanguageBreakFactories->size();
   1784     const LanguageBreakEngine *lbe = NULL;
   1785     while (--i >= 0) {
   1786         LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
   1787         lbe = factory->getEngineFor(c, breakType);
   1788         if (lbe != NULL) {
   1789             break;
   1790         }
   1791     }
   1792     return lbe;
   1793 }
   1794 
   1795 
   1796 //-------------------------------------------------------------------------------
   1797 //
   1798 //  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the
   1799 //                          the character c.
   1800 //
   1801 //-------------------------------------------------------------------------------
   1802 const LanguageBreakEngine *
   1803 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
   1804     const LanguageBreakEngine *lbe = NULL;
   1805     UErrorCode status = U_ZERO_ERROR;
   1806 
   1807     if (fLanguageBreakEngines == NULL) {
   1808         fLanguageBreakEngines = new UStack(status);
   1809         if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
   1810             delete fLanguageBreakEngines;
   1811             fLanguageBreakEngines = 0;
   1812             return NULL;
   1813         }
   1814     }
   1815 
   1816     int32_t i = fLanguageBreakEngines->size();
   1817     while (--i >= 0) {
   1818         lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
   1819         if (lbe->handles(c, fBreakType)) {
   1820             return lbe;
   1821         }
   1822     }
   1823 
   1824     // No existing dictionary took the character. See if a factory wants to
   1825     // give us a new LanguageBreakEngine for this character.
   1826     lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
   1827 
   1828     // If we got one, use it and push it on our stack.
   1829     if (lbe != NULL) {
   1830         fLanguageBreakEngines->push((void *)lbe, status);
   1831         // Even if we can't remember it, we can keep looking it up, so
   1832         // return it even if the push fails.
   1833         return lbe;
   1834     }
   1835 
   1836     // No engine is forthcoming for this character. Add it to the
   1837     // reject set. Create the reject break engine if needed.
   1838     if (fUnhandledBreakEngine == NULL) {
   1839         fUnhandledBreakEngine = new UnhandledEngine(status);
   1840         if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
   1841             status = U_MEMORY_ALLOCATION_ERROR;
   1842         }
   1843         // Put it last so that scripts for which we have an engine get tried
   1844         // first.
   1845         fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
   1846         // If we can't insert it, or creation failed, get rid of it
   1847         if (U_FAILURE(status)) {
   1848             delete fUnhandledBreakEngine;
   1849             fUnhandledBreakEngine = 0;
   1850             return NULL;
   1851         }
   1852     }
   1853 
   1854     // Tell the reject engine about the character; at its discretion, it may
   1855     // add more than just the one character.
   1856     fUnhandledBreakEngine->handleCharacter(c, fBreakType);
   1857 
   1858     return fUnhandledBreakEngine;
   1859 }
   1860 
   1861 
   1862 
   1863 /*int32_t RuleBasedBreakIterator::getBreakType() const {
   1864     return fBreakType;
   1865 }*/
   1866 
   1867 void RuleBasedBreakIterator::setBreakType(int32_t type) {
   1868     fBreakType = type;
   1869     reset();
   1870 }
   1871 
   1872 U_NAMESPACE_END
   1873 
   1874 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   1875