Home | History | Annotate | Download | only in common
      1 /*
      2 ***************************************************************************
      3 *   Copyright (C) 1999-2014 International Business Machines Corporation
      4 *   and others. All rights reserved.
      5 ***************************************************************************
      6 */
      7 //
      8 //  file:  rbbi.c    Contains the implementation of the rule based break iterator
      9 //                   runtime engine and the API implementation for
     10 //                   class RuleBasedBreakIterator
     11 //
     12 
     13 #include "utypeinfo.h"  // for 'typeid' to work
     14 
     15 #include "unicode/utypes.h"
     16 
     17 #if !UCONFIG_NO_BREAK_ITERATION
     18 
     19 #include "unicode/rbbi.h"
     20 #include "unicode/schriter.h"
     21 #include "unicode/uchriter.h"
     22 #include "unicode/udata.h"
     23 #include "unicode/uclean.h"
     24 #include "rbbidata.h"
     25 #include "rbbirb.h"
     26 #include "cmemory.h"
     27 #include "cstring.h"
     28 #include "umutex.h"
     29 #include "ucln_cmn.h"
     30 #include "brkeng.h"
     31 
     32 #include "uassert.h"
     33 #include "uvector.h"
     34 
     35 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
     36 #if U_LOCAL_SERVICE_HOOK
     37 #include "localsvc.h"
     38 #endif
     39 
     40 #ifdef RBBI_DEBUG
     41 static UBool fTrace = FALSE;
     42 #endif
     43 
     44 U_NAMESPACE_BEGIN
     45 
     46 // The state number of the starting state
     47 #define START_STATE 1
     48 
     49 // The state-transition value indicating "stop"
     50 #define STOP_STATE  0
     51 
     52 
     53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
     54 
     55 
     56 //=======================================================================
     57 // constructors
     58 //=======================================================================
     59 
     60 /**
     61  * Constructs a RuleBasedBreakIterator that uses the already-created
     62  * tables object that is passed in as a parameter.
     63  */
     64 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
     65 {
     66     init();
     67     fData = new RBBIDataWrapper(data, status); // status checked in constructor
     68     if (U_FAILURE(status)) {return;}
     69     if(fData == 0) {
     70         status = U_MEMORY_ALLOCATION_ERROR;
     71         return;
     72     }
     73 }
     74 
     75 /**
     76  * Same as above but does not adopt memory
     77  */
     78 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status)
     79 {
     80     init();
     81     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor
     82     if (U_FAILURE(status)) {return;}
     83     if(fData == 0) {
     84         status = U_MEMORY_ALLOCATION_ERROR;
     85         return;
     86     }
     87 }
     88 
     89 
     90 //
     91 //  Construct from precompiled binary rules (tables).  This constructor is public API,
     92 //  taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
     93 //
     94 RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
     95                        uint32_t       ruleLength,
     96                        UErrorCode     &status) {
     97     init();
     98     if (U_FAILURE(status)) {
     99         return;
    100     }
    101     if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) {
    102         status = U_ILLEGAL_ARGUMENT_ERROR;
    103         return;
    104     }
    105     const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules;
    106     if (data->fLength > ruleLength) {
    107         status = U_ILLEGAL_ARGUMENT_ERROR;
    108         return;
    109     }
    110     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status);
    111     if (U_FAILURE(status)) {return;}
    112     if(fData == 0) {
    113         status = U_MEMORY_ALLOCATION_ERROR;
    114         return;
    115     }
    116 }
    117 
    118 
    119 //-------------------------------------------------------------------------------
    120 //
    121 //   Constructor   from a UDataMemory handle to precompiled break rules
    122 //                 stored in an ICU data file.
    123 //
    124 //-------------------------------------------------------------------------------
    125 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
    126 {
    127     init();
    128     fData = new RBBIDataWrapper(udm, status); // status checked in constructor
    129     if (U_FAILURE(status)) {return;}
    130     if(fData == 0) {
    131         status = U_MEMORY_ALLOCATION_ERROR;
    132         return;
    133     }
    134 }
    135 
    136 
    137 
    138 //-------------------------------------------------------------------------------
    139 //
    140 //   Constructor       from a set of rules supplied as a string.
    141 //
    142 //-------------------------------------------------------------------------------
    143 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
    144                                                 UParseError          &parseError,
    145                                                 UErrorCode           &status)
    146 {
    147     init();
    148     if (U_FAILURE(status)) {return;}
    149     RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
    150         RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
    151     // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that
    152     //        creates and returns a complete RBBI.  From here, in a constructor, we
    153     //        can't just return the object created by the builder factory, hence
    154     //        the assignment of the factory created object to "this".
    155     if (U_SUCCESS(status)) {
    156         *this = *bi;
    157         delete bi;
    158     }
    159 }
    160 
    161 
    162 //-------------------------------------------------------------------------------
    163 //
    164 // Default Constructor.      Create an empty shell that can be set up later.
    165 //                           Used when creating a RuleBasedBreakIterator from a set
    166 //                           of rules.
    167 //-------------------------------------------------------------------------------
    168 RuleBasedBreakIterator::RuleBasedBreakIterator() {
    169     init();
    170 }
    171 
    172 
    173 //-------------------------------------------------------------------------------
    174 //
    175 //   Copy constructor.  Will produce a break iterator with the same behavior,
    176 //                      and which iterates over the same text, as the one passed in.
    177 //
    178 //-------------------------------------------------------------------------------
    179 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
    180 : BreakIterator(other)
    181 {
    182     this->init();
    183     *this = other;
    184 }
    185 
    186 
    187 /**
    188  * Destructor
    189  */
    190 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
    191     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    192         // fCharIter was adopted from the outside.
    193         delete fCharIter;
    194     }
    195     fCharIter = NULL;
    196     delete fSCharIter;
    197     fCharIter = NULL;
    198     delete fDCharIter;
    199     fDCharIter = NULL;
    200 
    201     utext_close(fText);
    202 
    203     if (fData != NULL) {
    204         fData->removeReference();
    205         fData = NULL;
    206     }
    207     if (fCachedBreakPositions) {
    208         uprv_free(fCachedBreakPositions);
    209         fCachedBreakPositions = NULL;
    210     }
    211     if (fLanguageBreakEngines) {
    212         delete fLanguageBreakEngines;
    213         fLanguageBreakEngines = NULL;
    214     }
    215     if (fUnhandledBreakEngine) {
    216         delete fUnhandledBreakEngine;
    217         fUnhandledBreakEngine = NULL;
    218     }
    219 }
    220 
    221 /**
    222  * Assignment operator.  Sets this iterator to have the same behavior,
    223  * and iterate over the same text, as the one passed in.
    224  */
    225 RuleBasedBreakIterator&
    226 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
    227     if (this == &that) {
    228         return *this;
    229     }
    230     reset();    // Delete break cache information
    231     fBreakType = that.fBreakType;
    232     if (fLanguageBreakEngines != NULL) {
    233         delete fLanguageBreakEngines;
    234         fLanguageBreakEngines = NULL;   // Just rebuild for now
    235     }
    236     // TODO: clone fLanguageBreakEngines from "that"
    237     UErrorCode status = U_ZERO_ERROR;
    238     fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
    239 
    240     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    241         delete fCharIter;
    242     }
    243     fCharIter = NULL;
    244 
    245     if (that.fCharIter != NULL ) {
    246         // This is a little bit tricky - it will intially appear that
    247         //  this->fCharIter is adopted, even if that->fCharIter was
    248         //  not adopted.  That's ok.
    249         fCharIter = that.fCharIter->clone();
    250     }
    251 
    252     if (fData != NULL) {
    253         fData->removeReference();
    254         fData = NULL;
    255     }
    256     if (that.fData != NULL) {
    257         fData = that.fData->addReference();
    258     }
    259 
    260     return *this;
    261 }
    262 
    263 
    264 
    265 //-----------------------------------------------------------------------------
    266 //
    267 //    init()      Shared initialization routine.   Used by all the constructors.
    268 //                Initializes all fields, leaving the object in a consistent state.
    269 //
    270 //-----------------------------------------------------------------------------
    271 void RuleBasedBreakIterator::init() {
    272     UErrorCode  status    = U_ZERO_ERROR;
    273     fText                 = utext_openUChars(NULL, NULL, 0, &status);
    274     fCharIter             = NULL;
    275     fSCharIter            = NULL;
    276     fDCharIter            = NULL;
    277     fData                 = NULL;
    278     fLastRuleStatusIndex  = 0;
    279     fLastStatusIndexValid = TRUE;
    280     fDictionaryCharCount  = 0;
    281     fBreakType            = UBRK_WORD;  // Defaulting BreakType to word gives reasonable
    282                                         //   dictionary behavior for Break Iterators that are
    283                                         //   built from rules.  Even better would be the ability to
    284                                         //   declare the type in the rules.
    285 
    286     fCachedBreakPositions    = NULL;
    287     fLanguageBreakEngines    = NULL;
    288     fUnhandledBreakEngine    = NULL;
    289     fNumCachedBreakPositions = 0;
    290     fPositionInCache         = 0;
    291 
    292 #ifdef RBBI_DEBUG
    293     static UBool debugInitDone = FALSE;
    294     if (debugInitDone == FALSE) {
    295         char *debugEnv = getenv("U_RBBIDEBUG");
    296         if (debugEnv && uprv_strstr(debugEnv, "trace")) {
    297             fTrace = TRUE;
    298         }
    299         debugInitDone = TRUE;
    300     }
    301 #endif
    302 }
    303 
    304 
    305 
    306 //-----------------------------------------------------------------------------
    307 //
    308 //    clone - Returns a newly-constructed RuleBasedBreakIterator with the same
    309 //            behavior, and iterating over the same text, as this one.
    310 //            Virtual function: does the right thing with subclasses.
    311 //
    312 //-----------------------------------------------------------------------------
    313 BreakIterator*
    314 RuleBasedBreakIterator::clone(void) const {
    315     return new RuleBasedBreakIterator(*this);
    316 }
    317 
    318 /**
    319  * Equality operator.  Returns TRUE if both BreakIterators are of the
    320  * same class, have the same behavior, and iterate over the same text.
    321  */
    322 UBool
    323 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
    324     if (typeid(*this) != typeid(that)) {
    325         return FALSE;
    326     }
    327 
    328     const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
    329 
    330     if (!utext_equals(fText, that2.fText)) {
    331         // The two break iterators are operating on different text,
    332         //   or have a different interation position.
    333         return FALSE;
    334     };
    335 
    336     // TODO:  need a check for when in a dictionary region at different offsets.
    337 
    338     if (that2.fData == fData ||
    339         (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
    340             // The two break iterators are using the same rules.
    341             return TRUE;
    342         }
    343     return FALSE;
    344 }
    345 
    346 /**
    347  * Compute a hash code for this BreakIterator
    348  * @return A hash code
    349  */
    350 int32_t
    351 RuleBasedBreakIterator::hashCode(void) const {
    352     int32_t   hash = 0;
    353     if (fData != NULL) {
    354         hash = fData->hashCode();
    355     }
    356     return hash;
    357 }
    358 
    359 
    360 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
    361     if (U_FAILURE(status)) {
    362         return;
    363     }
    364     reset();
    365     fText = utext_clone(fText, ut, FALSE, TRUE, &status);
    366 
    367     // Set up a dummy CharacterIterator to be returned if anyone
    368     //   calls getText().  With input from UText, there is no reasonable
    369     //   way to return a characterIterator over the actual input text.
    370     //   Return one over an empty string instead - this is the closest
    371     //   we can come to signaling a failure.
    372     //   (GetText() is obsolete, this failure is sort of OK)
    373     if (fDCharIter == NULL) {
    374         static const UChar c = 0;
    375         fDCharIter = new UCharCharacterIterator(&c, 0);
    376         if (fDCharIter == NULL) {
    377             status = U_MEMORY_ALLOCATION_ERROR;
    378             return;
    379         }
    380     }
    381 
    382     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    383         // existing fCharIter was adopted from the outside.  Delete it now.
    384         delete fCharIter;
    385     }
    386     fCharIter = fDCharIter;
    387 
    388     this->first();
    389 }
    390 
    391 
    392 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
    393     UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
    394     return result;
    395 }
    396 
    397 
    398 
    399 /**
    400  * Returns the description used to create this iterator
    401  */
    402 const UnicodeString&
    403 RuleBasedBreakIterator::getRules() const {
    404     if (fData != NULL) {
    405         return fData->getRuleSourceString();
    406     } else {
    407         static const UnicodeString *s;
    408         if (s == NULL) {
    409             // TODO:  something more elegant here.
    410             //        perhaps API should return the string by value.
    411             //        Note:  thread unsafe init & leak are semi-ok, better than
    412             //               what was before.  Sould be cleaned up, though.
    413             s = new UnicodeString;
    414         }
    415         return *s;
    416     }
    417 }
    418 
    419 //=======================================================================
    420 // BreakIterator overrides
    421 //=======================================================================
    422 
    423 /**
    424  * Return a CharacterIterator over the text being analyzed.
    425  */
    426 CharacterIterator&
    427 RuleBasedBreakIterator::getText() const {
    428     return *fCharIter;
    429 }
    430 
    431 /**
    432  * Set the iterator to analyze a new piece of text.  This function resets
    433  * the current iteration position to the beginning of the text.
    434  * @param newText An iterator over the text to analyze.
    435  */
    436 void
    437 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
    438     // If we are holding a CharacterIterator adopted from a
    439     //   previous call to this function, delete it now.
    440     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    441         delete fCharIter;
    442     }
    443 
    444     fCharIter = newText;
    445     UErrorCode status = U_ZERO_ERROR;
    446     reset();
    447     if (newText==NULL || newText->startIndex() != 0) {
    448         // startIndex !=0 wants to be an error, but there's no way to report it.
    449         // Make the iterator text be an empty string.
    450         fText = utext_openUChars(fText, NULL, 0, &status);
    451     } else {
    452         fText = utext_openCharacterIterator(fText, newText, &status);
    453     }
    454     this->first();
    455 }
    456 
    457 /**
    458  * Set the iterator to analyze a new piece of text.  This function resets
    459  * the current iteration position to the beginning of the text.
    460  * @param newText An iterator over the text to analyze.
    461  */
    462 void
    463 RuleBasedBreakIterator::setText(const UnicodeString& newText) {
    464     UErrorCode status = U_ZERO_ERROR;
    465     reset();
    466     fText = utext_openConstUnicodeString(fText, &newText, &status);
    467 
    468     // Set up a character iterator on the string.
    469     //   Needed in case someone calls getText().
    470     //  Can not, unfortunately, do this lazily on the (probably never)
    471     //  call to getText(), because getText is const.
    472     if (fSCharIter == NULL) {
    473         fSCharIter = new StringCharacterIterator(newText);
    474     } else {
    475         fSCharIter->setText(newText);
    476     }
    477 
    478     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    479         // old fCharIter was adopted from the outside.  Delete it.
    480         delete fCharIter;
    481     }
    482     fCharIter = fSCharIter;
    483 
    484     this->first();
    485 }
    486 
    487 
    488 /**
    489  *  Provide a new UText for the input text.  Must reference text with contents identical
    490  *  to the original.
    491  *  Intended for use with text data originating in Java (garbage collected) environments
    492  *  where the data may be moved in memory at arbitrary times.
    493  */
    494 RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
    495     if (U_FAILURE(status)) {
    496         return *this;
    497     }
    498     if (input == NULL) {
    499         status = U_ILLEGAL_ARGUMENT_ERROR;
    500         return *this;
    501     }
    502     int64_t pos = utext_getNativeIndex(fText);
    503     //  Shallow read-only clone of the new UText into the existing input UText
    504     fText = utext_clone(fText, input, FALSE, TRUE, &status);
    505     if (U_FAILURE(status)) {
    506         return *this;
    507     }
    508     utext_setNativeIndex(fText, pos);
    509     if (utext_getNativeIndex(fText) != pos) {
    510         // Sanity check.  The new input utext is supposed to have the exact same
    511         // contents as the old.  If we can't set to the same position, it doesn't.
    512         // The contents underlying the old utext might be invalid at this point,
    513         // so it's not safe to check directly.
    514         status = U_ILLEGAL_ARGUMENT_ERROR;
    515     }
    516     return *this;
    517 }
    518 
    519 
    520 /**
    521  * Sets the current iteration position to the beginning of the text.
    522  * @return The offset of the beginning of the text.
    523  */
    524 int32_t RuleBasedBreakIterator::first(void) {
    525     reset();
    526     fLastRuleStatusIndex  = 0;
    527     fLastStatusIndexValid = TRUE;
    528     //if (fText == NULL)
    529     //    return BreakIterator::DONE;
    530 
    531     utext_setNativeIndex(fText, 0);
    532     return 0;
    533 }
    534 
    535 /**
    536  * Sets the current iteration position to the end of the text.
    537  * @return The text's past-the-end offset.
    538  */
    539 int32_t RuleBasedBreakIterator::last(void) {
    540     reset();
    541     if (fText == NULL) {
    542         fLastRuleStatusIndex  = 0;
    543         fLastStatusIndexValid = TRUE;
    544         return BreakIterator::DONE;
    545     }
    546 
    547     fLastStatusIndexValid = FALSE;
    548     int32_t pos = (int32_t)utext_nativeLength(fText);
    549     utext_setNativeIndex(fText, pos);
    550     return pos;
    551 }
    552 
    553 /**
    554  * Advances the iterator either forward or backward the specified number of steps.
    555  * Negative values move backward, and positive values move forward.  This is
    556  * equivalent to repeatedly calling next() or previous().
    557  * @param n The number of steps to move.  The sign indicates the direction
    558  * (negative is backwards, and positive is forwards).
    559  * @return The character offset of the boundary position n boundaries away from
    560  * the current one.
    561  */
    562 int32_t RuleBasedBreakIterator::next(int32_t n) {
    563     int32_t result = current();
    564     while (n > 0) {
    565         result = next();
    566         --n;
    567     }
    568     while (n < 0) {
    569         result = previous();
    570         ++n;
    571     }
    572     return result;
    573 }
    574 
    575 /**
    576  * Advances the iterator to the next boundary position.
    577  * @return The position of the first boundary after this one.
    578  */
    579 int32_t RuleBasedBreakIterator::next(void) {
    580     // if we have cached break positions and we're still in the range
    581     // covered by them, just move one step forward in the cache
    582     if (fCachedBreakPositions != NULL) {
    583         if (fPositionInCache < fNumCachedBreakPositions - 1) {
    584             ++fPositionInCache;
    585             int32_t pos = fCachedBreakPositions[fPositionInCache];
    586             utext_setNativeIndex(fText, pos);
    587             return pos;
    588         }
    589         else {
    590             reset();
    591         }
    592     }
    593 
    594     int32_t startPos = current();
    595     fDictionaryCharCount = 0;
    596     int32_t result = handleNext(fData->fForwardTable);
    597     if (fDictionaryCharCount > 0) {
    598         result = checkDictionary(startPos, result, FALSE);
    599     }
    600     return result;
    601 }
    602 
    603 /**
    604  * Advances the iterator backwards, to the last boundary preceding this one.
    605  * @return The position of the last boundary position preceding this one.
    606  */
    607 int32_t RuleBasedBreakIterator::previous(void) {
    608     int32_t result;
    609     int32_t startPos;
    610 
    611     // if we have cached break positions and we're still in the range
    612     // covered by them, just move one step backward in the cache
    613     if (fCachedBreakPositions != NULL) {
    614         if (fPositionInCache > 0) {
    615             --fPositionInCache;
    616             // If we're at the beginning of the cache, need to reevaluate the
    617             // rule status
    618             if (fPositionInCache <= 0) {
    619                 fLastStatusIndexValid = FALSE;
    620             }
    621             int32_t pos = fCachedBreakPositions[fPositionInCache];
    622             utext_setNativeIndex(fText, pos);
    623             return pos;
    624         }
    625         else {
    626             reset();
    627         }
    628     }
    629 
    630     // if we're already sitting at the beginning of the text, return DONE
    631     if (fText == NULL || (startPos = current()) == 0) {
    632         fLastRuleStatusIndex  = 0;
    633         fLastStatusIndexValid = TRUE;
    634         return BreakIterator::DONE;
    635     }
    636 
    637     if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
    638         result = handlePrevious(fData->fReverseTable);
    639         if (fDictionaryCharCount > 0) {
    640             result = checkDictionary(result, startPos, TRUE);
    641         }
    642         return result;
    643     }
    644 
    645     // old rule syntax
    646     // set things up.  handlePrevious() will back us up to some valid
    647     // break position before the current position (we back our internal
    648     // iterator up one step to prevent handlePrevious() from returning
    649     // the current position), but not necessarily the last one before
    650     // where we started
    651 
    652     int32_t start = current();
    653 
    654     (void)UTEXT_PREVIOUS32(fText);
    655     int32_t lastResult    = handlePrevious(fData->fReverseTable);
    656     if (lastResult == UBRK_DONE) {
    657         lastResult = 0;
    658         utext_setNativeIndex(fText, 0);
    659     }
    660     result = lastResult;
    661     int32_t lastTag       = 0;
    662     UBool   breakTagValid = FALSE;
    663 
    664     // iterate forward from the known break position until we pass our
    665     // starting point.  The last break position before the starting
    666     // point is our return value
    667 
    668     for (;;) {
    669         result         = next();
    670         if (result == BreakIterator::DONE || result >= start) {
    671             break;
    672         }
    673         lastResult     = result;
    674         lastTag        = fLastRuleStatusIndex;
    675         breakTagValid  = TRUE;
    676     }
    677 
    678     // fLastBreakTag wants to have the value for section of text preceding
    679     // the result position that we are to return (in lastResult.)  If
    680     // the backwards rules overshot and the above loop had to do two or more
    681     // next()s to move up to the desired return position, we will have a valid
    682     // tag value. But, if handlePrevious() took us to exactly the correct result position,
    683     // we wont have a tag value for that position, which is only set by handleNext().
    684 
    685     // Set the current iteration position to be the last break position
    686     // before where we started, and then return that value.
    687     utext_setNativeIndex(fText, lastResult);
    688     fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()
    689     fLastStatusIndexValid = breakTagValid;
    690 
    691     // No need to check the dictionary; it will have been handled by
    692     // next()
    693 
    694     return lastResult;
    695 }
    696 
    697 /**
    698  * Sets the iterator to refer to the first boundary position following
    699  * the specified position.
    700  * @offset The position from which to begin searching for a break position.
    701  * @return The position of the first break after the current position.
    702  */
    703 int32_t RuleBasedBreakIterator::following(int32_t offset) {
    704     // if we have cached break positions and offset is in the range
    705     // covered by them, use them
    706     // TODO: could use binary search
    707     // TODO: what if offset is outside range, but break is not?
    708     if (fCachedBreakPositions != NULL) {
    709         if (offset >= fCachedBreakPositions[0]
    710                 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
    711             fPositionInCache = 0;
    712             // We are guaranteed not to leave the array due to range test above
    713             while (offset >= fCachedBreakPositions[fPositionInCache]) {
    714                 ++fPositionInCache;
    715             }
    716             int32_t pos = fCachedBreakPositions[fPositionInCache];
    717             utext_setNativeIndex(fText, pos);
    718             return pos;
    719         }
    720         else {
    721             reset();
    722         }
    723     }
    724 
    725     // if the offset passed in is already past the end of the text,
    726     // just return DONE; if it's before the beginning, return the
    727     // text's starting offset
    728     fLastRuleStatusIndex  = 0;
    729     fLastStatusIndexValid = TRUE;
    730     if (fText == NULL || offset >= utext_nativeLength(fText)) {
    731         last();
    732         return next();
    733     }
    734     else if (offset < 0) {
    735         return first();
    736     }
    737 
    738     // otherwise, set our internal iteration position (temporarily)
    739     // to the position passed in.  If this is the _beginning_ position,
    740     // then we can just use next() to get our return value
    741 
    742     int32_t result = 0;
    743 
    744     if (fData->fSafeRevTable != NULL) {
    745         // new rule syntax
    746         utext_setNativeIndex(fText, offset);
    747         // move forward one codepoint to prepare for moving back to a
    748         // safe point.
    749         // this handles offset being between a supplementary character
    750         (void)UTEXT_NEXT32(fText);
    751         // handlePrevious will move most of the time to < 1 boundary away
    752         handlePrevious(fData->fSafeRevTable);
    753         int32_t result = next();
    754         while (result <= offset) {
    755             result = next();
    756         }
    757         return result;
    758     }
    759     if (fData->fSafeFwdTable != NULL) {
    760         // backup plan if forward safe table is not available
    761         utext_setNativeIndex(fText, offset);
    762         (void)UTEXT_PREVIOUS32(fText);
    763         // handle next will give result >= offset
    764         handleNext(fData->fSafeFwdTable);
    765         // previous will give result 0 or 1 boundary away from offset,
    766         // most of the time
    767         // we have to
    768         int32_t oldresult = previous();
    769         while (oldresult > offset) {
    770             int32_t result = previous();
    771             if (result <= offset) {
    772                 return oldresult;
    773             }
    774             oldresult = result;
    775         }
    776         int32_t result = next();
    777         if (result <= offset) {
    778             return next();
    779         }
    780         return result;
    781     }
    782     // otherwise, we have to sync up first.  Use handlePrevious() to back
    783     // up to a known break position before the specified position (if
    784     // we can determine that the specified position is a break position,
    785     // we don't back up at all).  This may or may not be the last break
    786     // position at or before our starting position.  Advance forward
    787     // from here until we've passed the starting position.  The position
    788     // we stop on will be the first break position after the specified one.
    789     // old rule syntax
    790 
    791     utext_setNativeIndex(fText, offset);
    792     if (offset==0 ||
    793         (offset==1  && utext_getNativeIndex(fText)==0)) {
    794         return next();
    795     }
    796     result = previous();
    797 
    798     while (result != BreakIterator::DONE && result <= offset) {
    799         result = next();
    800     }
    801 
    802     return result;
    803 }
    804 
    805 /**
    806  * Sets the iterator to refer to the last boundary position before the
    807  * specified position.
    808  * @offset The position to begin searching for a break from.
    809  * @return The position of the last boundary before the starting position.
    810  */
    811 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
    812     // if we have cached break positions and offset is in the range
    813     // covered by them, use them
    814     if (fCachedBreakPositions != NULL) {
    815         // TODO: binary search?
    816         // TODO: What if offset is outside range, but break is not?
    817         if (offset > fCachedBreakPositions[0]
    818                 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
    819             fPositionInCache = 0;
    820             while (fPositionInCache < fNumCachedBreakPositions
    821                    && offset > fCachedBreakPositions[fPositionInCache])
    822                 ++fPositionInCache;
    823             --fPositionInCache;
    824             // If we're at the beginning of the cache, need to reevaluate the
    825             // rule status
    826             if (fPositionInCache <= 0) {
    827                 fLastStatusIndexValid = FALSE;
    828             }
    829             utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
    830             return fCachedBreakPositions[fPositionInCache];
    831         }
    832         else {
    833             reset();
    834         }
    835     }
    836 
    837     // if the offset passed in is already past the end of the text,
    838     // just return DONE; if it's before the beginning, return the
    839     // text's starting offset
    840     if (fText == NULL || offset > utext_nativeLength(fText)) {
    841         // return BreakIterator::DONE;
    842         return last();
    843     }
    844     else if (offset < 0) {
    845         return first();
    846     }
    847 
    848     // if we start by updating the current iteration position to the
    849     // position specified by the caller, we can just use previous()
    850     // to carry out this operation
    851 
    852     if (fData->fSafeFwdTable != NULL) {
    853         // new rule syntax
    854         utext_setNativeIndex(fText, offset);
    855         int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    856         if (newOffset != offset) {
    857             // Will come here if specified offset was not a code point boundary AND
    858             //   the underlying implmentation is using UText, which snaps any non-code-point-boundary
    859             //   indices to the containing code point.
    860             // For breakitereator::preceding only, these non-code-point indices need to be moved
    861             //   up to refer to the following codepoint.
    862             (void)UTEXT_NEXT32(fText);
    863             offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    864         }
    865 
    866         // TODO:  (synwee) would it be better to just check for being in the middle of a surrogate pair,
    867         //        rather than adjusting the position unconditionally?
    868         //        (Change would interact with safe rules.)
    869         // TODO:  change RBBI behavior for off-boundary indices to match that of UText?
    870         //        affects only preceding(), seems cleaner, but is slightly different.
    871         (void)UTEXT_PREVIOUS32(fText);
    872         handleNext(fData->fSafeFwdTable);
    873         int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    874         while (result >= offset) {
    875             result = previous();
    876         }
    877         return result;
    878     }
    879     if (fData->fSafeRevTable != NULL) {
    880         // backup plan if forward safe table is not available
    881         //  TODO:  check whether this path can be discarded
    882         //         It's probably OK to say that rules must supply both safe tables
    883         //            if they use safe tables at all.  We have certainly never described
    884         //            to anyone how to work with just one safe table.
    885         utext_setNativeIndex(fText, offset);
    886         (void)UTEXT_NEXT32(fText);
    887 
    888         // handle previous will give result <= offset
    889         handlePrevious(fData->fSafeRevTable);
    890 
    891         // next will give result 0 or 1 boundary away from offset,
    892         // most of the time
    893         // we have to
    894         int32_t oldresult = next();
    895         while (oldresult < offset) {
    896             int32_t result = next();
    897             if (result >= offset) {
    898                 return oldresult;
    899             }
    900             oldresult = result;
    901         }
    902         int32_t result = previous();
    903         if (result >= offset) {
    904             return previous();
    905         }
    906         return result;
    907     }
    908 
    909     // old rule syntax
    910     utext_setNativeIndex(fText, offset);
    911     return previous();
    912 }
    913 
    914 /**
    915  * Returns true if the specfied position is a boundary position.  As a side
    916  * effect, leaves the iterator pointing to the first boundary position at
    917  * or after "offset".
    918  * @param offset the offset to check.
    919  * @return True if "offset" is a boundary position.
    920  */
    921 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
    922     // the beginning index of the iterator is always a boundary position by definition
    923     if (offset == 0) {
    924         first();       // For side effects on current position, tag values.
    925         return TRUE;
    926     }
    927 
    928     if (offset == (int32_t)utext_nativeLength(fText)) {
    929         last();       // For side effects on current position, tag values.
    930         return TRUE;
    931     }
    932 
    933     // out-of-range indexes are never boundary positions
    934     if (offset < 0) {
    935         first();       // For side effects on current position, tag values.
    936         return FALSE;
    937     }
    938 
    939     if (offset > utext_nativeLength(fText)) {
    940         last();        // For side effects on current position, tag values.
    941         return FALSE;
    942     }
    943 
    944     // otherwise, we can use following() on the position before the specified
    945     // one and return true if the position we get back is the one the user
    946     // specified
    947     utext_previous32From(fText, offset);
    948     int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    949     UBool    result  = following(backOne) == offset;
    950     return result;
    951 }
    952 
    953 /**
    954  * Returns the current iteration position.
    955  * @return The current iteration position.
    956  */
    957 int32_t RuleBasedBreakIterator::current(void) const {
    958     int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    959     return pos;
    960 }
    961 
    962 //=======================================================================
    963 // implementation
    964 //=======================================================================
    965 
    966 //
    967 // RBBIRunMode  -  the state machine runs an extra iteration at the beginning and end
    968 //                 of user text.  A variable with this enum type keeps track of where we
    969 //                 are.  The state machine only fetches user input while in the RUN mode.
    970 //
    971 enum RBBIRunMode {
    972     RBBI_START,     // state machine processing is before first char of input
    973     RBBI_RUN,       // state machine processing is in the user text
    974     RBBI_END        // state machine processing is after end of user text.
    975 };
    976 
    977 
    978 //-----------------------------------------------------------------------------------
    979 //
    980 //  handleNext(stateTable)
    981 //     This method is the actual implementation of the rbbi next() method.
    982 //     This method initializes the state machine to state 1
    983 //     and advances through the text character by character until we reach the end
    984 //     of the text or the state machine transitions to state 0.  We update our return
    985 //     value every time the state machine passes through an accepting state.
    986 //
    987 //-----------------------------------------------------------------------------------
    988 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
    989     int32_t             state;
    990     uint16_t            category        = 0;
    991     RBBIRunMode         mode;
    992 
    993     RBBIStateTableRow  *row;
    994     UChar32             c;
    995     int32_t             lookaheadStatus = 0;
    996     int32_t             lookaheadTagIdx = 0;
    997     int32_t             result          = 0;
    998     int32_t             initialPosition = 0;
    999     int32_t             lookaheadResult = 0;
   1000     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
   1001     const char         *tableData       = statetable->fTableData;
   1002     uint32_t            tableRowLen     = statetable->fRowLen;
   1003 
   1004     #ifdef RBBI_DEBUG
   1005         if (fTrace) {
   1006             RBBIDebugPuts("Handle Next   pos   char  state category");
   1007         }
   1008     #endif
   1009 
   1010     // No matter what, handleNext alway correctly sets the break tag value.
   1011     fLastStatusIndexValid = TRUE;
   1012     fLastRuleStatusIndex = 0;
   1013 
   1014     // if we're already at the end of the text, return DONE.
   1015     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1016     result          = initialPosition;
   1017     c               = UTEXT_NEXT32(fText);
   1018     if (fData == NULL || c==U_SENTINEL) {
   1019         return BreakIterator::DONE;
   1020     }
   1021 
   1022     //  Set the initial state for the state machine
   1023     state = START_STATE;
   1024     row = (RBBIStateTableRow *)
   1025             //(statetable->fTableData + (statetable->fRowLen * state));
   1026             (tableData + tableRowLen * state);
   1027 
   1028 
   1029     mode     = RBBI_RUN;
   1030     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
   1031         category = 2;
   1032         mode     = RBBI_START;
   1033     }
   1034 
   1035 
   1036     // loop until we reach the end of the text or transition to state 0
   1037     //
   1038     for (;;) {
   1039         if (c == U_SENTINEL) {
   1040             // Reached end of input string.
   1041             if (mode == RBBI_END) {
   1042                 // We have already run the loop one last time with the
   1043                 //   character set to the psueudo {eof} value.  Now it is time
   1044                 //   to unconditionally bail out.
   1045                 if (lookaheadResult > result) {
   1046                     // We ran off the end of the string with a pending look-ahead match.
   1047                     // Treat this as if the look-ahead condition had been met, and return
   1048                     //  the match at the / position from the look-ahead rule.
   1049                     result               = lookaheadResult;
   1050                     fLastRuleStatusIndex = lookaheadTagIdx;
   1051                     lookaheadStatus = 0;
   1052                 }
   1053                 break;
   1054             }
   1055             // Run the loop one last time with the fake end-of-input character category.
   1056             mode = RBBI_END;
   1057             category = 1;
   1058         }
   1059 
   1060         //
   1061         // Get the char category.  An incoming category of 1 or 2 means that
   1062         //      we are preset for doing the beginning or end of input, and
   1063         //      that we shouldn't get a category from an actual text input character.
   1064         //
   1065         if (mode == RBBI_RUN) {
   1066             // look up the current character's character category, which tells us
   1067             // which column in the state table to look at.
   1068             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
   1069             //        not the size of the character going in, which is a UChar32.
   1070             //
   1071             UTRIE_GET16(&fData->fTrie, c, category);
   1072 
   1073             // Check the dictionary bit in the character's category.
   1074             //    Counter is only used by dictionary based iterators (subclasses).
   1075             //    Chars that need to be handled by a dictionary have a flag bit set
   1076             //    in their category values.
   1077             //
   1078             if ((category & 0x4000) != 0)  {
   1079                 fDictionaryCharCount++;
   1080                 //  And off the dictionary flag bit.
   1081                 category &= ~0x4000;
   1082             }
   1083         }
   1084 
   1085        #ifdef RBBI_DEBUG
   1086             if (fTrace) {
   1087                 RBBIDebugPrintf("             %4ld   ", utext_getNativeIndex(fText));
   1088                 if (0x20<=c && c<0x7f) {
   1089                     RBBIDebugPrintf("\"%c\"  ", c);
   1090                 } else {
   1091                     RBBIDebugPrintf("%5x  ", c);
   1092                 }
   1093                 RBBIDebugPrintf("%3d  %3d\n", state, category);
   1094             }
   1095         #endif
   1096 
   1097         // State Transition - move machine to its next state
   1098         //
   1099 
   1100         // Note: fNextState is defined as uint16_t[2], but we are casting
   1101         // a generated RBBI table to RBBIStateTableRow and some tables
   1102         // actually have more than 2 categories.
   1103         U_ASSERT(category<fData->fHeader->fCatCount);
   1104         state = row->fNextState[category];  /*Not accessing beyond memory*/
   1105         row = (RBBIStateTableRow *)
   1106             // (statetable->fTableData + (statetable->fRowLen * state));
   1107             (tableData + tableRowLen * state);
   1108 
   1109 
   1110         if (row->fAccepting == -1) {
   1111             // Match found, common case.
   1112             if (mode != RBBI_START) {
   1113                 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1114             }
   1115             fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
   1116         }
   1117 
   1118         if (row->fLookAhead != 0) {
   1119             if (lookaheadStatus != 0
   1120                 && row->fAccepting == lookaheadStatus) {
   1121                 // Lookahead match is completed.
   1122                 result               = lookaheadResult;
   1123                 fLastRuleStatusIndex = lookaheadTagIdx;
   1124                 lookaheadStatus      = 0;
   1125                 // TODO:  make a standalone hard break in a rule work.
   1126                 if (lookAheadHardBreak) {
   1127                     UTEXT_SETNATIVEINDEX(fText, result);
   1128                     return result;
   1129                 }
   1130                 // Look-ahead completed, but other rules may match further.  Continue on
   1131                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
   1132                 goto continueOn;
   1133             }
   1134 
   1135             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1136             lookaheadResult = r;
   1137             lookaheadStatus = row->fLookAhead;
   1138             lookaheadTagIdx = row->fTagIdx;
   1139             goto continueOn;
   1140         }
   1141 
   1142 
   1143         if (row->fAccepting != 0) {
   1144             // Because this is an accepting state, any in-progress look-ahead match
   1145             //   is no longer relavant.  Clear out the pending lookahead status.
   1146             lookaheadStatus = 0;           // clear out any pending look-ahead match.
   1147         }
   1148 
   1149 continueOn:
   1150         if (state == STOP_STATE) {
   1151             // This is the normal exit from the lookup state machine.
   1152             // We have advanced through the string until it is certain that no
   1153             //   longer match is possible, no matter what characters follow.
   1154             break;
   1155         }
   1156 
   1157         // Advance to the next character.
   1158         // If this is a beginning-of-input loop iteration, don't advance
   1159         //    the input position.  The next iteration will be processing the
   1160         //    first real input character.
   1161         if (mode == RBBI_RUN) {
   1162             c = UTEXT_NEXT32(fText);
   1163         } else {
   1164             if (mode == RBBI_START) {
   1165                 mode = RBBI_RUN;
   1166             }
   1167         }
   1168 
   1169 
   1170     }
   1171 
   1172     // The state machine is done.  Check whether it found a match...
   1173 
   1174     // If the iterator failed to advance in the match engine, force it ahead by one.
   1175     //   (This really indicates a defect in the break rules.  They should always match
   1176     //    at least one character.)
   1177     if (result == initialPosition) {
   1178         UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1179         UTEXT_NEXT32(fText);
   1180         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1181     }
   1182 
   1183     // Leave the iterator at our result position.
   1184     UTEXT_SETNATIVEINDEX(fText, result);
   1185     #ifdef RBBI_DEBUG
   1186         if (fTrace) {
   1187             RBBIDebugPrintf("result = %d\n\n", result);
   1188         }
   1189     #endif
   1190     return result;
   1191 }
   1192 
   1193 
   1194 
   1195 //-----------------------------------------------------------------------------------
   1196 //
   1197 //  handlePrevious()
   1198 //
   1199 //      Iterate backwards, according to the logic of the reverse rules.
   1200 //      This version handles the exact style backwards rules.
   1201 //
   1202 //      The logic of this function is very similar to handleNext(), above.
   1203 //
   1204 //-----------------------------------------------------------------------------------
   1205 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
   1206     int32_t             state;
   1207     uint16_t            category        = 0;
   1208     RBBIRunMode         mode;
   1209     RBBIStateTableRow  *row;
   1210     UChar32             c;
   1211     int32_t             lookaheadStatus = 0;
   1212     int32_t             result          = 0;
   1213     int32_t             initialPosition = 0;
   1214     int32_t             lookaheadResult = 0;
   1215     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
   1216 
   1217     #ifdef RBBI_DEBUG
   1218         if (fTrace) {
   1219             RBBIDebugPuts("Handle Previous   pos   char  state category");
   1220         }
   1221     #endif
   1222 
   1223     // handlePrevious() never gets the rule status.
   1224     // Flag the status as invalid; if the user ever asks for status, we will need
   1225     // to back up, then re-find the break position using handleNext(), which does
   1226     // get the status value.
   1227     fLastStatusIndexValid = FALSE;
   1228     fLastRuleStatusIndex = 0;
   1229 
   1230     // if we're already at the start of the text, return DONE.
   1231     if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
   1232         return BreakIterator::DONE;
   1233     }
   1234 
   1235     //  Set up the starting char.
   1236     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1237     result          = initialPosition;
   1238     c               = UTEXT_PREVIOUS32(fText);
   1239 
   1240     //  Set the initial state for the state machine
   1241     state = START_STATE;
   1242     row = (RBBIStateTableRow *)
   1243             (statetable->fTableData + (statetable->fRowLen * state));
   1244     category = 3;
   1245     mode     = RBBI_RUN;
   1246     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
   1247         category = 2;
   1248         mode     = RBBI_START;
   1249     }
   1250 
   1251 
   1252     // loop until we reach the start of the text or transition to state 0
   1253     //
   1254     for (;;) {
   1255         if (c == U_SENTINEL) {
   1256             // Reached end of input string.
   1257             if (mode == RBBI_END) {
   1258                 // We have already run the loop one last time with the
   1259                 //   character set to the psueudo {eof} value.  Now it is time
   1260                 //   to unconditionally bail out.
   1261                 if (lookaheadResult < result) {
   1262                     // We ran off the end of the string with a pending look-ahead match.
   1263                     // Treat this as if the look-ahead condition had been met, and return
   1264                     //  the match at the / position from the look-ahead rule.
   1265                     result               = lookaheadResult;
   1266                     lookaheadStatus = 0;
   1267                 } else if (result == initialPosition) {
   1268                     // Ran off start, no match found.
   1269                     // move one index one (towards the start, since we are doing a previous())
   1270                     UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1271                     (void)UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.
   1272                 }
   1273                 break;
   1274             }
   1275             // Run the loop one last time with the fake end-of-input character category.
   1276             mode = RBBI_END;
   1277             category = 1;
   1278         }
   1279 
   1280         //
   1281         // Get the char category.  An incoming category of 1 or 2 means that
   1282         //      we are preset for doing the beginning or end of input, and
   1283         //      that we shouldn't get a category from an actual text input character.
   1284         //
   1285         if (mode == RBBI_RUN) {
   1286             // look up the current character's character category, which tells us
   1287             // which column in the state table to look at.
   1288             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
   1289             //        not the size of the character going in, which is a UChar32.
   1290             //
   1291             UTRIE_GET16(&fData->fTrie, c, category);
   1292 
   1293             // Check the dictionary bit in the character's category.
   1294             //    Counter is only used by dictionary based iterators (subclasses).
   1295             //    Chars that need to be handled by a dictionary have a flag bit set
   1296             //    in their category values.
   1297             //
   1298             if ((category & 0x4000) != 0)  {
   1299                 fDictionaryCharCount++;
   1300                 //  And off the dictionary flag bit.
   1301                 category &= ~0x4000;
   1302             }
   1303         }
   1304 
   1305         #ifdef RBBI_DEBUG
   1306             if (fTrace) {
   1307                 RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(fText));
   1308                 if (0x20<=c && c<0x7f) {
   1309                     RBBIDebugPrintf("\"%c\"  ", c);
   1310                 } else {
   1311                     RBBIDebugPrintf("%5x  ", c);
   1312                 }
   1313                 RBBIDebugPrintf("%3d  %3d\n", state, category);
   1314             }
   1315         #endif
   1316 
   1317         // State Transition - move machine to its next state
   1318         //
   1319 
   1320         // Note: fNextState is defined as uint16_t[2], but we are casting
   1321         // a generated RBBI table to RBBIStateTableRow and some tables
   1322         // actually have more than 2 categories.
   1323         U_ASSERT(category<fData->fHeader->fCatCount);
   1324         state = row->fNextState[category];  /*Not accessing beyond memory*/
   1325         row = (RBBIStateTableRow *)
   1326             (statetable->fTableData + (statetable->fRowLen * state));
   1327 
   1328         if (row->fAccepting == -1) {
   1329             // Match found, common case.
   1330             result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1331         }
   1332 
   1333         if (row->fLookAhead != 0) {
   1334             if (lookaheadStatus != 0
   1335                 && row->fAccepting == lookaheadStatus) {
   1336                 // Lookahead match is completed.
   1337                 result               = lookaheadResult;
   1338                 lookaheadStatus      = 0;
   1339                 // TODO:  make a standalone hard break in a rule work.
   1340                 if (lookAheadHardBreak) {
   1341                     UTEXT_SETNATIVEINDEX(fText, result);
   1342                     return result;
   1343                 }
   1344                 // Look-ahead completed, but other rules may match further.  Continue on
   1345                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
   1346                 goto continueOn;
   1347             }
   1348 
   1349             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1350             lookaheadResult = r;
   1351             lookaheadStatus = row->fLookAhead;
   1352             goto continueOn;
   1353         }
   1354 
   1355 
   1356         if (row->fAccepting != 0) {
   1357             // Because this is an accepting state, any in-progress look-ahead match
   1358             //   is no longer relavant.  Clear out the pending lookahead status.
   1359             lookaheadStatus = 0;
   1360         }
   1361 
   1362 continueOn:
   1363         if (state == STOP_STATE) {
   1364             // This is the normal exit from the lookup state machine.
   1365             // We have advanced through the string until it is certain that no
   1366             //   longer match is possible, no matter what characters follow.
   1367             break;
   1368         }
   1369 
   1370         // Move (backwards) to the next character to process.
   1371         // If this is a beginning-of-input loop iteration, don't advance
   1372         //    the input position.  The next iteration will be processing the
   1373         //    first real input character.
   1374         if (mode == RBBI_RUN) {
   1375             c = UTEXT_PREVIOUS32(fText);
   1376         } else {
   1377             if (mode == RBBI_START) {
   1378                 mode = RBBI_RUN;
   1379             }
   1380         }
   1381     }
   1382 
   1383     // The state machine is done.  Check whether it found a match...
   1384 
   1385     // If the iterator failed to advance in the match engine, force it ahead by one.
   1386     //   (This really indicates a defect in the break rules.  They should always match
   1387     //    at least one character.)
   1388     if (result == initialPosition) {
   1389         UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1390         UTEXT_PREVIOUS32(fText);
   1391         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1392     }
   1393 
   1394     // Leave the iterator at our result position.
   1395     UTEXT_SETNATIVEINDEX(fText, result);
   1396     #ifdef RBBI_DEBUG
   1397         if (fTrace) {
   1398             RBBIDebugPrintf("result = %d\n\n", result);
   1399         }
   1400     #endif
   1401     return result;
   1402 }
   1403 
   1404 
   1405 void
   1406 RuleBasedBreakIterator::reset()
   1407 {
   1408     if (fCachedBreakPositions) {
   1409         uprv_free(fCachedBreakPositions);
   1410     }
   1411     fCachedBreakPositions = NULL;
   1412     fNumCachedBreakPositions = 0;
   1413     fDictionaryCharCount = 0;
   1414     fPositionInCache = 0;
   1415 }
   1416 
   1417 
   1418 
   1419 //-------------------------------------------------------------------------------
   1420 //
   1421 //   getRuleStatus()   Return the break rule tag associated with the current
   1422 //                     iterator position.  If the iterator arrived at its current
   1423 //                     position by iterating forwards, the value will have been
   1424 //                     cached by the handleNext() function.
   1425 //
   1426 //                     If no cached status value is available, the status is
   1427 //                     found by doing a previous() followed by a next(), which
   1428 //                     leaves the iterator where it started, and computes the
   1429 //                     status while doing the next().
   1430 //
   1431 //-------------------------------------------------------------------------------
   1432 void RuleBasedBreakIterator::makeRuleStatusValid() {
   1433     if (fLastStatusIndexValid == FALSE) {
   1434         //  No cached status is available.
   1435         if (fText == NULL || current() == 0) {
   1436             //  At start of text, or there is no text.  Status is always zero.
   1437             fLastRuleStatusIndex = 0;
   1438             fLastStatusIndexValid = TRUE;
   1439         } else {
   1440             //  Not at start of text.  Find status the tedious way.
   1441             int32_t pa = current();
   1442             previous();
   1443             if (fNumCachedBreakPositions > 0) {
   1444                 reset();                // Blow off the dictionary cache
   1445             }
   1446             int32_t pb = next();
   1447             if (pa != pb) {
   1448                 // note: the if (pa != pb) test is here only to eliminate warnings for
   1449                 //       unused local variables on gcc.  Logically, it isn't needed.
   1450                 U_ASSERT(pa == pb);
   1451             }
   1452         }
   1453     }
   1454     U_ASSERT(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fData->fStatusMaxIdx);
   1455 }
   1456 
   1457 
   1458 int32_t  RuleBasedBreakIterator::getRuleStatus() const {
   1459     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
   1460     nonConstThis->makeRuleStatusValid();
   1461 
   1462     // fLastRuleStatusIndex indexes to the start of the appropriate status record
   1463     //                                                 (the number of status values.)
   1464     //   This function returns the last (largest) of the array of status values.
   1465     int32_t  idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
   1466     int32_t  tagVal = fData->fRuleStatusTable[idx];
   1467 
   1468     return tagVal;
   1469 }
   1470 
   1471 
   1472 
   1473 
   1474 int32_t RuleBasedBreakIterator::getRuleStatusVec(
   1475              int32_t *fillInVec, int32_t capacity, UErrorCode &status)
   1476 {
   1477     if (U_FAILURE(status)) {
   1478         return 0;
   1479     }
   1480 
   1481     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
   1482     nonConstThis->makeRuleStatusValid();
   1483     int32_t  numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
   1484     int32_t  numValsToCopy = numVals;
   1485     if (numVals > capacity) {
   1486         status = U_BUFFER_OVERFLOW_ERROR;
   1487         numValsToCopy = capacity;
   1488     }
   1489     int i;
   1490     for (i=0; i<numValsToCopy; i++) {
   1491         fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
   1492     }
   1493     return numVals;
   1494 }
   1495 
   1496 
   1497 
   1498 //-------------------------------------------------------------------------------
   1499 //
   1500 //   getBinaryRules        Access to the compiled form of the rules,
   1501 //                         for use by build system tools that save the data
   1502 //                         for standard iterator types.
   1503 //
   1504 //-------------------------------------------------------------------------------
   1505 const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
   1506     const uint8_t  *retPtr = NULL;
   1507     length = 0;
   1508 
   1509     if (fData != NULL) {
   1510         retPtr = (const uint8_t *)fData->fHeader;
   1511         length = fData->fHeader->fLength;
   1512     }
   1513     return retPtr;
   1514 }
   1515 
   1516 
   1517 BreakIterator *  RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/,
   1518                                    int32_t &bufferSize,
   1519                                    UErrorCode &status)
   1520 {
   1521     if (U_FAILURE(status)){
   1522         return NULL;
   1523     }
   1524 
   1525     if (bufferSize == 0) {
   1526         bufferSize = 1;  // preflighting for deprecated functionality
   1527         return NULL;
   1528     }
   1529 
   1530     BreakIterator *clonedBI = clone();
   1531     if (clonedBI == NULL) {
   1532         status = U_MEMORY_ALLOCATION_ERROR;
   1533     } else {
   1534         status = U_SAFECLONE_ALLOCATED_WARNING;
   1535     }
   1536     return (RuleBasedBreakIterator *)clonedBI;
   1537 }
   1538 
   1539 
   1540 //-------------------------------------------------------------------------------
   1541 //
   1542 //  isDictionaryChar      Return true if the category lookup for this char
   1543 //                        indicates that it is in the set of dictionary lookup
   1544 //                        chars.
   1545 //
   1546 //                        This function is intended for use by dictionary based
   1547 //                        break iterators.
   1548 //
   1549 //-------------------------------------------------------------------------------
   1550 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32   c) {
   1551     if (fData == NULL) {
   1552         return FALSE;
   1553     }
   1554     uint16_t category;
   1555     UTRIE_GET16(&fData->fTrie, c, category);
   1556     return (category & 0x4000) != 0;
   1557 }*/
   1558 
   1559 
   1560 //-------------------------------------------------------------------------------
   1561 //
   1562 //  checkDictionary       This function handles all processing of characters in
   1563 //                        the "dictionary" set. It will determine the appropriate
   1564 //                        course of action, and possibly set up a cache in the
   1565 //                        process.
   1566 //
   1567 //-------------------------------------------------------------------------------
   1568 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
   1569                             int32_t endPos,
   1570                             UBool reverse) {
   1571     // Reset the old break cache first.
   1572     reset();
   1573 
   1574     // note: code segment below assumes that dictionary chars are in the
   1575     // startPos-endPos range
   1576     // value returned should be next character in sequence
   1577     if ((endPos - startPos) <= 1) {
   1578         return (reverse ? startPos : endPos);
   1579     }
   1580 
   1581     // Bug 5532.  The dictionary code will crash if the input text is UTF-8
   1582     //      because native indexes are different from UTF-16 indexes.
   1583     //      Temporary hack: skip dictionary lookup for UTF-8 encoded text.
   1584     //      It wont give the right breaks, but it's better than a crash.
   1585     //
   1586     //      Check the type of the UText by checking its pFuncs field, which
   1587     //      is UText's function dispatch table.  It will be the same for all
   1588     //      UTF-8 UTexts and different for any other UText type.
   1589     //
   1590     //      We have no other type of UText available with non-UTF-16 native indexing.
   1591     //      This whole check will go away once the dictionary code is fixed.
   1592     static const void *utext_utf8Funcs;
   1593     if (utext_utf8Funcs == NULL) {
   1594         // Cache the UTF-8 UText function pointer value.
   1595         UErrorCode status = U_ZERO_ERROR;
   1596         UText tempUText = UTEXT_INITIALIZER;
   1597         utext_openUTF8(&tempUText, NULL, 0, &status);
   1598         utext_utf8Funcs = tempUText.pFuncs;
   1599         utext_close(&tempUText);
   1600     }
   1601     if (fText->pFuncs == utext_utf8Funcs) {
   1602         return (reverse ? startPos : endPos);
   1603     }
   1604 
   1605     // Starting from the starting point, scan towards the proposed result,
   1606     // looking for the first dictionary character (which may be the one
   1607     // we're on, if we're starting in the middle of a range).
   1608     utext_setNativeIndex(fText, reverse ? endPos : startPos);
   1609     if (reverse) {
   1610         UTEXT_PREVIOUS32(fText);
   1611     }
   1612 
   1613     int32_t rangeStart = startPos;
   1614     int32_t rangeEnd = endPos;
   1615 
   1616     uint16_t    category;
   1617     int32_t     current;
   1618     UErrorCode  status = U_ZERO_ERROR;
   1619     UStack      breaks(status);
   1620     int32_t     foundBreakCount = 0;
   1621     UChar32     c = utext_current32(fText);
   1622 
   1623     UTRIE_GET16(&fData->fTrie, c, category);
   1624 
   1625     // Is the character we're starting on a dictionary character? If so, we
   1626     // need to back up to include the entire run; otherwise the results of
   1627     // the break algorithm will differ depending on where we start. Since
   1628     // the result is cached and there is typically a non-dictionary break
   1629     // within a small number of words, there should be little performance impact.
   1630     if (category & 0x4000) {
   1631         if (reverse) {
   1632             do {
   1633                 utext_next32(fText);          // TODO:  recast to work directly with postincrement.
   1634                 c = utext_current32(fText);
   1635                 UTRIE_GET16(&fData->fTrie, c, category);
   1636             } while (c != U_SENTINEL && (category & 0x4000));
   1637             // Back up to the last dictionary character
   1638             rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1639             if (c == U_SENTINEL) {
   1640                 // c = fText->last32();
   1641                 //   TODO:  why was this if needed?
   1642                 c = UTEXT_PREVIOUS32(fText);
   1643             }
   1644             else {
   1645                 c = UTEXT_PREVIOUS32(fText);
   1646             }
   1647         }
   1648         else {
   1649             do {
   1650                 c = UTEXT_PREVIOUS32(fText);
   1651                 UTRIE_GET16(&fData->fTrie, c, category);
   1652             }
   1653             while (c != U_SENTINEL && (category & 0x4000));
   1654             // Back up to the last dictionary character
   1655             if (c == U_SENTINEL) {
   1656                 // c = fText->first32();
   1657                 c = utext_current32(fText);
   1658             }
   1659             else {
   1660                 utext_next32(fText);
   1661                 c = utext_current32(fText);
   1662             }
   1663             rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
   1664         }
   1665         UTRIE_GET16(&fData->fTrie, c, category);
   1666     }
   1667 
   1668     // Loop through the text, looking for ranges of dictionary characters.
   1669     // For each span, find the appropriate break engine, and ask it to find
   1670     // any breaks within the span.
   1671     // Note: we always do this in the forward direction, so that the break
   1672     // cache is built in the right order.
   1673     if (reverse) {
   1674         utext_setNativeIndex(fText, rangeStart);
   1675         c = utext_current32(fText);
   1676         UTRIE_GET16(&fData->fTrie, c, category);
   1677     }
   1678     while(U_SUCCESS(status)) {
   1679         while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
   1680             utext_next32(fText);           // TODO:  tweak for post-increment operation
   1681             c = utext_current32(fText);
   1682             UTRIE_GET16(&fData->fTrie, c, category);
   1683         }
   1684         if (current >= rangeEnd) {
   1685             break;
   1686         }
   1687 
   1688         // We now have a dictionary character. Get the appropriate language object
   1689         // to deal with it.
   1690         const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
   1691 
   1692         // Ask the language object if there are any breaks. It will leave the text
   1693         // pointer on the other side of its range, ready to search for the next one.
   1694         if (lbe != NULL) {
   1695             foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
   1696         }
   1697 
   1698         // Reload the loop variables for the next go-round
   1699         c = utext_current32(fText);
   1700         UTRIE_GET16(&fData->fTrie, c, category);
   1701     }
   1702 
   1703     // If we found breaks, build a new break cache. The first and last entries must
   1704     // be the original starting and ending position.
   1705     if (foundBreakCount > 0) {
   1706         U_ASSERT(foundBreakCount == breaks.size());
   1707         int32_t totalBreaks = foundBreakCount;
   1708         if (startPos < breaks.elementAti(0)) {
   1709             totalBreaks += 1;
   1710         }
   1711         if (endPos > breaks.peeki()) {
   1712             totalBreaks += 1;
   1713         }
   1714         fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
   1715         if (fCachedBreakPositions != NULL) {
   1716             int32_t out = 0;
   1717             fNumCachedBreakPositions = totalBreaks;
   1718             if (startPos < breaks.elementAti(0)) {
   1719                 fCachedBreakPositions[out++] = startPos;
   1720             }
   1721             for (int32_t i = 0; i < foundBreakCount; ++i) {
   1722                 fCachedBreakPositions[out++] = breaks.elementAti(i);
   1723             }
   1724             if (endPos > fCachedBreakPositions[out-1]) {
   1725                 fCachedBreakPositions[out] = endPos;
   1726             }
   1727             // If there are breaks, then by definition, we are replacing the original
   1728             // proposed break by one of the breaks we found. Use following() and
   1729             // preceding() to do the work. They should never recurse in this case.
   1730             if (reverse) {
   1731                 return preceding(endPos);
   1732             }
   1733             else {
   1734                 return following(startPos);
   1735             }
   1736         }
   1737         // If the allocation failed, just fall through to the "no breaks found" case.
   1738     }
   1739 
   1740     // If we get here, there were no language-based breaks. Set the text pointer
   1741     // to the original proposed break.
   1742     utext_setNativeIndex(fText, reverse ? startPos : endPos);
   1743     return (reverse ? startPos : endPos);
   1744 }
   1745 
   1746 // defined in ucln_cmn.h
   1747 
   1748 U_NAMESPACE_END
   1749 
   1750 
   1751 static icu::UStack *gLanguageBreakFactories = NULL;
   1752 static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER;
   1753 
   1754 /**
   1755  * Release all static memory held by breakiterator.
   1756  */
   1757 U_CDECL_BEGIN
   1758 static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
   1759     if (gLanguageBreakFactories) {
   1760         delete gLanguageBreakFactories;
   1761         gLanguageBreakFactories = NULL;
   1762     }
   1763     gLanguageBreakFactoriesInitOnce.reset();
   1764     return TRUE;
   1765 }
   1766 U_CDECL_END
   1767 
   1768 U_CDECL_BEGIN
   1769 static void U_CALLCONV _deleteFactory(void *obj) {
   1770     delete (icu::LanguageBreakFactory *) obj;
   1771 }
   1772 U_CDECL_END
   1773 U_NAMESPACE_BEGIN
   1774 
   1775 static void U_CALLCONV initLanguageFactories() {
   1776     UErrorCode status = U_ZERO_ERROR;
   1777     U_ASSERT(gLanguageBreakFactories == NULL);
   1778     gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status);
   1779     if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) {
   1780         ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
   1781         gLanguageBreakFactories->push(builtIn, status);
   1782 #ifdef U_LOCAL_SERVICE_HOOK
   1783         LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
   1784         if (extra != NULL) {
   1785             gLanguageBreakFactories->push(extra, status);
   1786         }
   1787 #endif
   1788     }
   1789     ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
   1790 }
   1791 
   1792 
   1793 static const LanguageBreakEngine*
   1794 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
   1795 {
   1796     umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
   1797     if (gLanguageBreakFactories == NULL) {
   1798         return NULL;
   1799     }
   1800 
   1801     int32_t i = gLanguageBreakFactories->size();
   1802     const LanguageBreakEngine *lbe = NULL;
   1803     while (--i >= 0) {
   1804         LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
   1805         lbe = factory->getEngineFor(c, breakType);
   1806         if (lbe != NULL) {
   1807             break;
   1808         }
   1809     }
   1810     return lbe;
   1811 }
   1812 
   1813 
   1814 //-------------------------------------------------------------------------------
   1815 //
   1816 //  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the
   1817 //                          the character c.
   1818 //
   1819 //-------------------------------------------------------------------------------
   1820 const LanguageBreakEngine *
   1821 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
   1822     const LanguageBreakEngine *lbe = NULL;
   1823     UErrorCode status = U_ZERO_ERROR;
   1824 
   1825     if (fLanguageBreakEngines == NULL) {
   1826         fLanguageBreakEngines = new UStack(status);
   1827         if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
   1828             delete fLanguageBreakEngines;
   1829             fLanguageBreakEngines = 0;
   1830             return NULL;
   1831         }
   1832     }
   1833 
   1834     int32_t i = fLanguageBreakEngines->size();
   1835     while (--i >= 0) {
   1836         lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
   1837         if (lbe->handles(c, fBreakType)) {
   1838             return lbe;
   1839         }
   1840     }
   1841 
   1842     // No existing dictionary took the character. See if a factory wants to
   1843     // give us a new LanguageBreakEngine for this character.
   1844     lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
   1845 
   1846     // If we got one, use it and push it on our stack.
   1847     if (lbe != NULL) {
   1848         fLanguageBreakEngines->push((void *)lbe, status);
   1849         // Even if we can't remember it, we can keep looking it up, so
   1850         // return it even if the push fails.
   1851         return lbe;
   1852     }
   1853 
   1854     // No engine is forthcoming for this character. Add it to the
   1855     // reject set. Create the reject break engine if needed.
   1856     if (fUnhandledBreakEngine == NULL) {
   1857         fUnhandledBreakEngine = new UnhandledEngine(status);
   1858         if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
   1859             status = U_MEMORY_ALLOCATION_ERROR;
   1860         }
   1861         // Put it last so that scripts for which we have an engine get tried
   1862         // first.
   1863         fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
   1864         // If we can't insert it, or creation failed, get rid of it
   1865         if (U_FAILURE(status)) {
   1866             delete fUnhandledBreakEngine;
   1867             fUnhandledBreakEngine = 0;
   1868             return NULL;
   1869         }
   1870     }
   1871 
   1872     // Tell the reject engine about the character; at its discretion, it may
   1873     // add more than just the one character.
   1874     fUnhandledBreakEngine->handleCharacter(c, fBreakType);
   1875 
   1876     return fUnhandledBreakEngine;
   1877 }
   1878 
   1879 
   1880 
   1881 /*int32_t RuleBasedBreakIterator::getBreakType() const {
   1882     return fBreakType;
   1883 }*/
   1884 
   1885 void RuleBasedBreakIterator::setBreakType(int32_t type) {
   1886     fBreakType = type;
   1887     reset();
   1888 }
   1889 
   1890 U_NAMESPACE_END
   1891 
   1892 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   1893