Home | History | Annotate | Download | only in common
      1 /*
      2 ***************************************************************************
      3 *   Copyright (C) 1999-2008 International Business Machines Corporation   *
      4 *   and others. All rights reserved.                                      *
      5 ***************************************************************************
      6 */
      7 //
      8 //  file:  rbbi.c    Contains the implementation of the rule based break iterator
      9 //                   runtime engine and the API implementation for
     10 //                   class RuleBasedBreakIterator
     11 //
     12 
     13 #include "unicode/utypes.h"
     14 
     15 #if !UCONFIG_NO_BREAK_ITERATION
     16 
     17 #include "unicode/rbbi.h"
     18 #include "unicode/schriter.h"
     19 #include "unicode/uchriter.h"
     20 #include "unicode/udata.h"
     21 #include "unicode/uclean.h"
     22 #include "rbbidata.h"
     23 #include "rbbirb.h"
     24 #include "cmemory.h"
     25 #include "cstring.h"
     26 #include "umutex.h"
     27 #include "ucln_cmn.h"
     28 #include "brkeng.h"
     29 
     30 #include "uassert.h"
     31 #include "uvector.h"
     32 
     33 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
     34 #if U_LOCAL_SERVICE_HOOK
     35 #include "localsvc.h"
     36 #endif
     37 
     38 #ifdef RBBI_DEBUG
     39 static UBool fTrace = FALSE;
     40 #endif
     41 
     42 U_NAMESPACE_BEGIN
     43 
     44 // The state number of the starting state
     45 #define START_STATE 1
     46 
     47 // The state-transition value indicating "stop"
     48 #define STOP_STATE  0
     49 
     50 
     51 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
     52 
     53 
     54 //=======================================================================
     55 // constructors
     56 //=======================================================================
     57 
     58 /**
     59  * Constructs a RuleBasedBreakIterator that uses the already-created
     60  * tables object that is passed in as a parameter.
     61  */
     62 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
     63 {
     64     init();
     65     fData = new RBBIDataWrapper(data, status); // status checked in constructor
     66     if (U_FAILURE(status)) {return;}
     67     if(fData == 0) {
     68         status = U_MEMORY_ALLOCATION_ERROR;
     69         return;
     70     }
     71 }
     72 
     73 /**
     74  * Same as above but does not adopt memory
     75  */
     76 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status)
     77 {
     78     init();
     79     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor
     80     if (U_FAILURE(status)) {return;}
     81     if(fData == 0) {
     82         status = U_MEMORY_ALLOCATION_ERROR;
     83         return;
     84     }
     85 }
     86 
     87 //-------------------------------------------------------------------------------
     88 //
     89 //   Constructor   from a UDataMemory handle to precompiled break rules
     90 //                 stored in an ICU data file.
     91 //
     92 //-------------------------------------------------------------------------------
     93 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
     94 {
     95     init();
     96     fData = new RBBIDataWrapper(udm, status); // status checked in constructor
     97     if (U_FAILURE(status)) {return;}
     98     if(fData == 0) {
     99         status = U_MEMORY_ALLOCATION_ERROR;
    100         return;
    101     }
    102 }
    103 
    104 
    105 
    106 //-------------------------------------------------------------------------------
    107 //
    108 //   Constructor       from a set of rules supplied as a string.
    109 //
    110 //-------------------------------------------------------------------------------
    111 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
    112                                                 UParseError          &parseError,
    113                                                 UErrorCode           &status)
    114 {
    115     init();
    116     if (U_FAILURE(status)) {return;}
    117     RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
    118         RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
    119     // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that
    120     //        creates and returns a complete RBBI.  From here, in a constructor, we
    121     //        can't just return the object created by the builder factory, hence
    122     //        the assignment of the factory created object to "this".
    123     if (U_SUCCESS(status)) {
    124         *this = *bi;
    125         delete bi;
    126     }
    127 }
    128 
    129 
    130 //-------------------------------------------------------------------------------
    131 //
    132 // Default Constructor.      Create an empty shell that can be set up later.
    133 //                           Used when creating a RuleBasedBreakIterator from a set
    134 //                           of rules.
    135 //-------------------------------------------------------------------------------
    136 RuleBasedBreakIterator::RuleBasedBreakIterator() {
    137     init();
    138 }
    139 
    140 
    141 //-------------------------------------------------------------------------------
    142 //
    143 //   Copy constructor.  Will produce a break iterator with the same behavior,
    144 //                      and which iterates over the same text, as the one passed in.
    145 //
    146 //-------------------------------------------------------------------------------
    147 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
    148 : BreakIterator(other)
    149 {
    150     this->init();
    151     *this = other;
    152 }
    153 
    154 
    155 /**
    156  * Destructor
    157  */
    158 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
    159     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    160         // fCharIter was adopted from the outside.
    161         delete fCharIter;
    162     }
    163     fCharIter = NULL;
    164     delete fSCharIter;
    165     fCharIter = NULL;
    166     delete fDCharIter;
    167     fDCharIter = NULL;
    168 
    169     utext_close(fText);
    170 
    171     if (fData != NULL) {
    172         fData->removeReference();
    173         fData = NULL;
    174     }
    175     if (fCachedBreakPositions) {
    176         uprv_free(fCachedBreakPositions);
    177         fCachedBreakPositions = NULL;
    178     }
    179     if (fLanguageBreakEngines) {
    180         delete fLanguageBreakEngines;
    181         fLanguageBreakEngines = NULL;
    182     }
    183     if (fUnhandledBreakEngine) {
    184         delete fUnhandledBreakEngine;
    185         fUnhandledBreakEngine = NULL;
    186     }
    187 }
    188 
    189 /**
    190  * Assignment operator.  Sets this iterator to have the same behavior,
    191  * and iterate over the same text, as the one passed in.
    192  */
    193 RuleBasedBreakIterator&
    194 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
    195     if (this == &that) {
    196         return *this;
    197     }
    198     reset();    // Delete break cache information
    199     fBreakType = that.fBreakType;
    200     if (fLanguageBreakEngines != NULL) {
    201         delete fLanguageBreakEngines;
    202         fLanguageBreakEngines = NULL;   // Just rebuild for now
    203     }
    204     // TODO: clone fLanguageBreakEngines from "that"
    205     UErrorCode status = U_ZERO_ERROR;
    206     fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
    207 
    208     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    209         delete fCharIter;
    210     }
    211     fCharIter = NULL;
    212 
    213     if (that.fCharIter != NULL ) {
    214         // This is a little bit tricky - it will intially appear that
    215         //  this->fCharIter is adopted, even if that->fCharIter was
    216         //  not adopted.  That's ok.
    217         fCharIter = that.fCharIter->clone();
    218     }
    219 
    220     if (fData != NULL) {
    221         fData->removeReference();
    222         fData = NULL;
    223     }
    224     if (that.fData != NULL) {
    225         fData = that.fData->addReference();
    226     }
    227 
    228     return *this;
    229 }
    230 
    231 
    232 
    233 //-----------------------------------------------------------------------------
    234 //
    235 //    init()      Shared initialization routine.   Used by all the constructors.
    236 //                Initializes all fields, leaving the object in a consistent state.
    237 //
    238 //-----------------------------------------------------------------------------
    239 void RuleBasedBreakIterator::init() {
    240     UErrorCode  status    = U_ZERO_ERROR;
    241     fBufferClone          = FALSE;
    242     fText                 = utext_openUChars(NULL, NULL, 0, &status);
    243     fCharIter             = NULL;
    244     fSCharIter            = NULL;
    245     fDCharIter            = NULL;
    246     fData                 = NULL;
    247     fLastRuleStatusIndex  = 0;
    248     fLastStatusIndexValid = TRUE;
    249     fDictionaryCharCount  = 0;
    250     fBreakType            = -1;
    251 
    252     fCachedBreakPositions    = NULL;
    253     fLanguageBreakEngines    = NULL;
    254     fUnhandledBreakEngine    = NULL;
    255     fNumCachedBreakPositions = 0;
    256     fPositionInCache         = 0;
    257 
    258 #ifdef RBBI_DEBUG
    259     static UBool debugInitDone = FALSE;
    260     if (debugInitDone == FALSE) {
    261         char *debugEnv = getenv("U_RBBIDEBUG");
    262         if (debugEnv && uprv_strstr(debugEnv, "trace")) {
    263             fTrace = TRUE;
    264         }
    265         debugInitDone = TRUE;
    266     }
    267 #endif
    268 }
    269 
    270 
    271 
    272 //-----------------------------------------------------------------------------
    273 //
    274 //    clone - Returns a newly-constructed RuleBasedBreakIterator with the same
    275 //            behavior, and iterating over the same text, as this one.
    276 //            Virtual function: does the right thing with subclasses.
    277 //
    278 //-----------------------------------------------------------------------------
    279 BreakIterator*
    280 RuleBasedBreakIterator::clone(void) const {
    281     return new RuleBasedBreakIterator(*this);
    282 }
    283 
    284 /**
    285  * Equality operator.  Returns TRUE if both BreakIterators are of the
    286  * same class, have the same behavior, and iterate over the same text.
    287  */
    288 UBool
    289 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
    290     if (that.getDynamicClassID() != getDynamicClassID()) {
    291         return FALSE;
    292     }
    293 
    294     const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
    295 
    296     if (!utext_equals(fText, that2.fText)) {
    297         // The two break iterators are operating on different text,
    298         //   or have a different interation position.
    299         return FALSE;
    300     };
    301 
    302     // TODO:  need a check for when in a dictionary region at different offsets.
    303 
    304     if (that2.fData == fData ||
    305         (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
    306             // The two break iterators are using the same rules.
    307             return TRUE;
    308         }
    309     return FALSE;
    310 }
    311 
    312 /**
    313  * Compute a hash code for this BreakIterator
    314  * @return A hash code
    315  */
    316 int32_t
    317 RuleBasedBreakIterator::hashCode(void) const {
    318     int32_t   hash = 0;
    319     if (fData != NULL) {
    320         hash = fData->hashCode();
    321     }
    322     return hash;
    323 }
    324 
    325 
    326 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
    327     if (U_FAILURE(status)) {
    328         return;
    329     }
    330     reset();
    331     fText = utext_clone(fText, ut, FALSE, TRUE, &status);
    332 
    333     // Set up a dummy CharacterIterator to be returned if anyone
    334     //   calls getText().  With input from UText, there is no reasonable
    335     //   way to return a characterIterator over the actual input text.
    336     //   Return one over an empty string instead - this is the closest
    337     //   we can come to signaling a failure.
    338     //   (GetText() is obsolete, this failure is sort of OK)
    339     if (fDCharIter == NULL) {
    340         static const UChar c = 0;
    341         fDCharIter = new UCharCharacterIterator(&c, 0);
    342         if (fDCharIter == NULL) {
    343             status = U_MEMORY_ALLOCATION_ERROR;
    344             return;
    345         }
    346     }
    347 
    348     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    349         // existing fCharIter was adopted from the outside.  Delete it now.
    350         delete fCharIter;
    351     }
    352     fCharIter = fDCharIter;
    353 
    354     this->first();
    355 }
    356 
    357 
    358 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
    359     UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
    360     return result;
    361 }
    362 
    363 
    364 
    365 /**
    366  * Returns the description used to create this iterator
    367  */
    368 const UnicodeString&
    369 RuleBasedBreakIterator::getRules() const {
    370     if (fData != NULL) {
    371         return fData->getRuleSourceString();
    372     } else {
    373         static const UnicodeString *s;
    374         if (s == NULL) {
    375             // TODO:  something more elegant here.
    376             //        perhaps API should return the string by value.
    377             //        Note:  thread unsafe init & leak are semi-ok, better than
    378             //               what was before.  Sould be cleaned up, though.
    379             s = new UnicodeString;
    380         }
    381         return *s;
    382     }
    383 }
    384 
    385 //=======================================================================
    386 // BreakIterator overrides
    387 //=======================================================================
    388 
    389 /**
    390  * Return a CharacterIterator over the text being analyzed.
    391  */
    392 CharacterIterator&
    393 RuleBasedBreakIterator::getText() const {
    394     return *fCharIter;
    395 }
    396 
    397 /**
    398  * Set the iterator to analyze a new piece of text.  This function resets
    399  * the current iteration position to the beginning of the text.
    400  * @param newText An iterator over the text to analyze.
    401  */
    402 void
    403 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
    404     // If we are holding a CharacterIterator adopted from a
    405     //   previous call to this function, delete it now.
    406     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    407         delete fCharIter;
    408     }
    409 
    410     fCharIter = newText;
    411     UErrorCode status = U_ZERO_ERROR;
    412     reset();
    413     if (newText==NULL || newText->startIndex() != 0) {
    414         // startIndex !=0 wants to be an error, but there's no way to report it.
    415         // Make the iterator text be an empty string.
    416         fText = utext_openUChars(fText, NULL, 0, &status);
    417     } else {
    418         fText = utext_openCharacterIterator(fText, newText, &status);
    419     }
    420     this->first();
    421 }
    422 
    423 /**
    424  * Set the iterator to analyze a new piece of text.  This function resets
    425  * the current iteration position to the beginning of the text.
    426  * @param newText An iterator over the text to analyze.
    427  */
    428 void
    429 RuleBasedBreakIterator::setText(const UnicodeString& newText) {
    430     UErrorCode status = U_ZERO_ERROR;
    431     reset();
    432     fText = utext_openConstUnicodeString(fText, &newText, &status);
    433 
    434     // Set up a character iterator on the string.
    435     //   Needed in case someone calls getText().
    436     //  Can not, unfortunately, do this lazily on the (probably never)
    437     //  call to getText(), because getText is const.
    438     if (fSCharIter == NULL) {
    439         fSCharIter = new StringCharacterIterator(newText);
    440     } else {
    441         fSCharIter->setText(newText);
    442     }
    443 
    444     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    445         // old fCharIter was adopted from the outside.  Delete it.
    446         delete fCharIter;
    447     }
    448     fCharIter = fSCharIter;
    449 
    450     this->first();
    451 }
    452 
    453 
    454 
    455 /**
    456  * Sets the current iteration position to the beginning of the text.
    457  * @return The offset of the beginning of the text.
    458  */
    459 int32_t RuleBasedBreakIterator::first(void) {
    460     reset();
    461     fLastRuleStatusIndex  = 0;
    462     fLastStatusIndexValid = TRUE;
    463     //if (fText == NULL)
    464     //    return BreakIterator::DONE;
    465 
    466     utext_setNativeIndex(fText, 0);
    467     return 0;
    468 }
    469 
    470 /**
    471  * Sets the current iteration position to the end of the text.
    472  * @return The text's past-the-end offset.
    473  */
    474 int32_t RuleBasedBreakIterator::last(void) {
    475     reset();
    476     if (fText == NULL) {
    477         fLastRuleStatusIndex  = 0;
    478         fLastStatusIndexValid = TRUE;
    479         return BreakIterator::DONE;
    480     }
    481 
    482     fLastStatusIndexValid = FALSE;
    483     int32_t pos = (int32_t)utext_nativeLength(fText);
    484     utext_setNativeIndex(fText, pos);
    485     return pos;
    486 }
    487 
    488 /**
    489  * Advances the iterator either forward or backward the specified number of steps.
    490  * Negative values move backward, and positive values move forward.  This is
    491  * equivalent to repeatedly calling next() or previous().
    492  * @param n The number of steps to move.  The sign indicates the direction
    493  * (negative is backwards, and positive is forwards).
    494  * @return The character offset of the boundary position n boundaries away from
    495  * the current one.
    496  */
    497 int32_t RuleBasedBreakIterator::next(int32_t n) {
    498     int32_t result = current();
    499     while (n > 0) {
    500         result = next();
    501         --n;
    502     }
    503     while (n < 0) {
    504         result = previous();
    505         ++n;
    506     }
    507     return result;
    508 }
    509 
    510 /**
    511  * Advances the iterator to the next boundary position.
    512  * @return The position of the first boundary after this one.
    513  */
    514 int32_t RuleBasedBreakIterator::next(void) {
    515     // if we have cached break positions and we're still in the range
    516     // covered by them, just move one step forward in the cache
    517     if (fCachedBreakPositions != NULL) {
    518         if (fPositionInCache < fNumCachedBreakPositions - 1) {
    519             ++fPositionInCache;
    520             int32_t pos = fCachedBreakPositions[fPositionInCache];
    521             utext_setNativeIndex(fText, pos);
    522             return pos;
    523         }
    524         else {
    525             reset();
    526         }
    527     }
    528 
    529     int32_t startPos = current();
    530     int32_t result = handleNext(fData->fForwardTable);
    531     if (fDictionaryCharCount > 0) {
    532         result = checkDictionary(startPos, result, FALSE);
    533     }
    534     return result;
    535 }
    536 
    537 /**
    538  * Advances the iterator backwards, to the last boundary preceding this one.
    539  * @return The position of the last boundary position preceding this one.
    540  */
    541 int32_t RuleBasedBreakIterator::previous(void) {
    542     int32_t result;
    543     int32_t startPos;
    544 
    545     // if we have cached break positions and we're still in the range
    546     // covered by them, just move one step backward in the cache
    547     if (fCachedBreakPositions != NULL) {
    548         if (fPositionInCache > 0) {
    549             --fPositionInCache;
    550             // If we're at the beginning of the cache, need to reevaluate the
    551             // rule status
    552             if (fPositionInCache <= 0) {
    553                 fLastStatusIndexValid = FALSE;
    554             }
    555             int32_t pos = fCachedBreakPositions[fPositionInCache];
    556             utext_setNativeIndex(fText, pos);
    557             return pos;
    558         }
    559         else {
    560             reset();
    561         }
    562     }
    563 
    564     // if we're already sitting at the beginning of the text, return DONE
    565     if (fText == NULL || (startPos = current()) == 0) {
    566         fLastRuleStatusIndex  = 0;
    567         fLastStatusIndexValid = TRUE;
    568         return BreakIterator::DONE;
    569     }
    570 
    571     if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
    572         result = handlePrevious(fData->fReverseTable);
    573         if (fDictionaryCharCount > 0) {
    574             result = checkDictionary(result, startPos, TRUE);
    575         }
    576         return result;
    577     }
    578 
    579     // old rule syntax
    580     // set things up.  handlePrevious() will back us up to some valid
    581     // break position before the current position (we back our internal
    582     // iterator up one step to prevent handlePrevious() from returning
    583     // the current position), but not necessarily the last one before
    584 
    585     // where we started
    586 
    587     int32_t start = current();
    588 
    589     UTEXT_PREVIOUS32(fText);
    590     int32_t lastResult    = handlePrevious(fData->fReverseTable);
    591     if (lastResult == UBRK_DONE) {
    592         lastResult = 0;
    593         utext_setNativeIndex(fText, 0);
    594     }
    595     result = lastResult;
    596     int32_t lastTag       = 0;
    597     UBool   breakTagValid = FALSE;
    598 
    599     // iterate forward from the known break position until we pass our
    600     // starting point.  The last break position before the starting
    601     // point is our return value
    602 
    603     for (;;) {
    604         result         = next();
    605         if (result == BreakIterator::DONE || result >= start) {
    606             break;
    607         }
    608         lastResult     = result;
    609         lastTag        = fLastRuleStatusIndex;
    610         breakTagValid  = TRUE;
    611     }
    612 
    613     // fLastBreakTag wants to have the value for section of text preceding
    614     // the result position that we are to return (in lastResult.)  If
    615     // the backwards rules overshot and the above loop had to do two or more
    616     // next()s to move up to the desired return position, we will have a valid
    617     // tag value. But, if handlePrevious() took us to exactly the correct result positon,
    618     // we wont have a tag value for that position, which is only set by handleNext().
    619 
    620     // set the current iteration position to be the last break position
    621     // before where we started, and then return that value
    622     utext_setNativeIndex(fText, lastResult);
    623     fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()
    624     fLastStatusIndexValid = breakTagValid;
    625 
    626     // No need to check the dictionary; it will have been handled by
    627     // next()
    628 
    629     return lastResult;
    630 }
    631 
    632 /**
    633  * Sets the iterator to refer to the first boundary position following
    634  * the specified position.
    635  * @offset The position from which to begin searching for a break position.
    636  * @return The position of the first break after the current position.
    637  */
    638 int32_t RuleBasedBreakIterator::following(int32_t offset) {
    639     // if we have cached break positions and offset is in the range
    640     // covered by them, use them
    641     // TODO: could use binary search
    642     // TODO: what if offset is outside range, but break is not?
    643     if (fCachedBreakPositions != NULL) {
    644         if (offset >= fCachedBreakPositions[0]
    645                 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
    646             fPositionInCache = 0;
    647             // We are guaranteed not to leave the array due to range test above
    648             while (offset >= fCachedBreakPositions[fPositionInCache]) {
    649                 ++fPositionInCache;
    650             }
    651             int32_t pos = fCachedBreakPositions[fPositionInCache];
    652             utext_setNativeIndex(fText, pos);
    653             return pos;
    654         }
    655         else {
    656             reset();
    657         }
    658     }
    659 
    660     // if the offset passed in is already past the end of the text,
    661     // just return DONE; if it's before the beginning, return the
    662     // text's starting offset
    663     fLastRuleStatusIndex  = 0;
    664     fLastStatusIndexValid = TRUE;
    665     if (fText == NULL || offset >= utext_nativeLength(fText)) {
    666         last();
    667         return next();
    668     }
    669     else if (offset < 0) {
    670         return first();
    671     }
    672 
    673     // otherwise, set our internal iteration position (temporarily)
    674     // to the position passed in.  If this is the _beginning_ position,
    675     // then we can just use next() to get our return value
    676 
    677     int32_t result = 0;
    678 
    679     if (fData->fSafeRevTable != NULL) {
    680         // new rule syntax
    681         utext_setNativeIndex(fText, offset);
    682         // move forward one codepoint to prepare for moving back to a
    683         // safe point.
    684         // this handles offset being between a supplementary character
    685         UTEXT_NEXT32(fText);
    686         // handlePrevious will move most of the time to < 1 boundary away
    687         handlePrevious(fData->fSafeRevTable);
    688         int32_t result = next();
    689         while (result <= offset) {
    690             result = next();
    691         }
    692         return result;
    693     }
    694     if (fData->fSafeFwdTable != NULL) {
    695         // backup plan if forward safe table is not available
    696         utext_setNativeIndex(fText, offset);
    697         UTEXT_PREVIOUS32(fText);
    698         // handle next will give result >= offset
    699         handleNext(fData->fSafeFwdTable);
    700         // previous will give result 0 or 1 boundary away from offset,
    701         // most of the time
    702         // we have to
    703         int32_t oldresult = previous();
    704         while (oldresult > offset) {
    705             int32_t result = previous();
    706             if (result <= offset) {
    707                 return oldresult;
    708             }
    709             oldresult = result;
    710         }
    711         int32_t result = next();
    712         if (result <= offset) {
    713             return next();
    714         }
    715         return result;
    716     }
    717     // otherwise, we have to sync up first.  Use handlePrevious() to back
    718     // up to a known break position before the specified position (if
    719     // we can determine that the specified position is a break position,
    720     // we don't back up at all).  This may or may not be the last break
    721     // position at or before our starting position.  Advance forward
    722     // from here until we've passed the starting position.  The position
    723     // we stop on will be the first break position after the specified one.
    724     // old rule syntax
    725 
    726     utext_setNativeIndex(fText, offset);
    727     if (offset==0 ||
    728         offset==1  && utext_getNativeIndex(fText)==0) {
    729         return next();
    730     }
    731     result = previous();
    732 
    733     while (result != BreakIterator::DONE && result <= offset) {
    734         result = next();
    735     }
    736 
    737     return result;
    738 }
    739 
    740 /**
    741  * Sets the iterator to refer to the last boundary position before the
    742  * specified position.
    743  * @offset The position to begin searching for a break from.
    744  * @return The position of the last boundary before the starting position.
    745  */
    746 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
    747     // if we have cached break positions and offset is in the range
    748     // covered by them, use them
    749     if (fCachedBreakPositions != NULL) {
    750         // TODO: binary search?
    751         // TODO: What if offset is outside range, but break is not?
    752         if (offset > fCachedBreakPositions[0]
    753                 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
    754             fPositionInCache = 0;
    755             while (fPositionInCache < fNumCachedBreakPositions
    756                    && offset > fCachedBreakPositions[fPositionInCache])
    757                 ++fPositionInCache;
    758             --fPositionInCache;
    759             // If we're at the beginning of the cache, need to reevaluate the
    760             // rule status
    761             if (fPositionInCache <= 0) {
    762                 fLastStatusIndexValid = FALSE;
    763             }
    764             utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
    765             return fCachedBreakPositions[fPositionInCache];
    766         }
    767         else {
    768             reset();
    769         }
    770     }
    771 
    772     // if the offset passed in is already past the end of the text,
    773     // just return DONE; if it's before the beginning, return the
    774     // text's starting offset
    775     if (fText == NULL || offset > utext_nativeLength(fText)) {
    776         // return BreakIterator::DONE;
    777         return last();
    778     }
    779     else if (offset < 0) {
    780         return first();
    781     }
    782 
    783     // if we start by updating the current iteration position to the
    784     // position specified by the caller, we can just use previous()
    785     // to carry out this operation
    786 
    787     if (fData->fSafeFwdTable != NULL) {
    788         // new rule syntax
    789         utext_setNativeIndex(fText, offset);
    790         int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    791         if (newOffset != offset) {
    792             // Will come here if specified offset was not a code point boundary AND
    793             //   the underlying implmentation is using UText, which snaps any non-code-point-boundary
    794             //   indices to the containing code point.
    795             // For breakitereator::preceding only, these non-code-point indices need to be moved
    796             //   up to refer to the following codepoint.
    797             UTEXT_NEXT32(fText);
    798             offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    799         }
    800 
    801         // TODO:  (synwee) would it be better to just check for being in the middle of a surrogate pair,
    802         //        rather than adjusting the position unconditionally?
    803         //        (Change would interact with safe rules.)
    804         // TODO:  change RBBI behavior for off-boundary indices to match that of UText?
    805         //        affects only preceding(), seems cleaner, but is slightly different.
    806         UTEXT_PREVIOUS32(fText);
    807         handleNext(fData->fSafeFwdTable);
    808         int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    809         while (result >= offset) {
    810             result = previous();
    811         }
    812         return result;
    813     }
    814     if (fData->fSafeRevTable != NULL) {
    815         // backup plan if forward safe table is not available
    816         //  TODO:  check whether this path can be discarded
    817         //         It's probably OK to say that rules must supply both safe tables
    818         //            if they use safe tables at all.  We have certainly never described
    819         //            to anyone how to work with just one safe table.
    820         utext_setNativeIndex(fText, offset);
    821         UTEXT_NEXT32(fText);
    822 
    823         // handle previous will give result <= offset
    824         handlePrevious(fData->fSafeRevTable);
    825 
    826         // next will give result 0 or 1 boundary away from offset,
    827         // most of the time
    828         // we have to
    829         int32_t oldresult = next();
    830         while (oldresult < offset) {
    831             int32_t result = next();
    832             if (result >= offset) {
    833                 return oldresult;
    834             }
    835             oldresult = result;
    836         }
    837         int32_t result = previous();
    838         if (result >= offset) {
    839             return previous();
    840         }
    841         return result;
    842     }
    843 
    844     // old rule syntax
    845     utext_setNativeIndex(fText, offset);
    846     return previous();
    847 }
    848 
    849 /**
    850  * Returns true if the specfied position is a boundary position.  As a side
    851  * effect, leaves the iterator pointing to the first boundary position at
    852  * or after "offset".
    853  * @param offset the offset to check.
    854  * @return True if "offset" is a boundary position.
    855  */
    856 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
    857     // the beginning index of the iterator is always a boundary position by definition
    858     if (offset == 0) {
    859         first();       // For side effects on current position, tag values.
    860         return TRUE;
    861     }
    862 
    863     if (offset == (int32_t)utext_nativeLength(fText)) {
    864         last();       // For side effects on current position, tag values.
    865         return TRUE;
    866     }
    867 
    868     // out-of-range indexes are never boundary positions
    869     if (offset < 0) {
    870         first();       // For side effects on current position, tag values.
    871         return FALSE;
    872     }
    873 
    874     if (offset > utext_nativeLength(fText)) {
    875         last();        // For side effects on current position, tag values.
    876         return FALSE;
    877     }
    878 
    879     // otherwise, we can use following() on the position before the specified
    880     // one and return true if the position we get back is the one the user
    881     // specified
    882     utext_previous32From(fText, offset);
    883     int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    884     UBool    result  = following(backOne) == offset;
    885     return result;
    886 }
    887 
    888 /**
    889  * Returns the current iteration position.
    890  * @return The current iteration position.
    891  */
    892 int32_t RuleBasedBreakIterator::current(void) const {
    893     int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    894     return pos;
    895 }
    896 
    897 //=======================================================================
    898 // implementation
    899 //=======================================================================
    900 
    901 //
    902 // RBBIRunMode  -  the state machine runs an extra iteration at the beginning and end
    903 //                 of user text.  A variable with this enum type keeps track of where we
    904 //                 are.  The state machine only fetches user input while in the RUN mode.
    905 //
    906 enum RBBIRunMode {
    907     RBBI_START,     // state machine processing is before first char of input
    908     RBBI_RUN,       // state machine processing is in the user text
    909     RBBI_END        // state machine processing is after end of user text.
    910 };
    911 
    912 
    913 //-----------------------------------------------------------------------------------
    914 //
    915 //  handleNext(stateTable)
    916 //     This method is the actual implementation of the rbbi next() method.
    917 //     This method initializes the state machine to state 1
    918 //     and advances through the text character by character until we reach the end
    919 //     of the text or the state machine transitions to state 0.  We update our return
    920 //     value every time the state machine passes through an accepting state.
    921 //
    922 //-----------------------------------------------------------------------------------
    923 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
    924     int32_t             state;
    925     int16_t             category        = 0;
    926     RBBIRunMode         mode;
    927 
    928     RBBIStateTableRow  *row;
    929     UChar32             c;
    930     int32_t             lookaheadStatus = 0;
    931     int32_t             lookaheadTagIdx = 0;
    932     int32_t             result          = 0;
    933     int32_t             initialPosition = 0;
    934     int32_t             lookaheadResult = 0;
    935     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
    936     const char         *tableData       = statetable->fTableData;
    937     uint32_t            tableRowLen     = statetable->fRowLen;
    938 
    939     #ifdef RBBI_DEBUG
    940         if (fTrace) {
    941             RBBIDebugPuts("Handle Next   pos   char  state category");
    942         }
    943     #endif
    944 
    945     // No matter what, handleNext alway correctly sets the break tag value.
    946     fLastStatusIndexValid = TRUE;
    947     fLastRuleStatusIndex = 0;
    948 
    949     // if we're already at the end of the text, return DONE.
    950     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    951     result          = initialPosition;
    952     c               = UTEXT_NEXT32(fText);
    953     if (fData == NULL || c==U_SENTINEL) {
    954         return BreakIterator::DONE;
    955     }
    956 
    957     //  Set the initial state for the state machine
    958     state = START_STATE;
    959     row = (RBBIStateTableRow *)
    960             //(statetable->fTableData + (statetable->fRowLen * state));
    961             (tableData + tableRowLen * state);
    962 
    963 
    964     mode     = RBBI_RUN;
    965     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
    966         category = 2;
    967         mode     = RBBI_START;
    968     }
    969 
    970 
    971     // loop until we reach the end of the text or transition to state 0
    972     //
    973     for (;;) {
    974         if (c == U_SENTINEL) {
    975             // Reached end of input string.
    976             if (mode == RBBI_END) {
    977                 // We have already run the loop one last time with the
    978                 //   character set to the psueudo {eof} value.  Now it is time
    979                 //   to unconditionally bail out.
    980                 if (lookaheadResult > result) {
    981                     // We ran off the end of the string with a pending look-ahead match.
    982                     // Treat this as if the look-ahead condition had been met, and return
    983                     //  the match at the / position from the look-ahead rule.
    984                     result               = lookaheadResult;
    985                     fLastRuleStatusIndex = lookaheadTagIdx;
    986                     lookaheadStatus = 0;
    987                 }
    988                 break;
    989             }
    990             // Run the loop one last time with the fake end-of-input character category.
    991             mode = RBBI_END;
    992             category = 1;
    993         }
    994 
    995         //
    996         // Get the char category.  An incoming category of 1 or 2 means that
    997         //      we are preset for doing the beginning or end of input, and
    998         //      that we shouldn't get a category from an actual text input character.
    999         //
   1000         if (mode == RBBI_RUN) {
   1001             // look up the current character's character category, which tells us
   1002             // which column in the state table to look at.
   1003             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
   1004             //        not the size of the character going in, which is a UChar32.
   1005             //
   1006             UTRIE_GET16(&fData->fTrie, c, category);
   1007 
   1008             // Check the dictionary bit in the character's category.
   1009             //    Counter is only used by dictionary based iterators (subclasses).
   1010             //    Chars that need to be handled by a dictionary have a flag bit set
   1011             //    in their category values.
   1012             //
   1013             if ((category & 0x4000) != 0)  {
   1014                 fDictionaryCharCount++;
   1015                 //  And off the dictionary flag bit.
   1016                 category &= ~0x4000;
   1017             }
   1018         }
   1019 
   1020         #ifdef RBBI_DEBUG
   1021             if (fTrace) {
   1022                 RBBIDebugPrintf("             %4d   ", utext_getNativeIndex(fText));
   1023                 if (0x20<=c && c<0x7f) {
   1024                     RBBIDebugPrintf("\"%c\"  ", c);
   1025                 } else {
   1026                     RBBIDebugPrintf("%5x  ", c);
   1027                 }
   1028                 RBBIDebugPrintf("%3d  %3d\n", state, category);
   1029             }
   1030         #endif
   1031 
   1032         // State Transition - move machine to its next state
   1033         //
   1034         state = row->fNextState[category];
   1035         row = (RBBIStateTableRow *)
   1036             // (statetable->fTableData + (statetable->fRowLen * state));
   1037             (tableData + tableRowLen * state);
   1038 
   1039 
   1040         if (row->fAccepting == -1) {
   1041             // Match found, common case.
   1042             if (mode != RBBI_START) {
   1043                 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1044             }
   1045             fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
   1046         }
   1047 
   1048         if (row->fLookAhead != 0) {
   1049             if (lookaheadStatus != 0
   1050                 && row->fAccepting == lookaheadStatus) {
   1051                 // Lookahead match is completed.
   1052                 result               = lookaheadResult;
   1053                 fLastRuleStatusIndex = lookaheadTagIdx;
   1054                 lookaheadStatus      = 0;
   1055                 // TODO:  make a standalone hard break in a rule work.
   1056                 if (lookAheadHardBreak) {
   1057                     UTEXT_SETNATIVEINDEX(fText, result);
   1058                     return result;
   1059                 }
   1060                 // Look-ahead completed, but other rules may match further.  Continue on
   1061                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
   1062                 goto continueOn;
   1063             }
   1064 
   1065             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1066             lookaheadResult = r;
   1067             lookaheadStatus = row->fLookAhead;
   1068             lookaheadTagIdx = row->fTagIdx;
   1069             goto continueOn;
   1070         }
   1071 
   1072 
   1073         if (row->fAccepting != 0) {
   1074             // Because this is an accepting state, any in-progress look-ahead match
   1075             //   is no longer relavant.  Clear out the pending lookahead status.
   1076             lookaheadStatus = 0;           // clear out any pending look-ahead match.
   1077         }
   1078 
   1079 continueOn:
   1080         if (state == STOP_STATE) {
   1081             // This is the normal exit from the lookup state machine.
   1082             // We have advanced through the string until it is certain that no
   1083             //   longer match is possible, no matter what characters follow.
   1084             break;
   1085         }
   1086 
   1087         // Advance to the next character.
   1088         // If this is a beginning-of-input loop iteration, don't advance
   1089         //    the input position.  The next iteration will be processing the
   1090         //    first real input character.
   1091         if (mode == RBBI_RUN) {
   1092             c = UTEXT_NEXT32(fText);
   1093         } else {
   1094             if (mode == RBBI_START) {
   1095                 mode = RBBI_RUN;
   1096             }
   1097         }
   1098 
   1099 
   1100     }
   1101 
   1102     // The state machine is done.  Check whether it found a match...
   1103 
   1104     // If the iterator failed to advance in the match engine, force it ahead by one.
   1105     //   (This really indicates a defect in the break rules.  They should always match
   1106     //    at least one character.)
   1107     if (result == initialPosition) {
   1108         UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1109         UTEXT_NEXT32(fText);
   1110         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1111     }
   1112 
   1113     // Leave the iterator at our result position.
   1114     UTEXT_SETNATIVEINDEX(fText, result);
   1115     #ifdef RBBI_DEBUG
   1116         if (fTrace) {
   1117             RBBIDebugPrintf("result = %d\n\n", result);
   1118         }
   1119     #endif
   1120     return result;
   1121 }
   1122 
   1123 
   1124 
   1125 //-----------------------------------------------------------------------------------
   1126 //
   1127 //  handlePrevious()
   1128 //
   1129 //      Iterate backwards, according to the logic of the reverse rules.
   1130 //      This version handles the exact style backwards rules.
   1131 //
   1132 //      The logic of this function is very similar to handleNext(), above.
   1133 //
   1134 //-----------------------------------------------------------------------------------
   1135 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
   1136     int32_t             state;
   1137     int16_t             category        = 0;
   1138     RBBIRunMode         mode;
   1139     RBBIStateTableRow  *row;
   1140     UChar32             c;
   1141     int32_t             lookaheadStatus = 0;
   1142     int32_t             result          = 0;
   1143     int32_t             initialPosition = 0;
   1144     int32_t             lookaheadResult = 0;
   1145     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
   1146 
   1147     #ifdef RBBI_DEBUG
   1148         if (fTrace) {
   1149             RBBIDebugPuts("Handle Previous   pos   char  state category");
   1150         }
   1151     #endif
   1152 
   1153     // handlePrevious() never gets the rule status.
   1154     // Flag the status as invalid; if the user ever asks for status, we will need
   1155     // to back up, then re-find the break position using handleNext(), which does
   1156     // get the status value.
   1157     fLastStatusIndexValid = FALSE;
   1158     fLastRuleStatusIndex = 0;
   1159 
   1160     // if we're already at the start of the text, return DONE.
   1161     if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
   1162         return BreakIterator::DONE;
   1163     }
   1164 
   1165     //  Set up the starting char.
   1166     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1167     result          = initialPosition;
   1168     c               = UTEXT_PREVIOUS32(fText);
   1169 
   1170     //  Set the initial state for the state machine
   1171     state = START_STATE;
   1172     row = (RBBIStateTableRow *)
   1173             (statetable->fTableData + (statetable->fRowLen * state));
   1174     category = 3;
   1175     mode     = RBBI_RUN;
   1176     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
   1177         category = 2;
   1178         mode     = RBBI_START;
   1179     }
   1180 
   1181 
   1182     // loop until we reach the start of the text or transition to state 0
   1183     //
   1184     for (;;) {
   1185         if (c == U_SENTINEL) {
   1186             // Reached end of input string.
   1187             if (mode == RBBI_END ||
   1188                 *(int32_t *)fData->fHeader->fFormatVersion == 1 ) {
   1189                 // We have already run the loop one last time with the
   1190                 //   character set to the psueudo {eof} value.  Now it is time
   1191                 //   to unconditionally bail out.
   1192                 //  (Or we have an old format binary rule file that does not support {eof}.)
   1193                 if (lookaheadResult < result) {
   1194                     // We ran off the end of the string with a pending look-ahead match.
   1195                     // Treat this as if the look-ahead condition had been met, and return
   1196                     //  the match at the / position from the look-ahead rule.
   1197                     result               = lookaheadResult;
   1198                     lookaheadStatus = 0;
   1199                 } else if (result == initialPosition) {
   1200                     // Ran off start, no match found.
   1201                     // move one index one (towards the start, since we are doing a previous())
   1202                     UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1203                     UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.
   1204                 }
   1205                 break;
   1206             }
   1207             // Run the loop one last time with the fake end-of-input character category.
   1208             mode = RBBI_END;
   1209             category = 1;
   1210         }
   1211 
   1212         //
   1213         // Get the char category.  An incoming category of 1 or 2 means that
   1214         //      we are preset for doing the beginning or end of input, and
   1215         //      that we shouldn't get a category from an actual text input character.
   1216         //
   1217         if (mode == RBBI_RUN) {
   1218             // look up the current character's character category, which tells us
   1219             // which column in the state table to look at.
   1220             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
   1221             //        not the size of the character going in, which is a UChar32.
   1222             //
   1223             UTRIE_GET16(&fData->fTrie, c, category);
   1224 
   1225             // Check the dictionary bit in the character's category.
   1226             //    Counter is only used by dictionary based iterators (subclasses).
   1227             //    Chars that need to be handled by a dictionary have a flag bit set
   1228             //    in their category values.
   1229             //
   1230             if ((category & 0x4000) != 0)  {
   1231                 fDictionaryCharCount++;
   1232                 //  And off the dictionary flag bit.
   1233                 category &= ~0x4000;
   1234             }
   1235         }
   1236 
   1237         #ifdef RBBI_DEBUG
   1238             if (fTrace) {
   1239                 RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(fText));
   1240                 if (0x20<=c && c<0x7f) {
   1241                     RBBIDebugPrintf("\"%c\"  ", c);
   1242                 } else {
   1243                     RBBIDebugPrintf("%5x  ", c);
   1244                 }
   1245                 RBBIDebugPrintf("%3d  %3d\n", state, category);
   1246             }
   1247         #endif
   1248 
   1249         // State Transition - move machine to its next state
   1250         //
   1251         state = row->fNextState[category];
   1252         row = (RBBIStateTableRow *)
   1253             (statetable->fTableData + (statetable->fRowLen * state));
   1254 
   1255         if (row->fAccepting == -1) {
   1256             // Match found, common case.
   1257             result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1258         }
   1259 
   1260         if (row->fLookAhead != 0) {
   1261             if (lookaheadStatus != 0
   1262                 && row->fAccepting == lookaheadStatus) {
   1263                 // Lookahead match is completed.
   1264                 result               = lookaheadResult;
   1265                 lookaheadStatus      = 0;
   1266                 // TODO:  make a standalone hard break in a rule work.
   1267                 if (lookAheadHardBreak) {
   1268                     UTEXT_SETNATIVEINDEX(fText, result);
   1269                     return result;
   1270                 }
   1271                 // Look-ahead completed, but other rules may match further.  Continue on
   1272                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
   1273                 goto continueOn;
   1274             }
   1275 
   1276             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1277             lookaheadResult = r;
   1278             lookaheadStatus = row->fLookAhead;
   1279             goto continueOn;
   1280         }
   1281 
   1282 
   1283         if (row->fAccepting != 0) {
   1284             // Because this is an accepting state, any in-progress look-ahead match
   1285             //   is no longer relavant.  Clear out the pending lookahead status.
   1286             lookaheadStatus = 0;
   1287         }
   1288 
   1289 continueOn:
   1290         if (state == STOP_STATE) {
   1291             // This is the normal exit from the lookup state machine.
   1292             // We have advanced through the string until it is certain that no
   1293             //   longer match is possible, no matter what characters follow.
   1294             break;
   1295         }
   1296 
   1297         // Move (backwards) to the next character to process.
   1298         // If this is a beginning-of-input loop iteration, don't advance
   1299         //    the input position.  The next iteration will be processing the
   1300         //    first real input character.
   1301         if (mode == RBBI_RUN) {
   1302             c = UTEXT_PREVIOUS32(fText);
   1303         } else {
   1304             if (mode == RBBI_START) {
   1305                 mode = RBBI_RUN;
   1306             }
   1307         }
   1308     }
   1309 
   1310     // The state machine is done.  Check whether it found a match...
   1311 
   1312     // If the iterator failed to advance in the match engine, force it ahead by one.
   1313     //   (This really indicates a defect in the break rules.  They should always match
   1314     //    at least one character.)
   1315     if (result == initialPosition) {
   1316         UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1317         UTEXT_PREVIOUS32(fText);
   1318         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1319     }
   1320 
   1321     // Leave the iterator at our result position.
   1322     UTEXT_SETNATIVEINDEX(fText, result);
   1323     #ifdef RBBI_DEBUG
   1324         if (fTrace) {
   1325             RBBIDebugPrintf("result = %d\n\n", result);
   1326         }
   1327     #endif
   1328     return result;
   1329 }
   1330 
   1331 
   1332 void
   1333 RuleBasedBreakIterator::reset()
   1334 {
   1335     if (fCachedBreakPositions) {
   1336         uprv_free(fCachedBreakPositions);
   1337     }
   1338     fCachedBreakPositions = NULL;
   1339     fNumCachedBreakPositions = 0;
   1340     fDictionaryCharCount = 0;
   1341     fPositionInCache = 0;
   1342 }
   1343 
   1344 
   1345 
   1346 //-------------------------------------------------------------------------------
   1347 //
   1348 //   getRuleStatus()   Return the break rule tag associated with the current
   1349 //                     iterator position.  If the iterator arrived at its current
   1350 //                     position by iterating forwards, the value will have been
   1351 //                     cached by the handleNext() function.
   1352 //
   1353 //                     If no cached status value is available, the status is
   1354 //                     found by doing a previous() followed by a next(), which
   1355 //                     leaves the iterator where it started, and computes the
   1356 //                     status while doing the next().
   1357 //
   1358 //-------------------------------------------------------------------------------
   1359 void RuleBasedBreakIterator::makeRuleStatusValid() {
   1360     if (fLastStatusIndexValid == FALSE) {
   1361         //  No cached status is available.
   1362         if (fText == NULL || current() == 0) {
   1363             //  At start of text, or there is no text.  Status is always zero.
   1364             fLastRuleStatusIndex = 0;
   1365             fLastStatusIndexValid = TRUE;
   1366         } else {
   1367             //  Not at start of text.  Find status the tedious way.
   1368             int32_t pa = current();
   1369             previous();
   1370             if (fNumCachedBreakPositions > 0) {
   1371                 reset();                // Blow off the dictionary cache
   1372             }
   1373             int32_t pb = next();
   1374             if (pa != pb) {
   1375                 // note: the if (pa != pb) test is here only to eliminate warnings for
   1376                 //       unused local variables on gcc.  Logically, it isn't needed.
   1377                 U_ASSERT(pa == pb);
   1378             }
   1379         }
   1380     }
   1381     U_ASSERT(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fData->fStatusMaxIdx);
   1382 }
   1383 
   1384 
   1385 int32_t  RuleBasedBreakIterator::getRuleStatus() const {
   1386     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
   1387     nonConstThis->makeRuleStatusValid();
   1388 
   1389     // fLastRuleStatusIndex indexes to the start of the appropriate status record
   1390     //                                                 (the number of status values.)
   1391     //   This function returns the last (largest) of the array of status values.
   1392     int32_t  idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
   1393     int32_t  tagVal = fData->fRuleStatusTable[idx];
   1394 
   1395     return tagVal;
   1396 }
   1397 
   1398 
   1399 
   1400 
   1401 int32_t RuleBasedBreakIterator::getRuleStatusVec(
   1402              int32_t *fillInVec, int32_t capacity, UErrorCode &status)
   1403 {
   1404     if (U_FAILURE(status)) {
   1405         return 0;
   1406     }
   1407 
   1408     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
   1409     nonConstThis->makeRuleStatusValid();
   1410     int32_t  numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
   1411     int32_t  numValsToCopy = numVals;
   1412     if (numVals > capacity) {
   1413         status = U_BUFFER_OVERFLOW_ERROR;
   1414         numValsToCopy = capacity;
   1415     }
   1416     int i;
   1417     for (i=0; i<numValsToCopy; i++) {
   1418         fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
   1419     }
   1420     return numVals;
   1421 }
   1422 
   1423 
   1424 
   1425 //-------------------------------------------------------------------------------
   1426 //
   1427 //   getBinaryRules        Access to the compiled form of the rules,
   1428 //                         for use by build system tools that save the data
   1429 //                         for standard iterator types.
   1430 //
   1431 //-------------------------------------------------------------------------------
   1432 const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
   1433     const uint8_t  *retPtr = NULL;
   1434     length = 0;
   1435 
   1436     if (fData != NULL) {
   1437         retPtr = (const uint8_t *)fData->fHeader;
   1438         length = fData->fHeader->fLength;
   1439     }
   1440     return retPtr;
   1441 }
   1442 
   1443 
   1444 
   1445 
   1446 //-------------------------------------------------------------------------------
   1447 //
   1448 //  BufferClone       TODO:  In my (Andy) opinion, this function should be deprecated.
   1449 //                    Saving one heap allocation isn't worth the trouble.
   1450 //                    Cloning shouldn't be done in tight loops, and
   1451 //                    making the clone copy involves other heap operations anyway.
   1452 //                    And the application code for correctly dealing with buffer
   1453 //                    size problems and the eventual object destruction is ugly.
   1454 //
   1455 //-------------------------------------------------------------------------------
   1456 BreakIterator *  RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
   1457                                    int32_t &bufferSize,
   1458                                    UErrorCode &status)
   1459 {
   1460     if (U_FAILURE(status)){
   1461         return NULL;
   1462     }
   1463 
   1464     //
   1465     //  If user buffer size is zero this is a preflight operation to
   1466     //    obtain the needed buffer size, allowing for worst case misalignment.
   1467     //
   1468     if (bufferSize == 0) {
   1469         bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
   1470         return NULL;
   1471     }
   1472 
   1473 
   1474     //
   1475     //  Check the alignment and size of the user supplied buffer.
   1476     //  Allocate heap memory if the user supplied memory is insufficient.
   1477     //
   1478     char    *buf   = (char *)stackBuffer;
   1479     uint32_t s      = bufferSize;
   1480 
   1481     if (stackBuffer == NULL) {
   1482         s = 0;   // Ignore size, force allocation if user didn't give us a buffer.
   1483     }
   1484     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
   1485         uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf);
   1486         s   -= offsetUp;
   1487         buf += offsetUp;
   1488     }
   1489     if (s < sizeof(RuleBasedBreakIterator)) {
   1490         // Not enough room in the caller-supplied buffer.
   1491         // Do a plain-vanilla heap based clone and return that, along with
   1492         //   a warning that the clone was allocated.
   1493         RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this);
   1494         if (clonedBI == 0) {
   1495             status = U_MEMORY_ALLOCATION_ERROR;
   1496         } else {
   1497             status = U_SAFECLONE_ALLOCATED_WARNING;
   1498         }
   1499         return clonedBI;
   1500     }
   1501 
   1502     //
   1503     //  Clone the source BI into the caller-supplied buffer.
   1504     //    TODO:  using an overloaded operator new to directly initialize the
   1505     //           copy in the user's buffer would be better, but it doesn't seem
   1506     //           to get along with namespaces.  Investigate why.
   1507     //
   1508     //           The memcpy is only safe with an empty (default constructed)
   1509     //           break iterator.  Use on others can screw up reference counts
   1510     //           to data.  memcpy-ing objects is not really a good idea...
   1511     //
   1512     RuleBasedBreakIterator localIter;        // Empty break iterator, source for memcpy
   1513     RuleBasedBreakIterator *clone = (RuleBasedBreakIterator *)buf;
   1514     uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // init C++ gorp, BreakIterator base class part
   1515     clone->init();                // Init RuleBasedBreakIterator part, (user default constructor)
   1516     *clone = *this;               // clone = the real BI we want.
   1517     clone->fBufferClone = TRUE;   // Flag to prevent deleting storage on close (From C code)
   1518 
   1519     return clone;
   1520 }
   1521 
   1522 
   1523 //-------------------------------------------------------------------------------
   1524 //
   1525 //  isDictionaryChar      Return true if the category lookup for this char
   1526 //                        indicates that it is in the set of dictionary lookup
   1527 //                        chars.
   1528 //
   1529 //                        This function is intended for use by dictionary based
   1530 //                        break iterators.
   1531 //
   1532 //-------------------------------------------------------------------------------
   1533 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32   c) {
   1534     if (fData == NULL) {
   1535         return FALSE;
   1536     }
   1537     uint16_t category;
   1538     UTRIE_GET16(&fData->fTrie, c, category);
   1539     return (category & 0x4000) != 0;
   1540 }*/
   1541 
   1542 
   1543 //-------------------------------------------------------------------------------
   1544 //
   1545 //  checkDictionary       This function handles all processing of characters in
   1546 //                        the "dictionary" set. It will determine the appropriate
   1547 //                        course of action, and possibly set up a cache in the
   1548 //                        process.
   1549 //
   1550 //-------------------------------------------------------------------------------
   1551 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
   1552                             int32_t endPos,
   1553                             UBool reverse) {
   1554     // Reset the old break cache first.
   1555     uint32_t dictionaryCount = fDictionaryCharCount;
   1556     reset();
   1557 
   1558     if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
   1559         return (reverse ? startPos : endPos);
   1560     }
   1561 
   1562     // Starting from the starting point, scan towards the proposed result,
   1563     // looking for the first dictionary character (which may be the one
   1564     // we're on, if we're starting in the middle of a range).
   1565     utext_setNativeIndex(fText, reverse ? endPos : startPos);
   1566     if (reverse) {
   1567         UTEXT_PREVIOUS32(fText);
   1568     }
   1569 
   1570     int32_t rangeStart = startPos;
   1571     int32_t rangeEnd = endPos;
   1572 
   1573     uint16_t    category;
   1574     int32_t     current;
   1575     UErrorCode  status = U_ZERO_ERROR;
   1576     UStack      breaks(status);
   1577     int32_t     foundBreakCount = 0;
   1578     UChar32     c = utext_current32(fText);
   1579 
   1580     UTRIE_GET16(&fData->fTrie, c, category);
   1581 
   1582     // Is the character we're starting on a dictionary character? If so, we
   1583     // need to back up to include the entire run; otherwise the results of
   1584     // the break algorithm will differ depending on where we start. Since
   1585     // the result is cached and there is typically a non-dictionary break
   1586     // within a small number of words, there should be little performance impact.
   1587     if (category & 0x4000) {
   1588         if (reverse) {
   1589             do {
   1590                 utext_next32(fText);          // TODO:  recast to work directly with postincrement.
   1591                 c = utext_current32(fText);
   1592                 UTRIE_GET16(&fData->fTrie, c, category);
   1593             } while (c != U_SENTINEL && (category & 0x4000));
   1594             // Back up to the last dictionary character
   1595             rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1596             if (c == U_SENTINEL) {
   1597                 // c = fText->last32();
   1598                 //   TODO:  why was this if needed?
   1599                 c = UTEXT_PREVIOUS32(fText);
   1600             }
   1601             else {
   1602                 c = UTEXT_PREVIOUS32(fText);
   1603             }
   1604         }
   1605         else {
   1606             do {
   1607                 c = UTEXT_PREVIOUS32(fText);
   1608                 UTRIE_GET16(&fData->fTrie, c, category);
   1609             }
   1610             while (c != U_SENTINEL && (category & 0x4000));
   1611             // Back up to the last dictionary character
   1612             if (c == U_SENTINEL) {
   1613                 // c = fText->first32();
   1614                 c = utext_current32(fText);
   1615             }
   1616             else {
   1617                 utext_next32(fText);
   1618                 c = utext_current32(fText);
   1619             }
   1620             rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
   1621         }
   1622         UTRIE_GET16(&fData->fTrie, c, category);
   1623     }
   1624 
   1625     // Loop through the text, looking for ranges of dictionary characters.
   1626     // For each span, find the appropriate break engine, and ask it to find
   1627     // any breaks within the span.
   1628     // Note: we always do this in the forward direction, so that the break
   1629     // cache is built in the right order.
   1630     if (reverse) {
   1631         utext_setNativeIndex(fText, rangeStart);
   1632         c = utext_current32(fText);
   1633         UTRIE_GET16(&fData->fTrie, c, category);
   1634     }
   1635     while(U_SUCCESS(status)) {
   1636         while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
   1637             utext_next32(fText);           // TODO:  tweak for post-increment operation
   1638             c = utext_current32(fText);
   1639             UTRIE_GET16(&fData->fTrie, c, category);
   1640         }
   1641         if (current >= rangeEnd) {
   1642             break;
   1643         }
   1644 
   1645         // We now have a dictionary character. Get the appropriate language object
   1646         // to deal with it.
   1647         const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
   1648 
   1649         // Ask the language object if there are any breaks. It will leave the text
   1650         // pointer on the other side of its range, ready to search for the next one.
   1651         if (lbe != NULL) {
   1652             foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
   1653         }
   1654 
   1655         // Reload the loop variables for the next go-round
   1656         c = utext_current32(fText);
   1657         UTRIE_GET16(&fData->fTrie, c, category);
   1658     }
   1659 
   1660     // If we found breaks, build a new break cache. The first and last entries must
   1661     // be the original starting and ending position.
   1662     if (foundBreakCount > 0) {
   1663         int32_t totalBreaks = foundBreakCount;
   1664         if (startPos < breaks.elementAti(0)) {
   1665             totalBreaks += 1;
   1666         }
   1667         if (endPos > breaks.peeki()) {
   1668             totalBreaks += 1;
   1669         }
   1670         fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
   1671         if (fCachedBreakPositions != NULL) {
   1672             int32_t out = 0;
   1673             fNumCachedBreakPositions = totalBreaks;
   1674             if (startPos < breaks.elementAti(0)) {
   1675                 fCachedBreakPositions[out++] = startPos;
   1676             }
   1677             for (int32_t i = 0; i < foundBreakCount; ++i) {
   1678                 fCachedBreakPositions[out++] = breaks.elementAti(i);
   1679             }
   1680             if (endPos > fCachedBreakPositions[out-1]) {
   1681                 fCachedBreakPositions[out] = endPos;
   1682             }
   1683             // If there are breaks, then by definition, we are replacing the original
   1684             // proposed break by one of the breaks we found. Use following() and
   1685             // preceding() to do the work. They should never recurse in this case.
   1686             if (reverse) {
   1687                 return preceding(endPos - 1);
   1688             }
   1689             else {
   1690                 return following(startPos);
   1691             }
   1692         }
   1693         // If the allocation failed, just fall through to the "no breaks found" case.
   1694     }
   1695 
   1696     // If we get here, there were no language-based breaks. Set the text pointer
   1697     // to the original proposed break.
   1698     utext_setNativeIndex(fText, reverse ? startPos : endPos);
   1699     return (reverse ? startPos : endPos);
   1700 }
   1701 
   1702 U_NAMESPACE_END
   1703 
   1704 // defined in ucln_cmn.h
   1705 
   1706 static U_NAMESPACE_QUALIFIER UStack *gLanguageBreakFactories = NULL;
   1707 
   1708 /**
   1709  * Release all static memory held by breakiterator.
   1710  */
   1711 U_CDECL_BEGIN
   1712 static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
   1713     if (gLanguageBreakFactories) {
   1714         delete gLanguageBreakFactories;
   1715         gLanguageBreakFactories = NULL;
   1716     }
   1717     return TRUE;
   1718 }
   1719 U_CDECL_END
   1720 
   1721 U_CDECL_BEGIN
   1722 static void U_CALLCONV _deleteFactory(void *obj) {
   1723     delete (U_NAMESPACE_QUALIFIER LanguageBreakFactory *) obj;
   1724 }
   1725 U_CDECL_END
   1726 U_NAMESPACE_BEGIN
   1727 
   1728 static const LanguageBreakEngine*
   1729 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
   1730 {
   1731     UBool       needsInit;
   1732     UErrorCode  status = U_ZERO_ERROR;
   1733     UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit);
   1734 
   1735     if (needsInit) {
   1736         UStack  *factories = new UStack(_deleteFactory, NULL, status);
   1737         if (factories != NULL && U_SUCCESS(status)) {
   1738             ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
   1739             factories->push(builtIn, status);
   1740 #ifdef U_LOCAL_SERVICE_HOOK
   1741             LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
   1742             if (extra != NULL) {
   1743                 factories->push(extra, status);
   1744             }
   1745 #endif
   1746         }
   1747         umtx_lock(NULL);
   1748         if (gLanguageBreakFactories == NULL) {
   1749             gLanguageBreakFactories = factories;
   1750             factories = NULL;
   1751             ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
   1752         }
   1753         umtx_unlock(NULL);
   1754         delete factories;
   1755     }
   1756 
   1757     if (gLanguageBreakFactories == NULL) {
   1758         return NULL;
   1759     }
   1760 
   1761     int32_t i = gLanguageBreakFactories->size();
   1762     const LanguageBreakEngine *lbe = NULL;
   1763     while (--i >= 0) {
   1764         LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
   1765         lbe = factory->getEngineFor(c, breakType);
   1766         if (lbe != NULL) {
   1767             break;
   1768         }
   1769     }
   1770     return lbe;
   1771 }
   1772 
   1773 
   1774 //-------------------------------------------------------------------------------
   1775 //
   1776 //  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the
   1777 //                          the characer c.
   1778 //
   1779 //-------------------------------------------------------------------------------
   1780 const LanguageBreakEngine *
   1781 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
   1782     const LanguageBreakEngine *lbe = NULL;
   1783     UErrorCode status = U_ZERO_ERROR;
   1784 
   1785     if (fLanguageBreakEngines == NULL) {
   1786         fLanguageBreakEngines = new UStack(status);
   1787         if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
   1788             delete fLanguageBreakEngines;
   1789             fLanguageBreakEngines = 0;
   1790             return NULL;
   1791         }
   1792     }
   1793 
   1794     int32_t i = fLanguageBreakEngines->size();
   1795     while (--i >= 0) {
   1796         lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
   1797         if (lbe->handles(c, fBreakType)) {
   1798             return lbe;
   1799         }
   1800     }
   1801 
   1802     // No existing dictionary took the character. See if a factory wants to
   1803     // give us a new LanguageBreakEngine for this character.
   1804     lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
   1805 
   1806     // If we got one, use it and push it on our stack.
   1807     if (lbe != NULL) {
   1808         fLanguageBreakEngines->push((void *)lbe, status);
   1809         // Even if we can't remember it, we can keep looking it up, so
   1810         // return it even if the push fails.
   1811         return lbe;
   1812     }
   1813 
   1814     // No engine is forthcoming for this character. Add it to the
   1815     // reject set. Create the reject break engine if needed.
   1816     if (fUnhandledBreakEngine == NULL) {
   1817         fUnhandledBreakEngine = new UnhandledEngine(status);
   1818         if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
   1819             status = U_MEMORY_ALLOCATION_ERROR;
   1820         }
   1821         // Put it last so that scripts for which we have an engine get tried
   1822         // first.
   1823         fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
   1824         // If we can't insert it, or creation failed, get rid of it
   1825         if (U_FAILURE(status)) {
   1826             delete fUnhandledBreakEngine;
   1827             fUnhandledBreakEngine = 0;
   1828             return NULL;
   1829         }
   1830     }
   1831 
   1832     // Tell the reject engine about the character; at its discretion, it may
   1833     // add more than just the one character.
   1834     fUnhandledBreakEngine->handleCharacter(c, fBreakType);
   1835 
   1836     return fUnhandledBreakEngine;
   1837 }
   1838 
   1839 
   1840 
   1841 /*int32_t RuleBasedBreakIterator::getBreakType() const {
   1842     return fBreakType;
   1843 }*/
   1844 
   1845 void RuleBasedBreakIterator::setBreakType(int32_t type) {
   1846     fBreakType = type;
   1847     reset();
   1848 }
   1849 
   1850 U_NAMESPACE_END
   1851 
   1852 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   1853