Home | History | Annotate | Download | only in common
      1 /*
      2 ***************************************************************************
      3 *   Copyright (C) 1999-2008 International Business Machines Corporation   *
      4 *   and others. All rights reserved.                                      *
      5 ***************************************************************************
      6 */
      7 //
      8 //  file:  rbbi.c    Contains the implementation of the rule based break iterator
      9 //                   runtime engine and the API implementation for
     10 //                   class RuleBasedBreakIterator
     11 //
     12 
     13 #include "unicode/utypes.h"
     14 
     15 #if !UCONFIG_NO_BREAK_ITERATION
     16 
     17 #include "unicode/rbbi.h"
     18 #include "unicode/schriter.h"
     19 #include "unicode/uchriter.h"
     20 #include "unicode/udata.h"
     21 #include "unicode/uclean.h"
     22 #include "rbbidata.h"
     23 #include "rbbirb.h"
     24 #include "cmemory.h"
     25 #include "cstring.h"
     26 #include "umutex.h"
     27 #include "ucln_cmn.h"
     28 #include "brkeng.h"
     29 
     30 #include "uassert.h"
     31 #include "uvector.h"
     32 #include <stdio.h>
     33 
     34 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
     35 #if U_LOCAL_SERVICE_HOOK
     36 #include "localsvc.h"
     37 #endif
     38 
     39 #ifdef RBBI_DEBUG
     40 static UBool fTrace = FALSE;
     41 #endif
     42 
     43 U_NAMESPACE_BEGIN
     44 
     45 // The state number of the starting state
     46 #define START_STATE 1
     47 
     48 // The state-transition value indicating "stop"
     49 #define STOP_STATE  0
     50 
     51 
     52 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
     53 
     54 
     55 //=======================================================================
     56 // constructors
     57 //=======================================================================
     58 
     59 /**
     60  * Constructs a RuleBasedBreakIterator that uses the already-created
     61  * tables object that is passed in as a parameter.
     62  */
     63 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
     64 {
     65     init();
     66     fData = new RBBIDataWrapper(data, status); // status checked in constructor
     67     if (U_FAILURE(status)) {return;}
     68     if(fData == 0) {
     69         status = U_MEMORY_ALLOCATION_ERROR;
     70         return;
     71     }
     72 }
     73 
     74 /**
     75  * Same as above but does not adopt memory
     76  */
     77 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status)
     78 {
     79     init();
     80     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor
     81     if (U_FAILURE(status)) {return;}
     82     if(fData == 0) {
     83         status = U_MEMORY_ALLOCATION_ERROR;
     84         return;
     85     }
     86 }
     87 
     88 //-------------------------------------------------------------------------------
     89 //
     90 //   Constructor   from a UDataMemory handle to precompiled break rules
     91 //                 stored in an ICU data file.
     92 //
     93 //-------------------------------------------------------------------------------
     94 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
     95 {
     96     init();
     97     fData = new RBBIDataWrapper(udm, status); // status checked in constructor
     98     if (U_FAILURE(status)) {return;}
     99     if(fData == 0) {
    100         status = U_MEMORY_ALLOCATION_ERROR;
    101         return;
    102     }
    103 }
    104 
    105 
    106 
    107 //-------------------------------------------------------------------------------
    108 //
    109 //   Constructor       from a set of rules supplied as a string.
    110 //
    111 //-------------------------------------------------------------------------------
    112 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
    113                                                 UParseError          &parseError,
    114                                                 UErrorCode           &status)
    115 {
    116     init();
    117     if (U_FAILURE(status)) {return;}
    118     RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
    119         RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
    120     // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that
    121     //        creates and returns a complete RBBI.  From here, in a constructor, we
    122     //        can't just return the object created by the builder factory, hence
    123     //        the assignment of the factory created object to "this".
    124     if (U_SUCCESS(status)) {
    125         *this = *bi;
    126         delete bi;
    127     }
    128 }
    129 
    130 
    131 //-------------------------------------------------------------------------------
    132 //
    133 // Default Constructor.      Create an empty shell that can be set up later.
    134 //                           Used when creating a RuleBasedBreakIterator from a set
    135 //                           of rules.
    136 //-------------------------------------------------------------------------------
    137 RuleBasedBreakIterator::RuleBasedBreakIterator() {
    138     init();
    139 }
    140 
    141 
    142 //-------------------------------------------------------------------------------
    143 //
    144 //   Copy constructor.  Will produce a break iterator with the same behavior,
    145 //                      and which iterates over the same text, as the one passed in.
    146 //
    147 //-------------------------------------------------------------------------------
    148 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
    149 : BreakIterator(other)
    150 {
    151     this->init();
    152     *this = other;
    153 }
    154 
    155 
    156 /**
    157  * Destructor
    158  */
    159 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
    160     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    161         // fCharIter was adopted from the outside.
    162         delete fCharIter;
    163     }
    164     fCharIter = NULL;
    165     delete fSCharIter;
    166     fCharIter = NULL;
    167     delete fDCharIter;
    168     fDCharIter = NULL;
    169 
    170     utext_close(fText);
    171 
    172     if (fData != NULL) {
    173         fData->removeReference();
    174         fData = NULL;
    175     }
    176     if (fCachedBreakPositions) {
    177         uprv_free(fCachedBreakPositions);
    178         fCachedBreakPositions = NULL;
    179     }
    180     if (fLanguageBreakEngines) {
    181         delete fLanguageBreakEngines;
    182         fLanguageBreakEngines = NULL;
    183     }
    184     if (fUnhandledBreakEngine) {
    185         delete fUnhandledBreakEngine;
    186         fUnhandledBreakEngine = NULL;
    187     }
    188 }
    189 
    190 /**
    191  * Assignment operator.  Sets this iterator to have the same behavior,
    192  * and iterate over the same text, as the one passed in.
    193  */
    194 RuleBasedBreakIterator&
    195 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
    196     if (this == &that) {
    197         return *this;
    198     }
    199     reset();    // Delete break cache information
    200     fBreakType = that.fBreakType;
    201     if (fLanguageBreakEngines != NULL) {
    202         delete fLanguageBreakEngines;
    203         fLanguageBreakEngines = NULL;   // Just rebuild for now
    204     }
    205     // TODO: clone fLanguageBreakEngines from "that"
    206     UErrorCode status = U_ZERO_ERROR;
    207     fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
    208 
    209     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    210         delete fCharIter;
    211     }
    212     fCharIter = NULL;
    213 
    214     if (that.fCharIter != NULL ) {
    215         // This is a little bit tricky - it will intially appear that
    216         //  this->fCharIter is adopted, even if that->fCharIter was
    217         //  not adopted.  That's ok.
    218         fCharIter = that.fCharIter->clone();
    219     }
    220 
    221     if (fData != NULL) {
    222         fData->removeReference();
    223         fData = NULL;
    224     }
    225     if (that.fData != NULL) {
    226         fData = that.fData->addReference();
    227     }
    228 
    229     return *this;
    230 }
    231 
    232 
    233 
    234 //-----------------------------------------------------------------------------
    235 //
    236 //    init()      Shared initialization routine.   Used by all the constructors.
    237 //                Initializes all fields, leaving the object in a consistent state.
    238 //
    239 //-----------------------------------------------------------------------------
    240 void RuleBasedBreakIterator::init() {
    241     UErrorCode  status    = U_ZERO_ERROR;
    242     fBufferClone          = FALSE;
    243     fText                 = utext_openUChars(NULL, NULL, 0, &status);
    244     fCharIter             = NULL;
    245     fSCharIter            = NULL;
    246     fDCharIter            = NULL;
    247     fData                 = NULL;
    248     fLastRuleStatusIndex  = 0;
    249     fLastStatusIndexValid = TRUE;
    250     fDictionaryCharCount  = 0;
    251     fBreakType            = -1;
    252 
    253     fCachedBreakPositions    = NULL;
    254     fLanguageBreakEngines    = NULL;
    255     fUnhandledBreakEngine    = NULL;
    256     fNumCachedBreakPositions = 0;
    257     fPositionInCache         = 0;
    258 
    259 #ifdef RBBI_DEBUG
    260     static UBool debugInitDone = FALSE;
    261     if (debugInitDone == FALSE) {
    262         char *debugEnv = getenv("U_RBBIDEBUG");
    263         if (debugEnv && uprv_strstr(debugEnv, "trace")) {
    264             fTrace = TRUE;
    265         }
    266         debugInitDone = TRUE;
    267     }
    268 #endif
    269 }
    270 
    271 
    272 
    273 //-----------------------------------------------------------------------------
    274 //
    275 //    clone - Returns a newly-constructed RuleBasedBreakIterator with the same
    276 //            behavior, and iterating over the same text, as this one.
    277 //            Virtual function: does the right thing with subclasses.
    278 //
    279 //-----------------------------------------------------------------------------
    280 BreakIterator*
    281 RuleBasedBreakIterator::clone(void) const {
    282     return new RuleBasedBreakIterator(*this);
    283 }
    284 
    285 /**
    286  * Equality operator.  Returns TRUE if both BreakIterators are of the
    287  * same class, have the same behavior, and iterate over the same text.
    288  */
    289 UBool
    290 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
    291     if (that.getDynamicClassID() != getDynamicClassID()) {
    292         return FALSE;
    293     }
    294 
    295     const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
    296 
    297     if (!utext_equals(fText, that2.fText)) {
    298         // The two break iterators are operating on different text,
    299         //   or have a different interation position.
    300         return FALSE;
    301     };
    302 
    303     // TODO:  need a check for when in a dictionary region at different offsets.
    304 
    305     if (that2.fData == fData ||
    306         (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
    307             // The two break iterators are using the same rules.
    308             return TRUE;
    309         }
    310     return FALSE;
    311 }
    312 
    313 /**
    314  * Compute a hash code for this BreakIterator
    315  * @return A hash code
    316  */
    317 int32_t
    318 RuleBasedBreakIterator::hashCode(void) const {
    319     int32_t   hash = 0;
    320     if (fData != NULL) {
    321         hash = fData->hashCode();
    322     }
    323     return hash;
    324 }
    325 
    326 
    327 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
    328     if (U_FAILURE(status)) {
    329         return;
    330     }
    331     reset();
    332     fText = utext_clone(fText, ut, FALSE, TRUE, &status);
    333 
    334     // Set up a dummy CharacterIterator to be returned if anyone
    335     //   calls getText().  With input from UText, there is no reasonable
    336     //   way to return a characterIterator over the actual input text.
    337     //   Return one over an empty string instead - this is the closest
    338     //   we can come to signaling a failure.
    339     //   (GetText() is obsolete, this failure is sort of OK)
    340     if (fDCharIter == NULL) {
    341         static const UChar c = 0;
    342         fDCharIter = new UCharCharacterIterator(&c, 0);
    343         if (fDCharIter == NULL) {
    344             status = U_MEMORY_ALLOCATION_ERROR;
    345             return;
    346         }
    347     }
    348 
    349     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    350         // existing fCharIter was adopted from the outside.  Delete it now.
    351         delete fCharIter;
    352     }
    353     fCharIter = fDCharIter;
    354 
    355     this->first();
    356 }
    357 
    358 
    359 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
    360     UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
    361     return result;
    362 }
    363 
    364 
    365 
    366 /**
    367  * Returns the description used to create this iterator
    368  */
    369 const UnicodeString&
    370 RuleBasedBreakIterator::getRules() const {
    371     if (fData != NULL) {
    372         return fData->getRuleSourceString();
    373     } else {
    374         static const UnicodeString *s;
    375         if (s == NULL) {
    376             // TODO:  something more elegant here.
    377             //        perhaps API should return the string by value.
    378             //        Note:  thread unsafe init & leak are semi-ok, better than
    379             //               what was before.  Sould be cleaned up, though.
    380             s = new UnicodeString;
    381         }
    382         return *s;
    383     }
    384 }
    385 
    386 //=======================================================================
    387 // BreakIterator overrides
    388 //=======================================================================
    389 
    390 /**
    391  * Return a CharacterIterator over the text being analyzed.
    392  */
    393 CharacterIterator&
    394 RuleBasedBreakIterator::getText() const {
    395     return *fCharIter;
    396 }
    397 
    398 /**
    399  * Set the iterator to analyze a new piece of text.  This function resets
    400  * the current iteration position to the beginning of the text.
    401  * @param newText An iterator over the text to analyze.
    402  */
    403 void
    404 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
    405     // If we are holding a CharacterIterator adopted from a
    406     //   previous call to this function, delete it now.
    407     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    408         delete fCharIter;
    409     }
    410 
    411     fCharIter = newText;
    412     UErrorCode status = U_ZERO_ERROR;
    413     reset();
    414     if (newText==NULL || newText->startIndex() != 0) {
    415         // startIndex !=0 wants to be an error, but there's no way to report it.
    416         // Make the iterator text be an empty string.
    417         fText = utext_openUChars(fText, NULL, 0, &status);
    418     } else {
    419         fText = utext_openCharacterIterator(fText, newText, &status);
    420     }
    421     this->first();
    422 }
    423 
    424 /**
    425  * Set the iterator to analyze a new piece of text.  This function resets
    426  * the current iteration position to the beginning of the text.
    427  * @param newText An iterator over the text to analyze.
    428  */
    429 void
    430 RuleBasedBreakIterator::setText(const UnicodeString& newText) {
    431     UErrorCode status = U_ZERO_ERROR;
    432     reset();
    433     fText = utext_openConstUnicodeString(fText, &newText, &status);
    434 
    435     // Set up a character iterator on the string.
    436     //   Needed in case someone calls getText().
    437     //  Can not, unfortunately, do this lazily on the (probably never)
    438     //  call to getText(), because getText is const.
    439     if (fSCharIter == NULL) {
    440         fSCharIter = new StringCharacterIterator(newText);
    441     } else {
    442         fSCharIter->setText(newText);
    443     }
    444 
    445     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    446         // old fCharIter was adopted from the outside.  Delete it.
    447         delete fCharIter;
    448     }
    449     fCharIter = fSCharIter;
    450 
    451     this->first();
    452 }
    453 
    454 
    455 
    456 /**
    457  * Sets the current iteration position to the beginning of the text.
    458  * @return The offset of the beginning of the text.
    459  */
    460 int32_t RuleBasedBreakIterator::first(void) {
    461     reset();
    462     fLastRuleStatusIndex  = 0;
    463     fLastStatusIndexValid = TRUE;
    464     //if (fText == NULL)
    465     //    return BreakIterator::DONE;
    466 
    467     utext_setNativeIndex(fText, 0);
    468     return 0;
    469 }
    470 
    471 /**
    472  * Sets the current iteration position to the end of the text.
    473  * @return The text's past-the-end offset.
    474  */
    475 int32_t RuleBasedBreakIterator::last(void) {
    476     reset();
    477     if (fText == NULL) {
    478         fLastRuleStatusIndex  = 0;
    479         fLastStatusIndexValid = TRUE;
    480         return BreakIterator::DONE;
    481     }
    482 
    483     fLastStatusIndexValid = FALSE;
    484     int32_t pos = (int32_t)utext_nativeLength(fText);
    485     utext_setNativeIndex(fText, pos);
    486     return pos;
    487 }
    488 
    489 /**
    490  * Advances the iterator either forward or backward the specified number of steps.
    491  * Negative values move backward, and positive values move forward.  This is
    492  * equivalent to repeatedly calling next() or previous().
    493  * @param n The number of steps to move.  The sign indicates the direction
    494  * (negative is backwards, and positive is forwards).
    495  * @return The character offset of the boundary position n boundaries away from
    496  * the current one.
    497  */
    498 int32_t RuleBasedBreakIterator::next(int32_t n) {
    499     int32_t result = current();
    500     while (n > 0) {
    501         result = next();
    502         --n;
    503     }
    504     while (n < 0) {
    505         result = previous();
    506         ++n;
    507     }
    508     return result;
    509 }
    510 
    511 /**
    512  * Advances the iterator to the next boundary position.
    513  * @return The position of the first boundary after this one.
    514  */
    515 int32_t RuleBasedBreakIterator::next(void) {
    516     // if we have cached break positions and we're still in the range
    517     // covered by them, just move one step forward in the cache
    518     if (fCachedBreakPositions != NULL) {
    519         if (fPositionInCache < fNumCachedBreakPositions - 1) {
    520             ++fPositionInCache;
    521             int32_t pos = fCachedBreakPositions[fPositionInCache];
    522             utext_setNativeIndex(fText, pos);
    523             return pos;
    524         }
    525         else {
    526             reset();
    527         }
    528     }
    529 
    530     int32_t startPos = current();
    531     int32_t result = handleNext(fData->fForwardTable);
    532     if (fDictionaryCharCount > 0) {
    533         result = checkDictionary(startPos, result, FALSE);
    534     }
    535     return result;
    536 }
    537 
    538 /**
    539  * Advances the iterator backwards, to the last boundary preceding this one.
    540  * @return The position of the last boundary position preceding this one.
    541  */
    542 int32_t RuleBasedBreakIterator::previous(void) {
    543     int32_t result;
    544     int32_t startPos;
    545 
    546     // if we have cached break positions and we're still in the range
    547     // covered by them, just move one step backward in the cache
    548     if (fCachedBreakPositions != NULL) {
    549         if (fPositionInCache > 0) {
    550             --fPositionInCache;
    551             // If we're at the beginning of the cache, need to reevaluate the
    552             // rule status
    553             if (fPositionInCache <= 0) {
    554                 fLastStatusIndexValid = FALSE;
    555             }
    556             int32_t pos = fCachedBreakPositions[fPositionInCache];
    557             utext_setNativeIndex(fText, pos);
    558             return pos;
    559         }
    560         else {
    561             reset();
    562         }
    563     }
    564 
    565     // if we're already sitting at the beginning of the text, return DONE
    566     if (fText == NULL || (startPos = current()) == 0) {
    567         fLastRuleStatusIndex  = 0;
    568         fLastStatusIndexValid = TRUE;
    569         return BreakIterator::DONE;
    570     }
    571 
    572     if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
    573         result = handlePrevious(fData->fReverseTable);
    574         if (fDictionaryCharCount > 0) {
    575             result = checkDictionary(result, startPos, TRUE);
    576         }
    577         return result;
    578     }
    579 
    580     // old rule syntax
    581     // set things up.  handlePrevious() will back us up to some valid
    582     // break position before the current position (we back our internal
    583     // iterator up one step to prevent handlePrevious() from returning
    584     // the current position), but not necessarily the last one before
    585 
    586     // where we started
    587 
    588     int32_t start = current();
    589 
    590     UTEXT_PREVIOUS32(fText);
    591     int32_t lastResult    = handlePrevious(fData->fReverseTable);
    592     if (lastResult == UBRK_DONE) {
    593         lastResult = 0;
    594         utext_setNativeIndex(fText, 0);
    595     }
    596     result = lastResult;
    597     int32_t lastTag       = 0;
    598     UBool   breakTagValid = FALSE;
    599 
    600     // iterate forward from the known break position until we pass our
    601     // starting point.  The last break position before the starting
    602     // point is our return value
    603 
    604     for (;;) {
    605         result         = next();
    606         if (result == BreakIterator::DONE || result >= start) {
    607             break;
    608         }
    609         lastResult     = result;
    610         lastTag        = fLastRuleStatusIndex;
    611         breakTagValid  = TRUE;
    612     }
    613 
    614     // fLastBreakTag wants to have the value for section of text preceding
    615     // the result position that we are to return (in lastResult.)  If
    616     // the backwards rules overshot and the above loop had to do two or more
    617     // next()s to move up to the desired return position, we will have a valid
    618     // tag value. But, if handlePrevious() took us to exactly the correct result positon,
    619     // we wont have a tag value for that position, which is only set by handleNext().
    620 
    621     // set the current iteration position to be the last break position
    622     // before where we started, and then return that value
    623     utext_setNativeIndex(fText, lastResult);
    624     fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()
    625     fLastStatusIndexValid = breakTagValid;
    626 
    627     // No need to check the dictionary; it will have been handled by
    628     // next()
    629 
    630     return lastResult;
    631 }
    632 
    633 /**
    634  * Sets the iterator to refer to the first boundary position following
    635  * the specified position.
    636  * @offset The position from which to begin searching for a break position.
    637  * @return The position of the first break after the current position.
    638  */
    639 int32_t RuleBasedBreakIterator::following(int32_t offset) {
    640     // if we have cached break positions and offset is in the range
    641     // covered by them, use them
    642     // TODO: could use binary search
    643     // TODO: what if offset is outside range, but break is not?
    644     if (fCachedBreakPositions != NULL) {
    645         if (offset >= fCachedBreakPositions[0]
    646                 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
    647             fPositionInCache = 0;
    648             // We are guaranteed not to leave the array due to range test above
    649             while (offset >= fCachedBreakPositions[fPositionInCache]) {
    650                 ++fPositionInCache;
    651             }
    652             int32_t pos = fCachedBreakPositions[fPositionInCache];
    653             utext_setNativeIndex(fText, pos);
    654             return pos;
    655         }
    656         else {
    657             reset();
    658         }
    659     }
    660 
    661     // if the offset passed in is already past the end of the text,
    662     // just return DONE; if it's before the beginning, return the
    663     // text's starting offset
    664     fLastRuleStatusIndex  = 0;
    665     fLastStatusIndexValid = TRUE;
    666     if (fText == NULL || offset >= utext_nativeLength(fText)) {
    667         last();
    668         return next();
    669     }
    670     else if (offset < 0) {
    671         return first();
    672     }
    673 
    674     // otherwise, set our internal iteration position (temporarily)
    675     // to the position passed in.  If this is the _beginning_ position,
    676     // then we can just use next() to get our return value
    677 
    678     int32_t result = 0;
    679 
    680     if (fData->fSafeRevTable != NULL) {
    681         // new rule syntax
    682         utext_setNativeIndex(fText, offset);
    683         // move forward one codepoint to prepare for moving back to a
    684         // safe point.
    685         // this handles offset being between a supplementary character
    686         UTEXT_NEXT32(fText);
    687         // handlePrevious will move most of the time to < 1 boundary away
    688         handlePrevious(fData->fSafeRevTable);
    689         int32_t result = next();
    690         while (result <= offset) {
    691             result = next();
    692         }
    693         return result;
    694     }
    695     if (fData->fSafeFwdTable != NULL) {
    696         // backup plan if forward safe table is not available
    697         utext_setNativeIndex(fText, offset);
    698         UTEXT_PREVIOUS32(fText);
    699         // handle next will give result >= offset
    700         handleNext(fData->fSafeFwdTable);
    701         // previous will give result 0 or 1 boundary away from offset,
    702         // most of the time
    703         // we have to
    704         int32_t oldresult = previous();
    705         while (oldresult > offset) {
    706             int32_t result = previous();
    707             if (result <= offset) {
    708                 return oldresult;
    709             }
    710             oldresult = result;
    711         }
    712         int32_t result = next();
    713         if (result <= offset) {
    714             return next();
    715         }
    716         return result;
    717     }
    718     // otherwise, we have to sync up first.  Use handlePrevious() to back
    719     // up to a known break position before the specified position (if
    720     // we can determine that the specified position is a break position,
    721     // we don't back up at all).  This may or may not be the last break
    722     // position at or before our starting position.  Advance forward
    723     // from here until we've passed the starting position.  The position
    724     // we stop on will be the first break position after the specified one.
    725     // old rule syntax
    726 
    727     utext_setNativeIndex(fText, offset);
    728     if (offset==0 ||
    729         offset==1  && utext_getNativeIndex(fText)==0) {
    730         return next();
    731     }
    732     result = previous();
    733 
    734     while (result != BreakIterator::DONE && result <= offset) {
    735         result = next();
    736     }
    737 
    738     return result;
    739 }
    740 
    741 /**
    742  * Sets the iterator to refer to the last boundary position before the
    743  * specified position.
    744  * @offset The position to begin searching for a break from.
    745  * @return The position of the last boundary before the starting position.
    746  */
    747 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
    748     // if we have cached break positions and offset is in the range
    749     // covered by them, use them
    750     if (fCachedBreakPositions != NULL) {
    751         // TODO: binary search?
    752         // TODO: What if offset is outside range, but break is not?
    753         if (offset > fCachedBreakPositions[0]
    754                 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
    755             fPositionInCache = 0;
    756             while (fPositionInCache < fNumCachedBreakPositions
    757                    && offset > fCachedBreakPositions[fPositionInCache])
    758                 ++fPositionInCache;
    759             --fPositionInCache;
    760             // If we're at the beginning of the cache, need to reevaluate the
    761             // rule status
    762             if (fPositionInCache <= 0) {
    763                 fLastStatusIndexValid = FALSE;
    764             }
    765             utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
    766             return fCachedBreakPositions[fPositionInCache];
    767         }
    768         else {
    769             reset();
    770         }
    771     }
    772 
    773     // if the offset passed in is already past the end of the text,
    774     // just return DONE; if it's before the beginning, return the
    775     // text's starting offset
    776     if (fText == NULL || offset > utext_nativeLength(fText)) {
    777         // return BreakIterator::DONE;
    778         return last();
    779     }
    780     else if (offset < 0) {
    781         return first();
    782     }
    783 
    784     // if we start by updating the current iteration position to the
    785     // position specified by the caller, we can just use previous()
    786     // to carry out this operation
    787 
    788     if (fData->fSafeFwdTable != NULL) {
    789         // new rule syntax
    790         utext_setNativeIndex(fText, offset);
    791         int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    792         if (newOffset != offset) {
    793             // Will come here if specified offset was not a code point boundary AND
    794             //   the underlying implmentation is using UText, which snaps any non-code-point-boundary
    795             //   indices to the containing code point.
    796             // For breakitereator::preceding only, these non-code-point indices need to be moved
    797             //   up to refer to the following codepoint.
    798             UTEXT_NEXT32(fText);
    799             offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    800         }
    801 
    802         // TODO:  (synwee) would it be better to just check for being in the middle of a surrogate pair,
    803         //        rather than adjusting the position unconditionally?
    804         //        (Change would interact with safe rules.)
    805         // TODO:  change RBBI behavior for off-boundary indices to match that of UText?
    806         //        affects only preceding(), seems cleaner, but is slightly different.
    807         UTEXT_PREVIOUS32(fText);
    808         handleNext(fData->fSafeFwdTable);
    809         int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    810         while (result >= offset) {
    811             result = previous();
    812         }
    813         return result;
    814     }
    815     if (fData->fSafeRevTable != NULL) {
    816         // backup plan if forward safe table is not available
    817         //  TODO:  check whether this path can be discarded
    818         //         It's probably OK to say that rules must supply both safe tables
    819         //            if they use safe tables at all.  We have certainly never described
    820         //            to anyone how to work with just one safe table.
    821         utext_setNativeIndex(fText, offset);
    822         UTEXT_NEXT32(fText);
    823 
    824         // handle previous will give result <= offset
    825         handlePrevious(fData->fSafeRevTable);
    826 
    827         // next will give result 0 or 1 boundary away from offset,
    828         // most of the time
    829         // we have to
    830         int32_t oldresult = next();
    831         while (oldresult < offset) {
    832             int32_t result = next();
    833             if (result >= offset) {
    834                 return oldresult;
    835             }
    836             oldresult = result;
    837         }
    838         int32_t result = previous();
    839         if (result >= offset) {
    840             return previous();
    841         }
    842         return result;
    843     }
    844 
    845     // old rule syntax
    846     utext_setNativeIndex(fText, offset);
    847     return previous();
    848 }
    849 
    850 /**
    851  * Returns true if the specfied position is a boundary position.  As a side
    852  * effect, leaves the iterator pointing to the first boundary position at
    853  * or after "offset".
    854  * @param offset the offset to check.
    855  * @return True if "offset" is a boundary position.
    856  */
    857 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
    858     // the beginning index of the iterator is always a boundary position by definition
    859     if (offset == 0) {
    860         first();       // For side effects on current position, tag values.
    861         return TRUE;
    862     }
    863 
    864     if (offset == (int32_t)utext_nativeLength(fText)) {
    865         last();       // For side effects on current position, tag values.
    866         return TRUE;
    867     }
    868 
    869     // out-of-range indexes are never boundary positions
    870     if (offset < 0) {
    871         first();       // For side effects on current position, tag values.
    872         return FALSE;
    873     }
    874 
    875     if (offset > utext_nativeLength(fText)) {
    876         last();        // For side effects on current position, tag values.
    877         return FALSE;
    878     }
    879 
    880     // otherwise, we can use following() on the position before the specified
    881     // one and return true if the position we get back is the one the user
    882     // specified
    883     utext_previous32From(fText, offset);
    884     int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    885     UBool    result  = following(backOne) == offset;
    886     return result;
    887 }
    888 
    889 /**
    890  * Returns the current iteration position.
    891  * @return The current iteration position.
    892  */
    893 int32_t RuleBasedBreakIterator::current(void) const {
    894     int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    895     return pos;
    896 }
    897 
    898 //=======================================================================
    899 // implementation
    900 //=======================================================================
    901 
    902 //
    903 // RBBIRunMode  -  the state machine runs an extra iteration at the beginning and end
    904 //                 of user text.  A variable with this enum type keeps track of where we
    905 //                 are.  The state machine only fetches user input while in the RUN mode.
    906 //
    907 enum RBBIRunMode {
    908     RBBI_START,     // state machine processing is before first char of input
    909     RBBI_RUN,       // state machine processing is in the user text
    910     RBBI_END        // state machine processing is after end of user text.
    911 };
    912 
    913 
    914 //-----------------------------------------------------------------------------------
    915 //
    916 //  handleNext(stateTable)
    917 //     This method is the actual implementation of the rbbi next() method.
    918 //     This method initializes the state machine to state 1
    919 //     and advances through the text character by character until we reach the end
    920 //     of the text or the state machine transitions to state 0.  We update our return
    921 //     value every time the state machine passes through an accepting state.
    922 //
    923 //-----------------------------------------------------------------------------------
    924 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
    925     int32_t             state;
    926     int16_t             category        = 0;
    927     RBBIRunMode         mode;
    928 
    929     RBBIStateTableRow  *row;
    930     UChar32             c;
    931     int32_t             lookaheadStatus = 0;
    932     int32_t             lookaheadTagIdx = 0;
    933     int32_t             result          = 0;
    934     int32_t             initialPosition = 0;
    935     int32_t             lookaheadResult = 0;
    936     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
    937     const char         *tableData       = statetable->fTableData;
    938     uint32_t            tableRowLen     = statetable->fRowLen;
    939 
    940     #ifdef RBBI_DEBUG
    941         if (fTrace) {
    942             RBBIDebugPuts("Handle Next   pos   char  state category");
    943         }
    944     #endif
    945 
    946     // No matter what, handleNext alway correctly sets the break tag value.
    947     fLastStatusIndexValid = TRUE;
    948     fLastRuleStatusIndex = 0;
    949 
    950     // if we're already at the end of the text, return DONE.
    951     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    952     result          = initialPosition;
    953     c               = UTEXT_NEXT32(fText);
    954     if (fData == NULL || c==U_SENTINEL) {
    955         return BreakIterator::DONE;
    956     }
    957 
    958     //  Set the initial state for the state machine
    959     state = START_STATE;
    960     row = (RBBIStateTableRow *)
    961             //(statetable->fTableData + (statetable->fRowLen * state));
    962             (tableData + tableRowLen * state);
    963 
    964 
    965     mode     = RBBI_RUN;
    966     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
    967         category = 2;
    968         mode     = RBBI_START;
    969     }
    970 
    971 
    972     // loop until we reach the end of the text or transition to state 0
    973     //
    974     for (;;) {
    975         if (c == U_SENTINEL) {
    976             // Reached end of input string.
    977             if (mode == RBBI_END) {
    978                 // We have already run the loop one last time with the
    979                 //   character set to the psueudo {eof} value.  Now it is time
    980                 //   to unconditionally bail out.
    981                 if (lookaheadResult > result) {
    982                     // We ran off the end of the string with a pending look-ahead match.
    983                     // Treat this as if the look-ahead condition had been met, and return
    984                     //  the match at the / position from the look-ahead rule.
    985                     result               = lookaheadResult;
    986                     fLastRuleStatusIndex = lookaheadTagIdx;
    987                     lookaheadStatus = 0;
    988                 }
    989                 break;
    990             }
    991             // Run the loop one last time with the fake end-of-input character category.
    992             mode = RBBI_END;
    993             category = 1;
    994         }
    995 
    996         //
    997         // Get the char category.  An incoming category of 1 or 2 means that
    998         //      we are preset for doing the beginning or end of input, and
    999         //      that we shouldn't get a category from an actual text input character.
   1000         //
   1001         if (mode == RBBI_RUN) {
   1002             // look up the current character's character category, which tells us
   1003             // which column in the state table to look at.
   1004             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
   1005             //        not the size of the character going in, which is a UChar32.
   1006             //
   1007             UTRIE_GET16(&fData->fTrie, c, category);
   1008 
   1009             // Check the dictionary bit in the character's category.
   1010             //    Counter is only used by dictionary based iterators (subclasses).
   1011             //    Chars that need to be handled by a dictionary have a flag bit set
   1012             //    in their category values.
   1013             //
   1014             if ((category & 0x4000) != 0)  {
   1015                 fDictionaryCharCount++;
   1016                 //  And off the dictionary flag bit.
   1017                 category &= ~0x4000;
   1018             }
   1019         }
   1020 
   1021         #ifdef RBBI_DEBUG
   1022             if (fTrace) {
   1023                 RBBIDebugPrintf("             %4d   ", utext_getNativeIndex(fText));
   1024                 if (0x20<=c && c<0x7f) {
   1025                     RBBIDebugPrintf("\"%c\"  ", c);
   1026                 } else {
   1027                     RBBIDebugPrintf("%5x  ", c);
   1028                 }
   1029                 RBBIDebugPrintf("%3d  %3d\n", state, category);
   1030             }
   1031         #endif
   1032 
   1033         // State Transition - move machine to its next state
   1034         //
   1035         state = row->fNextState[category];
   1036         row = (RBBIStateTableRow *)
   1037             // (statetable->fTableData + (statetable->fRowLen * state));
   1038             (tableData + tableRowLen * state);
   1039 
   1040 
   1041         if (row->fAccepting == -1) {
   1042             // Match found, common case.
   1043             if (mode != RBBI_START) {
   1044                 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1045             }
   1046             fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
   1047         }
   1048 
   1049         if (row->fLookAhead != 0) {
   1050             if (lookaheadStatus != 0
   1051                 && row->fAccepting == lookaheadStatus) {
   1052                 // Lookahead match is completed.
   1053                 result               = lookaheadResult;
   1054                 fLastRuleStatusIndex = lookaheadTagIdx;
   1055                 lookaheadStatus      = 0;
   1056                 // TODO:  make a standalone hard break in a rule work.
   1057                 if (lookAheadHardBreak) {
   1058                     UTEXT_SETNATIVEINDEX(fText, result);
   1059                     return result;
   1060                 }
   1061                 // Look-ahead completed, but other rules may match further.  Continue on
   1062                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
   1063                 goto continueOn;
   1064             }
   1065 
   1066             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1067             lookaheadResult = r;
   1068             lookaheadStatus = row->fLookAhead;
   1069             lookaheadTagIdx = row->fTagIdx;
   1070             goto continueOn;
   1071         }
   1072 
   1073 
   1074         if (row->fAccepting != 0) {
   1075             // Because this is an accepting state, any in-progress look-ahead match
   1076             //   is no longer relavant.  Clear out the pending lookahead status.
   1077             lookaheadStatus = 0;           // clear out any pending look-ahead match.
   1078         }
   1079 
   1080 continueOn:
   1081         if (state == STOP_STATE) {
   1082             // This is the normal exit from the lookup state machine.
   1083             // We have advanced through the string until it is certain that no
   1084             //   longer match is possible, no matter what characters follow.
   1085             break;
   1086         }
   1087 
   1088         // Advance to the next character.
   1089         // If this is a beginning-of-input loop iteration, don't advance
   1090         //    the input position.  The next iteration will be processing the
   1091         //    first real input character.
   1092         if (mode == RBBI_RUN) {
   1093             c = UTEXT_NEXT32(fText);
   1094         } else {
   1095             if (mode == RBBI_START) {
   1096                 mode = RBBI_RUN;
   1097             }
   1098         }
   1099 
   1100 
   1101     }
   1102 
   1103     // The state machine is done.  Check whether it found a match...
   1104 
   1105     // If the iterator failed to advance in the match engine, force it ahead by one.
   1106     //   (This really indicates a defect in the break rules.  They should always match
   1107     //    at least one character.)
   1108     if (result == initialPosition) {
   1109         UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1110         UTEXT_NEXT32(fText);
   1111         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1112     }
   1113 
   1114     // Leave the iterator at our result position.
   1115     UTEXT_SETNATIVEINDEX(fText, result);
   1116     #ifdef RBBI_DEBUG
   1117         if (fTrace) {
   1118             RBBIDebugPrintf("result = %d\n\n", result);
   1119         }
   1120     #endif
   1121     return result;
   1122 }
   1123 
   1124 
   1125 
   1126 //-----------------------------------------------------------------------------------
   1127 //
   1128 //  handlePrevious()
   1129 //
   1130 //      Iterate backwards, according to the logic of the reverse rules.
   1131 //      This version handles the exact style backwards rules.
   1132 //
   1133 //      The logic of this function is very similar to handleNext(), above.
   1134 //
   1135 //-----------------------------------------------------------------------------------
   1136 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
   1137     int32_t             state;
   1138     int16_t             category        = 0;
   1139     RBBIRunMode         mode;
   1140     RBBIStateTableRow  *row;
   1141     UChar32             c;
   1142     int32_t             lookaheadStatus = 0;
   1143     int32_t             result          = 0;
   1144     int32_t             initialPosition = 0;
   1145     int32_t             lookaheadResult = 0;
   1146     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
   1147 
   1148     #ifdef RBBI_DEBUG
   1149         if (fTrace) {
   1150             RBBIDebugPuts("Handle Previous   pos   char  state category");
   1151         }
   1152     #endif
   1153 
   1154     // handlePrevious() never gets the rule status.
   1155     // Flag the status as invalid; if the user ever asks for status, we will need
   1156     // to back up, then re-find the break position using handleNext(), which does
   1157     // get the status value.
   1158     fLastStatusIndexValid = FALSE;
   1159     fLastRuleStatusIndex = 0;
   1160 
   1161     // if we're already at the start of the text, return DONE.
   1162     if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
   1163         return BreakIterator::DONE;
   1164     }
   1165 
   1166     //  Set up the starting char.
   1167     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1168     result          = initialPosition;
   1169     c               = UTEXT_PREVIOUS32(fText);
   1170 
   1171     //  Set the initial state for the state machine
   1172     state = START_STATE;
   1173     row = (RBBIStateTableRow *)
   1174             (statetable->fTableData + (statetable->fRowLen * state));
   1175     category = 3;
   1176     mode     = RBBI_RUN;
   1177     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
   1178         category = 2;
   1179         mode     = RBBI_START;
   1180     }
   1181 
   1182 
   1183     // loop until we reach the start of the text or transition to state 0
   1184     //
   1185     for (;;) {
   1186         if (c == U_SENTINEL) {
   1187             // Reached end of input string.
   1188             if (mode == RBBI_END ||
   1189                 *(int32_t *)fData->fHeader->fFormatVersion == 1 ) {
   1190                 // We have already run the loop one last time with the
   1191                 //   character set to the psueudo {eof} value.  Now it is time
   1192                 //   to unconditionally bail out.
   1193                 //  (Or we have an old format binary rule file that does not support {eof}.)
   1194                 if (lookaheadResult < result) {
   1195                     // We ran off the end of the string with a pending look-ahead match.
   1196                     // Treat this as if the look-ahead condition had been met, and return
   1197                     //  the match at the / position from the look-ahead rule.
   1198                     result               = lookaheadResult;
   1199                     lookaheadStatus = 0;
   1200                 } else if (result == initialPosition) {
   1201                     // Ran off start, no match found.
   1202                     // move one index one (towards the start, since we are doing a previous())
   1203                     UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1204                     UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.
   1205                 }
   1206                 break;
   1207             }
   1208             // Run the loop one last time with the fake end-of-input character category.
   1209             mode = RBBI_END;
   1210             category = 1;
   1211         }
   1212 
   1213         //
   1214         // Get the char category.  An incoming category of 1 or 2 means that
   1215         //      we are preset for doing the beginning or end of input, and
   1216         //      that we shouldn't get a category from an actual text input character.
   1217         //
   1218         if (mode == RBBI_RUN) {
   1219             // look up the current character's character category, which tells us
   1220             // which column in the state table to look at.
   1221             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
   1222             //        not the size of the character going in, which is a UChar32.
   1223             //
   1224             UTRIE_GET16(&fData->fTrie, c, category);
   1225 
   1226             // Check the dictionary bit in the character's category.
   1227             //    Counter is only used by dictionary based iterators (subclasses).
   1228             //    Chars that need to be handled by a dictionary have a flag bit set
   1229             //    in their category values.
   1230             //
   1231             if ((category & 0x4000) != 0)  {
   1232                 fDictionaryCharCount++;
   1233                 //  And off the dictionary flag bit.
   1234                 category &= ~0x4000;
   1235             }
   1236         }
   1237 
   1238         #ifdef RBBI_DEBUG
   1239             if (fTrace) {
   1240                 RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(fText));
   1241                 if (0x20<=c && c<0x7f) {
   1242                     RBBIDebugPrintf("\"%c\"  ", c);
   1243                 } else {
   1244                     RBBIDebugPrintf("%5x  ", c);
   1245                 }
   1246                 RBBIDebugPrintf("%3d  %3d\n", state, category);
   1247             }
   1248         #endif
   1249 
   1250         // State Transition - move machine to its next state
   1251         //
   1252         state = row->fNextState[category];
   1253         row = (RBBIStateTableRow *)
   1254             (statetable->fTableData + (statetable->fRowLen * state));
   1255 
   1256         if (row->fAccepting == -1) {
   1257             // Match found, common case.
   1258             result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1259         }
   1260 
   1261         if (row->fLookAhead != 0) {
   1262             if (lookaheadStatus != 0
   1263                 && row->fAccepting == lookaheadStatus) {
   1264                 // Lookahead match is completed.
   1265                 result               = lookaheadResult;
   1266                 lookaheadStatus      = 0;
   1267                 // TODO:  make a standalone hard break in a rule work.
   1268                 if (lookAheadHardBreak) {
   1269                     UTEXT_SETNATIVEINDEX(fText, result);
   1270                     return result;
   1271                 }
   1272                 // Look-ahead completed, but other rules may match further.  Continue on
   1273                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
   1274                 goto continueOn;
   1275             }
   1276 
   1277             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1278             lookaheadResult = r;
   1279             lookaheadStatus = row->fLookAhead;
   1280             goto continueOn;
   1281         }
   1282 
   1283 
   1284         if (row->fAccepting != 0) {
   1285             // Because this is an accepting state, any in-progress look-ahead match
   1286             //   is no longer relavant.  Clear out the pending lookahead status.
   1287             lookaheadStatus = 0;
   1288         }
   1289 
   1290 continueOn:
   1291         if (state == STOP_STATE) {
   1292             // This is the normal exit from the lookup state machine.
   1293             // We have advanced through the string until it is certain that no
   1294             //   longer match is possible, no matter what characters follow.
   1295             break;
   1296         }
   1297 
   1298         // Move (backwards) to the next character to process.
   1299         // If this is a beginning-of-input loop iteration, don't advance
   1300         //    the input position.  The next iteration will be processing the
   1301         //    first real input character.
   1302         if (mode == RBBI_RUN) {
   1303             c = UTEXT_PREVIOUS32(fText);
   1304         } else {
   1305             if (mode == RBBI_START) {
   1306                 mode = RBBI_RUN;
   1307             }
   1308         }
   1309     }
   1310 
   1311     // The state machine is done.  Check whether it found a match...
   1312 
   1313     // If the iterator failed to advance in the match engine, force it ahead by one.
   1314     //   (This really indicates a defect in the break rules.  They should always match
   1315     //    at least one character.)
   1316     if (result == initialPosition) {
   1317         UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1318         UTEXT_PREVIOUS32(fText);
   1319         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1320     }
   1321 
   1322     // Leave the iterator at our result position.
   1323     UTEXT_SETNATIVEINDEX(fText, result);
   1324     #ifdef RBBI_DEBUG
   1325         if (fTrace) {
   1326             RBBIDebugPrintf("result = %d\n\n", result);
   1327         }
   1328     #endif
   1329     return result;
   1330 }
   1331 
   1332 
   1333 void
   1334 RuleBasedBreakIterator::reset()
   1335 {
   1336     if (fCachedBreakPositions) {
   1337         uprv_free(fCachedBreakPositions);
   1338     }
   1339     fCachedBreakPositions = NULL;
   1340     fNumCachedBreakPositions = 0;
   1341     fDictionaryCharCount = 0;
   1342     fPositionInCache = 0;
   1343 }
   1344 
   1345 
   1346 
   1347 //-------------------------------------------------------------------------------
   1348 //
   1349 //   getRuleStatus()   Return the break rule tag associated with the current
   1350 //                     iterator position.  If the iterator arrived at its current
   1351 //                     position by iterating forwards, the value will have been
   1352 //                     cached by the handleNext() function.
   1353 //
   1354 //                     If no cached status value is available, the status is
   1355 //                     found by doing a previous() followed by a next(), which
   1356 //                     leaves the iterator where it started, and computes the
   1357 //                     status while doing the next().
   1358 //
   1359 //-------------------------------------------------------------------------------
   1360 void RuleBasedBreakIterator::makeRuleStatusValid() {
   1361     if (fLastStatusIndexValid == FALSE) {
   1362         //  No cached status is available.
   1363         if (fText == NULL || current() == 0) {
   1364             //  At start of text, or there is no text.  Status is always zero.
   1365             fLastRuleStatusIndex = 0;
   1366             fLastStatusIndexValid = TRUE;
   1367         } else {
   1368             //  Not at start of text.  Find status the tedious way.
   1369             int32_t pa = current();
   1370             previous();
   1371             if (fNumCachedBreakPositions > 0) {
   1372                 reset();                // Blow off the dictionary cache
   1373             }
   1374             int32_t pb = next();
   1375             if (pa != pb) {
   1376                 // note: the if (pa != pb) test is here only to eliminate warnings for
   1377                 //       unused local variables on gcc.  Logically, it isn't needed.
   1378                 U_ASSERT(pa == pb);
   1379             }
   1380         }
   1381     }
   1382     U_ASSERT(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fData->fStatusMaxIdx);
   1383 }
   1384 
   1385 
   1386 int32_t  RuleBasedBreakIterator::getRuleStatus() const {
   1387     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
   1388     nonConstThis->makeRuleStatusValid();
   1389 
   1390     // fLastRuleStatusIndex indexes to the start of the appropriate status record
   1391     //                                                 (the number of status values.)
   1392     //   This function returns the last (largest) of the array of status values.
   1393     int32_t  idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
   1394     int32_t  tagVal = fData->fRuleStatusTable[idx];
   1395 
   1396     return tagVal;
   1397 }
   1398 
   1399 
   1400 
   1401 
   1402 int32_t RuleBasedBreakIterator::getRuleStatusVec(
   1403              int32_t *fillInVec, int32_t capacity, UErrorCode &status)
   1404 {
   1405     if (U_FAILURE(status)) {
   1406         return 0;
   1407     }
   1408 
   1409     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
   1410     nonConstThis->makeRuleStatusValid();
   1411     int32_t  numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
   1412     int32_t  numValsToCopy = numVals;
   1413     if (numVals > capacity) {
   1414         status = U_BUFFER_OVERFLOW_ERROR;
   1415         numValsToCopy = capacity;
   1416     }
   1417     int i;
   1418     for (i=0; i<numValsToCopy; i++) {
   1419         fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
   1420     }
   1421     return numVals;
   1422 }
   1423 
   1424 
   1425 
   1426 //-------------------------------------------------------------------------------
   1427 //
   1428 //   getBinaryRules        Access to the compiled form of the rules,
   1429 //                         for use by build system tools that save the data
   1430 //                         for standard iterator types.
   1431 //
   1432 //-------------------------------------------------------------------------------
   1433 const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
   1434     const uint8_t  *retPtr = NULL;
   1435     length = 0;
   1436 
   1437     if (fData != NULL) {
   1438         retPtr = (const uint8_t *)fData->fHeader;
   1439         length = fData->fHeader->fLength;
   1440     }
   1441     return retPtr;
   1442 }
   1443 
   1444 
   1445 
   1446 
   1447 //-------------------------------------------------------------------------------
   1448 //
   1449 //  BufferClone       TODO:  In my (Andy) opinion, this function should be deprecated.
   1450 //                    Saving one heap allocation isn't worth the trouble.
   1451 //                    Cloning shouldn't be done in tight loops, and
   1452 //                    making the clone copy involves other heap operations anyway.
   1453 //                    And the application code for correctly dealing with buffer
   1454 //                    size problems and the eventual object destruction is ugly.
   1455 //
   1456 //-------------------------------------------------------------------------------
   1457 BreakIterator *  RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
   1458                                    int32_t &bufferSize,
   1459                                    UErrorCode &status)
   1460 {
   1461     if (U_FAILURE(status)){
   1462         return NULL;
   1463     }
   1464 
   1465     //
   1466     //  If user buffer size is zero this is a preflight operation to
   1467     //    obtain the needed buffer size, allowing for worst case misalignment.
   1468     //
   1469     if (bufferSize == 0) {
   1470         bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
   1471         return NULL;
   1472     }
   1473 
   1474 
   1475     //
   1476     //  Check the alignment and size of the user supplied buffer.
   1477     //  Allocate heap memory if the user supplied memory is insufficient.
   1478     //
   1479     char    *buf   = (char *)stackBuffer;
   1480     uint32_t s      = bufferSize;
   1481 
   1482     if (stackBuffer == NULL) {
   1483         s = 0;   // Ignore size, force allocation if user didn't give us a buffer.
   1484     }
   1485     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
   1486         uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf);
   1487         s   -= offsetUp;
   1488         buf += offsetUp;
   1489     }
   1490     if (s < sizeof(RuleBasedBreakIterator)) {
   1491         // Not enough room in the caller-supplied buffer.
   1492         // Do a plain-vanilla heap based clone and return that, along with
   1493         //   a warning that the clone was allocated.
   1494         RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this);
   1495         if (clonedBI == 0) {
   1496             status = U_MEMORY_ALLOCATION_ERROR;
   1497         } else {
   1498             status = U_SAFECLONE_ALLOCATED_WARNING;
   1499         }
   1500         return clonedBI;
   1501     }
   1502 
   1503     //
   1504     //  Clone the source BI into the caller-supplied buffer.
   1505     //    TODO:  using an overloaded operator new to directly initialize the
   1506     //           copy in the user's buffer would be better, but it doesn't seem
   1507     //           to get along with namespaces.  Investigate why.
   1508     //
   1509     //           The memcpy is only safe with an empty (default constructed)
   1510     //           break iterator.  Use on others can screw up reference counts
   1511     //           to data.  memcpy-ing objects is not really a good idea...
   1512     //
   1513     RuleBasedBreakIterator localIter;        // Empty break iterator, source for memcpy
   1514     RuleBasedBreakIterator *clone = (RuleBasedBreakIterator *)buf;
   1515     uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // init C++ gorp, BreakIterator base class part
   1516     clone->init();                // Init RuleBasedBreakIterator part, (user default constructor)
   1517     *clone = *this;               // clone = the real BI we want.
   1518     clone->fBufferClone = TRUE;   // Flag to prevent deleting storage on close (From C code)
   1519 
   1520     return clone;
   1521 }
   1522 
   1523 
   1524 //-------------------------------------------------------------------------------
   1525 //
   1526 //  isDictionaryChar      Return true if the category lookup for this char
   1527 //                        indicates that it is in the set of dictionary lookup
   1528 //                        chars.
   1529 //
   1530 //                        This function is intended for use by dictionary based
   1531 //                        break iterators.
   1532 //
   1533 //-------------------------------------------------------------------------------
   1534 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32   c) {
   1535     if (fData == NULL) {
   1536         return FALSE;
   1537     }
   1538     uint16_t category;
   1539     UTRIE_GET16(&fData->fTrie, c, category);
   1540     return (category & 0x4000) != 0;
   1541 }*/
   1542 
   1543 
   1544 //-------------------------------------------------------------------------------
   1545 //
   1546 //  checkDictionary       This function handles all processing of characters in
   1547 //                        the "dictionary" set. It will determine the appropriate
   1548 //                        course of action, and possibly set up a cache in the
   1549 //                        process.
   1550 //
   1551 //-------------------------------------------------------------------------------
   1552 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
   1553                             int32_t endPos,
   1554                             UBool reverse) {
   1555     // Reset the old break cache first.
   1556 //    uint32_t dictionaryCount = fDictionaryCharCount;
   1557     reset();
   1558 
   1559     // note: code segment below assumes that dictionary chars are in the
   1560     // startPos-endPos range
   1561     // value returned should be next character in sequence
   1562 //  if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
   1563     if ((endPos - startPos) <= 1) {
   1564         return (reverse ? startPos : endPos);
   1565     }
   1566 
   1567     // Starting from the starting point, scan towards the proposed result,
   1568     // looking for the first dictionary character (which may be the one
   1569     // we're on, if we're starting in the middle of a range).
   1570     utext_setNativeIndex(fText, reverse ? endPos : startPos);
   1571     if (reverse) {
   1572         UTEXT_PREVIOUS32(fText);
   1573     }
   1574 
   1575     int32_t rangeStart = startPos;
   1576     int32_t rangeEnd = endPos;
   1577 
   1578     uint16_t    category;
   1579     int32_t     current;
   1580     UErrorCode  status = U_ZERO_ERROR;
   1581     UStack      breaks(status);
   1582     int32_t     foundBreakCount = 0;
   1583     UChar32     c = utext_current32(fText);
   1584 
   1585     UTRIE_GET16(&fData->fTrie, c, category);
   1586 
   1587     // Is the character we're starting on a dictionary character? If so, we
   1588     // need to back up to include the entire run; otherwise the results of
   1589     // the break algorithm will differ depending on where we start. Since
   1590     // the result is cached and there is typically a non-dictionary break
   1591     // within a small number of words, there should be little performance impact.
   1592     if (category & 0x4000) {
   1593         if (reverse) {
   1594             do {
   1595                 utext_next32(fText);          // TODO:  recast to work directly with postincrement.
   1596                 c = utext_current32(fText);
   1597                 UTRIE_GET16(&fData->fTrie, c, category);
   1598             } while (c != U_SENTINEL && (category & 0x4000));
   1599             // Back up to the last dictionary character
   1600             rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1601             if (c == U_SENTINEL) {
   1602                 // c = fText->last32();
   1603                 //   TODO:  why was this if needed?
   1604                 c = UTEXT_PREVIOUS32(fText);
   1605             }
   1606             else {
   1607                 c = UTEXT_PREVIOUS32(fText);
   1608             }
   1609         }
   1610         else {
   1611             do {
   1612                 c = UTEXT_PREVIOUS32(fText);
   1613                 UTRIE_GET16(&fData->fTrie, c, category);
   1614             }
   1615             while (c != U_SENTINEL && (category & 0x4000));
   1616             // Back up to the last dictionary character
   1617             if (c == U_SENTINEL) {
   1618                 // c = fText->first32();
   1619                 c = utext_current32(fText);
   1620             }
   1621             else {
   1622                 utext_next32(fText);
   1623                 c = utext_current32(fText);
   1624             }
   1625             rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
   1626         }
   1627         UTRIE_GET16(&fData->fTrie, c, category);
   1628     }
   1629 
   1630     // Loop through the text, looking for ranges of dictionary characters.
   1631     // For each span, find the appropriate break engine, and ask it to find
   1632     // any breaks within the span.
   1633     // Note: we always do this in the forward direction, so that the break
   1634     // cache is built in the right order.
   1635     if (reverse) {
   1636         utext_setNativeIndex(fText, rangeStart);
   1637         c = utext_current32(fText);
   1638         UTRIE_GET16(&fData->fTrie, c, category);
   1639     }
   1640     while(U_SUCCESS(status)) {
   1641         while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
   1642             utext_next32(fText);           // TODO:  tweak for post-increment operation
   1643             c = utext_current32(fText);
   1644             UTRIE_GET16(&fData->fTrie, c, category);
   1645         }
   1646         if (current >= rangeEnd) {
   1647             break;
   1648         }
   1649 
   1650         // We now have a dictionary character. Get the appropriate language object
   1651         // to deal with it.
   1652         const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
   1653 
   1654         // Ask the language object if there are any breaks. It will leave the text
   1655         // pointer on the other side of its range, ready to search for the next one.
   1656         if (lbe != NULL) {
   1657             foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
   1658         }
   1659 
   1660         // Reload the loop variables for the next go-round
   1661         c = utext_current32(fText);
   1662         UTRIE_GET16(&fData->fTrie, c, category);
   1663     }
   1664 
   1665     // If we found breaks, build a new break cache. The first and last entries must
   1666     // be the original starting and ending position.
   1667     if (foundBreakCount > 0) {
   1668         int32_t totalBreaks = foundBreakCount;
   1669         if (startPos < breaks.elementAti(0)) {
   1670             totalBreaks += 1;
   1671         }
   1672         if (endPos > breaks.peeki()) {
   1673             totalBreaks += 1;
   1674         }
   1675         fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
   1676         if (fCachedBreakPositions != NULL) {
   1677             int32_t out = 0;
   1678             fNumCachedBreakPositions = totalBreaks;
   1679             if (startPos < breaks.elementAti(0)) {
   1680                 fCachedBreakPositions[out++] = startPos;
   1681             }
   1682             for (int32_t i = 0; i < foundBreakCount; ++i) {
   1683                 fCachedBreakPositions[out++] = breaks.elementAti(i);
   1684             }
   1685             if (endPos > fCachedBreakPositions[out-1]) {
   1686                 fCachedBreakPositions[out] = endPos;
   1687             }
   1688             // If there are breaks, then by definition, we are replacing the original
   1689             // proposed break by one of the breaks we found. Use following() and
   1690             // preceding() to do the work. They should never recurse in this case.
   1691             if (reverse) {
   1692                 return preceding(endPos);
   1693             }
   1694             else {
   1695                 return following(startPos);
   1696             }
   1697         }
   1698         // If the allocation failed, just fall through to the "no breaks found" case.
   1699     }
   1700 
   1701     // If we get here, there were no language-based breaks. Set the text pointer
   1702     // to the original proposed break.
   1703     utext_setNativeIndex(fText, reverse ? startPos : endPos);
   1704     return (reverse ? startPos : endPos);
   1705 }
   1706 
   1707 U_NAMESPACE_END
   1708 
   1709 // defined in ucln_cmn.h
   1710 
   1711 static U_NAMESPACE_QUALIFIER UStack *gLanguageBreakFactories = NULL;
   1712 
   1713 /**
   1714  * Release all static memory held by breakiterator.
   1715  */
   1716 U_CDECL_BEGIN
   1717 static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
   1718     if (gLanguageBreakFactories) {
   1719         delete gLanguageBreakFactories;
   1720         gLanguageBreakFactories = NULL;
   1721     }
   1722     return TRUE;
   1723 }
   1724 U_CDECL_END
   1725 
   1726 U_CDECL_BEGIN
   1727 static void U_CALLCONV _deleteFactory(void *obj) {
   1728     delete (U_NAMESPACE_QUALIFIER LanguageBreakFactory *) obj;
   1729 }
   1730 U_CDECL_END
   1731 U_NAMESPACE_BEGIN
   1732 
   1733 static const LanguageBreakEngine*
   1734 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
   1735 {
   1736     UBool       needsInit;
   1737     UErrorCode  status = U_ZERO_ERROR;
   1738     UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit);
   1739 
   1740     if (needsInit) {
   1741         UStack  *factories = new UStack(_deleteFactory, NULL, status);
   1742         if (factories != NULL && U_SUCCESS(status)) {
   1743             ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
   1744             factories->push(builtIn, status);
   1745 #ifdef U_LOCAL_SERVICE_HOOK
   1746             LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
   1747             if (extra != NULL) {
   1748                 factories->push(extra, status);
   1749             }
   1750 #endif
   1751         }
   1752         umtx_lock(NULL);
   1753         if (gLanguageBreakFactories == NULL) {
   1754             gLanguageBreakFactories = factories;
   1755             factories = NULL;
   1756             ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
   1757         }
   1758         umtx_unlock(NULL);
   1759         delete factories;
   1760     }
   1761 
   1762     if (gLanguageBreakFactories == NULL) {
   1763         return NULL;
   1764     }
   1765 
   1766     int32_t i = gLanguageBreakFactories->size();
   1767     const LanguageBreakEngine *lbe = NULL;
   1768     while (--i >= 0) {
   1769         LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
   1770         lbe = factory->getEngineFor(c, breakType);
   1771         if (lbe != NULL) {
   1772             break;
   1773         }
   1774     }
   1775     return lbe;
   1776 }
   1777 
   1778 
   1779 //-------------------------------------------------------------------------------
   1780 //
   1781 //  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the
   1782 //                          the characer c.
   1783 //
   1784 //-------------------------------------------------------------------------------
   1785 const LanguageBreakEngine *
   1786 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
   1787     const LanguageBreakEngine *lbe = NULL;
   1788     UErrorCode status = U_ZERO_ERROR;
   1789 
   1790     if (fLanguageBreakEngines == NULL) {
   1791         fLanguageBreakEngines = new UStack(status);
   1792         if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
   1793             delete fLanguageBreakEngines;
   1794             fLanguageBreakEngines = 0;
   1795             return NULL;
   1796         }
   1797     }
   1798 
   1799     int32_t i = fLanguageBreakEngines->size();
   1800     while (--i >= 0) {
   1801         lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
   1802         if (lbe->handles(c, fBreakType)) {
   1803             return lbe;
   1804         }
   1805     }
   1806 
   1807     // No existing dictionary took the character. See if a factory wants to
   1808     // give us a new LanguageBreakEngine for this character.
   1809     lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
   1810 
   1811     // If we got one, use it and push it on our stack.
   1812     if (lbe != NULL) {
   1813         fLanguageBreakEngines->push((void *)lbe, status);
   1814         // Even if we can't remember it, we can keep looking it up, so
   1815         // return it even if the push fails.
   1816         return lbe;
   1817     }
   1818 
   1819     // No engine is forthcoming for this character. Add it to the
   1820     // reject set. Create the reject break engine if needed.
   1821     if (fUnhandledBreakEngine == NULL) {
   1822         fUnhandledBreakEngine = new UnhandledEngine(status);
   1823         if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
   1824             status = U_MEMORY_ALLOCATION_ERROR;
   1825         }
   1826         // Put it last so that scripts for which we have an engine get tried
   1827         // first.
   1828         fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
   1829         // If we can't insert it, or creation failed, get rid of it
   1830         if (U_FAILURE(status)) {
   1831             delete fUnhandledBreakEngine;
   1832             fUnhandledBreakEngine = 0;
   1833             return NULL;
   1834         }
   1835     }
   1836 
   1837     // Tell the reject engine about the character; at its discretion, it may
   1838     // add more than just the one character.
   1839     fUnhandledBreakEngine->handleCharacter(c, fBreakType);
   1840 
   1841     return fUnhandledBreakEngine;
   1842 }
   1843 
   1844 
   1845 
   1846 /*int32_t RuleBasedBreakIterator::getBreakType() const {
   1847     return fBreakType;
   1848 }*/
   1849 
   1850 void RuleBasedBreakIterator::setBreakType(int32_t type) {
   1851     fBreakType = type;
   1852     reset();
   1853 }
   1854 
   1855 U_NAMESPACE_END
   1856 
   1857 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   1858