Home | History | Annotate | Download | only in common
      1 /*
      2 ***************************************************************************
      3 *   Copyright (C) 1999-2009 International Business Machines Corporation   *
      4 *   and others. All rights reserved.                                      *
      5 ***************************************************************************
      6 */
      7 //
      8 //  file:  rbbi.c    Contains the implementation of the rule based break iterator
      9 //                   runtime engine and the API implementation for
     10 //                   class RuleBasedBreakIterator
     11 //
     12 
     13 #include "unicode/utypes.h"
     14 
     15 #if !UCONFIG_NO_BREAK_ITERATION
     16 
     17 #include "unicode/rbbi.h"
     18 #include "unicode/schriter.h"
     19 #include "unicode/uchriter.h"
     20 #include "unicode/udata.h"
     21 #include "unicode/uclean.h"
     22 #include "rbbidata.h"
     23 #include "rbbirb.h"
     24 #include "cmemory.h"
     25 #include "cstring.h"
     26 #include "umutex.h"
     27 #include "ucln_cmn.h"
     28 #include "brkeng.h"
     29 
     30 #include "uassert.h"
     31 #include "uvector.h"
     32 
     33 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
     34 #if U_LOCAL_SERVICE_HOOK
     35 #include "localsvc.h"
     36 #endif
     37 
     38 #ifdef RBBI_DEBUG
     39 static UBool fTrace = FALSE;
     40 #endif
     41 
     42 U_NAMESPACE_BEGIN
     43 
     44 // The state number of the starting state
     45 #define START_STATE 1
     46 
     47 // The state-transition value indicating "stop"
     48 #define STOP_STATE  0
     49 
     50 
     51 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
     52 
     53 
     54 //=======================================================================
     55 // constructors
     56 //=======================================================================
     57 
     58 /**
     59  * Constructs a RuleBasedBreakIterator that uses the already-created
     60  * tables object that is passed in as a parameter.
     61  */
     62 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
     63 {
     64     init();
     65     fData = new RBBIDataWrapper(data, status); // status checked in constructor
     66     if (U_FAILURE(status)) {return;}
     67     if(fData == 0) {
     68         status = U_MEMORY_ALLOCATION_ERROR;
     69         return;
     70     }
     71 }
     72 
     73 /**
     74  * Same as above but does not adopt memory
     75  */
     76 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status)
     77 {
     78     init();
     79     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor
     80     if (U_FAILURE(status)) {return;}
     81     if(fData == 0) {
     82         status = U_MEMORY_ALLOCATION_ERROR;
     83         return;
     84     }
     85 }
     86 
     87 //-------------------------------------------------------------------------------
     88 //
     89 //   Constructor   from a UDataMemory handle to precompiled break rules
     90 //                 stored in an ICU data file.
     91 //
     92 //-------------------------------------------------------------------------------
     93 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
     94 {
     95     init();
     96     fData = new RBBIDataWrapper(udm, status); // status checked in constructor
     97     if (U_FAILURE(status)) {return;}
     98     if(fData == 0) {
     99         status = U_MEMORY_ALLOCATION_ERROR;
    100         return;
    101     }
    102 }
    103 
    104 
    105 
    106 //-------------------------------------------------------------------------------
    107 //
    108 //   Constructor       from a set of rules supplied as a string.
    109 //
    110 //-------------------------------------------------------------------------------
    111 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
    112                                                 UParseError          &parseError,
    113                                                 UErrorCode           &status)
    114 {
    115     init();
    116     if (U_FAILURE(status)) {return;}
    117     RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
    118         RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
    119     // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that
    120     //        creates and returns a complete RBBI.  From here, in a constructor, we
    121     //        can't just return the object created by the builder factory, hence
    122     //        the assignment of the factory created object to "this".
    123     if (U_SUCCESS(status)) {
    124         *this = *bi;
    125         delete bi;
    126     }
    127 }
    128 
    129 
    130 //-------------------------------------------------------------------------------
    131 //
    132 // Default Constructor.      Create an empty shell that can be set up later.
    133 //                           Used when creating a RuleBasedBreakIterator from a set
    134 //                           of rules.
    135 //-------------------------------------------------------------------------------
    136 RuleBasedBreakIterator::RuleBasedBreakIterator() {
    137     init();
    138 }
    139 
    140 
    141 //-------------------------------------------------------------------------------
    142 //
    143 //   Copy constructor.  Will produce a break iterator with the same behavior,
    144 //                      and which iterates over the same text, as the one passed in.
    145 //
    146 //-------------------------------------------------------------------------------
    147 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
    148 : BreakIterator(other)
    149 {
    150     this->init();
    151     *this = other;
    152 }
    153 
    154 
    155 /**
    156  * Destructor
    157  */
    158 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
    159     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    160         // fCharIter was adopted from the outside.
    161         delete fCharIter;
    162     }
    163     fCharIter = NULL;
    164     delete fSCharIter;
    165     fCharIter = NULL;
    166     delete fDCharIter;
    167     fDCharIter = NULL;
    168 
    169     utext_close(fText);
    170 
    171     if (fData != NULL) {
    172         fData->removeReference();
    173         fData = NULL;
    174     }
    175     if (fCachedBreakPositions) {
    176         uprv_free(fCachedBreakPositions);
    177         fCachedBreakPositions = NULL;
    178     }
    179     if (fLanguageBreakEngines) {
    180         delete fLanguageBreakEngines;
    181         fLanguageBreakEngines = NULL;
    182     }
    183     if (fUnhandledBreakEngine) {
    184         delete fUnhandledBreakEngine;
    185         fUnhandledBreakEngine = NULL;
    186     }
    187 }
    188 
    189 /**
    190  * Assignment operator.  Sets this iterator to have the same behavior,
    191  * and iterate over the same text, as the one passed in.
    192  */
    193 RuleBasedBreakIterator&
    194 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
    195     if (this == &that) {
    196         return *this;
    197     }
    198     reset();    // Delete break cache information
    199     fBreakType = that.fBreakType;
    200     if (fLanguageBreakEngines != NULL) {
    201         delete fLanguageBreakEngines;
    202         fLanguageBreakEngines = NULL;   // Just rebuild for now
    203     }
    204     // TODO: clone fLanguageBreakEngines from "that"
    205     UErrorCode status = U_ZERO_ERROR;
    206     fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
    207 
    208     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    209         delete fCharIter;
    210     }
    211     fCharIter = NULL;
    212 
    213     if (that.fCharIter != NULL ) {
    214         // This is a little bit tricky - it will intially appear that
    215         //  this->fCharIter is adopted, even if that->fCharIter was
    216         //  not adopted.  That's ok.
    217         fCharIter = that.fCharIter->clone();
    218     }
    219 
    220     if (fData != NULL) {
    221         fData->removeReference();
    222         fData = NULL;
    223     }
    224     if (that.fData != NULL) {
    225         fData = that.fData->addReference();
    226     }
    227 
    228     return *this;
    229 }
    230 
    231 
    232 
    233 //-----------------------------------------------------------------------------
    234 //
    235 //    init()      Shared initialization routine.   Used by all the constructors.
    236 //                Initializes all fields, leaving the object in a consistent state.
    237 //
    238 //-----------------------------------------------------------------------------
    239 void RuleBasedBreakIterator::init() {
    240     UErrorCode  status    = U_ZERO_ERROR;
    241     fBufferClone          = FALSE;
    242     fText                 = utext_openUChars(NULL, NULL, 0, &status);
    243     fCharIter             = NULL;
    244     fSCharIter            = NULL;
    245     fDCharIter            = NULL;
    246     fData                 = NULL;
    247     fLastRuleStatusIndex  = 0;
    248     fLastStatusIndexValid = TRUE;
    249     fDictionaryCharCount  = 0;
    250     fBreakType            = UBRK_WORD;  // Defaulting BreakType to word gives reasonable
    251                                         //   dictionary behavior for Break Iterators that are
    252                                         //   built from rules.  Even better would be the ability to
    253                                         //   declare the type in the rules.
    254 
    255     fCachedBreakPositions    = NULL;
    256     fLanguageBreakEngines    = NULL;
    257     fUnhandledBreakEngine    = NULL;
    258     fNumCachedBreakPositions = 0;
    259     fPositionInCache         = 0;
    260 
    261 #ifdef RBBI_DEBUG
    262     static UBool debugInitDone = FALSE;
    263     if (debugInitDone == FALSE) {
    264         char *debugEnv = getenv("U_RBBIDEBUG");
    265         if (debugEnv && uprv_strstr(debugEnv, "trace")) {
    266             fTrace = TRUE;
    267         }
    268         debugInitDone = TRUE;
    269     }
    270 #endif
    271 }
    272 
    273 
    274 
    275 //-----------------------------------------------------------------------------
    276 //
    277 //    clone - Returns a newly-constructed RuleBasedBreakIterator with the same
    278 //            behavior, and iterating over the same text, as this one.
    279 //            Virtual function: does the right thing with subclasses.
    280 //
    281 //-----------------------------------------------------------------------------
    282 BreakIterator*
    283 RuleBasedBreakIterator::clone(void) const {
    284     return new RuleBasedBreakIterator(*this);
    285 }
    286 
    287 /**
    288  * Equality operator.  Returns TRUE if both BreakIterators are of the
    289  * same class, have the same behavior, and iterate over the same text.
    290  */
    291 UBool
    292 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
    293     if (that.getDynamicClassID() != getDynamicClassID()) {
    294         return FALSE;
    295     }
    296 
    297     const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
    298 
    299     if (!utext_equals(fText, that2.fText)) {
    300         // The two break iterators are operating on different text,
    301         //   or have a different interation position.
    302         return FALSE;
    303     };
    304 
    305     // TODO:  need a check for when in a dictionary region at different offsets.
    306 
    307     if (that2.fData == fData ||
    308         (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
    309             // The two break iterators are using the same rules.
    310             return TRUE;
    311         }
    312     return FALSE;
    313 }
    314 
    315 /**
    316  * Compute a hash code for this BreakIterator
    317  * @return A hash code
    318  */
    319 int32_t
    320 RuleBasedBreakIterator::hashCode(void) const {
    321     int32_t   hash = 0;
    322     if (fData != NULL) {
    323         hash = fData->hashCode();
    324     }
    325     return hash;
    326 }
    327 
    328 
    329 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
    330     if (U_FAILURE(status)) {
    331         return;
    332     }
    333     reset();
    334     fText = utext_clone(fText, ut, FALSE, TRUE, &status);
    335 
    336     // Set up a dummy CharacterIterator to be returned if anyone
    337     //   calls getText().  With input from UText, there is no reasonable
    338     //   way to return a characterIterator over the actual input text.
    339     //   Return one over an empty string instead - this is the closest
    340     //   we can come to signaling a failure.
    341     //   (GetText() is obsolete, this failure is sort of OK)
    342     if (fDCharIter == NULL) {
    343         static const UChar c = 0;
    344         fDCharIter = new UCharCharacterIterator(&c, 0);
    345         if (fDCharIter == NULL) {
    346             status = U_MEMORY_ALLOCATION_ERROR;
    347             return;
    348         }
    349     }
    350 
    351     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    352         // existing fCharIter was adopted from the outside.  Delete it now.
    353         delete fCharIter;
    354     }
    355     fCharIter = fDCharIter;
    356 
    357     this->first();
    358 }
    359 
    360 
    361 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
    362     UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
    363     return result;
    364 }
    365 
    366 
    367 
    368 /**
    369  * Returns the description used to create this iterator
    370  */
    371 const UnicodeString&
    372 RuleBasedBreakIterator::getRules() const {
    373     if (fData != NULL) {
    374         return fData->getRuleSourceString();
    375     } else {
    376         static const UnicodeString *s;
    377         if (s == NULL) {
    378             // TODO:  something more elegant here.
    379             //        perhaps API should return the string by value.
    380             //        Note:  thread unsafe init & leak are semi-ok, better than
    381             //               what was before.  Sould be cleaned up, though.
    382             s = new UnicodeString;
    383         }
    384         return *s;
    385     }
    386 }
    387 
    388 //=======================================================================
    389 // BreakIterator overrides
    390 //=======================================================================
    391 
    392 /**
    393  * Return a CharacterIterator over the text being analyzed.
    394  */
    395 CharacterIterator&
    396 RuleBasedBreakIterator::getText() const {
    397     return *fCharIter;
    398 }
    399 
    400 /**
    401  * Set the iterator to analyze a new piece of text.  This function resets
    402  * the current iteration position to the beginning of the text.
    403  * @param newText An iterator over the text to analyze.
    404  */
    405 void
    406 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
    407     // If we are holding a CharacterIterator adopted from a
    408     //   previous call to this function, delete it now.
    409     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    410         delete fCharIter;
    411     }
    412 
    413     fCharIter = newText;
    414     UErrorCode status = U_ZERO_ERROR;
    415     reset();
    416     if (newText==NULL || newText->startIndex() != 0) {
    417         // startIndex !=0 wants to be an error, but there's no way to report it.
    418         // Make the iterator text be an empty string.
    419         fText = utext_openUChars(fText, NULL, 0, &status);
    420     } else {
    421         fText = utext_openCharacterIterator(fText, newText, &status);
    422     }
    423     this->first();
    424 }
    425 
    426 /**
    427  * Set the iterator to analyze a new piece of text.  This function resets
    428  * the current iteration position to the beginning of the text.
    429  * @param newText An iterator over the text to analyze.
    430  */
    431 void
    432 RuleBasedBreakIterator::setText(const UnicodeString& newText) {
    433     UErrorCode status = U_ZERO_ERROR;
    434     reset();
    435     fText = utext_openConstUnicodeString(fText, &newText, &status);
    436 
    437     // Set up a character iterator on the string.
    438     //   Needed in case someone calls getText().
    439     //  Can not, unfortunately, do this lazily on the (probably never)
    440     //  call to getText(), because getText is const.
    441     if (fSCharIter == NULL) {
    442         fSCharIter = new StringCharacterIterator(newText);
    443     } else {
    444         fSCharIter->setText(newText);
    445     }
    446 
    447     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    448         // old fCharIter was adopted from the outside.  Delete it.
    449         delete fCharIter;
    450     }
    451     fCharIter = fSCharIter;
    452 
    453     this->first();
    454 }
    455 
    456 
    457 
    458 /**
    459  * Sets the current iteration position to the beginning of the text.
    460  * @return The offset of the beginning of the text.
    461  */
    462 int32_t RuleBasedBreakIterator::first(void) {
    463     reset();
    464     fLastRuleStatusIndex  = 0;
    465     fLastStatusIndexValid = TRUE;
    466     //if (fText == NULL)
    467     //    return BreakIterator::DONE;
    468 
    469     utext_setNativeIndex(fText, 0);
    470     return 0;
    471 }
    472 
    473 /**
    474  * Sets the current iteration position to the end of the text.
    475  * @return The text's past-the-end offset.
    476  */
    477 int32_t RuleBasedBreakIterator::last(void) {
    478     reset();
    479     if (fText == NULL) {
    480         fLastRuleStatusIndex  = 0;
    481         fLastStatusIndexValid = TRUE;
    482         return BreakIterator::DONE;
    483     }
    484 
    485     fLastStatusIndexValid = FALSE;
    486     int32_t pos = (int32_t)utext_nativeLength(fText);
    487     utext_setNativeIndex(fText, pos);
    488     return pos;
    489 }
    490 
    491 /**
    492  * Advances the iterator either forward or backward the specified number of steps.
    493  * Negative values move backward, and positive values move forward.  This is
    494  * equivalent to repeatedly calling next() or previous().
    495  * @param n The number of steps to move.  The sign indicates the direction
    496  * (negative is backwards, and positive is forwards).
    497  * @return The character offset of the boundary position n boundaries away from
    498  * the current one.
    499  */
    500 int32_t RuleBasedBreakIterator::next(int32_t n) {
    501     int32_t result = current();
    502     while (n > 0) {
    503         result = next();
    504         --n;
    505     }
    506     while (n < 0) {
    507         result = previous();
    508         ++n;
    509     }
    510     return result;
    511 }
    512 
    513 /**
    514  * Advances the iterator to the next boundary position.
    515  * @return The position of the first boundary after this one.
    516  */
    517 int32_t RuleBasedBreakIterator::next(void) {
    518     // if we have cached break positions and we're still in the range
    519     // covered by them, just move one step forward in the cache
    520     if (fCachedBreakPositions != NULL) {
    521         if (fPositionInCache < fNumCachedBreakPositions - 1) {
    522             ++fPositionInCache;
    523             int32_t pos = fCachedBreakPositions[fPositionInCache];
    524             utext_setNativeIndex(fText, pos);
    525             return pos;
    526         }
    527         else {
    528             reset();
    529         }
    530     }
    531 
    532     int32_t startPos = current();
    533     int32_t result = handleNext(fData->fForwardTable);
    534     if (fDictionaryCharCount > 0) {
    535         result = checkDictionary(startPos, result, FALSE);
    536     }
    537     return result;
    538 }
    539 
    540 /**
    541  * Advances the iterator backwards, to the last boundary preceding this one.
    542  * @return The position of the last boundary position preceding this one.
    543  */
    544 int32_t RuleBasedBreakIterator::previous(void) {
    545     int32_t result;
    546     int32_t startPos;
    547 
    548     // if we have cached break positions and we're still in the range
    549     // covered by them, just move one step backward in the cache
    550     if (fCachedBreakPositions != NULL) {
    551         if (fPositionInCache > 0) {
    552             --fPositionInCache;
    553             // If we're at the beginning of the cache, need to reevaluate the
    554             // rule status
    555             if (fPositionInCache <= 0) {
    556                 fLastStatusIndexValid = FALSE;
    557             }
    558             int32_t pos = fCachedBreakPositions[fPositionInCache];
    559             utext_setNativeIndex(fText, pos);
    560             return pos;
    561         }
    562         else {
    563             reset();
    564         }
    565     }
    566 
    567     // if we're already sitting at the beginning of the text, return DONE
    568     if (fText == NULL || (startPos = current()) == 0) {
    569         fLastRuleStatusIndex  = 0;
    570         fLastStatusIndexValid = TRUE;
    571         return BreakIterator::DONE;
    572     }
    573 
    574     if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
    575         result = handlePrevious(fData->fReverseTable);
    576         if (fDictionaryCharCount > 0) {
    577             result = checkDictionary(result, startPos, TRUE);
    578         }
    579         return result;
    580     }
    581 
    582     // old rule syntax
    583     // set things up.  handlePrevious() will back us up to some valid
    584     // break position before the current position (we back our internal
    585     // iterator up one step to prevent handlePrevious() from returning
    586     // the current position), but not necessarily the last one before
    587 
    588     // where we started
    589 
    590     int32_t start = current();
    591 
    592     UTEXT_PREVIOUS32(fText);
    593     int32_t lastResult    = handlePrevious(fData->fReverseTable);
    594     if (lastResult == UBRK_DONE) {
    595         lastResult = 0;
    596         utext_setNativeIndex(fText, 0);
    597     }
    598     result = lastResult;
    599     int32_t lastTag       = 0;
    600     UBool   breakTagValid = FALSE;
    601 
    602     // iterate forward from the known break position until we pass our
    603     // starting point.  The last break position before the starting
    604     // point is our return value
    605 
    606     for (;;) {
    607         result         = next();
    608         if (result == BreakIterator::DONE || result >= start) {
    609             break;
    610         }
    611         lastResult     = result;
    612         lastTag        = fLastRuleStatusIndex;
    613         breakTagValid  = TRUE;
    614     }
    615 
    616     // fLastBreakTag wants to have the value for section of text preceding
    617     // the result position that we are to return (in lastResult.)  If
    618     // the backwards rules overshot and the above loop had to do two or more
    619     // next()s to move up to the desired return position, we will have a valid
    620     // tag value. But, if handlePrevious() took us to exactly the correct result positon,
    621     // we wont have a tag value for that position, which is only set by handleNext().
    622 
    623     // set the current iteration position to be the last break position
    624     // before where we started, and then return that value
    625     utext_setNativeIndex(fText, lastResult);
    626     fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()
    627     fLastStatusIndexValid = breakTagValid;
    628 
    629     // No need to check the dictionary; it will have been handled by
    630     // next()
    631 
    632     return lastResult;
    633 }
    634 
    635 /**
    636  * Sets the iterator to refer to the first boundary position following
    637  * the specified position.
    638  * @offset The position from which to begin searching for a break position.
    639  * @return The position of the first break after the current position.
    640  */
    641 int32_t RuleBasedBreakIterator::following(int32_t offset) {
    642     // if we have cached break positions and offset is in the range
    643     // covered by them, use them
    644     // TODO: could use binary search
    645     // TODO: what if offset is outside range, but break is not?
    646     if (fCachedBreakPositions != NULL) {
    647         if (offset >= fCachedBreakPositions[0]
    648                 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
    649             fPositionInCache = 0;
    650             // We are guaranteed not to leave the array due to range test above
    651             while (offset >= fCachedBreakPositions[fPositionInCache]) {
    652                 ++fPositionInCache;
    653             }
    654             int32_t pos = fCachedBreakPositions[fPositionInCache];
    655             utext_setNativeIndex(fText, pos);
    656             return pos;
    657         }
    658         else {
    659             reset();
    660         }
    661     }
    662 
    663     // if the offset passed in is already past the end of the text,
    664     // just return DONE; if it's before the beginning, return the
    665     // text's starting offset
    666     fLastRuleStatusIndex  = 0;
    667     fLastStatusIndexValid = TRUE;
    668     if (fText == NULL || offset >= utext_nativeLength(fText)) {
    669         last();
    670         return next();
    671     }
    672     else if (offset < 0) {
    673         return first();
    674     }
    675 
    676     // otherwise, set our internal iteration position (temporarily)
    677     // to the position passed in.  If this is the _beginning_ position,
    678     // then we can just use next() to get our return value
    679 
    680     int32_t result = 0;
    681 
    682     if (fData->fSafeRevTable != NULL) {
    683         // new rule syntax
    684         utext_setNativeIndex(fText, offset);
    685         // move forward one codepoint to prepare for moving back to a
    686         // safe point.
    687         // this handles offset being between a supplementary character
    688         UTEXT_NEXT32(fText);
    689         // handlePrevious will move most of the time to < 1 boundary away
    690         handlePrevious(fData->fSafeRevTable);
    691         int32_t result = next();
    692         while (result <= offset) {
    693             result = next();
    694         }
    695         return result;
    696     }
    697     if (fData->fSafeFwdTable != NULL) {
    698         // backup plan if forward safe table is not available
    699         utext_setNativeIndex(fText, offset);
    700         UTEXT_PREVIOUS32(fText);
    701         // handle next will give result >= offset
    702         handleNext(fData->fSafeFwdTable);
    703         // previous will give result 0 or 1 boundary away from offset,
    704         // most of the time
    705         // we have to
    706         int32_t oldresult = previous();
    707         while (oldresult > offset) {
    708             int32_t result = previous();
    709             if (result <= offset) {
    710                 return oldresult;
    711             }
    712             oldresult = result;
    713         }
    714         int32_t result = next();
    715         if (result <= offset) {
    716             return next();
    717         }
    718         return result;
    719     }
    720     // otherwise, we have to sync up first.  Use handlePrevious() to back
    721     // up to a known break position before the specified position (if
    722     // we can determine that the specified position is a break position,
    723     // we don't back up at all).  This may or may not be the last break
    724     // position at or before our starting position.  Advance forward
    725     // from here until we've passed the starting position.  The position
    726     // we stop on will be the first break position after the specified one.
    727     // old rule syntax
    728 
    729     utext_setNativeIndex(fText, offset);
    730     if (offset==0 ||
    731         offset==1  && utext_getNativeIndex(fText)==0) {
    732         return next();
    733     }
    734     result = previous();
    735 
    736     while (result != BreakIterator::DONE && result <= offset) {
    737         result = next();
    738     }
    739 
    740     return result;
    741 }
    742 
    743 /**
    744  * Sets the iterator to refer to the last boundary position before the
    745  * specified position.
    746  * @offset The position to begin searching for a break from.
    747  * @return The position of the last boundary before the starting position.
    748  */
    749 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
    750     // if we have cached break positions and offset is in the range
    751     // covered by them, use them
    752     if (fCachedBreakPositions != NULL) {
    753         // TODO: binary search?
    754         // TODO: What if offset is outside range, but break is not?
    755         if (offset > fCachedBreakPositions[0]
    756                 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
    757             fPositionInCache = 0;
    758             while (fPositionInCache < fNumCachedBreakPositions
    759                    && offset > fCachedBreakPositions[fPositionInCache])
    760                 ++fPositionInCache;
    761             --fPositionInCache;
    762             // If we're at the beginning of the cache, need to reevaluate the
    763             // rule status
    764             if (fPositionInCache <= 0) {
    765                 fLastStatusIndexValid = FALSE;
    766             }
    767             utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
    768             return fCachedBreakPositions[fPositionInCache];
    769         }
    770         else {
    771             reset();
    772         }
    773     }
    774 
    775     // if the offset passed in is already past the end of the text,
    776     // just return DONE; if it's before the beginning, return the
    777     // text's starting offset
    778     if (fText == NULL || offset > utext_nativeLength(fText)) {
    779         // return BreakIterator::DONE;
    780         return last();
    781     }
    782     else if (offset < 0) {
    783         return first();
    784     }
    785 
    786     // if we start by updating the current iteration position to the
    787     // position specified by the caller, we can just use previous()
    788     // to carry out this operation
    789 
    790     if (fData->fSafeFwdTable != NULL) {
    791         // new rule syntax
    792         utext_setNativeIndex(fText, offset);
    793         int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    794         if (newOffset != offset) {
    795             // Will come here if specified offset was not a code point boundary AND
    796             //   the underlying implmentation is using UText, which snaps any non-code-point-boundary
    797             //   indices to the containing code point.
    798             // For breakitereator::preceding only, these non-code-point indices need to be moved
    799             //   up to refer to the following codepoint.
    800             UTEXT_NEXT32(fText);
    801             offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    802         }
    803 
    804         // TODO:  (synwee) would it be better to just check for being in the middle of a surrogate pair,
    805         //        rather than adjusting the position unconditionally?
    806         //        (Change would interact with safe rules.)
    807         // TODO:  change RBBI behavior for off-boundary indices to match that of UText?
    808         //        affects only preceding(), seems cleaner, but is slightly different.
    809         UTEXT_PREVIOUS32(fText);
    810         handleNext(fData->fSafeFwdTable);
    811         int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    812         while (result >= offset) {
    813             result = previous();
    814         }
    815         return result;
    816     }
    817     if (fData->fSafeRevTable != NULL) {
    818         // backup plan if forward safe table is not available
    819         //  TODO:  check whether this path can be discarded
    820         //         It's probably OK to say that rules must supply both safe tables
    821         //            if they use safe tables at all.  We have certainly never described
    822         //            to anyone how to work with just one safe table.
    823         utext_setNativeIndex(fText, offset);
    824         UTEXT_NEXT32(fText);
    825 
    826         // handle previous will give result <= offset
    827         handlePrevious(fData->fSafeRevTable);
    828 
    829         // next will give result 0 or 1 boundary away from offset,
    830         // most of the time
    831         // we have to
    832         int32_t oldresult = next();
    833         while (oldresult < offset) {
    834             int32_t result = next();
    835             if (result >= offset) {
    836                 return oldresult;
    837             }
    838             oldresult = result;
    839         }
    840         int32_t result = previous();
    841         if (result >= offset) {
    842             return previous();
    843         }
    844         return result;
    845     }
    846 
    847     // old rule syntax
    848     utext_setNativeIndex(fText, offset);
    849     return previous();
    850 }
    851 
    852 /**
    853  * Returns true if the specfied position is a boundary position.  As a side
    854  * effect, leaves the iterator pointing to the first boundary position at
    855  * or after "offset".
    856  * @param offset the offset to check.
    857  * @return True if "offset" is a boundary position.
    858  */
    859 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
    860     // the beginning index of the iterator is always a boundary position by definition
    861     if (offset == 0) {
    862         first();       // For side effects on current position, tag values.
    863         return TRUE;
    864     }
    865 
    866     if (offset == (int32_t)utext_nativeLength(fText)) {
    867         last();       // For side effects on current position, tag values.
    868         return TRUE;
    869     }
    870 
    871     // out-of-range indexes are never boundary positions
    872     if (offset < 0) {
    873         first();       // For side effects on current position, tag values.
    874         return FALSE;
    875     }
    876 
    877     if (offset > utext_nativeLength(fText)) {
    878         last();        // For side effects on current position, tag values.
    879         return FALSE;
    880     }
    881 
    882     // otherwise, we can use following() on the position before the specified
    883     // one and return true if the position we get back is the one the user
    884     // specified
    885     utext_previous32From(fText, offset);
    886     int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    887     UBool    result  = following(backOne) == offset;
    888     return result;
    889 }
    890 
    891 /**
    892  * Returns the current iteration position.
    893  * @return The current iteration position.
    894  */
    895 int32_t RuleBasedBreakIterator::current(void) const {
    896     int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    897     return pos;
    898 }
    899 
    900 //=======================================================================
    901 // implementation
    902 //=======================================================================
    903 
    904 //
    905 // RBBIRunMode  -  the state machine runs an extra iteration at the beginning and end
    906 //                 of user text.  A variable with this enum type keeps track of where we
    907 //                 are.  The state machine only fetches user input while in the RUN mode.
    908 //
    909 enum RBBIRunMode {
    910     RBBI_START,     // state machine processing is before first char of input
    911     RBBI_RUN,       // state machine processing is in the user text
    912     RBBI_END        // state machine processing is after end of user text.
    913 };
    914 
    915 
    916 //-----------------------------------------------------------------------------------
    917 //
    918 //  handleNext(stateTable)
    919 //     This method is the actual implementation of the rbbi next() method.
    920 //     This method initializes the state machine to state 1
    921 //     and advances through the text character by character until we reach the end
    922 //     of the text or the state machine transitions to state 0.  We update our return
    923 //     value every time the state machine passes through an accepting state.
    924 //
    925 //-----------------------------------------------------------------------------------
    926 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
    927     int32_t             state;
    928     int16_t             category        = 0;
    929     RBBIRunMode         mode;
    930 
    931     RBBIStateTableRow  *row;
    932     UChar32             c;
    933     int32_t             lookaheadStatus = 0;
    934     int32_t             lookaheadTagIdx = 0;
    935     int32_t             result          = 0;
    936     int32_t             initialPosition = 0;
    937     int32_t             lookaheadResult = 0;
    938     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
    939     const char         *tableData       = statetable->fTableData;
    940     uint32_t            tableRowLen     = statetable->fRowLen;
    941 
    942     #ifdef RBBI_DEBUG
    943         if (fTrace) {
    944             RBBIDebugPuts("Handle Next   pos   char  state category");
    945         }
    946     #endif
    947 
    948     // No matter what, handleNext alway correctly sets the break tag value.
    949     fLastStatusIndexValid = TRUE;
    950     fLastRuleStatusIndex = 0;
    951 
    952     // if we're already at the end of the text, return DONE.
    953     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    954     result          = initialPosition;
    955     c               = UTEXT_NEXT32(fText);
    956     if (fData == NULL || c==U_SENTINEL) {
    957         return BreakIterator::DONE;
    958     }
    959 
    960     //  Set the initial state for the state machine
    961     state = START_STATE;
    962     row = (RBBIStateTableRow *)
    963             //(statetable->fTableData + (statetable->fRowLen * state));
    964             (tableData + tableRowLen * state);
    965 
    966 
    967     mode     = RBBI_RUN;
    968     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
    969         category = 2;
    970         mode     = RBBI_START;
    971     }
    972 
    973 
    974     // loop until we reach the end of the text or transition to state 0
    975     //
    976     for (;;) {
    977         if (c == U_SENTINEL) {
    978             // Reached end of input string.
    979             if (mode == RBBI_END) {
    980                 // We have already run the loop one last time with the
    981                 //   character set to the psueudo {eof} value.  Now it is time
    982                 //   to unconditionally bail out.
    983                 if (lookaheadResult > result) {
    984                     // We ran off the end of the string with a pending look-ahead match.
    985                     // Treat this as if the look-ahead condition had been met, and return
    986                     //  the match at the / position from the look-ahead rule.
    987                     result               = lookaheadResult;
    988                     fLastRuleStatusIndex = lookaheadTagIdx;
    989                     lookaheadStatus = 0;
    990                 }
    991                 break;
    992             }
    993             // Run the loop one last time with the fake end-of-input character category.
    994             mode = RBBI_END;
    995             category = 1;
    996         }
    997 
    998         //
    999         // Get the char category.  An incoming category of 1 or 2 means that
   1000         //      we are preset for doing the beginning or end of input, and
   1001         //      that we shouldn't get a category from an actual text input character.
   1002         //
   1003         if (mode == RBBI_RUN) {
   1004             // look up the current character's character category, which tells us
   1005             // which column in the state table to look at.
   1006             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
   1007             //        not the size of the character going in, which is a UChar32.
   1008             //
   1009             UTRIE_GET16(&fData->fTrie, c, category);
   1010 
   1011             // Check the dictionary bit in the character's category.
   1012             //    Counter is only used by dictionary based iterators (subclasses).
   1013             //    Chars that need to be handled by a dictionary have a flag bit set
   1014             //    in their category values.
   1015             //
   1016             if ((category & 0x4000) != 0)  {
   1017                 fDictionaryCharCount++;
   1018                 //  And off the dictionary flag bit.
   1019                 category &= ~0x4000;
   1020             }
   1021         }
   1022 
   1023         #ifdef RBBI_DEBUG
   1024             if (fTrace) {
   1025                 RBBIDebugPrintf("             %4ld   ", utext_getNativeIndex(fText));
   1026                 if (0x20<=c && c<0x7f) {
   1027                     RBBIDebugPrintf("\"%c\"  ", c);
   1028                 } else {
   1029                     RBBIDebugPrintf("%5x  ", c);
   1030                 }
   1031                 RBBIDebugPrintf("%3d  %3d\n", state, category);
   1032             }
   1033         #endif
   1034 
   1035         // State Transition - move machine to its next state
   1036         //
   1037         state = row->fNextState[category];
   1038         row = (RBBIStateTableRow *)
   1039             // (statetable->fTableData + (statetable->fRowLen * state));
   1040             (tableData + tableRowLen * state);
   1041 
   1042 
   1043         if (row->fAccepting == -1) {
   1044             // Match found, common case.
   1045             if (mode != RBBI_START) {
   1046                 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1047             }
   1048             fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
   1049         }
   1050 
   1051         if (row->fLookAhead != 0) {
   1052             if (lookaheadStatus != 0
   1053                 && row->fAccepting == lookaheadStatus) {
   1054                 // Lookahead match is completed.
   1055                 result               = lookaheadResult;
   1056                 fLastRuleStatusIndex = lookaheadTagIdx;
   1057                 lookaheadStatus      = 0;
   1058                 // TODO:  make a standalone hard break in a rule work.
   1059                 if (lookAheadHardBreak) {
   1060                     UTEXT_SETNATIVEINDEX(fText, result);
   1061                     return result;
   1062                 }
   1063                 // Look-ahead completed, but other rules may match further.  Continue on
   1064                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
   1065                 goto continueOn;
   1066             }
   1067 
   1068             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1069             lookaheadResult = r;
   1070             lookaheadStatus = row->fLookAhead;
   1071             lookaheadTagIdx = row->fTagIdx;
   1072             goto continueOn;
   1073         }
   1074 
   1075 
   1076         if (row->fAccepting != 0) {
   1077             // Because this is an accepting state, any in-progress look-ahead match
   1078             //   is no longer relavant.  Clear out the pending lookahead status.
   1079             lookaheadStatus = 0;           // clear out any pending look-ahead match.
   1080         }
   1081 
   1082 continueOn:
   1083         if (state == STOP_STATE) {
   1084             // This is the normal exit from the lookup state machine.
   1085             // We have advanced through the string until it is certain that no
   1086             //   longer match is possible, no matter what characters follow.
   1087             break;
   1088         }
   1089 
   1090         // Advance to the next character.
   1091         // If this is a beginning-of-input loop iteration, don't advance
   1092         //    the input position.  The next iteration will be processing the
   1093         //    first real input character.
   1094         if (mode == RBBI_RUN) {
   1095             c = UTEXT_NEXT32(fText);
   1096         } else {
   1097             if (mode == RBBI_START) {
   1098                 mode = RBBI_RUN;
   1099             }
   1100         }
   1101 
   1102 
   1103     }
   1104 
   1105     // The state machine is done.  Check whether it found a match...
   1106 
   1107     // If the iterator failed to advance in the match engine, force it ahead by one.
   1108     //   (This really indicates a defect in the break rules.  They should always match
   1109     //    at least one character.)
   1110     if (result == initialPosition) {
   1111         UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1112         UTEXT_NEXT32(fText);
   1113         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1114     }
   1115 
   1116     // Leave the iterator at our result position.
   1117     UTEXT_SETNATIVEINDEX(fText, result);
   1118     #ifdef RBBI_DEBUG
   1119         if (fTrace) {
   1120             RBBIDebugPrintf("result = %d\n\n", result);
   1121         }
   1122     #endif
   1123     return result;
   1124 }
   1125 
   1126 
   1127 
   1128 //-----------------------------------------------------------------------------------
   1129 //
   1130 //  handlePrevious()
   1131 //
   1132 //      Iterate backwards, according to the logic of the reverse rules.
   1133 //      This version handles the exact style backwards rules.
   1134 //
   1135 //      The logic of this function is very similar to handleNext(), above.
   1136 //
   1137 //-----------------------------------------------------------------------------------
   1138 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
   1139     int32_t             state;
   1140     int16_t             category        = 0;
   1141     RBBIRunMode         mode;
   1142     RBBIStateTableRow  *row;
   1143     UChar32             c;
   1144     int32_t             lookaheadStatus = 0;
   1145     int32_t             result          = 0;
   1146     int32_t             initialPosition = 0;
   1147     int32_t             lookaheadResult = 0;
   1148     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
   1149 
   1150     #ifdef RBBI_DEBUG
   1151         if (fTrace) {
   1152             RBBIDebugPuts("Handle Previous   pos   char  state category");
   1153         }
   1154     #endif
   1155 
   1156     // handlePrevious() never gets the rule status.
   1157     // Flag the status as invalid; if the user ever asks for status, we will need
   1158     // to back up, then re-find the break position using handleNext(), which does
   1159     // get the status value.
   1160     fLastStatusIndexValid = FALSE;
   1161     fLastRuleStatusIndex = 0;
   1162 
   1163     // if we're already at the start of the text, return DONE.
   1164     if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
   1165         return BreakIterator::DONE;
   1166     }
   1167 
   1168     //  Set up the starting char.
   1169     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1170     result          = initialPosition;
   1171     c               = UTEXT_PREVIOUS32(fText);
   1172 
   1173     //  Set the initial state for the state machine
   1174     state = START_STATE;
   1175     row = (RBBIStateTableRow *)
   1176             (statetable->fTableData + (statetable->fRowLen * state));
   1177     category = 3;
   1178     mode     = RBBI_RUN;
   1179     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
   1180         category = 2;
   1181         mode     = RBBI_START;
   1182     }
   1183 
   1184 
   1185     // loop until we reach the start of the text or transition to state 0
   1186     //
   1187     for (;;) {
   1188         if (c == U_SENTINEL) {
   1189             // Reached end of input string.
   1190             if (mode == RBBI_END ||
   1191                 *(int32_t *)fData->fHeader->fFormatVersion == 1 ) {
   1192                 // We have already run the loop one last time with the
   1193                 //   character set to the psueudo {eof} value.  Now it is time
   1194                 //   to unconditionally bail out.
   1195                 //  (Or we have an old format binary rule file that does not support {eof}.)
   1196                 if (lookaheadResult < result) {
   1197                     // We ran off the end of the string with a pending look-ahead match.
   1198                     // Treat this as if the look-ahead condition had been met, and return
   1199                     //  the match at the / position from the look-ahead rule.
   1200                     result               = lookaheadResult;
   1201                     lookaheadStatus = 0;
   1202                 } else if (result == initialPosition) {
   1203                     // Ran off start, no match found.
   1204                     // move one index one (towards the start, since we are doing a previous())
   1205                     UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1206                     UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.
   1207                 }
   1208                 break;
   1209             }
   1210             // Run the loop one last time with the fake end-of-input character category.
   1211             mode = RBBI_END;
   1212             category = 1;
   1213         }
   1214 
   1215         //
   1216         // Get the char category.  An incoming category of 1 or 2 means that
   1217         //      we are preset for doing the beginning or end of input, and
   1218         //      that we shouldn't get a category from an actual text input character.
   1219         //
   1220         if (mode == RBBI_RUN) {
   1221             // look up the current character's character category, which tells us
   1222             // which column in the state table to look at.
   1223             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
   1224             //        not the size of the character going in, which is a UChar32.
   1225             //
   1226             UTRIE_GET16(&fData->fTrie, c, category);
   1227 
   1228             // Check the dictionary bit in the character's category.
   1229             //    Counter is only used by dictionary based iterators (subclasses).
   1230             //    Chars that need to be handled by a dictionary have a flag bit set
   1231             //    in their category values.
   1232             //
   1233             if ((category & 0x4000) != 0)  {
   1234                 fDictionaryCharCount++;
   1235                 //  And off the dictionary flag bit.
   1236                 category &= ~0x4000;
   1237             }
   1238         }
   1239 
   1240         #ifdef RBBI_DEBUG
   1241             if (fTrace) {
   1242                 RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(fText));
   1243                 if (0x20<=c && c<0x7f) {
   1244                     RBBIDebugPrintf("\"%c\"  ", c);
   1245                 } else {
   1246                     RBBIDebugPrintf("%5x  ", c);
   1247                 }
   1248                 RBBIDebugPrintf("%3d  %3d\n", state, category);
   1249             }
   1250         #endif
   1251 
   1252         // State Transition - move machine to its next state
   1253         //
   1254         state = row->fNextState[category];
   1255         row = (RBBIStateTableRow *)
   1256             (statetable->fTableData + (statetable->fRowLen * state));
   1257 
   1258         if (row->fAccepting == -1) {
   1259             // Match found, common case.
   1260             result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1261         }
   1262 
   1263         if (row->fLookAhead != 0) {
   1264             if (lookaheadStatus != 0
   1265                 && row->fAccepting == lookaheadStatus) {
   1266                 // Lookahead match is completed.
   1267                 result               = lookaheadResult;
   1268                 lookaheadStatus      = 0;
   1269                 // TODO:  make a standalone hard break in a rule work.
   1270                 if (lookAheadHardBreak) {
   1271                     UTEXT_SETNATIVEINDEX(fText, result);
   1272                     return result;
   1273                 }
   1274                 // Look-ahead completed, but other rules may match further.  Continue on
   1275                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
   1276                 goto continueOn;
   1277             }
   1278 
   1279             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1280             lookaheadResult = r;
   1281             lookaheadStatus = row->fLookAhead;
   1282             goto continueOn;
   1283         }
   1284 
   1285 
   1286         if (row->fAccepting != 0) {
   1287             // Because this is an accepting state, any in-progress look-ahead match
   1288             //   is no longer relavant.  Clear out the pending lookahead status.
   1289             lookaheadStatus = 0;
   1290         }
   1291 
   1292 continueOn:
   1293         if (state == STOP_STATE) {
   1294             // This is the normal exit from the lookup state machine.
   1295             // We have advanced through the string until it is certain that no
   1296             //   longer match is possible, no matter what characters follow.
   1297             break;
   1298         }
   1299 
   1300         // Move (backwards) to the next character to process.
   1301         // If this is a beginning-of-input loop iteration, don't advance
   1302         //    the input position.  The next iteration will be processing the
   1303         //    first real input character.
   1304         if (mode == RBBI_RUN) {
   1305             c = UTEXT_PREVIOUS32(fText);
   1306         } else {
   1307             if (mode == RBBI_START) {
   1308                 mode = RBBI_RUN;
   1309             }
   1310         }
   1311     }
   1312 
   1313     // The state machine is done.  Check whether it found a match...
   1314 
   1315     // If the iterator failed to advance in the match engine, force it ahead by one.
   1316     //   (This really indicates a defect in the break rules.  They should always match
   1317     //    at least one character.)
   1318     if (result == initialPosition) {
   1319         UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1320         UTEXT_PREVIOUS32(fText);
   1321         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1322     }
   1323 
   1324     // Leave the iterator at our result position.
   1325     UTEXT_SETNATIVEINDEX(fText, result);
   1326     #ifdef RBBI_DEBUG
   1327         if (fTrace) {
   1328             RBBIDebugPrintf("result = %d\n\n", result);
   1329         }
   1330     #endif
   1331     return result;
   1332 }
   1333 
   1334 
   1335 void
   1336 RuleBasedBreakIterator::reset()
   1337 {
   1338     if (fCachedBreakPositions) {
   1339         uprv_free(fCachedBreakPositions);
   1340     }
   1341     fCachedBreakPositions = NULL;
   1342     fNumCachedBreakPositions = 0;
   1343     fDictionaryCharCount = 0;
   1344     fPositionInCache = 0;
   1345 }
   1346 
   1347 
   1348 
   1349 //-------------------------------------------------------------------------------
   1350 //
   1351 //   getRuleStatus()   Return the break rule tag associated with the current
   1352 //                     iterator position.  If the iterator arrived at its current
   1353 //                     position by iterating forwards, the value will have been
   1354 //                     cached by the handleNext() function.
   1355 //
   1356 //                     If no cached status value is available, the status is
   1357 //                     found by doing a previous() followed by a next(), which
   1358 //                     leaves the iterator where it started, and computes the
   1359 //                     status while doing the next().
   1360 //
   1361 //-------------------------------------------------------------------------------
   1362 void RuleBasedBreakIterator::makeRuleStatusValid() {
   1363     if (fLastStatusIndexValid == FALSE) {
   1364         //  No cached status is available.
   1365         if (fText == NULL || current() == 0) {
   1366             //  At start of text, or there is no text.  Status is always zero.
   1367             fLastRuleStatusIndex = 0;
   1368             fLastStatusIndexValid = TRUE;
   1369         } else {
   1370             //  Not at start of text.  Find status the tedious way.
   1371             int32_t pa = current();
   1372             previous();
   1373             if (fNumCachedBreakPositions > 0) {
   1374                 reset();                // Blow off the dictionary cache
   1375             }
   1376             int32_t pb = next();
   1377             if (pa != pb) {
   1378                 // note: the if (pa != pb) test is here only to eliminate warnings for
   1379                 //       unused local variables on gcc.  Logically, it isn't needed.
   1380                 U_ASSERT(pa == pb);
   1381             }
   1382         }
   1383     }
   1384     U_ASSERT(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fData->fStatusMaxIdx);
   1385 }
   1386 
   1387 
   1388 int32_t  RuleBasedBreakIterator::getRuleStatus() const {
   1389     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
   1390     nonConstThis->makeRuleStatusValid();
   1391 
   1392     // fLastRuleStatusIndex indexes to the start of the appropriate status record
   1393     //                                                 (the number of status values.)
   1394     //   This function returns the last (largest) of the array of status values.
   1395     int32_t  idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
   1396     int32_t  tagVal = fData->fRuleStatusTable[idx];
   1397 
   1398     return tagVal;
   1399 }
   1400 
   1401 
   1402 
   1403 
   1404 int32_t RuleBasedBreakIterator::getRuleStatusVec(
   1405              int32_t *fillInVec, int32_t capacity, UErrorCode &status)
   1406 {
   1407     if (U_FAILURE(status)) {
   1408         return 0;
   1409     }
   1410 
   1411     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
   1412     nonConstThis->makeRuleStatusValid();
   1413     int32_t  numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
   1414     int32_t  numValsToCopy = numVals;
   1415     if (numVals > capacity) {
   1416         status = U_BUFFER_OVERFLOW_ERROR;
   1417         numValsToCopy = capacity;
   1418     }
   1419     int i;
   1420     for (i=0; i<numValsToCopy; i++) {
   1421         fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
   1422     }
   1423     return numVals;
   1424 }
   1425 
   1426 
   1427 
   1428 //-------------------------------------------------------------------------------
   1429 //
   1430 //   getBinaryRules        Access to the compiled form of the rules,
   1431 //                         for use by build system tools that save the data
   1432 //                         for standard iterator types.
   1433 //
   1434 //-------------------------------------------------------------------------------
   1435 const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
   1436     const uint8_t  *retPtr = NULL;
   1437     length = 0;
   1438 
   1439     if (fData != NULL) {
   1440         retPtr = (const uint8_t *)fData->fHeader;
   1441         length = fData->fHeader->fLength;
   1442     }
   1443     return retPtr;
   1444 }
   1445 
   1446 
   1447 
   1448 
   1449 //-------------------------------------------------------------------------------
   1450 //
   1451 //  BufferClone       TODO:  In my (Andy) opinion, this function should be deprecated.
   1452 //                    Saving one heap allocation isn't worth the trouble.
   1453 //                    Cloning shouldn't be done in tight loops, and
   1454 //                    making the clone copy involves other heap operations anyway.
   1455 //                    And the application code for correctly dealing with buffer
   1456 //                    size problems and the eventual object destruction is ugly.
   1457 //
   1458 //-------------------------------------------------------------------------------
   1459 BreakIterator *  RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
   1460                                    int32_t &bufferSize,
   1461                                    UErrorCode &status)
   1462 {
   1463     if (U_FAILURE(status)){
   1464         return NULL;
   1465     }
   1466 
   1467     //
   1468     //  If user buffer size is zero this is a preflight operation to
   1469     //    obtain the needed buffer size, allowing for worst case misalignment.
   1470     //
   1471     if (bufferSize == 0) {
   1472         bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
   1473         return NULL;
   1474     }
   1475 
   1476 
   1477     //
   1478     //  Check the alignment and size of the user supplied buffer.
   1479     //  Allocate heap memory if the user supplied memory is insufficient.
   1480     //
   1481     char    *buf   = (char *)stackBuffer;
   1482     uint32_t s      = bufferSize;
   1483 
   1484     if (stackBuffer == NULL) {
   1485         s = 0;   // Ignore size, force allocation if user didn't give us a buffer.
   1486     }
   1487     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
   1488         uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf);
   1489         s   -= offsetUp;
   1490         buf += offsetUp;
   1491     }
   1492     if (s < sizeof(RuleBasedBreakIterator)) {
   1493         // Not enough room in the caller-supplied buffer.
   1494         // Do a plain-vanilla heap based clone and return that, along with
   1495         //   a warning that the clone was allocated.
   1496         RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this);
   1497         if (clonedBI == 0) {
   1498             status = U_MEMORY_ALLOCATION_ERROR;
   1499         } else {
   1500             status = U_SAFECLONE_ALLOCATED_WARNING;
   1501         }
   1502         return clonedBI;
   1503     }
   1504 
   1505     //
   1506     //  Clone the source BI into the caller-supplied buffer.
   1507     //    TODO:  using an overloaded operator new to directly initialize the
   1508     //           copy in the user's buffer would be better, but it doesn't seem
   1509     //           to get along with namespaces.  Investigate why.
   1510     //
   1511     //           The memcpy is only safe with an empty (default constructed)
   1512     //           break iterator.  Use on others can screw up reference counts
   1513     //           to data.  memcpy-ing objects is not really a good idea...
   1514     //
   1515     RuleBasedBreakIterator localIter;        // Empty break iterator, source for memcpy
   1516     RuleBasedBreakIterator *clone = (RuleBasedBreakIterator *)buf;
   1517     uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // init C++ gorp, BreakIterator base class part
   1518     clone->init();                // Init RuleBasedBreakIterator part, (user default constructor)
   1519     *clone = *this;               // clone = the real BI we want.
   1520     clone->fBufferClone = TRUE;   // Flag to prevent deleting storage on close (From C code)
   1521 
   1522     return clone;
   1523 }
   1524 
   1525 
   1526 //-------------------------------------------------------------------------------
   1527 //
   1528 //  isDictionaryChar      Return true if the category lookup for this char
   1529 //                        indicates that it is in the set of dictionary lookup
   1530 //                        chars.
   1531 //
   1532 //                        This function is intended for use by dictionary based
   1533 //                        break iterators.
   1534 //
   1535 //-------------------------------------------------------------------------------
   1536 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32   c) {
   1537     if (fData == NULL) {
   1538         return FALSE;
   1539     }
   1540     uint16_t category;
   1541     UTRIE_GET16(&fData->fTrie, c, category);
   1542     return (category & 0x4000) != 0;
   1543 }*/
   1544 
   1545 
   1546 //-------------------------------------------------------------------------------
   1547 //
   1548 //  checkDictionary       This function handles all processing of characters in
   1549 //                        the "dictionary" set. It will determine the appropriate
   1550 //                        course of action, and possibly set up a cache in the
   1551 //                        process.
   1552 //
   1553 //-------------------------------------------------------------------------------
   1554 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
   1555                             int32_t endPos,
   1556                             UBool reverse) {
   1557     // Reset the old break cache first.
   1558     uint32_t dictionaryCount = fDictionaryCharCount;
   1559     reset();
   1560 
   1561     if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
   1562         return (reverse ? startPos : endPos);
   1563     }
   1564 
   1565     // Starting from the starting point, scan towards the proposed result,
   1566     // looking for the first dictionary character (which may be the one
   1567     // we're on, if we're starting in the middle of a range).
   1568     utext_setNativeIndex(fText, reverse ? endPos : startPos);
   1569     if (reverse) {
   1570         UTEXT_PREVIOUS32(fText);
   1571     }
   1572 
   1573     int32_t rangeStart = startPos;
   1574     int32_t rangeEnd = endPos;
   1575 
   1576     uint16_t    category;
   1577     int32_t     current;
   1578     UErrorCode  status = U_ZERO_ERROR;
   1579     UStack      breaks(status);
   1580     int32_t     foundBreakCount = 0;
   1581     UChar32     c = utext_current32(fText);
   1582 
   1583     UTRIE_GET16(&fData->fTrie, c, category);
   1584 
   1585     // Is the character we're starting on a dictionary character? If so, we
   1586     // need to back up to include the entire run; otherwise the results of
   1587     // the break algorithm will differ depending on where we start. Since
   1588     // the result is cached and there is typically a non-dictionary break
   1589     // within a small number of words, there should be little performance impact.
   1590     if (category & 0x4000) {
   1591         if (reverse) {
   1592             do {
   1593                 utext_next32(fText);          // TODO:  recast to work directly with postincrement.
   1594                 c = utext_current32(fText);
   1595                 UTRIE_GET16(&fData->fTrie, c, category);
   1596             } while (c != U_SENTINEL && (category & 0x4000));
   1597             // Back up to the last dictionary character
   1598             rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1599             if (c == U_SENTINEL) {
   1600                 // c = fText->last32();
   1601                 //   TODO:  why was this if needed?
   1602                 c = UTEXT_PREVIOUS32(fText);
   1603             }
   1604             else {
   1605                 c = UTEXT_PREVIOUS32(fText);
   1606             }
   1607         }
   1608         else {
   1609             do {
   1610                 c = UTEXT_PREVIOUS32(fText);
   1611                 UTRIE_GET16(&fData->fTrie, c, category);
   1612             }
   1613             while (c != U_SENTINEL && (category & 0x4000));
   1614             // Back up to the last dictionary character
   1615             if (c == U_SENTINEL) {
   1616                 // c = fText->first32();
   1617                 c = utext_current32(fText);
   1618             }
   1619             else {
   1620                 utext_next32(fText);
   1621                 c = utext_current32(fText);
   1622             }
   1623             rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
   1624         }
   1625         UTRIE_GET16(&fData->fTrie, c, category);
   1626     }
   1627 
   1628     // Loop through the text, looking for ranges of dictionary characters.
   1629     // For each span, find the appropriate break engine, and ask it to find
   1630     // any breaks within the span.
   1631     // Note: we always do this in the forward direction, so that the break
   1632     // cache is built in the right order.
   1633     if (reverse) {
   1634         utext_setNativeIndex(fText, rangeStart);
   1635         c = utext_current32(fText);
   1636         UTRIE_GET16(&fData->fTrie, c, category);
   1637     }
   1638     while(U_SUCCESS(status)) {
   1639         while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
   1640             utext_next32(fText);           // TODO:  tweak for post-increment operation
   1641             c = utext_current32(fText);
   1642             UTRIE_GET16(&fData->fTrie, c, category);
   1643         }
   1644         if (current >= rangeEnd) {
   1645             break;
   1646         }
   1647 
   1648         // We now have a dictionary character. Get the appropriate language object
   1649         // to deal with it.
   1650         const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
   1651 
   1652         // Ask the language object if there are any breaks. It will leave the text
   1653         // pointer on the other side of its range, ready to search for the next one.
   1654         if (lbe != NULL) {
   1655             foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
   1656         }
   1657 
   1658         // Reload the loop variables for the next go-round
   1659         c = utext_current32(fText);
   1660         UTRIE_GET16(&fData->fTrie, c, category);
   1661     }
   1662 
   1663     // If we found breaks, build a new break cache. The first and last entries must
   1664     // be the original starting and ending position.
   1665     if (foundBreakCount > 0) {
   1666         int32_t totalBreaks = foundBreakCount;
   1667         if (startPos < breaks.elementAti(0)) {
   1668             totalBreaks += 1;
   1669         }
   1670         if (endPos > breaks.peeki()) {
   1671             totalBreaks += 1;
   1672         }
   1673         fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
   1674         if (fCachedBreakPositions != NULL) {
   1675             int32_t out = 0;
   1676             fNumCachedBreakPositions = totalBreaks;
   1677             if (startPos < breaks.elementAti(0)) {
   1678                 fCachedBreakPositions[out++] = startPos;
   1679             }
   1680             for (int32_t i = 0; i < foundBreakCount; ++i) {
   1681                 fCachedBreakPositions[out++] = breaks.elementAti(i);
   1682             }
   1683             if (endPos > fCachedBreakPositions[out-1]) {
   1684                 fCachedBreakPositions[out] = endPos;
   1685             }
   1686             // If there are breaks, then by definition, we are replacing the original
   1687             // proposed break by one of the breaks we found. Use following() and
   1688             // preceding() to do the work. They should never recurse in this case.
   1689             if (reverse) {
   1690                 return preceding(endPos - 1);
   1691             }
   1692             else {
   1693                 return following(startPos);
   1694             }
   1695         }
   1696         // If the allocation failed, just fall through to the "no breaks found" case.
   1697     }
   1698 
   1699     // If we get here, there were no language-based breaks. Set the text pointer
   1700     // to the original proposed break.
   1701     utext_setNativeIndex(fText, reverse ? startPos : endPos);
   1702     return (reverse ? startPos : endPos);
   1703 }
   1704 
   1705 U_NAMESPACE_END
   1706 
   1707 // defined in ucln_cmn.h
   1708 
   1709 static U_NAMESPACE_QUALIFIER UStack *gLanguageBreakFactories = NULL;
   1710 
   1711 /**
   1712  * Release all static memory held by breakiterator.
   1713  */
   1714 U_CDECL_BEGIN
   1715 static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
   1716     if (gLanguageBreakFactories) {
   1717         delete gLanguageBreakFactories;
   1718         gLanguageBreakFactories = NULL;
   1719     }
   1720     return TRUE;
   1721 }
   1722 U_CDECL_END
   1723 
   1724 U_CDECL_BEGIN
   1725 static void U_CALLCONV _deleteFactory(void *obj) {
   1726     delete (U_NAMESPACE_QUALIFIER LanguageBreakFactory *) obj;
   1727 }
   1728 U_CDECL_END
   1729 U_NAMESPACE_BEGIN
   1730 
   1731 static const LanguageBreakEngine*
   1732 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
   1733 {
   1734     UBool       needsInit;
   1735     UErrorCode  status = U_ZERO_ERROR;
   1736     UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit);
   1737 
   1738     if (needsInit) {
   1739         UStack  *factories = new UStack(_deleteFactory, NULL, status);
   1740         if (factories != NULL && U_SUCCESS(status)) {
   1741             ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
   1742             factories->push(builtIn, status);
   1743 #ifdef U_LOCAL_SERVICE_HOOK
   1744             LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
   1745             if (extra != NULL) {
   1746                 factories->push(extra, status);
   1747             }
   1748 #endif
   1749         }
   1750         umtx_lock(NULL);
   1751         if (gLanguageBreakFactories == NULL) {
   1752             gLanguageBreakFactories = factories;
   1753             factories = NULL;
   1754             ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
   1755         }
   1756         umtx_unlock(NULL);
   1757         delete factories;
   1758     }
   1759 
   1760     if (gLanguageBreakFactories == NULL) {
   1761         return NULL;
   1762     }
   1763 
   1764     int32_t i = gLanguageBreakFactories->size();
   1765     const LanguageBreakEngine *lbe = NULL;
   1766     while (--i >= 0) {
   1767         LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
   1768         lbe = factory->getEngineFor(c, breakType);
   1769         if (lbe != NULL) {
   1770             break;
   1771         }
   1772     }
   1773     return lbe;
   1774 }
   1775 
   1776 
   1777 //-------------------------------------------------------------------------------
   1778 //
   1779 //  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the
   1780 //                          the characer c.
   1781 //
   1782 //-------------------------------------------------------------------------------
   1783 const LanguageBreakEngine *
   1784 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
   1785     const LanguageBreakEngine *lbe = NULL;
   1786     UErrorCode status = U_ZERO_ERROR;
   1787 
   1788     if (fLanguageBreakEngines == NULL) {
   1789         fLanguageBreakEngines = new UStack(status);
   1790         if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
   1791             delete fLanguageBreakEngines;
   1792             fLanguageBreakEngines = 0;
   1793             return NULL;
   1794         }
   1795     }
   1796 
   1797     int32_t i = fLanguageBreakEngines->size();
   1798     while (--i >= 0) {
   1799         lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
   1800         if (lbe->handles(c, fBreakType)) {
   1801             return lbe;
   1802         }
   1803     }
   1804 
   1805     // No existing dictionary took the character. See if a factory wants to
   1806     // give us a new LanguageBreakEngine for this character.
   1807     lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
   1808 
   1809     // If we got one, use it and push it on our stack.
   1810     if (lbe != NULL) {
   1811         fLanguageBreakEngines->push((void *)lbe, status);
   1812         // Even if we can't remember it, we can keep looking it up, so
   1813         // return it even if the push fails.
   1814         return lbe;
   1815     }
   1816 
   1817     // No engine is forthcoming for this character. Add it to the
   1818     // reject set. Create the reject break engine if needed.
   1819     if (fUnhandledBreakEngine == NULL) {
   1820         fUnhandledBreakEngine = new UnhandledEngine(status);
   1821         if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
   1822             status = U_MEMORY_ALLOCATION_ERROR;
   1823         }
   1824         // Put it last so that scripts for which we have an engine get tried
   1825         // first.
   1826         fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
   1827         // If we can't insert it, or creation failed, get rid of it
   1828         if (U_FAILURE(status)) {
   1829             delete fUnhandledBreakEngine;
   1830             fUnhandledBreakEngine = 0;
   1831             return NULL;
   1832         }
   1833     }
   1834 
   1835     // Tell the reject engine about the character; at its discretion, it may
   1836     // add more than just the one character.
   1837     fUnhandledBreakEngine->handleCharacter(c, fBreakType);
   1838 
   1839     return fUnhandledBreakEngine;
   1840 }
   1841 
   1842 
   1843 
   1844 /*int32_t RuleBasedBreakIterator::getBreakType() const {
   1845     return fBreakType;
   1846 }*/
   1847 
   1848 void RuleBasedBreakIterator::setBreakType(int32_t type) {
   1849     fBreakType = type;
   1850     reset();
   1851 }
   1852 
   1853 U_NAMESPACE_END
   1854 
   1855 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   1856