Home | History | Annotate | Download | only in common
      1 /*
      2 ***************************************************************************
      3 *   Copyright (C) 1999-2012 International Business Machines Corporation
      4 *   and others. All rights reserved.
      5 ***************************************************************************
      6 */
      7 //
      8 //  file:  rbbi.c    Contains the implementation of the rule based break iterator
      9 //                   runtime engine and the API implementation for
     10 //                   class RuleBasedBreakIterator
     11 //
     12 
     13 #include "utypeinfo.h"  // for 'typeid' to work
     14 
     15 #include "unicode/utypes.h"
     16 
     17 #if !UCONFIG_NO_BREAK_ITERATION
     18 
     19 #include "unicode/rbbi.h"
     20 #include "unicode/schriter.h"
     21 #include "unicode/uchriter.h"
     22 #include "unicode/udata.h"
     23 #include "unicode/uclean.h"
     24 #include "rbbidata.h"
     25 #include "rbbirb.h"
     26 #include "cmemory.h"
     27 #include "cstring.h"
     28 #include "umutex.h"
     29 #include "ucln_cmn.h"
     30 #include "brkeng.h"
     31 
     32 #include "uassert.h"
     33 #include "uvector.h"
     34 
     35 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
     36 #if U_LOCAL_SERVICE_HOOK
     37 #include "localsvc.h"
     38 #endif
     39 
     40 #ifdef RBBI_DEBUG
     41 static UBool fTrace = FALSE;
     42 #endif
     43 
     44 U_NAMESPACE_BEGIN
     45 
     46 // The state number of the starting state
     47 #define START_STATE 1
     48 
     49 // The state-transition value indicating "stop"
     50 #define STOP_STATE  0
     51 
     52 
     53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
     54 
     55 
     56 //=======================================================================
     57 // constructors
     58 //=======================================================================
     59 
     60 /**
     61  * Constructs a RuleBasedBreakIterator that uses the already-created
     62  * tables object that is passed in as a parameter.
     63  */
     64 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
     65 {
     66     init();
     67     fData = new RBBIDataWrapper(data, status); // status checked in constructor
     68     if (U_FAILURE(status)) {return;}
     69     if(fData == 0) {
     70         status = U_MEMORY_ALLOCATION_ERROR;
     71         return;
     72     }
     73 }
     74 
     75 /**
     76  * Same as above but does not adopt memory
     77  */
     78 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status)
     79 {
     80     init();
     81     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor
     82     if (U_FAILURE(status)) {return;}
     83     if(fData == 0) {
     84         status = U_MEMORY_ALLOCATION_ERROR;
     85         return;
     86     }
     87 }
     88 
     89 
     90 //
     91 //  Construct from precompiled binary rules (tables).  This constructor is public API,
     92 //  taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
     93 //
     94 RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
     95                        uint32_t       ruleLength,
     96                        UErrorCode     &status) {
     97     init();
     98     if (U_FAILURE(status)) {
     99         return;
    100     }
    101     if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) {
    102         status = U_ILLEGAL_ARGUMENT_ERROR;
    103         return;
    104     }
    105     const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules;
    106     if (data->fLength > ruleLength) {
    107         status = U_ILLEGAL_ARGUMENT_ERROR;
    108         return;
    109     }
    110     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status);
    111     if (U_FAILURE(status)) {return;}
    112     if(fData == 0) {
    113         status = U_MEMORY_ALLOCATION_ERROR;
    114         return;
    115     }
    116 }
    117 
    118 
    119 //-------------------------------------------------------------------------------
    120 //
    121 //   Constructor   from a UDataMemory handle to precompiled break rules
    122 //                 stored in an ICU data file.
    123 //
    124 //-------------------------------------------------------------------------------
    125 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
    126 {
    127     init();
    128     fData = new RBBIDataWrapper(udm, status); // status checked in constructor
    129     if (U_FAILURE(status)) {return;}
    130     if(fData == 0) {
    131         status = U_MEMORY_ALLOCATION_ERROR;
    132         return;
    133     }
    134 }
    135 
    136 
    137 
    138 //-------------------------------------------------------------------------------
    139 //
    140 //   Constructor       from a set of rules supplied as a string.
    141 //
    142 //-------------------------------------------------------------------------------
    143 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
    144                                                 UParseError          &parseError,
    145                                                 UErrorCode           &status)
    146 {
    147     init();
    148     if (U_FAILURE(status)) {return;}
    149     RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
    150         RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
    151     // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that
    152     //        creates and returns a complete RBBI.  From here, in a constructor, we
    153     //        can't just return the object created by the builder factory, hence
    154     //        the assignment of the factory created object to "this".
    155     if (U_SUCCESS(status)) {
    156         *this = *bi;
    157         delete bi;
    158     }
    159 }
    160 
    161 
    162 //-------------------------------------------------------------------------------
    163 //
    164 // Default Constructor.      Create an empty shell that can be set up later.
    165 //                           Used when creating a RuleBasedBreakIterator from a set
    166 //                           of rules.
    167 //-------------------------------------------------------------------------------
    168 RuleBasedBreakIterator::RuleBasedBreakIterator() {
    169     init();
    170 }
    171 
    172 
    173 //-------------------------------------------------------------------------------
    174 //
    175 //   Copy constructor.  Will produce a break iterator with the same behavior,
    176 //                      and which iterates over the same text, as the one passed in.
    177 //
    178 //-------------------------------------------------------------------------------
    179 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
    180 : BreakIterator(other)
    181 {
    182     this->init();
    183     *this = other;
    184 }
    185 
    186 
    187 /**
    188  * Destructor
    189  */
    190 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
    191     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    192         // fCharIter was adopted from the outside.
    193         delete fCharIter;
    194     }
    195     fCharIter = NULL;
    196     delete fSCharIter;
    197     fCharIter = NULL;
    198     delete fDCharIter;
    199     fDCharIter = NULL;
    200 
    201     utext_close(fText);
    202 
    203     if (fData != NULL) {
    204         fData->removeReference();
    205         fData = NULL;
    206     }
    207     if (fCachedBreakPositions) {
    208         uprv_free(fCachedBreakPositions);
    209         fCachedBreakPositions = NULL;
    210     }
    211     if (fLanguageBreakEngines) {
    212         delete fLanguageBreakEngines;
    213         fLanguageBreakEngines = NULL;
    214     }
    215     if (fUnhandledBreakEngine) {
    216         delete fUnhandledBreakEngine;
    217         fUnhandledBreakEngine = NULL;
    218     }
    219 }
    220 
    221 /**
    222  * Assignment operator.  Sets this iterator to have the same behavior,
    223  * and iterate over the same text, as the one passed in.
    224  */
    225 RuleBasedBreakIterator&
    226 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
    227     if (this == &that) {
    228         return *this;
    229     }
    230     reset();    // Delete break cache information
    231     fBreakType = that.fBreakType;
    232     if (fLanguageBreakEngines != NULL) {
    233         delete fLanguageBreakEngines;
    234         fLanguageBreakEngines = NULL;   // Just rebuild for now
    235     }
    236     // TODO: clone fLanguageBreakEngines from "that"
    237     UErrorCode status = U_ZERO_ERROR;
    238     fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
    239 
    240     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    241         delete fCharIter;
    242     }
    243     fCharIter = NULL;
    244 
    245     if (that.fCharIter != NULL ) {
    246         // This is a little bit tricky - it will intially appear that
    247         //  this->fCharIter is adopted, even if that->fCharIter was
    248         //  not adopted.  That's ok.
    249         fCharIter = that.fCharIter->clone();
    250     }
    251 
    252     if (fData != NULL) {
    253         fData->removeReference();
    254         fData = NULL;
    255     }
    256     if (that.fData != NULL) {
    257         fData = that.fData->addReference();
    258     }
    259 
    260     return *this;
    261 }
    262 
    263 
    264 
    265 //-----------------------------------------------------------------------------
    266 //
    267 //    init()      Shared initialization routine.   Used by all the constructors.
    268 //                Initializes all fields, leaving the object in a consistent state.
    269 //
    270 //-----------------------------------------------------------------------------
    271 void RuleBasedBreakIterator::init() {
    272     UErrorCode  status    = U_ZERO_ERROR;
    273     fBufferClone          = FALSE;
    274     fText                 = utext_openUChars(NULL, NULL, 0, &status);
    275     fCharIter             = NULL;
    276     fSCharIter            = NULL;
    277     fDCharIter            = NULL;
    278     fData                 = NULL;
    279     fLastRuleStatusIndex  = 0;
    280     fLastStatusIndexValid = TRUE;
    281     fDictionaryCharCount  = 0;
    282     fBreakType            = UBRK_WORD;  // Defaulting BreakType to word gives reasonable
    283                                         //   dictionary behavior for Break Iterators that are
    284                                         //   built from rules.  Even better would be the ability to
    285                                         //   declare the type in the rules.
    286 
    287     fCachedBreakPositions    = NULL;
    288     fLanguageBreakEngines    = NULL;
    289     fUnhandledBreakEngine    = NULL;
    290     fNumCachedBreakPositions = 0;
    291     fPositionInCache         = 0;
    292 
    293 #ifdef RBBI_DEBUG
    294     static UBool debugInitDone = FALSE;
    295     if (debugInitDone == FALSE) {
    296         char *debugEnv = getenv("U_RBBIDEBUG");
    297         if (debugEnv && uprv_strstr(debugEnv, "trace")) {
    298             fTrace = TRUE;
    299         }
    300         debugInitDone = TRUE;
    301     }
    302 #endif
    303 }
    304 
    305 
    306 
    307 //-----------------------------------------------------------------------------
    308 //
    309 //    clone - Returns a newly-constructed RuleBasedBreakIterator with the same
    310 //            behavior, and iterating over the same text, as this one.
    311 //            Virtual function: does the right thing with subclasses.
    312 //
    313 //-----------------------------------------------------------------------------
    314 BreakIterator*
    315 RuleBasedBreakIterator::clone(void) const {
    316     return new RuleBasedBreakIterator(*this);
    317 }
    318 
    319 /**
    320  * Equality operator.  Returns TRUE if both BreakIterators are of the
    321  * same class, have the same behavior, and iterate over the same text.
    322  */
    323 UBool
    324 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
    325     if (typeid(*this) != typeid(that)) {
    326         return FALSE;
    327     }
    328 
    329     const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
    330 
    331     if (!utext_equals(fText, that2.fText)) {
    332         // The two break iterators are operating on different text,
    333         //   or have a different interation position.
    334         return FALSE;
    335     };
    336 
    337     // TODO:  need a check for when in a dictionary region at different offsets.
    338 
    339     if (that2.fData == fData ||
    340         (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
    341             // The two break iterators are using the same rules.
    342             return TRUE;
    343         }
    344     return FALSE;
    345 }
    346 
    347 /**
    348  * Compute a hash code for this BreakIterator
    349  * @return A hash code
    350  */
    351 int32_t
    352 RuleBasedBreakIterator::hashCode(void) const {
    353     int32_t   hash = 0;
    354     if (fData != NULL) {
    355         hash = fData->hashCode();
    356     }
    357     return hash;
    358 }
    359 
    360 
    361 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
    362     if (U_FAILURE(status)) {
    363         return;
    364     }
    365     reset();
    366     fText = utext_clone(fText, ut, FALSE, TRUE, &status);
    367 
    368     // Set up a dummy CharacterIterator to be returned if anyone
    369     //   calls getText().  With input from UText, there is no reasonable
    370     //   way to return a characterIterator over the actual input text.
    371     //   Return one over an empty string instead - this is the closest
    372     //   we can come to signaling a failure.
    373     //   (GetText() is obsolete, this failure is sort of OK)
    374     if (fDCharIter == NULL) {
    375         static const UChar c = 0;
    376         fDCharIter = new UCharCharacterIterator(&c, 0);
    377         if (fDCharIter == NULL) {
    378             status = U_MEMORY_ALLOCATION_ERROR;
    379             return;
    380         }
    381     }
    382 
    383     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    384         // existing fCharIter was adopted from the outside.  Delete it now.
    385         delete fCharIter;
    386     }
    387     fCharIter = fDCharIter;
    388 
    389     this->first();
    390 }
    391 
    392 
    393 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
    394     UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
    395     return result;
    396 }
    397 
    398 
    399 
    400 /**
    401  * Returns the description used to create this iterator
    402  */
    403 const UnicodeString&
    404 RuleBasedBreakIterator::getRules() const {
    405     if (fData != NULL) {
    406         return fData->getRuleSourceString();
    407     } else {
    408         static const UnicodeString *s;
    409         if (s == NULL) {
    410             // TODO:  something more elegant here.
    411             //        perhaps API should return the string by value.
    412             //        Note:  thread unsafe init & leak are semi-ok, better than
    413             //               what was before.  Sould be cleaned up, though.
    414             s = new UnicodeString;
    415         }
    416         return *s;
    417     }
    418 }
    419 
    420 //=======================================================================
    421 // BreakIterator overrides
    422 //=======================================================================
    423 
    424 /**
    425  * Return a CharacterIterator over the text being analyzed.
    426  */
    427 CharacterIterator&
    428 RuleBasedBreakIterator::getText() const {
    429     return *fCharIter;
    430 }
    431 
    432 /**
    433  * Set the iterator to analyze a new piece of text.  This function resets
    434  * the current iteration position to the beginning of the text.
    435  * @param newText An iterator over the text to analyze.
    436  */
    437 void
    438 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
    439     // If we are holding a CharacterIterator adopted from a
    440     //   previous call to this function, delete it now.
    441     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    442         delete fCharIter;
    443     }
    444 
    445     fCharIter = newText;
    446     UErrorCode status = U_ZERO_ERROR;
    447     reset();
    448     if (newText==NULL || newText->startIndex() != 0) {
    449         // startIndex !=0 wants to be an error, but there's no way to report it.
    450         // Make the iterator text be an empty string.
    451         fText = utext_openUChars(fText, NULL, 0, &status);
    452     } else {
    453         fText = utext_openCharacterIterator(fText, newText, &status);
    454     }
    455     this->first();
    456 }
    457 
    458 /**
    459  * Set the iterator to analyze a new piece of text.  This function resets
    460  * the current iteration position to the beginning of the text.
    461  * @param newText An iterator over the text to analyze.
    462  */
    463 void
    464 RuleBasedBreakIterator::setText(const UnicodeString& newText) {
    465     UErrorCode status = U_ZERO_ERROR;
    466     reset();
    467     fText = utext_openConstUnicodeString(fText, &newText, &status);
    468 
    469     // Set up a character iterator on the string.
    470     //   Needed in case someone calls getText().
    471     //  Can not, unfortunately, do this lazily on the (probably never)
    472     //  call to getText(), because getText is const.
    473     if (fSCharIter == NULL) {
    474         fSCharIter = new StringCharacterIterator(newText);
    475     } else {
    476         fSCharIter->setText(newText);
    477     }
    478 
    479     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    480         // old fCharIter was adopted from the outside.  Delete it.
    481         delete fCharIter;
    482     }
    483     fCharIter = fSCharIter;
    484 
    485     this->first();
    486 }
    487 
    488 
    489 /**
    490  *  Provide a new UText for the input text.  Must reference text with contents identical
    491  *  to the original.
    492  *  Intended for use with text data originating in Java (garbage collected) environments
    493  *  where the data may be moved in memory at arbitrary times.
    494  */
    495 RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
    496     if (U_FAILURE(status)) {
    497         return *this;
    498     }
    499     if (input == NULL) {
    500         status = U_ILLEGAL_ARGUMENT_ERROR;
    501         return *this;
    502     }
    503     int64_t pos = utext_getNativeIndex(fText);
    504     //  Shallow read-only clone of the new UText into the existing input UText
    505     fText = utext_clone(fText, input, FALSE, TRUE, &status);
    506     if (U_FAILURE(status)) {
    507         return *this;
    508     }
    509     utext_setNativeIndex(fText, pos);
    510     if (utext_getNativeIndex(fText) != pos) {
    511         // Sanity check.  The new input utext is supposed to have the exact same
    512         // contents as the old.  If we can't set to the same position, it doesn't.
    513         // The contents underlying the old utext might be invalid at this point,
    514         // so it's not safe to check directly.
    515         status = U_ILLEGAL_ARGUMENT_ERROR;
    516     }
    517     return *this;
    518 }
    519 
    520 
    521 /**
    522  * Sets the current iteration position to the beginning of the text.
    523  * @return The offset of the beginning of the text.
    524  */
    525 int32_t RuleBasedBreakIterator::first(void) {
    526     reset();
    527     fLastRuleStatusIndex  = 0;
    528     fLastStatusIndexValid = TRUE;
    529     //if (fText == NULL)
    530     //    return BreakIterator::DONE;
    531 
    532     utext_setNativeIndex(fText, 0);
    533     return 0;
    534 }
    535 
    536 /**
    537  * Sets the current iteration position to the end of the text.
    538  * @return The text's past-the-end offset.
    539  */
    540 int32_t RuleBasedBreakIterator::last(void) {
    541     reset();
    542     if (fText == NULL) {
    543         fLastRuleStatusIndex  = 0;
    544         fLastStatusIndexValid = TRUE;
    545         return BreakIterator::DONE;
    546     }
    547 
    548     fLastStatusIndexValid = FALSE;
    549     int32_t pos = (int32_t)utext_nativeLength(fText);
    550     utext_setNativeIndex(fText, pos);
    551     return pos;
    552 }
    553 
    554 /**
    555  * Advances the iterator either forward or backward the specified number of steps.
    556  * Negative values move backward, and positive values move forward.  This is
    557  * equivalent to repeatedly calling next() or previous().
    558  * @param n The number of steps to move.  The sign indicates the direction
    559  * (negative is backwards, and positive is forwards).
    560  * @return The character offset of the boundary position n boundaries away from
    561  * the current one.
    562  */
    563 int32_t RuleBasedBreakIterator::next(int32_t n) {
    564     int32_t result = current();
    565     while (n > 0) {
    566         result = next();
    567         --n;
    568     }
    569     while (n < 0) {
    570         result = previous();
    571         ++n;
    572     }
    573     return result;
    574 }
    575 
    576 /**
    577  * Advances the iterator to the next boundary position.
    578  * @return The position of the first boundary after this one.
    579  */
    580 int32_t RuleBasedBreakIterator::next(void) {
    581     // if we have cached break positions and we're still in the range
    582     // covered by them, just move one step forward in the cache
    583     if (fCachedBreakPositions != NULL) {
    584         if (fPositionInCache < fNumCachedBreakPositions - 1) {
    585             ++fPositionInCache;
    586             int32_t pos = fCachedBreakPositions[fPositionInCache];
    587             utext_setNativeIndex(fText, pos);
    588             return pos;
    589         }
    590         else {
    591             reset();
    592         }
    593     }
    594 
    595     int32_t startPos = current();
    596     int32_t result = handleNext(fData->fForwardTable);
    597     if (fDictionaryCharCount > 0) {
    598         result = checkDictionary(startPos, result, FALSE);
    599     }
    600     return result;
    601 }
    602 
    603 /**
    604  * Advances the iterator backwards, to the last boundary preceding this one.
    605  * @return The position of the last boundary position preceding this one.
    606  */
    607 int32_t RuleBasedBreakIterator::previous(void) {
    608     int32_t result;
    609     int32_t startPos;
    610 
    611     // if we have cached break positions and we're still in the range
    612     // covered by them, just move one step backward in the cache
    613     if (fCachedBreakPositions != NULL) {
    614         if (fPositionInCache > 0) {
    615             --fPositionInCache;
    616             // If we're at the beginning of the cache, need to reevaluate the
    617             // rule status
    618             if (fPositionInCache <= 0) {
    619                 fLastStatusIndexValid = FALSE;
    620             }
    621             int32_t pos = fCachedBreakPositions[fPositionInCache];
    622             utext_setNativeIndex(fText, pos);
    623             return pos;
    624         }
    625         else {
    626             reset();
    627         }
    628     }
    629 
    630     // if we're already sitting at the beginning of the text, return DONE
    631     if (fText == NULL || (startPos = current()) == 0) {
    632         fLastRuleStatusIndex  = 0;
    633         fLastStatusIndexValid = TRUE;
    634         return BreakIterator::DONE;
    635     }
    636 
    637     if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
    638         result = handlePrevious(fData->fReverseTable);
    639         if (fDictionaryCharCount > 0) {
    640             result = checkDictionary(result, startPos, TRUE);
    641         }
    642         return result;
    643     }
    644 
    645     // old rule syntax
    646     // set things up.  handlePrevious() will back us up to some valid
    647     // break position before the current position (we back our internal
    648     // iterator up one step to prevent handlePrevious() from returning
    649     // the current position), but not necessarily the last one before
    650 
    651     // where we started
    652 
    653     int32_t start = current();
    654 
    655     (void)UTEXT_PREVIOUS32(fText);
    656     int32_t lastResult    = handlePrevious(fData->fReverseTable);
    657     if (lastResult == UBRK_DONE) {
    658         lastResult = 0;
    659         utext_setNativeIndex(fText, 0);
    660     }
    661     result = lastResult;
    662     int32_t lastTag       = 0;
    663     UBool   breakTagValid = FALSE;
    664 
    665     // iterate forward from the known break position until we pass our
    666     // starting point.  The last break position before the starting
    667     // point is our return value
    668 
    669     for (;;) {
    670         result         = next();
    671         if (result == BreakIterator::DONE || result >= start) {
    672             break;
    673         }
    674         lastResult     = result;
    675         lastTag        = fLastRuleStatusIndex;
    676         breakTagValid  = TRUE;
    677     }
    678 
    679     // fLastBreakTag wants to have the value for section of text preceding
    680     // the result position that we are to return (in lastResult.)  If
    681     // the backwards rules overshot and the above loop had to do two or more
    682     // next()s to move up to the desired return position, we will have a valid
    683     // tag value. But, if handlePrevious() took us to exactly the correct result positon,
    684     // we wont have a tag value for that position, which is only set by handleNext().
    685 
    686     // set the current iteration position to be the last break position
    687     // before where we started, and then return that value
    688     utext_setNativeIndex(fText, lastResult);
    689     fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()
    690     fLastStatusIndexValid = breakTagValid;
    691 
    692     // No need to check the dictionary; it will have been handled by
    693     // next()
    694 
    695     return lastResult;
    696 }
    697 
    698 /**
    699  * Sets the iterator to refer to the first boundary position following
    700  * the specified position.
    701  * @offset The position from which to begin searching for a break position.
    702  * @return The position of the first break after the current position.
    703  */
    704 int32_t RuleBasedBreakIterator::following(int32_t offset) {
    705     // if we have cached break positions and offset is in the range
    706     // covered by them, use them
    707     // TODO: could use binary search
    708     // TODO: what if offset is outside range, but break is not?
    709     if (fCachedBreakPositions != NULL) {
    710         if (offset >= fCachedBreakPositions[0]
    711                 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
    712             fPositionInCache = 0;
    713             // We are guaranteed not to leave the array due to range test above
    714             while (offset >= fCachedBreakPositions[fPositionInCache]) {
    715                 ++fPositionInCache;
    716             }
    717             int32_t pos = fCachedBreakPositions[fPositionInCache];
    718             utext_setNativeIndex(fText, pos);
    719             return pos;
    720         }
    721         else {
    722             reset();
    723         }
    724     }
    725 
    726     // if the offset passed in is already past the end of the text,
    727     // just return DONE; if it's before the beginning, return the
    728     // text's starting offset
    729     fLastRuleStatusIndex  = 0;
    730     fLastStatusIndexValid = TRUE;
    731     if (fText == NULL || offset >= utext_nativeLength(fText)) {
    732         last();
    733         return next();
    734     }
    735     else if (offset < 0) {
    736         return first();
    737     }
    738 
    739     // otherwise, set our internal iteration position (temporarily)
    740     // to the position passed in.  If this is the _beginning_ position,
    741     // then we can just use next() to get our return value
    742 
    743     int32_t result = 0;
    744 
    745     if (fData->fSafeRevTable != NULL) {
    746         // new rule syntax
    747         utext_setNativeIndex(fText, offset);
    748         // move forward one codepoint to prepare for moving back to a
    749         // safe point.
    750         // this handles offset being between a supplementary character
    751         (void)UTEXT_NEXT32(fText);
    752         // handlePrevious will move most of the time to < 1 boundary away
    753         handlePrevious(fData->fSafeRevTable);
    754         int32_t result = next();
    755         while (result <= offset) {
    756             result = next();
    757         }
    758         return result;
    759     }
    760     if (fData->fSafeFwdTable != NULL) {
    761         // backup plan if forward safe table is not available
    762         utext_setNativeIndex(fText, offset);
    763         (void)UTEXT_PREVIOUS32(fText);
    764         // handle next will give result >= offset
    765         handleNext(fData->fSafeFwdTable);
    766         // previous will give result 0 or 1 boundary away from offset,
    767         // most of the time
    768         // we have to
    769         int32_t oldresult = previous();
    770         while (oldresult > offset) {
    771             int32_t result = previous();
    772             if (result <= offset) {
    773                 return oldresult;
    774             }
    775             oldresult = result;
    776         }
    777         int32_t result = next();
    778         if (result <= offset) {
    779             return next();
    780         }
    781         return result;
    782     }
    783     // otherwise, we have to sync up first.  Use handlePrevious() to back
    784     // up to a known break position before the specified position (if
    785     // we can determine that the specified position is a break position,
    786     // we don't back up at all).  This may or may not be the last break
    787     // position at or before our starting position.  Advance forward
    788     // from here until we've passed the starting position.  The position
    789     // we stop on will be the first break position after the specified one.
    790     // old rule syntax
    791 
    792     utext_setNativeIndex(fText, offset);
    793     if (offset==0 ||
    794         (offset==1  && utext_getNativeIndex(fText)==0)) {
    795         return next();
    796     }
    797     result = previous();
    798 
    799     while (result != BreakIterator::DONE && result <= offset) {
    800         result = next();
    801     }
    802 
    803     return result;
    804 }
    805 
    806 /**
    807  * Sets the iterator to refer to the last boundary position before the
    808  * specified position.
    809  * @offset The position to begin searching for a break from.
    810  * @return The position of the last boundary before the starting position.
    811  */
    812 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
    813     // if we have cached break positions and offset is in the range
    814     // covered by them, use them
    815     if (fCachedBreakPositions != NULL) {
    816         // TODO: binary search?
    817         // TODO: What if offset is outside range, but break is not?
    818         if (offset > fCachedBreakPositions[0]
    819                 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
    820             fPositionInCache = 0;
    821             while (fPositionInCache < fNumCachedBreakPositions
    822                    && offset > fCachedBreakPositions[fPositionInCache])
    823                 ++fPositionInCache;
    824             --fPositionInCache;
    825             // If we're at the beginning of the cache, need to reevaluate the
    826             // rule status
    827             if (fPositionInCache <= 0) {
    828                 fLastStatusIndexValid = FALSE;
    829             }
    830             utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
    831             return fCachedBreakPositions[fPositionInCache];
    832         }
    833         else {
    834             reset();
    835         }
    836     }
    837 
    838     // if the offset passed in is already past the end of the text,
    839     // just return DONE; if it's before the beginning, return the
    840     // text's starting offset
    841     if (fText == NULL || offset > utext_nativeLength(fText)) {
    842         // return BreakIterator::DONE;
    843         return last();
    844     }
    845     else if (offset < 0) {
    846         return first();
    847     }
    848 
    849     // if we start by updating the current iteration position to the
    850     // position specified by the caller, we can just use previous()
    851     // to carry out this operation
    852 
    853     if (fData->fSafeFwdTable != NULL) {
    854         // new rule syntax
    855         utext_setNativeIndex(fText, offset);
    856         int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    857         if (newOffset != offset) {
    858             // Will come here if specified offset was not a code point boundary AND
    859             //   the underlying implmentation is using UText, which snaps any non-code-point-boundary
    860             //   indices to the containing code point.
    861             // For breakitereator::preceding only, these non-code-point indices need to be moved
    862             //   up to refer to the following codepoint.
    863             (void)UTEXT_NEXT32(fText);
    864             offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    865         }
    866 
    867         // TODO:  (synwee) would it be better to just check for being in the middle of a surrogate pair,
    868         //        rather than adjusting the position unconditionally?
    869         //        (Change would interact with safe rules.)
    870         // TODO:  change RBBI behavior for off-boundary indices to match that of UText?
    871         //        affects only preceding(), seems cleaner, but is slightly different.
    872         (void)UTEXT_PREVIOUS32(fText);
    873         handleNext(fData->fSafeFwdTable);
    874         int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    875         while (result >= offset) {
    876             result = previous();
    877         }
    878         return result;
    879     }
    880     if (fData->fSafeRevTable != NULL) {
    881         // backup plan if forward safe table is not available
    882         //  TODO:  check whether this path can be discarded
    883         //         It's probably OK to say that rules must supply both safe tables
    884         //            if they use safe tables at all.  We have certainly never described
    885         //            to anyone how to work with just one safe table.
    886         utext_setNativeIndex(fText, offset);
    887         (void)UTEXT_NEXT32(fText);
    888 
    889         // handle previous will give result <= offset
    890         handlePrevious(fData->fSafeRevTable);
    891 
    892         // next will give result 0 or 1 boundary away from offset,
    893         // most of the time
    894         // we have to
    895         int32_t oldresult = next();
    896         while (oldresult < offset) {
    897             int32_t result = next();
    898             if (result >= offset) {
    899                 return oldresult;
    900             }
    901             oldresult = result;
    902         }
    903         int32_t result = previous();
    904         if (result >= offset) {
    905             return previous();
    906         }
    907         return result;
    908     }
    909 
    910     // old rule syntax
    911     utext_setNativeIndex(fText, offset);
    912     return previous();
    913 }
    914 
    915 /**
    916  * Returns true if the specfied position is a boundary position.  As a side
    917  * effect, leaves the iterator pointing to the first boundary position at
    918  * or after "offset".
    919  * @param offset the offset to check.
    920  * @return True if "offset" is a boundary position.
    921  */
    922 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
    923     // the beginning index of the iterator is always a boundary position by definition
    924     if (offset == 0) {
    925         first();       // For side effects on current position, tag values.
    926         return TRUE;
    927     }
    928 
    929     if (offset == (int32_t)utext_nativeLength(fText)) {
    930         last();       // For side effects on current position, tag values.
    931         return TRUE;
    932     }
    933 
    934     // out-of-range indexes are never boundary positions
    935     if (offset < 0) {
    936         first();       // For side effects on current position, tag values.
    937         return FALSE;
    938     }
    939 
    940     if (offset > utext_nativeLength(fText)) {
    941         last();        // For side effects on current position, tag values.
    942         return FALSE;
    943     }
    944 
    945     // otherwise, we can use following() on the position before the specified
    946     // one and return true if the position we get back is the one the user
    947     // specified
    948     utext_previous32From(fText, offset);
    949     int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    950     UBool    result  = following(backOne) == offset;
    951     return result;
    952 }
    953 
    954 /**
    955  * Returns the current iteration position.
    956  * @return The current iteration position.
    957  */
    958 int32_t RuleBasedBreakIterator::current(void) const {
    959     int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    960     return pos;
    961 }
    962 
    963 //=======================================================================
    964 // implementation
    965 //=======================================================================
    966 
    967 //
    968 // RBBIRunMode  -  the state machine runs an extra iteration at the beginning and end
    969 //                 of user text.  A variable with this enum type keeps track of where we
    970 //                 are.  The state machine only fetches user input while in the RUN mode.
    971 //
    972 enum RBBIRunMode {
    973     RBBI_START,     // state machine processing is before first char of input
    974     RBBI_RUN,       // state machine processing is in the user text
    975     RBBI_END        // state machine processing is after end of user text.
    976 };
    977 
    978 
    979 //-----------------------------------------------------------------------------------
    980 //
    981 //  handleNext(stateTable)
    982 //     This method is the actual implementation of the rbbi next() method.
    983 //     This method initializes the state machine to state 1
    984 //     and advances through the text character by character until we reach the end
    985 //     of the text or the state machine transitions to state 0.  We update our return
    986 //     value every time the state machine passes through an accepting state.
    987 //
    988 //-----------------------------------------------------------------------------------
    989 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
    990     int32_t             state;
    991     uint16_t            category        = 0;
    992     RBBIRunMode         mode;
    993 
    994     RBBIStateTableRow  *row;
    995     UChar32             c;
    996     int32_t             lookaheadStatus = 0;
    997     int32_t             lookaheadTagIdx = 0;
    998     int32_t             result          = 0;
    999     int32_t             initialPosition = 0;
   1000     int32_t             lookaheadResult = 0;
   1001     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
   1002     const char         *tableData       = statetable->fTableData;
   1003     uint32_t            tableRowLen     = statetable->fRowLen;
   1004 
   1005     #ifdef RBBI_DEBUG
   1006         if (fTrace) {
   1007             RBBIDebugPuts("Handle Next   pos   char  state category");
   1008         }
   1009     #endif
   1010 
   1011     // No matter what, handleNext alway correctly sets the break tag value.
   1012     fLastStatusIndexValid = TRUE;
   1013     fLastRuleStatusIndex = 0;
   1014 
   1015     // if we're already at the end of the text, return DONE.
   1016     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1017     result          = initialPosition;
   1018     c               = UTEXT_NEXT32(fText);
   1019     if (fData == NULL || c==U_SENTINEL) {
   1020         return BreakIterator::DONE;
   1021     }
   1022 
   1023     //  Set the initial state for the state machine
   1024     state = START_STATE;
   1025     row = (RBBIStateTableRow *)
   1026             //(statetable->fTableData + (statetable->fRowLen * state));
   1027             (tableData + tableRowLen * state);
   1028 
   1029 
   1030     mode     = RBBI_RUN;
   1031     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
   1032         category = 2;
   1033         mode     = RBBI_START;
   1034     }
   1035 
   1036 
   1037     // loop until we reach the end of the text or transition to state 0
   1038     //
   1039     for (;;) {
   1040         if (c == U_SENTINEL) {
   1041             // Reached end of input string.
   1042             if (mode == RBBI_END) {
   1043                 // We have already run the loop one last time with the
   1044                 //   character set to the psueudo {eof} value.  Now it is time
   1045                 //   to unconditionally bail out.
   1046                 if (lookaheadResult > result) {
   1047                     // We ran off the end of the string with a pending look-ahead match.
   1048                     // Treat this as if the look-ahead condition had been met, and return
   1049                     //  the match at the / position from the look-ahead rule.
   1050                     result               = lookaheadResult;
   1051                     fLastRuleStatusIndex = lookaheadTagIdx;
   1052                     lookaheadStatus = 0;
   1053                 }
   1054                 break;
   1055             }
   1056             // Run the loop one last time with the fake end-of-input character category.
   1057             mode = RBBI_END;
   1058             category = 1;
   1059         }
   1060 
   1061         //
   1062         // Get the char category.  An incoming category of 1 or 2 means that
   1063         //      we are preset for doing the beginning or end of input, and
   1064         //      that we shouldn't get a category from an actual text input character.
   1065         //
   1066         if (mode == RBBI_RUN) {
   1067             // look up the current character's character category, which tells us
   1068             // which column in the state table to look at.
   1069             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
   1070             //        not the size of the character going in, which is a UChar32.
   1071             //
   1072             UTRIE_GET16(&fData->fTrie, c, category);
   1073 
   1074             // Check the dictionary bit in the character's category.
   1075             //    Counter is only used by dictionary based iterators (subclasses).
   1076             //    Chars that need to be handled by a dictionary have a flag bit set
   1077             //    in their category values.
   1078             //
   1079             if ((category & 0x4000) != 0)  {
   1080                 fDictionaryCharCount++;
   1081                 //  And off the dictionary flag bit.
   1082                 category &= ~0x4000;
   1083             }
   1084         }
   1085 
   1086        #ifdef RBBI_DEBUG
   1087             if (fTrace) {
   1088                 RBBIDebugPrintf("             %4ld   ", utext_getNativeIndex(fText));
   1089                 if (0x20<=c && c<0x7f) {
   1090                     RBBIDebugPrintf("\"%c\"  ", c);
   1091                 } else {
   1092                     RBBIDebugPrintf("%5x  ", c);
   1093                 }
   1094                 RBBIDebugPrintf("%3d  %3d\n", state, category);
   1095             }
   1096         #endif
   1097 
   1098         // State Transition - move machine to its next state
   1099         //
   1100 
   1101         // Note: fNextState is defined as uint16_t[2], but we are casting
   1102         // a generated RBBI table to RBBIStateTableRow and some tables
   1103         // actually have more than 2 categories.
   1104         U_ASSERT(category<fData->fHeader->fCatCount);
   1105         state = row->fNextState[category];  /*Not accessing beyond memory*/
   1106         row = (RBBIStateTableRow *)
   1107             // (statetable->fTableData + (statetable->fRowLen * state));
   1108             (tableData + tableRowLen * state);
   1109 
   1110 
   1111         if (row->fAccepting == -1) {
   1112             // Match found, common case.
   1113             if (mode != RBBI_START) {
   1114                 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1115             }
   1116             fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
   1117         }
   1118 
   1119         if (row->fLookAhead != 0) {
   1120             if (lookaheadStatus != 0
   1121                 && row->fAccepting == lookaheadStatus) {
   1122                 // Lookahead match is completed.
   1123                 result               = lookaheadResult;
   1124                 fLastRuleStatusIndex = lookaheadTagIdx;
   1125                 lookaheadStatus      = 0;
   1126                 // TODO:  make a standalone hard break in a rule work.
   1127                 if (lookAheadHardBreak) {
   1128                     UTEXT_SETNATIVEINDEX(fText, result);
   1129                     return result;
   1130                 }
   1131                 // Look-ahead completed, but other rules may match further.  Continue on
   1132                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
   1133                 goto continueOn;
   1134             }
   1135 
   1136             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1137             lookaheadResult = r;
   1138             lookaheadStatus = row->fLookAhead;
   1139             lookaheadTagIdx = row->fTagIdx;
   1140             goto continueOn;
   1141         }
   1142 
   1143 
   1144         if (row->fAccepting != 0) {
   1145             // Because this is an accepting state, any in-progress look-ahead match
   1146             //   is no longer relavant.  Clear out the pending lookahead status.
   1147             lookaheadStatus = 0;           // clear out any pending look-ahead match.
   1148         }
   1149 
   1150 continueOn:
   1151         if (state == STOP_STATE) {
   1152             // This is the normal exit from the lookup state machine.
   1153             // We have advanced through the string until it is certain that no
   1154             //   longer match is possible, no matter what characters follow.
   1155             break;
   1156         }
   1157 
   1158         // Advance to the next character.
   1159         // If this is a beginning-of-input loop iteration, don't advance
   1160         //    the input position.  The next iteration will be processing the
   1161         //    first real input character.
   1162         if (mode == RBBI_RUN) {
   1163             c = UTEXT_NEXT32(fText);
   1164         } else {
   1165             if (mode == RBBI_START) {
   1166                 mode = RBBI_RUN;
   1167             }
   1168         }
   1169 
   1170 
   1171     }
   1172 
   1173     // The state machine is done.  Check whether it found a match...
   1174 
   1175     // If the iterator failed to advance in the match engine, force it ahead by one.
   1176     //   (This really indicates a defect in the break rules.  They should always match
   1177     //    at least one character.)
   1178     if (result == initialPosition) {
   1179         UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1180         UTEXT_NEXT32(fText);
   1181         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1182     }
   1183 
   1184     // Leave the iterator at our result position.
   1185     UTEXT_SETNATIVEINDEX(fText, result);
   1186     #ifdef RBBI_DEBUG
   1187         if (fTrace) {
   1188             RBBIDebugPrintf("result = %d\n\n", result);
   1189         }
   1190     #endif
   1191     return result;
   1192 }
   1193 
   1194 
   1195 
   1196 //-----------------------------------------------------------------------------------
   1197 //
   1198 //  handlePrevious()
   1199 //
   1200 //      Iterate backwards, according to the logic of the reverse rules.
   1201 //      This version handles the exact style backwards rules.
   1202 //
   1203 //      The logic of this function is very similar to handleNext(), above.
   1204 //
   1205 //-----------------------------------------------------------------------------------
   1206 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
   1207     int32_t             state;
   1208     uint16_t            category        = 0;
   1209     RBBIRunMode         mode;
   1210     RBBIStateTableRow  *row;
   1211     UChar32             c;
   1212     int32_t             lookaheadStatus = 0;
   1213     int32_t             result          = 0;
   1214     int32_t             initialPosition = 0;
   1215     int32_t             lookaheadResult = 0;
   1216     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
   1217 
   1218     #ifdef RBBI_DEBUG
   1219         if (fTrace) {
   1220             RBBIDebugPuts("Handle Previous   pos   char  state category");
   1221         }
   1222     #endif
   1223 
   1224     // handlePrevious() never gets the rule status.
   1225     // Flag the status as invalid; if the user ever asks for status, we will need
   1226     // to back up, then re-find the break position using handleNext(), which does
   1227     // get the status value.
   1228     fLastStatusIndexValid = FALSE;
   1229     fLastRuleStatusIndex = 0;
   1230 
   1231     // if we're already at the start of the text, return DONE.
   1232     if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
   1233         return BreakIterator::DONE;
   1234     }
   1235 
   1236     //  Set up the starting char.
   1237     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1238     result          = initialPosition;
   1239     c               = UTEXT_PREVIOUS32(fText);
   1240 
   1241     //  Set the initial state for the state machine
   1242     state = START_STATE;
   1243     row = (RBBIStateTableRow *)
   1244             (statetable->fTableData + (statetable->fRowLen * state));
   1245     category = 3;
   1246     mode     = RBBI_RUN;
   1247     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
   1248         category = 2;
   1249         mode     = RBBI_START;
   1250     }
   1251 
   1252 
   1253     // loop until we reach the start of the text or transition to state 0
   1254     //
   1255     for (;;) {
   1256         if (c == U_SENTINEL) {
   1257             // Reached end of input string.
   1258             if (mode == RBBI_END) {
   1259                 // We have already run the loop one last time with the
   1260                 //   character set to the psueudo {eof} value.  Now it is time
   1261                 //   to unconditionally bail out.
   1262                 if (lookaheadResult < result) {
   1263                     // We ran off the end of the string with a pending look-ahead match.
   1264                     // Treat this as if the look-ahead condition had been met, and return
   1265                     //  the match at the / position from the look-ahead rule.
   1266                     result               = lookaheadResult;
   1267                     lookaheadStatus = 0;
   1268                 } else if (result == initialPosition) {
   1269                     // Ran off start, no match found.
   1270                     // move one index one (towards the start, since we are doing a previous())
   1271                     UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1272                     (void)UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.
   1273                 }
   1274                 break;
   1275             }
   1276             // Run the loop one last time with the fake end-of-input character category.
   1277             mode = RBBI_END;
   1278             category = 1;
   1279         }
   1280 
   1281         //
   1282         // Get the char category.  An incoming category of 1 or 2 means that
   1283         //      we are preset for doing the beginning or end of input, and
   1284         //      that we shouldn't get a category from an actual text input character.
   1285         //
   1286         if (mode == RBBI_RUN) {
   1287             // look up the current character's character category, which tells us
   1288             // which column in the state table to look at.
   1289             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
   1290             //        not the size of the character going in, which is a UChar32.
   1291             //
   1292             UTRIE_GET16(&fData->fTrie, c, category);
   1293 
   1294             // Check the dictionary bit in the character's category.
   1295             //    Counter is only used by dictionary based iterators (subclasses).
   1296             //    Chars that need to be handled by a dictionary have a flag bit set
   1297             //    in their category values.
   1298             //
   1299             if ((category & 0x4000) != 0)  {
   1300                 fDictionaryCharCount++;
   1301                 //  And off the dictionary flag bit.
   1302                 category &= ~0x4000;
   1303             }
   1304         }
   1305 
   1306         #ifdef RBBI_DEBUG
   1307             if (fTrace) {
   1308                 RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(fText));
   1309                 if (0x20<=c && c<0x7f) {
   1310                     RBBIDebugPrintf("\"%c\"  ", c);
   1311                 } else {
   1312                     RBBIDebugPrintf("%5x  ", c);
   1313                 }
   1314                 RBBIDebugPrintf("%3d  %3d\n", state, category);
   1315             }
   1316         #endif
   1317 
   1318         // State Transition - move machine to its next state
   1319         //
   1320 
   1321         // Note: fNextState is defined as uint16_t[2], but we are casting
   1322         // a generated RBBI table to RBBIStateTableRow and some tables
   1323         // actually have more than 2 categories.
   1324         U_ASSERT(category<fData->fHeader->fCatCount);
   1325         state = row->fNextState[category];  /*Not accessing beyond memory*/
   1326         row = (RBBIStateTableRow *)
   1327             (statetable->fTableData + (statetable->fRowLen * state));
   1328 
   1329         if (row->fAccepting == -1) {
   1330             // Match found, common case.
   1331             result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1332         }
   1333 
   1334         if (row->fLookAhead != 0) {
   1335             if (lookaheadStatus != 0
   1336                 && row->fAccepting == lookaheadStatus) {
   1337                 // Lookahead match is completed.
   1338                 result               = lookaheadResult;
   1339                 lookaheadStatus      = 0;
   1340                 // TODO:  make a standalone hard break in a rule work.
   1341                 if (lookAheadHardBreak) {
   1342                     UTEXT_SETNATIVEINDEX(fText, result);
   1343                     return result;
   1344                 }
   1345                 // Look-ahead completed, but other rules may match further.  Continue on
   1346                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
   1347                 goto continueOn;
   1348             }
   1349 
   1350             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1351             lookaheadResult = r;
   1352             lookaheadStatus = row->fLookAhead;
   1353             goto continueOn;
   1354         }
   1355 
   1356 
   1357         if (row->fAccepting != 0) {
   1358             // Because this is an accepting state, any in-progress look-ahead match
   1359             //   is no longer relavant.  Clear out the pending lookahead status.
   1360             lookaheadStatus = 0;
   1361         }
   1362 
   1363 continueOn:
   1364         if (state == STOP_STATE) {
   1365             // This is the normal exit from the lookup state machine.
   1366             // We have advanced through the string until it is certain that no
   1367             //   longer match is possible, no matter what characters follow.
   1368             break;
   1369         }
   1370 
   1371         // Move (backwards) to the next character to process.
   1372         // If this is a beginning-of-input loop iteration, don't advance
   1373         //    the input position.  The next iteration will be processing the
   1374         //    first real input character.
   1375         if (mode == RBBI_RUN) {
   1376             c = UTEXT_PREVIOUS32(fText);
   1377         } else {
   1378             if (mode == RBBI_START) {
   1379                 mode = RBBI_RUN;
   1380             }
   1381         }
   1382     }
   1383 
   1384     // The state machine is done.  Check whether it found a match...
   1385 
   1386     // If the iterator failed to advance in the match engine, force it ahead by one.
   1387     //   (This really indicates a defect in the break rules.  They should always match
   1388     //    at least one character.)
   1389     if (result == initialPosition) {
   1390         UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1391         UTEXT_PREVIOUS32(fText);
   1392         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1393     }
   1394 
   1395     // Leave the iterator at our result position.
   1396     UTEXT_SETNATIVEINDEX(fText, result);
   1397     #ifdef RBBI_DEBUG
   1398         if (fTrace) {
   1399             RBBIDebugPrintf("result = %d\n\n", result);
   1400         }
   1401     #endif
   1402     return result;
   1403 }
   1404 
   1405 
   1406 void
   1407 RuleBasedBreakIterator::reset()
   1408 {
   1409     if (fCachedBreakPositions) {
   1410         uprv_free(fCachedBreakPositions);
   1411     }
   1412     fCachedBreakPositions = NULL;
   1413     fNumCachedBreakPositions = 0;
   1414     fDictionaryCharCount = 0;
   1415     fPositionInCache = 0;
   1416 }
   1417 
   1418 
   1419 
   1420 //-------------------------------------------------------------------------------
   1421 //
   1422 //   getRuleStatus()   Return the break rule tag associated with the current
   1423 //                     iterator position.  If the iterator arrived at its current
   1424 //                     position by iterating forwards, the value will have been
   1425 //                     cached by the handleNext() function.
   1426 //
   1427 //                     If no cached status value is available, the status is
   1428 //                     found by doing a previous() followed by a next(), which
   1429 //                     leaves the iterator where it started, and computes the
   1430 //                     status while doing the next().
   1431 //
   1432 //-------------------------------------------------------------------------------
   1433 void RuleBasedBreakIterator::makeRuleStatusValid() {
   1434     if (fLastStatusIndexValid == FALSE) {
   1435         //  No cached status is available.
   1436         if (fText == NULL || current() == 0) {
   1437             //  At start of text, or there is no text.  Status is always zero.
   1438             fLastRuleStatusIndex = 0;
   1439             fLastStatusIndexValid = TRUE;
   1440         } else {
   1441             //  Not at start of text.  Find status the tedious way.
   1442             int32_t pa = current();
   1443             previous();
   1444             if (fNumCachedBreakPositions > 0) {
   1445                 reset();                // Blow off the dictionary cache
   1446             }
   1447             int32_t pb = next();
   1448             if (pa != pb) {
   1449                 // note: the if (pa != pb) test is here only to eliminate warnings for
   1450                 //       unused local variables on gcc.  Logically, it isn't needed.
   1451                 U_ASSERT(pa == pb);
   1452             }
   1453         }
   1454     }
   1455     U_ASSERT(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fData->fStatusMaxIdx);
   1456 }
   1457 
   1458 
   1459 int32_t  RuleBasedBreakIterator::getRuleStatus() const {
   1460     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
   1461     nonConstThis->makeRuleStatusValid();
   1462 
   1463     // fLastRuleStatusIndex indexes to the start of the appropriate status record
   1464     //                                                 (the number of status values.)
   1465     //   This function returns the last (largest) of the array of status values.
   1466     int32_t  idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
   1467     int32_t  tagVal = fData->fRuleStatusTable[idx];
   1468 
   1469     return tagVal;
   1470 }
   1471 
   1472 
   1473 
   1474 
   1475 int32_t RuleBasedBreakIterator::getRuleStatusVec(
   1476              int32_t *fillInVec, int32_t capacity, UErrorCode &status)
   1477 {
   1478     if (U_FAILURE(status)) {
   1479         return 0;
   1480     }
   1481 
   1482     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
   1483     nonConstThis->makeRuleStatusValid();
   1484     int32_t  numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
   1485     int32_t  numValsToCopy = numVals;
   1486     if (numVals > capacity) {
   1487         status = U_BUFFER_OVERFLOW_ERROR;
   1488         numValsToCopy = capacity;
   1489     }
   1490     int i;
   1491     for (i=0; i<numValsToCopy; i++) {
   1492         fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
   1493     }
   1494     return numVals;
   1495 }
   1496 
   1497 
   1498 
   1499 //-------------------------------------------------------------------------------
   1500 //
   1501 //   getBinaryRules        Access to the compiled form of the rules,
   1502 //                         for use by build system tools that save the data
   1503 //                         for standard iterator types.
   1504 //
   1505 //-------------------------------------------------------------------------------
   1506 const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
   1507     const uint8_t  *retPtr = NULL;
   1508     length = 0;
   1509 
   1510     if (fData != NULL) {
   1511         retPtr = (const uint8_t *)fData->fHeader;
   1512         length = fData->fHeader->fLength;
   1513     }
   1514     return retPtr;
   1515 }
   1516 
   1517 
   1518 
   1519 
   1520 //-------------------------------------------------------------------------------
   1521 //
   1522 //  BufferClone       TODO:  In my (Andy) opinion, this function should be deprecated.
   1523 //                    Saving one heap allocation isn't worth the trouble.
   1524 //                    Cloning shouldn't be done in tight loops, and
   1525 //                    making the clone copy involves other heap operations anyway.
   1526 //                    And the application code for correctly dealing with buffer
   1527 //                    size problems and the eventual object destruction is ugly.
   1528 //
   1529 //-------------------------------------------------------------------------------
   1530 BreakIterator *  RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
   1531                                    int32_t &bufferSize,
   1532                                    UErrorCode &status)
   1533 {
   1534     if (U_FAILURE(status)){
   1535         return NULL;
   1536     }
   1537 
   1538     //
   1539     //  If user buffer size is zero this is a preflight operation to
   1540     //    obtain the needed buffer size, allowing for worst case misalignment.
   1541     //
   1542     if (bufferSize == 0) {
   1543         bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
   1544         return NULL;
   1545     }
   1546 
   1547 
   1548     //
   1549     //  Check the alignment and size of the user supplied buffer.
   1550     //  Allocate heap memory if the user supplied memory is insufficient.
   1551     //
   1552     char    *buf   = (char *)stackBuffer;
   1553     uint32_t s      = bufferSize;
   1554 
   1555     if (stackBuffer == NULL) {
   1556         s = 0;   // Ignore size, force allocation if user didn't give us a buffer.
   1557     }
   1558     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
   1559         uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf);
   1560         s   -= offsetUp;
   1561         buf += offsetUp;
   1562     }
   1563     if (s < sizeof(RuleBasedBreakIterator)) {
   1564         // Not enough room in the caller-supplied buffer.
   1565         // Do a plain-vanilla heap based clone and return that, along with
   1566         //   a warning that the clone was allocated.
   1567         RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this);
   1568         if (clonedBI == 0) {
   1569             status = U_MEMORY_ALLOCATION_ERROR;
   1570         } else {
   1571             status = U_SAFECLONE_ALLOCATED_WARNING;
   1572         }
   1573         return clonedBI;
   1574     }
   1575 
   1576     //
   1577     //  Clone the source BI into the caller-supplied buffer.
   1578     //
   1579     RuleBasedBreakIterator *clone = new(buf) RuleBasedBreakIterator(*this);
   1580     clone->fBufferClone = TRUE;   // Flag to prevent deleting storage on close (From C code)
   1581 
   1582     return clone;
   1583 }
   1584 
   1585 
   1586 //-------------------------------------------------------------------------------
   1587 //
   1588 //  isDictionaryChar      Return true if the category lookup for this char
   1589 //                        indicates that it is in the set of dictionary lookup
   1590 //                        chars.
   1591 //
   1592 //                        This function is intended for use by dictionary based
   1593 //                        break iterators.
   1594 //
   1595 //-------------------------------------------------------------------------------
   1596 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32   c) {
   1597     if (fData == NULL) {
   1598         return FALSE;
   1599     }
   1600     uint16_t category;
   1601     UTRIE_GET16(&fData->fTrie, c, category);
   1602     return (category & 0x4000) != 0;
   1603 }*/
   1604 
   1605 
   1606 //-------------------------------------------------------------------------------
   1607 //
   1608 //  checkDictionary       This function handles all processing of characters in
   1609 //                        the "dictionary" set. It will determine the appropriate
   1610 //                        course of action, and possibly set up a cache in the
   1611 //                        process.
   1612 //
   1613 //-------------------------------------------------------------------------------
   1614 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
   1615                             int32_t endPos,
   1616                             UBool reverse) {
   1617     // Reset the old break cache first.
   1618     reset();
   1619 
   1620     // note: code segment below assumes that dictionary chars are in the
   1621     // startPos-endPos range
   1622     // value returned should be next character in sequence
   1623     if ((endPos - startPos) <= 1) {
   1624         return (reverse ? startPos : endPos);
   1625     }
   1626 
   1627     // Bug 5532.  The dictionary code will crash if the input text is UTF-8
   1628     //      because native indexes are different from UTF-16 indexes.
   1629     //      Temporary hack: skip dictionary lookup for UTF-8 encoded text.
   1630     //      It wont give the right breaks, but it's better than a crash.
   1631     //
   1632     //      Check the type of the UText by checking its pFuncs field, which
   1633     //      is UText's function dispatch table.  It will be the same for all
   1634     //      UTF-8 UTexts and different for any other UText type.
   1635     //
   1636     //      We have no other type of UText available with non-UTF-16 native indexing.
   1637     //      This whole check will go away once the dictionary code is fixed.
   1638     static const void *utext_utf8Funcs;
   1639     if (utext_utf8Funcs == NULL) {
   1640         // Cache the UTF-8 UText function pointer value.
   1641         UErrorCode status = U_ZERO_ERROR;
   1642         UText tempUText = UTEXT_INITIALIZER;
   1643         utext_openUTF8(&tempUText, NULL, 0, &status);
   1644         utext_utf8Funcs = tempUText.pFuncs;
   1645         utext_close(&tempUText);
   1646     }
   1647     if (fText->pFuncs == utext_utf8Funcs) {
   1648         return (reverse ? startPos : endPos);
   1649     }
   1650 
   1651     // Starting from the starting point, scan towards the proposed result,
   1652     // looking for the first dictionary character (which may be the one
   1653     // we're on, if we're starting in the middle of a range).
   1654     utext_setNativeIndex(fText, reverse ? endPos : startPos);
   1655     if (reverse) {
   1656         UTEXT_PREVIOUS32(fText);
   1657     }
   1658 
   1659     int32_t rangeStart = startPos;
   1660     int32_t rangeEnd = endPos;
   1661 
   1662     uint16_t    category;
   1663     int32_t     current;
   1664     UErrorCode  status = U_ZERO_ERROR;
   1665     UStack      breaks(status);
   1666     int32_t     foundBreakCount = 0;
   1667     UChar32     c = utext_current32(fText);
   1668 
   1669     UTRIE_GET16(&fData->fTrie, c, category);
   1670 
   1671     // Is the character we're starting on a dictionary character? If so, we
   1672     // need to back up to include the entire run; otherwise the results of
   1673     // the break algorithm will differ depending on where we start. Since
   1674     // the result is cached and there is typically a non-dictionary break
   1675     // within a small number of words, there should be little performance impact.
   1676     if (category & 0x4000) {
   1677         if (reverse) {
   1678             do {
   1679                 utext_next32(fText);          // TODO:  recast to work directly with postincrement.
   1680                 c = utext_current32(fText);
   1681                 UTRIE_GET16(&fData->fTrie, c, category);
   1682             } while (c != U_SENTINEL && (category & 0x4000));
   1683             // Back up to the last dictionary character
   1684             rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1685             if (c == U_SENTINEL) {
   1686                 // c = fText->last32();
   1687                 //   TODO:  why was this if needed?
   1688                 c = UTEXT_PREVIOUS32(fText);
   1689             }
   1690             else {
   1691                 c = UTEXT_PREVIOUS32(fText);
   1692             }
   1693         }
   1694         else {
   1695             do {
   1696                 c = UTEXT_PREVIOUS32(fText);
   1697                 UTRIE_GET16(&fData->fTrie, c, category);
   1698             }
   1699             while (c != U_SENTINEL && (category & 0x4000));
   1700             // Back up to the last dictionary character
   1701             if (c == U_SENTINEL) {
   1702                 // c = fText->first32();
   1703                 c = utext_current32(fText);
   1704             }
   1705             else {
   1706                 utext_next32(fText);
   1707                 c = utext_current32(fText);
   1708             }
   1709             rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
   1710         }
   1711         UTRIE_GET16(&fData->fTrie, c, category);
   1712     }
   1713 
   1714     // Loop through the text, looking for ranges of dictionary characters.
   1715     // For each span, find the appropriate break engine, and ask it to find
   1716     // any breaks within the span.
   1717     // Note: we always do this in the forward direction, so that the break
   1718     // cache is built in the right order.
   1719     if (reverse) {
   1720         utext_setNativeIndex(fText, rangeStart);
   1721         c = utext_current32(fText);
   1722         UTRIE_GET16(&fData->fTrie, c, category);
   1723     }
   1724     while(U_SUCCESS(status)) {
   1725         while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
   1726             utext_next32(fText);           // TODO:  tweak for post-increment operation
   1727             c = utext_current32(fText);
   1728             UTRIE_GET16(&fData->fTrie, c, category);
   1729         }
   1730         if (current >= rangeEnd) {
   1731             break;
   1732         }
   1733 
   1734         // We now have a dictionary character. Get the appropriate language object
   1735         // to deal with it.
   1736         const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
   1737 
   1738         // Ask the language object if there are any breaks. It will leave the text
   1739         // pointer on the other side of its range, ready to search for the next one.
   1740         if (lbe != NULL) {
   1741             foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
   1742         }
   1743 
   1744         // Reload the loop variables for the next go-round
   1745         c = utext_current32(fText);
   1746         UTRIE_GET16(&fData->fTrie, c, category);
   1747     }
   1748 
   1749     // If we found breaks, build a new break cache. The first and last entries must
   1750     // be the original starting and ending position.
   1751     if (foundBreakCount > 0) {
   1752         int32_t totalBreaks = foundBreakCount;
   1753         if (startPos < breaks.elementAti(0)) {
   1754             totalBreaks += 1;
   1755         }
   1756         if (endPos > breaks.peeki()) {
   1757             totalBreaks += 1;
   1758         }
   1759         fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
   1760         if (fCachedBreakPositions != NULL) {
   1761             int32_t out = 0;
   1762             fNumCachedBreakPositions = totalBreaks;
   1763             if (startPos < breaks.elementAti(0)) {
   1764                 fCachedBreakPositions[out++] = startPos;
   1765             }
   1766             for (int32_t i = 0; i < foundBreakCount; ++i) {
   1767                 fCachedBreakPositions[out++] = breaks.elementAti(i);
   1768             }
   1769             if (endPos > fCachedBreakPositions[out-1]) {
   1770                 fCachedBreakPositions[out] = endPos;
   1771             }
   1772             // If there are breaks, then by definition, we are replacing the original
   1773             // proposed break by one of the breaks we found. Use following() and
   1774             // preceding() to do the work. They should never recurse in this case.
   1775             if (reverse) {
   1776                 return preceding(endPos);
   1777             }
   1778             else {
   1779                 return following(startPos);
   1780             }
   1781         }
   1782         // If the allocation failed, just fall through to the "no breaks found" case.
   1783     }
   1784 
   1785     // If we get here, there were no language-based breaks. Set the text pointer
   1786     // to the original proposed break.
   1787     utext_setNativeIndex(fText, reverse ? startPos : endPos);
   1788     return (reverse ? startPos : endPos);
   1789 }
   1790 
   1791 U_NAMESPACE_END
   1792 
   1793 // defined in ucln_cmn.h
   1794 
   1795 static icu::UStack *gLanguageBreakFactories = NULL;
   1796 
   1797 /**
   1798  * Release all static memory held by breakiterator.
   1799  */
   1800 U_CDECL_BEGIN
   1801 static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
   1802     if (gLanguageBreakFactories) {
   1803         delete gLanguageBreakFactories;
   1804         gLanguageBreakFactories = NULL;
   1805     }
   1806     return TRUE;
   1807 }
   1808 U_CDECL_END
   1809 
   1810 U_CDECL_BEGIN
   1811 static void U_CALLCONV _deleteFactory(void *obj) {
   1812     delete (icu::LanguageBreakFactory *) obj;
   1813 }
   1814 U_CDECL_END
   1815 U_NAMESPACE_BEGIN
   1816 
   1817 static const LanguageBreakEngine*
   1818 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
   1819 {
   1820     UBool       needsInit;
   1821     UErrorCode  status = U_ZERO_ERROR;
   1822     UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit);
   1823 
   1824     if (needsInit) {
   1825         UStack  *factories = new UStack(_deleteFactory, NULL, status);
   1826         if (factories != NULL && U_SUCCESS(status)) {
   1827             ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
   1828             factories->push(builtIn, status);
   1829 #ifdef U_LOCAL_SERVICE_HOOK
   1830             LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
   1831             if (extra != NULL) {
   1832                 factories->push(extra, status);
   1833             }
   1834 #endif
   1835         }
   1836         umtx_lock(NULL);
   1837         if (gLanguageBreakFactories == NULL) {
   1838             gLanguageBreakFactories = factories;
   1839             factories = NULL;
   1840             ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
   1841         }
   1842         umtx_unlock(NULL);
   1843         delete factories;
   1844     }
   1845 
   1846     if (gLanguageBreakFactories == NULL) {
   1847         return NULL;
   1848     }
   1849 
   1850     int32_t i = gLanguageBreakFactories->size();
   1851     const LanguageBreakEngine *lbe = NULL;
   1852     while (--i >= 0) {
   1853         LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
   1854         lbe = factory->getEngineFor(c, breakType);
   1855         if (lbe != NULL) {
   1856             break;
   1857         }
   1858     }
   1859     return lbe;
   1860 }
   1861 
   1862 
   1863 //-------------------------------------------------------------------------------
   1864 //
   1865 //  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the
   1866 //                          the character c.
   1867 //
   1868 //-------------------------------------------------------------------------------
   1869 const LanguageBreakEngine *
   1870 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
   1871     const LanguageBreakEngine *lbe = NULL;
   1872     UErrorCode status = U_ZERO_ERROR;
   1873 
   1874     if (fLanguageBreakEngines == NULL) {
   1875         fLanguageBreakEngines = new UStack(status);
   1876         if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
   1877             delete fLanguageBreakEngines;
   1878             fLanguageBreakEngines = 0;
   1879             return NULL;
   1880         }
   1881     }
   1882 
   1883     int32_t i = fLanguageBreakEngines->size();
   1884     while (--i >= 0) {
   1885         lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
   1886         if (lbe->handles(c, fBreakType)) {
   1887             return lbe;
   1888         }
   1889     }
   1890 
   1891     // No existing dictionary took the character. See if a factory wants to
   1892     // give us a new LanguageBreakEngine for this character.
   1893     lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
   1894 
   1895     // If we got one, use it and push it on our stack.
   1896     if (lbe != NULL) {
   1897         fLanguageBreakEngines->push((void *)lbe, status);
   1898         // Even if we can't remember it, we can keep looking it up, so
   1899         // return it even if the push fails.
   1900         return lbe;
   1901     }
   1902 
   1903     // No engine is forthcoming for this character. Add it to the
   1904     // reject set. Create the reject break engine if needed.
   1905     if (fUnhandledBreakEngine == NULL) {
   1906         fUnhandledBreakEngine = new UnhandledEngine(status);
   1907         if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
   1908             status = U_MEMORY_ALLOCATION_ERROR;
   1909         }
   1910         // Put it last so that scripts for which we have an engine get tried
   1911         // first.
   1912         fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
   1913         // If we can't insert it, or creation failed, get rid of it
   1914         if (U_FAILURE(status)) {
   1915             delete fUnhandledBreakEngine;
   1916             fUnhandledBreakEngine = 0;
   1917             return NULL;
   1918         }
   1919     }
   1920 
   1921     // Tell the reject engine about the character; at its discretion, it may
   1922     // add more than just the one character.
   1923     fUnhandledBreakEngine->handleCharacter(c, fBreakType);
   1924 
   1925     return fUnhandledBreakEngine;
   1926 }
   1927 
   1928 
   1929 
   1930 /*int32_t RuleBasedBreakIterator::getBreakType() const {
   1931     return fBreakType;
   1932 }*/
   1933 
   1934 void RuleBasedBreakIterator::setBreakType(int32_t type) {
   1935     fBreakType = type;
   1936     reset();
   1937 }
   1938 
   1939 U_NAMESPACE_END
   1940 
   1941 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   1942