Home | History | Annotate | Download | only in common
      1 /*
      2 ***************************************************************************
      3 *   Copyright (C) 1999-2010 International Business Machines Corporation
      4 *   and others. All rights reserved.
      5 ***************************************************************************
      6 */
      7 //
      8 //  file:  rbbi.c    Contains the implementation of the rule based break iterator
      9 //                   runtime engine and the API implementation for
     10 //                   class RuleBasedBreakIterator
     11 //
     12 
     13 #include <typeinfo>  // for 'typeid' to work
     14 
     15 #include "unicode/utypes.h"
     16 
     17 #if !UCONFIG_NO_BREAK_ITERATION
     18 
     19 #include "unicode/rbbi.h"
     20 #include "unicode/schriter.h"
     21 #include "unicode/uchriter.h"
     22 #include "unicode/udata.h"
     23 #include "unicode/uclean.h"
     24 #include "rbbidata.h"
     25 #include "rbbirb.h"
     26 #include "cmemory.h"
     27 #include "cstring.h"
     28 #include "umutex.h"
     29 #include "ucln_cmn.h"
     30 #include "brkeng.h"
     31 
     32 #include "uassert.h"
     33 #include "uvector.h"
     34 
     35 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
     36 #if U_LOCAL_SERVICE_HOOK
     37 #include "localsvc.h"
     38 #endif
     39 
     40 #ifdef RBBI_DEBUG
     41 static UBool fTrace = FALSE;
     42 #endif
     43 
     44 U_NAMESPACE_BEGIN
     45 
     46 // The state number of the starting state
     47 #define START_STATE 1
     48 
     49 // The state-transition value indicating "stop"
     50 #define STOP_STATE  0
     51 
     52 
     53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
     54 
     55 
     56 //=======================================================================
     57 // constructors
     58 //=======================================================================
     59 
     60 /**
     61  * Constructs a RuleBasedBreakIterator that uses the already-created
     62  * tables object that is passed in as a parameter.
     63  */
     64 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
     65 {
     66     init();
     67     fData = new RBBIDataWrapper(data, status); // status checked in constructor
     68     if (U_FAILURE(status)) {return;}
     69     if(fData == 0) {
     70         status = U_MEMORY_ALLOCATION_ERROR;
     71         return;
     72     }
     73 }
     74 
     75 /**
     76  * Same as above but does not adopt memory
     77  */
     78 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status)
     79 {
     80     init();
     81     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor
     82     if (U_FAILURE(status)) {return;}
     83     if(fData == 0) {
     84         status = U_MEMORY_ALLOCATION_ERROR;
     85         return;
     86     }
     87 }
     88 
     89 //-------------------------------------------------------------------------------
     90 //
     91 //   Constructor   from a UDataMemory handle to precompiled break rules
     92 //                 stored in an ICU data file.
     93 //
     94 //-------------------------------------------------------------------------------
     95 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
     96 {
     97     init();
     98     fData = new RBBIDataWrapper(udm, status); // status checked in constructor
     99     if (U_FAILURE(status)) {return;}
    100     if(fData == 0) {
    101         status = U_MEMORY_ALLOCATION_ERROR;
    102         return;
    103     }
    104 }
    105 
    106 
    107 
    108 //-------------------------------------------------------------------------------
    109 //
    110 //   Constructor       from a set of rules supplied as a string.
    111 //
    112 //-------------------------------------------------------------------------------
    113 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
    114                                                 UParseError          &parseError,
    115                                                 UErrorCode           &status)
    116 {
    117     init();
    118     if (U_FAILURE(status)) {return;}
    119     RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
    120         RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
    121     // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that
    122     //        creates and returns a complete RBBI.  From here, in a constructor, we
    123     //        can't just return the object created by the builder factory, hence
    124     //        the assignment of the factory created object to "this".
    125     if (U_SUCCESS(status)) {
    126         *this = *bi;
    127         delete bi;
    128     }
    129 }
    130 
    131 
    132 //-------------------------------------------------------------------------------
    133 //
    134 // Default Constructor.      Create an empty shell that can be set up later.
    135 //                           Used when creating a RuleBasedBreakIterator from a set
    136 //                           of rules.
    137 //-------------------------------------------------------------------------------
    138 RuleBasedBreakIterator::RuleBasedBreakIterator() {
    139     init();
    140 }
    141 
    142 
    143 //-------------------------------------------------------------------------------
    144 //
    145 //   Copy constructor.  Will produce a break iterator with the same behavior,
    146 //                      and which iterates over the same text, as the one passed in.
    147 //
    148 //-------------------------------------------------------------------------------
    149 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
    150 : BreakIterator(other)
    151 {
    152     this->init();
    153     *this = other;
    154 }
    155 
    156 
    157 /**
    158  * Destructor
    159  */
    160 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
    161     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    162         // fCharIter was adopted from the outside.
    163         delete fCharIter;
    164     }
    165     fCharIter = NULL;
    166     delete fSCharIter;
    167     fCharIter = NULL;
    168     delete fDCharIter;
    169     fDCharIter = NULL;
    170 
    171     utext_close(fText);
    172 
    173     if (fData != NULL) {
    174         fData->removeReference();
    175         fData = NULL;
    176     }
    177     if (fCachedBreakPositions) {
    178         uprv_free(fCachedBreakPositions);
    179         fCachedBreakPositions = NULL;
    180     }
    181     if (fLanguageBreakEngines) {
    182         delete fLanguageBreakEngines;
    183         fLanguageBreakEngines = NULL;
    184     }
    185     if (fUnhandledBreakEngine) {
    186         delete fUnhandledBreakEngine;
    187         fUnhandledBreakEngine = NULL;
    188     }
    189 }
    190 
    191 /**
    192  * Assignment operator.  Sets this iterator to have the same behavior,
    193  * and iterate over the same text, as the one passed in.
    194  */
    195 RuleBasedBreakIterator&
    196 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
    197     if (this == &that) {
    198         return *this;
    199     }
    200     reset();    // Delete break cache information
    201     fBreakType = that.fBreakType;
    202     if (fLanguageBreakEngines != NULL) {
    203         delete fLanguageBreakEngines;
    204         fLanguageBreakEngines = NULL;   // Just rebuild for now
    205     }
    206     // TODO: clone fLanguageBreakEngines from "that"
    207     UErrorCode status = U_ZERO_ERROR;
    208     fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
    209 
    210     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    211         delete fCharIter;
    212     }
    213     fCharIter = NULL;
    214 
    215     if (that.fCharIter != NULL ) {
    216         // This is a little bit tricky - it will intially appear that
    217         //  this->fCharIter is adopted, even if that->fCharIter was
    218         //  not adopted.  That's ok.
    219         fCharIter = that.fCharIter->clone();
    220     }
    221 
    222     if (fData != NULL) {
    223         fData->removeReference();
    224         fData = NULL;
    225     }
    226     if (that.fData != NULL) {
    227         fData = that.fData->addReference();
    228     }
    229 
    230     return *this;
    231 }
    232 
    233 
    234 
    235 //-----------------------------------------------------------------------------
    236 //
    237 //    init()      Shared initialization routine.   Used by all the constructors.
    238 //                Initializes all fields, leaving the object in a consistent state.
    239 //
    240 //-----------------------------------------------------------------------------
    241 void RuleBasedBreakIterator::init() {
    242     UErrorCode  status    = U_ZERO_ERROR;
    243     fBufferClone          = FALSE;
    244     fText                 = utext_openUChars(NULL, NULL, 0, &status);
    245     fCharIter             = NULL;
    246     fSCharIter            = NULL;
    247     fDCharIter            = NULL;
    248     fData                 = NULL;
    249     fLastRuleStatusIndex  = 0;
    250     fLastStatusIndexValid = TRUE;
    251     fDictionaryCharCount  = 0;
    252     fBreakType            = UBRK_WORD;  // Defaulting BreakType to word gives reasonable
    253                                         //   dictionary behavior for Break Iterators that are
    254                                         //   built from rules.  Even better would be the ability to
    255                                         //   declare the type in the rules.
    256 
    257     fCachedBreakPositions    = NULL;
    258     fLanguageBreakEngines    = NULL;
    259     fUnhandledBreakEngine    = NULL;
    260     fNumCachedBreakPositions = 0;
    261     fPositionInCache         = 0;
    262 
    263 #ifdef RBBI_DEBUG
    264     static UBool debugInitDone = FALSE;
    265     if (debugInitDone == FALSE) {
    266         char *debugEnv = getenv("U_RBBIDEBUG");
    267         if (debugEnv && uprv_strstr(debugEnv, "trace")) {
    268             fTrace = TRUE;
    269         }
    270         debugInitDone = TRUE;
    271     }
    272 #endif
    273 }
    274 
    275 
    276 
    277 //-----------------------------------------------------------------------------
    278 //
    279 //    clone - Returns a newly-constructed RuleBasedBreakIterator with the same
    280 //            behavior, and iterating over the same text, as this one.
    281 //            Virtual function: does the right thing with subclasses.
    282 //
    283 //-----------------------------------------------------------------------------
    284 BreakIterator*
    285 RuleBasedBreakIterator::clone(void) const {
    286     return new RuleBasedBreakIterator(*this);
    287 }
    288 
    289 /**
    290  * Equality operator.  Returns TRUE if both BreakIterators are of the
    291  * same class, have the same behavior, and iterate over the same text.
    292  */
    293 UBool
    294 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
    295     if (typeid(*this) != typeid(that)) {
    296         return FALSE;
    297     }
    298 
    299     const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
    300 
    301     if (!utext_equals(fText, that2.fText)) {
    302         // The two break iterators are operating on different text,
    303         //   or have a different interation position.
    304         return FALSE;
    305     };
    306 
    307     // TODO:  need a check for when in a dictionary region at different offsets.
    308 
    309     if (that2.fData == fData ||
    310         (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
    311             // The two break iterators are using the same rules.
    312             return TRUE;
    313         }
    314     return FALSE;
    315 }
    316 
    317 /**
    318  * Compute a hash code for this BreakIterator
    319  * @return A hash code
    320  */
    321 int32_t
    322 RuleBasedBreakIterator::hashCode(void) const {
    323     int32_t   hash = 0;
    324     if (fData != NULL) {
    325         hash = fData->hashCode();
    326     }
    327     return hash;
    328 }
    329 
    330 
    331 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
    332     if (U_FAILURE(status)) {
    333         return;
    334     }
    335     reset();
    336     fText = utext_clone(fText, ut, FALSE, TRUE, &status);
    337 
    338     // Set up a dummy CharacterIterator to be returned if anyone
    339     //   calls getText().  With input from UText, there is no reasonable
    340     //   way to return a characterIterator over the actual input text.
    341     //   Return one over an empty string instead - this is the closest
    342     //   we can come to signaling a failure.
    343     //   (GetText() is obsolete, this failure is sort of OK)
    344     if (fDCharIter == NULL) {
    345         static const UChar c = 0;
    346         fDCharIter = new UCharCharacterIterator(&c, 0);
    347         if (fDCharIter == NULL) {
    348             status = U_MEMORY_ALLOCATION_ERROR;
    349             return;
    350         }
    351     }
    352 
    353     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    354         // existing fCharIter was adopted from the outside.  Delete it now.
    355         delete fCharIter;
    356     }
    357     fCharIter = fDCharIter;
    358 
    359     this->first();
    360 }
    361 
    362 
    363 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
    364     UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
    365     return result;
    366 }
    367 
    368 
    369 
    370 /**
    371  * Returns the description used to create this iterator
    372  */
    373 const UnicodeString&
    374 RuleBasedBreakIterator::getRules() const {
    375     if (fData != NULL) {
    376         return fData->getRuleSourceString();
    377     } else {
    378         static const UnicodeString *s;
    379         if (s == NULL) {
    380             // TODO:  something more elegant here.
    381             //        perhaps API should return the string by value.
    382             //        Note:  thread unsafe init & leak are semi-ok, better than
    383             //               what was before.  Sould be cleaned up, though.
    384             s = new UnicodeString;
    385         }
    386         return *s;
    387     }
    388 }
    389 
    390 //=======================================================================
    391 // BreakIterator overrides
    392 //=======================================================================
    393 
    394 /**
    395  * Return a CharacterIterator over the text being analyzed.
    396  */
    397 CharacterIterator&
    398 RuleBasedBreakIterator::getText() const {
    399     return *fCharIter;
    400 }
    401 
    402 /**
    403  * Set the iterator to analyze a new piece of text.  This function resets
    404  * the current iteration position to the beginning of the text.
    405  * @param newText An iterator over the text to analyze.
    406  */
    407 void
    408 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
    409     // If we are holding a CharacterIterator adopted from a
    410     //   previous call to this function, delete it now.
    411     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    412         delete fCharIter;
    413     }
    414 
    415     fCharIter = newText;
    416     UErrorCode status = U_ZERO_ERROR;
    417     reset();
    418     if (newText==NULL || newText->startIndex() != 0) {
    419         // startIndex !=0 wants to be an error, but there's no way to report it.
    420         // Make the iterator text be an empty string.
    421         fText = utext_openUChars(fText, NULL, 0, &status);
    422     } else {
    423         fText = utext_openCharacterIterator(fText, newText, &status);
    424     }
    425     this->first();
    426 }
    427 
    428 /**
    429  * Set the iterator to analyze a new piece of text.  This function resets
    430  * the current iteration position to the beginning of the text.
    431  * @param newText An iterator over the text to analyze.
    432  */
    433 void
    434 RuleBasedBreakIterator::setText(const UnicodeString& newText) {
    435     UErrorCode status = U_ZERO_ERROR;
    436     reset();
    437     fText = utext_openConstUnicodeString(fText, &newText, &status);
    438 
    439     // Set up a character iterator on the string.
    440     //   Needed in case someone calls getText().
    441     //  Can not, unfortunately, do this lazily on the (probably never)
    442     //  call to getText(), because getText is const.
    443     if (fSCharIter == NULL) {
    444         fSCharIter = new StringCharacterIterator(newText);
    445     } else {
    446         fSCharIter->setText(newText);
    447     }
    448 
    449     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
    450         // old fCharIter was adopted from the outside.  Delete it.
    451         delete fCharIter;
    452     }
    453     fCharIter = fSCharIter;
    454 
    455     this->first();
    456 }
    457 
    458 
    459 
    460 /**
    461  * Sets the current iteration position to the beginning of the text.
    462  * @return The offset of the beginning of the text.
    463  */
    464 int32_t RuleBasedBreakIterator::first(void) {
    465     reset();
    466     fLastRuleStatusIndex  = 0;
    467     fLastStatusIndexValid = TRUE;
    468     //if (fText == NULL)
    469     //    return BreakIterator::DONE;
    470 
    471     utext_setNativeIndex(fText, 0);
    472     return 0;
    473 }
    474 
    475 /**
    476  * Sets the current iteration position to the end of the text.
    477  * @return The text's past-the-end offset.
    478  */
    479 int32_t RuleBasedBreakIterator::last(void) {
    480     reset();
    481     if (fText == NULL) {
    482         fLastRuleStatusIndex  = 0;
    483         fLastStatusIndexValid = TRUE;
    484         return BreakIterator::DONE;
    485     }
    486 
    487     fLastStatusIndexValid = FALSE;
    488     int32_t pos = (int32_t)utext_nativeLength(fText);
    489     utext_setNativeIndex(fText, pos);
    490     return pos;
    491 }
    492 
    493 /**
    494  * Advances the iterator either forward or backward the specified number of steps.
    495  * Negative values move backward, and positive values move forward.  This is
    496  * equivalent to repeatedly calling next() or previous().
    497  * @param n The number of steps to move.  The sign indicates the direction
    498  * (negative is backwards, and positive is forwards).
    499  * @return The character offset of the boundary position n boundaries away from
    500  * the current one.
    501  */
    502 int32_t RuleBasedBreakIterator::next(int32_t n) {
    503     int32_t result = current();
    504     while (n > 0) {
    505         result = next();
    506         --n;
    507     }
    508     while (n < 0) {
    509         result = previous();
    510         ++n;
    511     }
    512     return result;
    513 }
    514 
    515 /**
    516  * Advances the iterator to the next boundary position.
    517  * @return The position of the first boundary after this one.
    518  */
    519 int32_t RuleBasedBreakIterator::next(void) {
    520     // if we have cached break positions and we're still in the range
    521     // covered by them, just move one step forward in the cache
    522     if (fCachedBreakPositions != NULL) {
    523         if (fPositionInCache < fNumCachedBreakPositions - 1) {
    524             ++fPositionInCache;
    525             int32_t pos = fCachedBreakPositions[fPositionInCache];
    526             utext_setNativeIndex(fText, pos);
    527             return pos;
    528         }
    529         else {
    530             reset();
    531         }
    532     }
    533 
    534     int32_t startPos = current();
    535     int32_t result = handleNext(fData->fForwardTable);
    536     if (fDictionaryCharCount > 0) {
    537         result = checkDictionary(startPos, result, FALSE);
    538     }
    539     return result;
    540 }
    541 
    542 /**
    543  * Advances the iterator backwards, to the last boundary preceding this one.
    544  * @return The position of the last boundary position preceding this one.
    545  */
    546 int32_t RuleBasedBreakIterator::previous(void) {
    547     int32_t result;
    548     int32_t startPos;
    549 
    550     // if we have cached break positions and we're still in the range
    551     // covered by them, just move one step backward in the cache
    552     if (fCachedBreakPositions != NULL) {
    553         if (fPositionInCache > 0) {
    554             --fPositionInCache;
    555             // If we're at the beginning of the cache, need to reevaluate the
    556             // rule status
    557             if (fPositionInCache <= 0) {
    558                 fLastStatusIndexValid = FALSE;
    559             }
    560             int32_t pos = fCachedBreakPositions[fPositionInCache];
    561             utext_setNativeIndex(fText, pos);
    562             return pos;
    563         }
    564         else {
    565             reset();
    566         }
    567     }
    568 
    569     // if we're already sitting at the beginning of the text, return DONE
    570     if (fText == NULL || (startPos = current()) == 0) {
    571         fLastRuleStatusIndex  = 0;
    572         fLastStatusIndexValid = TRUE;
    573         return BreakIterator::DONE;
    574     }
    575 
    576     if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
    577         result = handlePrevious(fData->fReverseTable);
    578         if (fDictionaryCharCount > 0) {
    579             result = checkDictionary(result, startPos, TRUE);
    580         }
    581         return result;
    582     }
    583 
    584     // old rule syntax
    585     // set things up.  handlePrevious() will back us up to some valid
    586     // break position before the current position (we back our internal
    587     // iterator up one step to prevent handlePrevious() from returning
    588     // the current position), but not necessarily the last one before
    589 
    590     // where we started
    591 
    592     int32_t start = current();
    593 
    594     UTEXT_PREVIOUS32(fText);
    595     int32_t lastResult    = handlePrevious(fData->fReverseTable);
    596     if (lastResult == UBRK_DONE) {
    597         lastResult = 0;
    598         utext_setNativeIndex(fText, 0);
    599     }
    600     result = lastResult;
    601     int32_t lastTag       = 0;
    602     UBool   breakTagValid = FALSE;
    603 
    604     // iterate forward from the known break position until we pass our
    605     // starting point.  The last break position before the starting
    606     // point is our return value
    607 
    608     for (;;) {
    609         result         = next();
    610         if (result == BreakIterator::DONE || result >= start) {
    611             break;
    612         }
    613         lastResult     = result;
    614         lastTag        = fLastRuleStatusIndex;
    615         breakTagValid  = TRUE;
    616     }
    617 
    618     // fLastBreakTag wants to have the value for section of text preceding
    619     // the result position that we are to return (in lastResult.)  If
    620     // the backwards rules overshot and the above loop had to do two or more
    621     // next()s to move up to the desired return position, we will have a valid
    622     // tag value. But, if handlePrevious() took us to exactly the correct result positon,
    623     // we wont have a tag value for that position, which is only set by handleNext().
    624 
    625     // set the current iteration position to be the last break position
    626     // before where we started, and then return that value
    627     utext_setNativeIndex(fText, lastResult);
    628     fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()
    629     fLastStatusIndexValid = breakTagValid;
    630 
    631     // No need to check the dictionary; it will have been handled by
    632     // next()
    633 
    634     return lastResult;
    635 }
    636 
    637 /**
    638  * Sets the iterator to refer to the first boundary position following
    639  * the specified position.
    640  * @offset The position from which to begin searching for a break position.
    641  * @return The position of the first break after the current position.
    642  */
    643 int32_t RuleBasedBreakIterator::following(int32_t offset) {
    644     // if we have cached break positions and offset is in the range
    645     // covered by them, use them
    646     // TODO: could use binary search
    647     // TODO: what if offset is outside range, but break is not?
    648     if (fCachedBreakPositions != NULL) {
    649         if (offset >= fCachedBreakPositions[0]
    650                 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
    651             fPositionInCache = 0;
    652             // We are guaranteed not to leave the array due to range test above
    653             while (offset >= fCachedBreakPositions[fPositionInCache]) {
    654                 ++fPositionInCache;
    655             }
    656             int32_t pos = fCachedBreakPositions[fPositionInCache];
    657             utext_setNativeIndex(fText, pos);
    658             return pos;
    659         }
    660         else {
    661             reset();
    662         }
    663     }
    664 
    665     // if the offset passed in is already past the end of the text,
    666     // just return DONE; if it's before the beginning, return the
    667     // text's starting offset
    668     fLastRuleStatusIndex  = 0;
    669     fLastStatusIndexValid = TRUE;
    670     if (fText == NULL || offset >= utext_nativeLength(fText)) {
    671         last();
    672         return next();
    673     }
    674     else if (offset < 0) {
    675         return first();
    676     }
    677 
    678     // otherwise, set our internal iteration position (temporarily)
    679     // to the position passed in.  If this is the _beginning_ position,
    680     // then we can just use next() to get our return value
    681 
    682     int32_t result = 0;
    683 
    684     if (fData->fSafeRevTable != NULL) {
    685         // new rule syntax
    686         utext_setNativeIndex(fText, offset);
    687         // move forward one codepoint to prepare for moving back to a
    688         // safe point.
    689         // this handles offset being between a supplementary character
    690         UTEXT_NEXT32(fText);
    691         // handlePrevious will move most of the time to < 1 boundary away
    692         handlePrevious(fData->fSafeRevTable);
    693         int32_t result = next();
    694         while (result <= offset) {
    695             result = next();
    696         }
    697         return result;
    698     }
    699     if (fData->fSafeFwdTable != NULL) {
    700         // backup plan if forward safe table is not available
    701         utext_setNativeIndex(fText, offset);
    702         UTEXT_PREVIOUS32(fText);
    703         // handle next will give result >= offset
    704         handleNext(fData->fSafeFwdTable);
    705         // previous will give result 0 or 1 boundary away from offset,
    706         // most of the time
    707         // we have to
    708         int32_t oldresult = previous();
    709         while (oldresult > offset) {
    710             int32_t result = previous();
    711             if (result <= offset) {
    712                 return oldresult;
    713             }
    714             oldresult = result;
    715         }
    716         int32_t result = next();
    717         if (result <= offset) {
    718             return next();
    719         }
    720         return result;
    721     }
    722     // otherwise, we have to sync up first.  Use handlePrevious() to back
    723     // up to a known break position before the specified position (if
    724     // we can determine that the specified position is a break position,
    725     // we don't back up at all).  This may or may not be the last break
    726     // position at or before our starting position.  Advance forward
    727     // from here until we've passed the starting position.  The position
    728     // we stop on will be the first break position after the specified one.
    729     // old rule syntax
    730 
    731     utext_setNativeIndex(fText, offset);
    732     if (offset==0 ||
    733         (offset==1  && utext_getNativeIndex(fText)==0)) {
    734         return next();
    735     }
    736     result = previous();
    737 
    738     while (result != BreakIterator::DONE && result <= offset) {
    739         result = next();
    740     }
    741 
    742     return result;
    743 }
    744 
    745 /**
    746  * Sets the iterator to refer to the last boundary position before the
    747  * specified position.
    748  * @offset The position to begin searching for a break from.
    749  * @return The position of the last boundary before the starting position.
    750  */
    751 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
    752     // if we have cached break positions and offset is in the range
    753     // covered by them, use them
    754     if (fCachedBreakPositions != NULL) {
    755         // TODO: binary search?
    756         // TODO: What if offset is outside range, but break is not?
    757         if (offset > fCachedBreakPositions[0]
    758                 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
    759             fPositionInCache = 0;
    760             while (fPositionInCache < fNumCachedBreakPositions
    761                    && offset > fCachedBreakPositions[fPositionInCache])
    762                 ++fPositionInCache;
    763             --fPositionInCache;
    764             // If we're at the beginning of the cache, need to reevaluate the
    765             // rule status
    766             if (fPositionInCache <= 0) {
    767                 fLastStatusIndexValid = FALSE;
    768             }
    769             utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
    770             return fCachedBreakPositions[fPositionInCache];
    771         }
    772         else {
    773             reset();
    774         }
    775     }
    776 
    777     // if the offset passed in is already past the end of the text,
    778     // just return DONE; if it's before the beginning, return the
    779     // text's starting offset
    780     if (fText == NULL || offset > utext_nativeLength(fText)) {
    781         // return BreakIterator::DONE;
    782         return last();
    783     }
    784     else if (offset < 0) {
    785         return first();
    786     }
    787 
    788     // if we start by updating the current iteration position to the
    789     // position specified by the caller, we can just use previous()
    790     // to carry out this operation
    791 
    792     if (fData->fSafeFwdTable != NULL) {
    793         // new rule syntax
    794         utext_setNativeIndex(fText, offset);
    795         int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    796         if (newOffset != offset) {
    797             // Will come here if specified offset was not a code point boundary AND
    798             //   the underlying implmentation is using UText, which snaps any non-code-point-boundary
    799             //   indices to the containing code point.
    800             // For breakitereator::preceding only, these non-code-point indices need to be moved
    801             //   up to refer to the following codepoint.
    802             UTEXT_NEXT32(fText);
    803             offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    804         }
    805 
    806         // TODO:  (synwee) would it be better to just check for being in the middle of a surrogate pair,
    807         //        rather than adjusting the position unconditionally?
    808         //        (Change would interact with safe rules.)
    809         // TODO:  change RBBI behavior for off-boundary indices to match that of UText?
    810         //        affects only preceding(), seems cleaner, but is slightly different.
    811         UTEXT_PREVIOUS32(fText);
    812         handleNext(fData->fSafeFwdTable);
    813         int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    814         while (result >= offset) {
    815             result = previous();
    816         }
    817         return result;
    818     }
    819     if (fData->fSafeRevTable != NULL) {
    820         // backup plan if forward safe table is not available
    821         //  TODO:  check whether this path can be discarded
    822         //         It's probably OK to say that rules must supply both safe tables
    823         //            if they use safe tables at all.  We have certainly never described
    824         //            to anyone how to work with just one safe table.
    825         utext_setNativeIndex(fText, offset);
    826         UTEXT_NEXT32(fText);
    827 
    828         // handle previous will give result <= offset
    829         handlePrevious(fData->fSafeRevTable);
    830 
    831         // next will give result 0 or 1 boundary away from offset,
    832         // most of the time
    833         // we have to
    834         int32_t oldresult = next();
    835         while (oldresult < offset) {
    836             int32_t result = next();
    837             if (result >= offset) {
    838                 return oldresult;
    839             }
    840             oldresult = result;
    841         }
    842         int32_t result = previous();
    843         if (result >= offset) {
    844             return previous();
    845         }
    846         return result;
    847     }
    848 
    849     // old rule syntax
    850     utext_setNativeIndex(fText, offset);
    851     return previous();
    852 }
    853 
    854 /**
    855  * Returns true if the specfied position is a boundary position.  As a side
    856  * effect, leaves the iterator pointing to the first boundary position at
    857  * or after "offset".
    858  * @param offset the offset to check.
    859  * @return True if "offset" is a boundary position.
    860  */
    861 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
    862     // the beginning index of the iterator is always a boundary position by definition
    863     if (offset == 0) {
    864         first();       // For side effects on current position, tag values.
    865         return TRUE;
    866     }
    867 
    868     if (offset == (int32_t)utext_nativeLength(fText)) {
    869         last();       // For side effects on current position, tag values.
    870         return TRUE;
    871     }
    872 
    873     // out-of-range indexes are never boundary positions
    874     if (offset < 0) {
    875         first();       // For side effects on current position, tag values.
    876         return FALSE;
    877     }
    878 
    879     if (offset > utext_nativeLength(fText)) {
    880         last();        // For side effects on current position, tag values.
    881         return FALSE;
    882     }
    883 
    884     // otherwise, we can use following() on the position before the specified
    885     // one and return true if the position we get back is the one the user
    886     // specified
    887     utext_previous32From(fText, offset);
    888     int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    889     UBool    result  = following(backOne) == offset;
    890     return result;
    891 }
    892 
    893 /**
    894  * Returns the current iteration position.
    895  * @return The current iteration position.
    896  */
    897 int32_t RuleBasedBreakIterator::current(void) const {
    898     int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    899     return pos;
    900 }
    901 
    902 //=======================================================================
    903 // implementation
    904 //=======================================================================
    905 
    906 //
    907 // RBBIRunMode  -  the state machine runs an extra iteration at the beginning and end
    908 //                 of user text.  A variable with this enum type keeps track of where we
    909 //                 are.  The state machine only fetches user input while in the RUN mode.
    910 //
    911 enum RBBIRunMode {
    912     RBBI_START,     // state machine processing is before first char of input
    913     RBBI_RUN,       // state machine processing is in the user text
    914     RBBI_END        // state machine processing is after end of user text.
    915 };
    916 
    917 
    918 //-----------------------------------------------------------------------------------
    919 //
    920 //  handleNext(stateTable)
    921 //     This method is the actual implementation of the rbbi next() method.
    922 //     This method initializes the state machine to state 1
    923 //     and advances through the text character by character until we reach the end
    924 //     of the text or the state machine transitions to state 0.  We update our return
    925 //     value every time the state machine passes through an accepting state.
    926 //
    927 //-----------------------------------------------------------------------------------
    928 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
    929     int32_t             state;
    930     int16_t             category        = 0;
    931     RBBIRunMode         mode;
    932 
    933     RBBIStateTableRow  *row;
    934     UChar32             c;
    935     int32_t             lookaheadStatus = 0;
    936     int32_t             lookaheadTagIdx = 0;
    937     int32_t             result          = 0;
    938     int32_t             initialPosition = 0;
    939     int32_t             lookaheadResult = 0;
    940     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
    941     const char         *tableData       = statetable->fTableData;
    942     uint32_t            tableRowLen     = statetable->fRowLen;
    943 
    944     #ifdef RBBI_DEBUG
    945         if (fTrace) {
    946             RBBIDebugPuts("Handle Next   pos   char  state category");
    947         }
    948     #endif
    949 
    950     // No matter what, handleNext alway correctly sets the break tag value.
    951     fLastStatusIndexValid = TRUE;
    952     fLastRuleStatusIndex = 0;
    953 
    954     // if we're already at the end of the text, return DONE.
    955     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    956     result          = initialPosition;
    957     c               = UTEXT_NEXT32(fText);
    958     if (fData == NULL || c==U_SENTINEL) {
    959         return BreakIterator::DONE;
    960     }
    961 
    962     //  Set the initial state for the state machine
    963     state = START_STATE;
    964     row = (RBBIStateTableRow *)
    965             //(statetable->fTableData + (statetable->fRowLen * state));
    966             (tableData + tableRowLen * state);
    967 
    968 
    969     mode     = RBBI_RUN;
    970     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
    971         category = 2;
    972         mode     = RBBI_START;
    973     }
    974 
    975 
    976     // loop until we reach the end of the text or transition to state 0
    977     //
    978     for (;;) {
    979         if (c == U_SENTINEL) {
    980             // Reached end of input string.
    981             if (mode == RBBI_END) {
    982                 // We have already run the loop one last time with the
    983                 //   character set to the psueudo {eof} value.  Now it is time
    984                 //   to unconditionally bail out.
    985                 if (lookaheadResult > result) {
    986                     // We ran off the end of the string with a pending look-ahead match.
    987                     // Treat this as if the look-ahead condition had been met, and return
    988                     //  the match at the / position from the look-ahead rule.
    989                     result               = lookaheadResult;
    990                     fLastRuleStatusIndex = lookaheadTagIdx;
    991                     lookaheadStatus = 0;
    992                 }
    993                 break;
    994             }
    995             // Run the loop one last time with the fake end-of-input character category.
    996             mode = RBBI_END;
    997             category = 1;
    998         }
    999 
   1000         //
   1001         // Get the char category.  An incoming category of 1 or 2 means that
   1002         //      we are preset for doing the beginning or end of input, and
   1003         //      that we shouldn't get a category from an actual text input character.
   1004         //
   1005         if (mode == RBBI_RUN) {
   1006             // look up the current character's character category, which tells us
   1007             // which column in the state table to look at.
   1008             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
   1009             //        not the size of the character going in, which is a UChar32.
   1010             //
   1011             UTRIE_GET16(&fData->fTrie, c, category);
   1012 
   1013             // Check the dictionary bit in the character's category.
   1014             //    Counter is only used by dictionary based iterators (subclasses).
   1015             //    Chars that need to be handled by a dictionary have a flag bit set
   1016             //    in their category values.
   1017             //
   1018             if ((category & 0x4000) != 0)  {
   1019                 fDictionaryCharCount++;
   1020                 //  And off the dictionary flag bit.
   1021                 category &= ~0x4000;
   1022             }
   1023         }
   1024 
   1025         #ifdef RBBI_DEBUG
   1026             if (fTrace) {
   1027                 RBBIDebugPrintf("             %4ld   ", utext_getNativeIndex(fText));
   1028                 if (0x20<=c && c<0x7f) {
   1029                     RBBIDebugPrintf("\"%c\"  ", c);
   1030                 } else {
   1031                     RBBIDebugPrintf("%5x  ", c);
   1032                 }
   1033                 RBBIDebugPrintf("%3d  %3d\n", state, category);
   1034             }
   1035         #endif
   1036 
   1037         // State Transition - move machine to its next state
   1038         //
   1039         state = row->fNextState[category];
   1040         row = (RBBIStateTableRow *)
   1041             // (statetable->fTableData + (statetable->fRowLen * state));
   1042             (tableData + tableRowLen * state);
   1043 
   1044 
   1045         if (row->fAccepting == -1) {
   1046             // Match found, common case.
   1047             if (mode != RBBI_START) {
   1048                 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1049             }
   1050             fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
   1051         }
   1052 
   1053         if (row->fLookAhead != 0) {
   1054             if (lookaheadStatus != 0
   1055                 && row->fAccepting == lookaheadStatus) {
   1056                 // Lookahead match is completed.
   1057                 result               = lookaheadResult;
   1058                 fLastRuleStatusIndex = lookaheadTagIdx;
   1059                 lookaheadStatus      = 0;
   1060                 // TODO:  make a standalone hard break in a rule work.
   1061                 if (lookAheadHardBreak) {
   1062                     UTEXT_SETNATIVEINDEX(fText, result);
   1063                     return result;
   1064                 }
   1065                 // Look-ahead completed, but other rules may match further.  Continue on
   1066                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
   1067                 goto continueOn;
   1068             }
   1069 
   1070             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1071             lookaheadResult = r;
   1072             lookaheadStatus = row->fLookAhead;
   1073             lookaheadTagIdx = row->fTagIdx;
   1074             goto continueOn;
   1075         }
   1076 
   1077 
   1078         if (row->fAccepting != 0) {
   1079             // Because this is an accepting state, any in-progress look-ahead match
   1080             //   is no longer relavant.  Clear out the pending lookahead status.
   1081             lookaheadStatus = 0;           // clear out any pending look-ahead match.
   1082         }
   1083 
   1084 continueOn:
   1085         if (state == STOP_STATE) {
   1086             // This is the normal exit from the lookup state machine.
   1087             // We have advanced through the string until it is certain that no
   1088             //   longer match is possible, no matter what characters follow.
   1089             break;
   1090         }
   1091 
   1092         // Advance to the next character.
   1093         // If this is a beginning-of-input loop iteration, don't advance
   1094         //    the input position.  The next iteration will be processing the
   1095         //    first real input character.
   1096         if (mode == RBBI_RUN) {
   1097             c = UTEXT_NEXT32(fText);
   1098         } else {
   1099             if (mode == RBBI_START) {
   1100                 mode = RBBI_RUN;
   1101             }
   1102         }
   1103 
   1104 
   1105     }
   1106 
   1107     // The state machine is done.  Check whether it found a match...
   1108 
   1109     // If the iterator failed to advance in the match engine, force it ahead by one.
   1110     //   (This really indicates a defect in the break rules.  They should always match
   1111     //    at least one character.)
   1112     if (result == initialPosition) {
   1113         UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1114         UTEXT_NEXT32(fText);
   1115         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1116     }
   1117 
   1118     // Leave the iterator at our result position.
   1119     UTEXT_SETNATIVEINDEX(fText, result);
   1120     #ifdef RBBI_DEBUG
   1121         if (fTrace) {
   1122             RBBIDebugPrintf("result = %d\n\n", result);
   1123         }
   1124     #endif
   1125     return result;
   1126 }
   1127 
   1128 
   1129 
   1130 //-----------------------------------------------------------------------------------
   1131 //
   1132 //  handlePrevious()
   1133 //
   1134 //      Iterate backwards, according to the logic of the reverse rules.
   1135 //      This version handles the exact style backwards rules.
   1136 //
   1137 //      The logic of this function is very similar to handleNext(), above.
   1138 //
   1139 //-----------------------------------------------------------------------------------
   1140 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
   1141     int32_t             state;
   1142     int16_t             category        = 0;
   1143     RBBIRunMode         mode;
   1144     RBBIStateTableRow  *row;
   1145     UChar32             c;
   1146     int32_t             lookaheadStatus = 0;
   1147     int32_t             result          = 0;
   1148     int32_t             initialPosition = 0;
   1149     int32_t             lookaheadResult = 0;
   1150     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
   1151 
   1152     #ifdef RBBI_DEBUG
   1153         if (fTrace) {
   1154             RBBIDebugPuts("Handle Previous   pos   char  state category");
   1155         }
   1156     #endif
   1157 
   1158     // handlePrevious() never gets the rule status.
   1159     // Flag the status as invalid; if the user ever asks for status, we will need
   1160     // to back up, then re-find the break position using handleNext(), which does
   1161     // get the status value.
   1162     fLastStatusIndexValid = FALSE;
   1163     fLastRuleStatusIndex = 0;
   1164 
   1165     // if we're already at the start of the text, return DONE.
   1166     if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
   1167         return BreakIterator::DONE;
   1168     }
   1169 
   1170     //  Set up the starting char.
   1171     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1172     result          = initialPosition;
   1173     c               = UTEXT_PREVIOUS32(fText);
   1174 
   1175     //  Set the initial state for the state machine
   1176     state = START_STATE;
   1177     row = (RBBIStateTableRow *)
   1178             (statetable->fTableData + (statetable->fRowLen * state));
   1179     category = 3;
   1180     mode     = RBBI_RUN;
   1181     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
   1182         category = 2;
   1183         mode     = RBBI_START;
   1184     }
   1185 
   1186 
   1187     // loop until we reach the start of the text or transition to state 0
   1188     //
   1189     for (;;) {
   1190         if (c == U_SENTINEL) {
   1191             // Reached end of input string.
   1192             if (mode == RBBI_END) {
   1193                 // We have already run the loop one last time with the
   1194                 //   character set to the psueudo {eof} value.  Now it is time
   1195                 //   to unconditionally bail out.
   1196                 if (lookaheadResult < result) {
   1197                     // We ran off the end of the string with a pending look-ahead match.
   1198                     // Treat this as if the look-ahead condition had been met, and return
   1199                     //  the match at the / position from the look-ahead rule.
   1200                     result               = lookaheadResult;
   1201                     lookaheadStatus = 0;
   1202                 } else if (result == initialPosition) {
   1203                     // Ran off start, no match found.
   1204                     // move one index one (towards the start, since we are doing a previous())
   1205                     UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1206                     UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.
   1207                 }
   1208                 break;
   1209             }
   1210             // Run the loop one last time with the fake end-of-input character category.
   1211             mode = RBBI_END;
   1212             category = 1;
   1213         }
   1214 
   1215         //
   1216         // Get the char category.  An incoming category of 1 or 2 means that
   1217         //      we are preset for doing the beginning or end of input, and
   1218         //      that we shouldn't get a category from an actual text input character.
   1219         //
   1220         if (mode == RBBI_RUN) {
   1221             // look up the current character's character category, which tells us
   1222             // which column in the state table to look at.
   1223             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
   1224             //        not the size of the character going in, which is a UChar32.
   1225             //
   1226             UTRIE_GET16(&fData->fTrie, c, category);
   1227 
   1228             // Check the dictionary bit in the character's category.
   1229             //    Counter is only used by dictionary based iterators (subclasses).
   1230             //    Chars that need to be handled by a dictionary have a flag bit set
   1231             //    in their category values.
   1232             //
   1233             if ((category & 0x4000) != 0)  {
   1234                 fDictionaryCharCount++;
   1235                 //  And off the dictionary flag bit.
   1236                 category &= ~0x4000;
   1237             }
   1238         }
   1239 
   1240         #ifdef RBBI_DEBUG
   1241             if (fTrace) {
   1242                 RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(fText));
   1243                 if (0x20<=c && c<0x7f) {
   1244                     RBBIDebugPrintf("\"%c\"  ", c);
   1245                 } else {
   1246                     RBBIDebugPrintf("%5x  ", c);
   1247                 }
   1248                 RBBIDebugPrintf("%3d  %3d\n", state, category);
   1249             }
   1250         #endif
   1251 
   1252         // State Transition - move machine to its next state
   1253         //
   1254         state = row->fNextState[category];
   1255         row = (RBBIStateTableRow *)
   1256             (statetable->fTableData + (statetable->fRowLen * state));
   1257 
   1258         if (row->fAccepting == -1) {
   1259             // Match found, common case.
   1260             result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1261         }
   1262 
   1263         if (row->fLookAhead != 0) {
   1264             if (lookaheadStatus != 0
   1265                 && row->fAccepting == lookaheadStatus) {
   1266                 // Lookahead match is completed.
   1267                 result               = lookaheadResult;
   1268                 lookaheadStatus      = 0;
   1269                 // TODO:  make a standalone hard break in a rule work.
   1270                 if (lookAheadHardBreak) {
   1271                     UTEXT_SETNATIVEINDEX(fText, result);
   1272                     return result;
   1273                 }
   1274                 // Look-ahead completed, but other rules may match further.  Continue on
   1275                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
   1276                 goto continueOn;
   1277             }
   1278 
   1279             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1280             lookaheadResult = r;
   1281             lookaheadStatus = row->fLookAhead;
   1282             goto continueOn;
   1283         }
   1284 
   1285 
   1286         if (row->fAccepting != 0) {
   1287             // Because this is an accepting state, any in-progress look-ahead match
   1288             //   is no longer relavant.  Clear out the pending lookahead status.
   1289             lookaheadStatus = 0;
   1290         }
   1291 
   1292 continueOn:
   1293         if (state == STOP_STATE) {
   1294             // This is the normal exit from the lookup state machine.
   1295             // We have advanced through the string until it is certain that no
   1296             //   longer match is possible, no matter what characters follow.
   1297             break;
   1298         }
   1299 
   1300         // Move (backwards) to the next character to process.
   1301         // If this is a beginning-of-input loop iteration, don't advance
   1302         //    the input position.  The next iteration will be processing the
   1303         //    first real input character.
   1304         if (mode == RBBI_RUN) {
   1305             c = UTEXT_PREVIOUS32(fText);
   1306         } else {
   1307             if (mode == RBBI_START) {
   1308                 mode = RBBI_RUN;
   1309             }
   1310         }
   1311     }
   1312 
   1313     // The state machine is done.  Check whether it found a match...
   1314 
   1315     // If the iterator failed to advance in the match engine, force it ahead by one.
   1316     //   (This really indicates a defect in the break rules.  They should always match
   1317     //    at least one character.)
   1318     if (result == initialPosition) {
   1319         UTEXT_SETNATIVEINDEX(fText, initialPosition);
   1320         UTEXT_PREVIOUS32(fText);
   1321         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1322     }
   1323 
   1324     // Leave the iterator at our result position.
   1325     UTEXT_SETNATIVEINDEX(fText, result);
   1326     #ifdef RBBI_DEBUG
   1327         if (fTrace) {
   1328             RBBIDebugPrintf("result = %d\n\n", result);
   1329         }
   1330     #endif
   1331     return result;
   1332 }
   1333 
   1334 
   1335 void
   1336 RuleBasedBreakIterator::reset()
   1337 {
   1338     if (fCachedBreakPositions) {
   1339         uprv_free(fCachedBreakPositions);
   1340     }
   1341     fCachedBreakPositions = NULL;
   1342     fNumCachedBreakPositions = 0;
   1343     fDictionaryCharCount = 0;
   1344     fPositionInCache = 0;
   1345 }
   1346 
   1347 
   1348 
   1349 //-------------------------------------------------------------------------------
   1350 //
   1351 //   getRuleStatus()   Return the break rule tag associated with the current
   1352 //                     iterator position.  If the iterator arrived at its current
   1353 //                     position by iterating forwards, the value will have been
   1354 //                     cached by the handleNext() function.
   1355 //
   1356 //                     If no cached status value is available, the status is
   1357 //                     found by doing a previous() followed by a next(), which
   1358 //                     leaves the iterator where it started, and computes the
   1359 //                     status while doing the next().
   1360 //
   1361 //-------------------------------------------------------------------------------
   1362 void RuleBasedBreakIterator::makeRuleStatusValid() {
   1363     if (fLastStatusIndexValid == FALSE) {
   1364         //  No cached status is available.
   1365         if (fText == NULL || current() == 0) {
   1366             //  At start of text, or there is no text.  Status is always zero.
   1367             fLastRuleStatusIndex = 0;
   1368             fLastStatusIndexValid = TRUE;
   1369         } else {
   1370             //  Not at start of text.  Find status the tedious way.
   1371             int32_t pa = current();
   1372             previous();
   1373             if (fNumCachedBreakPositions > 0) {
   1374                 reset();                // Blow off the dictionary cache
   1375             }
   1376             int32_t pb = next();
   1377             if (pa != pb) {
   1378                 // note: the if (pa != pb) test is here only to eliminate warnings for
   1379                 //       unused local variables on gcc.  Logically, it isn't needed.
   1380                 U_ASSERT(pa == pb);
   1381             }
   1382         }
   1383     }
   1384     U_ASSERT(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fData->fStatusMaxIdx);
   1385 }
   1386 
   1387 
   1388 int32_t  RuleBasedBreakIterator::getRuleStatus() const {
   1389     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
   1390     nonConstThis->makeRuleStatusValid();
   1391 
   1392     // fLastRuleStatusIndex indexes to the start of the appropriate status record
   1393     //                                                 (the number of status values.)
   1394     //   This function returns the last (largest) of the array of status values.
   1395     int32_t  idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
   1396     int32_t  tagVal = fData->fRuleStatusTable[idx];
   1397 
   1398     return tagVal;
   1399 }
   1400 
   1401 
   1402 
   1403 
   1404 int32_t RuleBasedBreakIterator::getRuleStatusVec(
   1405              int32_t *fillInVec, int32_t capacity, UErrorCode &status)
   1406 {
   1407     if (U_FAILURE(status)) {
   1408         return 0;
   1409     }
   1410 
   1411     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
   1412     nonConstThis->makeRuleStatusValid();
   1413     int32_t  numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
   1414     int32_t  numValsToCopy = numVals;
   1415     if (numVals > capacity) {
   1416         status = U_BUFFER_OVERFLOW_ERROR;
   1417         numValsToCopy = capacity;
   1418     }
   1419     int i;
   1420     for (i=0; i<numValsToCopy; i++) {
   1421         fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
   1422     }
   1423     return numVals;
   1424 }
   1425 
   1426 
   1427 
   1428 //-------------------------------------------------------------------------------
   1429 //
   1430 //   getBinaryRules        Access to the compiled form of the rules,
   1431 //                         for use by build system tools that save the data
   1432 //                         for standard iterator types.
   1433 //
   1434 //-------------------------------------------------------------------------------
   1435 const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
   1436     const uint8_t  *retPtr = NULL;
   1437     length = 0;
   1438 
   1439     if (fData != NULL) {
   1440         retPtr = (const uint8_t *)fData->fHeader;
   1441         length = fData->fHeader->fLength;
   1442     }
   1443     return retPtr;
   1444 }
   1445 
   1446 
   1447 
   1448 
   1449 //-------------------------------------------------------------------------------
   1450 //
   1451 //  BufferClone       TODO:  In my (Andy) opinion, this function should be deprecated.
   1452 //                    Saving one heap allocation isn't worth the trouble.
   1453 //                    Cloning shouldn't be done in tight loops, and
   1454 //                    making the clone copy involves other heap operations anyway.
   1455 //                    And the application code for correctly dealing with buffer
   1456 //                    size problems and the eventual object destruction is ugly.
   1457 //
   1458 //-------------------------------------------------------------------------------
   1459 BreakIterator *  RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
   1460                                    int32_t &bufferSize,
   1461                                    UErrorCode &status)
   1462 {
   1463     if (U_FAILURE(status)){
   1464         return NULL;
   1465     }
   1466 
   1467     //
   1468     //  If user buffer size is zero this is a preflight operation to
   1469     //    obtain the needed buffer size, allowing for worst case misalignment.
   1470     //
   1471     if (bufferSize == 0) {
   1472         bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
   1473         return NULL;
   1474     }
   1475 
   1476 
   1477     //
   1478     //  Check the alignment and size of the user supplied buffer.
   1479     //  Allocate heap memory if the user supplied memory is insufficient.
   1480     //
   1481     char    *buf   = (char *)stackBuffer;
   1482     uint32_t s      = bufferSize;
   1483 
   1484     if (stackBuffer == NULL) {
   1485         s = 0;   // Ignore size, force allocation if user didn't give us a buffer.
   1486     }
   1487     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
   1488         uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf);
   1489         s   -= offsetUp;
   1490         buf += offsetUp;
   1491     }
   1492     if (s < sizeof(RuleBasedBreakIterator)) {
   1493         // Not enough room in the caller-supplied buffer.
   1494         // Do a plain-vanilla heap based clone and return that, along with
   1495         //   a warning that the clone was allocated.
   1496         RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this);
   1497         if (clonedBI == 0) {
   1498             status = U_MEMORY_ALLOCATION_ERROR;
   1499         } else {
   1500             status = U_SAFECLONE_ALLOCATED_WARNING;
   1501         }
   1502         return clonedBI;
   1503     }
   1504 
   1505     //
   1506     //  Clone the source BI into the caller-supplied buffer.
   1507     //    TODO:  using an overloaded operator new to directly initialize the
   1508     //           copy in the user's buffer would be better, but it doesn't seem
   1509     //           to get along with namespaces.  Investigate why.
   1510     //
   1511     //           The memcpy is only safe with an empty (default constructed)
   1512     //           break iterator.  Use on others can screw up reference counts
   1513     //           to data.  memcpy-ing objects is not really a good idea...
   1514     //
   1515     RuleBasedBreakIterator localIter;        // Empty break iterator, source for memcpy
   1516     RuleBasedBreakIterator *clone = (RuleBasedBreakIterator *)buf;
   1517     uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // init C++ gorp, BreakIterator base class part
   1518     clone->init();                // Init RuleBasedBreakIterator part, (user default constructor)
   1519     *clone = *this;               // clone = the real BI we want.
   1520     clone->fBufferClone = TRUE;   // Flag to prevent deleting storage on close (From C code)
   1521 
   1522     return clone;
   1523 }
   1524 
   1525 
   1526 //-------------------------------------------------------------------------------
   1527 //
   1528 //  isDictionaryChar      Return true if the category lookup for this char
   1529 //                        indicates that it is in the set of dictionary lookup
   1530 //                        chars.
   1531 //
   1532 //                        This function is intended for use by dictionary based
   1533 //                        break iterators.
   1534 //
   1535 //-------------------------------------------------------------------------------
   1536 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32   c) {
   1537     if (fData == NULL) {
   1538         return FALSE;
   1539     }
   1540     uint16_t category;
   1541     UTRIE_GET16(&fData->fTrie, c, category);
   1542     return (category & 0x4000) != 0;
   1543 }*/
   1544 
   1545 
   1546 //-------------------------------------------------------------------------------
   1547 //
   1548 //  checkDictionary       This function handles all processing of characters in
   1549 //                        the "dictionary" set. It will determine the appropriate
   1550 //                        course of action, and possibly set up a cache in the
   1551 //                        process.
   1552 //
   1553 //-------------------------------------------------------------------------------
   1554 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
   1555                             int32_t endPos,
   1556                             UBool reverse) {
   1557     // Reset the old break cache first.
   1558     uint32_t dictionaryCount = fDictionaryCharCount;
   1559     reset();
   1560 
   1561     if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
   1562         return (reverse ? startPos : endPos);
   1563     }
   1564 
   1565     // Bug 5532.  The dictionary code will crash if the input text is UTF-8
   1566     //      because native indexes are different from UTF-16 indexes.
   1567     //      Temporary hack: skip dictionary lookup for UTF-8 encoded text.
   1568     //      It wont give the right breaks, but it's better than a crash.
   1569     //
   1570     //      Check the type of the UText by checking its pFuncs field, which
   1571     //      is UText's function dispatch table.  It will be the same for all
   1572     //      UTF-8 UTexts and different for any other UText type.
   1573     //
   1574     //      We have no other type of UText available with non-UTF-16 native indexing.
   1575     //      This whole check will go away once the dictionary code is fixed.
   1576     static const void *utext_utf8Funcs;
   1577     if (utext_utf8Funcs == NULL) {
   1578         // Cache the UTF-8 UText function pointer value.
   1579         UErrorCode status = U_ZERO_ERROR;
   1580         UText tempUText = UTEXT_INITIALIZER;
   1581         utext_openUTF8(&tempUText, NULL, 0, &status);
   1582         utext_utf8Funcs = tempUText.pFuncs;
   1583         utext_close(&tempUText);
   1584     }
   1585     if (fText->pFuncs == utext_utf8Funcs) {
   1586         return (reverse ? startPos : endPos);
   1587     }
   1588 
   1589     // Starting from the starting point, scan towards the proposed result,
   1590     // looking for the first dictionary character (which may be the one
   1591     // we're on, if we're starting in the middle of a range).
   1592     utext_setNativeIndex(fText, reverse ? endPos : startPos);
   1593     if (reverse) {
   1594         UTEXT_PREVIOUS32(fText);
   1595     }
   1596 
   1597     int32_t rangeStart = startPos;
   1598     int32_t rangeEnd = endPos;
   1599 
   1600     uint16_t    category;
   1601     int32_t     current;
   1602     UErrorCode  status = U_ZERO_ERROR;
   1603     UStack      breaks(status);
   1604     int32_t     foundBreakCount = 0;
   1605     UChar32     c = utext_current32(fText);
   1606 
   1607     UTRIE_GET16(&fData->fTrie, c, category);
   1608 
   1609     // Is the character we're starting on a dictionary character? If so, we
   1610     // need to back up to include the entire run; otherwise the results of
   1611     // the break algorithm will differ depending on where we start. Since
   1612     // the result is cached and there is typically a non-dictionary break
   1613     // within a small number of words, there should be little performance impact.
   1614     if (category & 0x4000) {
   1615         if (reverse) {
   1616             do {
   1617                 utext_next32(fText);          // TODO:  recast to work directly with postincrement.
   1618                 c = utext_current32(fText);
   1619                 UTRIE_GET16(&fData->fTrie, c, category);
   1620             } while (c != U_SENTINEL && (category & 0x4000));
   1621             // Back up to the last dictionary character
   1622             rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1623             if (c == U_SENTINEL) {
   1624                 // c = fText->last32();
   1625                 //   TODO:  why was this if needed?
   1626                 c = UTEXT_PREVIOUS32(fText);
   1627             }
   1628             else {
   1629                 c = UTEXT_PREVIOUS32(fText);
   1630             }
   1631         }
   1632         else {
   1633             do {
   1634                 c = UTEXT_PREVIOUS32(fText);
   1635                 UTRIE_GET16(&fData->fTrie, c, category);
   1636             }
   1637             while (c != U_SENTINEL && (category & 0x4000));
   1638             // Back up to the last dictionary character
   1639             if (c == U_SENTINEL) {
   1640                 // c = fText->first32();
   1641                 c = utext_current32(fText);
   1642             }
   1643             else {
   1644                 utext_next32(fText);
   1645                 c = utext_current32(fText);
   1646             }
   1647             rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
   1648         }
   1649         UTRIE_GET16(&fData->fTrie, c, category);
   1650     }
   1651 
   1652     // Loop through the text, looking for ranges of dictionary characters.
   1653     // For each span, find the appropriate break engine, and ask it to find
   1654     // any breaks within the span.
   1655     // Note: we always do this in the forward direction, so that the break
   1656     // cache is built in the right order.
   1657     if (reverse) {
   1658         utext_setNativeIndex(fText, rangeStart);
   1659         c = utext_current32(fText);
   1660         UTRIE_GET16(&fData->fTrie, c, category);
   1661     }
   1662     while(U_SUCCESS(status)) {
   1663         while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
   1664             utext_next32(fText);           // TODO:  tweak for post-increment operation
   1665             c = utext_current32(fText);
   1666             UTRIE_GET16(&fData->fTrie, c, category);
   1667         }
   1668         if (current >= rangeEnd) {
   1669             break;
   1670         }
   1671 
   1672         // We now have a dictionary character. Get the appropriate language object
   1673         // to deal with it.
   1674         const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
   1675 
   1676         // Ask the language object if there are any breaks. It will leave the text
   1677         // pointer on the other side of its range, ready to search for the next one.
   1678         if (lbe != NULL) {
   1679             foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
   1680         }
   1681 
   1682         // Reload the loop variables for the next go-round
   1683         c = utext_current32(fText);
   1684         UTRIE_GET16(&fData->fTrie, c, category);
   1685     }
   1686 
   1687     // If we found breaks, build a new break cache. The first and last entries must
   1688     // be the original starting and ending position.
   1689     if (foundBreakCount > 0) {
   1690         int32_t totalBreaks = foundBreakCount;
   1691         if (startPos < breaks.elementAti(0)) {
   1692             totalBreaks += 1;
   1693         }
   1694         if (endPos > breaks.peeki()) {
   1695             totalBreaks += 1;
   1696         }
   1697         fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
   1698         if (fCachedBreakPositions != NULL) {
   1699             int32_t out = 0;
   1700             fNumCachedBreakPositions = totalBreaks;
   1701             if (startPos < breaks.elementAti(0)) {
   1702                 fCachedBreakPositions[out++] = startPos;
   1703             }
   1704             for (int32_t i = 0; i < foundBreakCount; ++i) {
   1705                 fCachedBreakPositions[out++] = breaks.elementAti(i);
   1706             }
   1707             if (endPos > fCachedBreakPositions[out-1]) {
   1708                 fCachedBreakPositions[out] = endPos;
   1709             }
   1710             // If there are breaks, then by definition, we are replacing the original
   1711             // proposed break by one of the breaks we found. Use following() and
   1712             // preceding() to do the work. They should never recurse in this case.
   1713             if (reverse) {
   1714                 return preceding(endPos - 1);
   1715             }
   1716             else {
   1717                 return following(startPos);
   1718             }
   1719         }
   1720         // If the allocation failed, just fall through to the "no breaks found" case.
   1721     }
   1722 
   1723     // If we get here, there were no language-based breaks. Set the text pointer
   1724     // to the original proposed break.
   1725     utext_setNativeIndex(fText, reverse ? startPos : endPos);
   1726     return (reverse ? startPos : endPos);
   1727 }
   1728 
   1729 U_NAMESPACE_END
   1730 
   1731 // defined in ucln_cmn.h
   1732 
   1733 static U_NAMESPACE_QUALIFIER UStack *gLanguageBreakFactories = NULL;
   1734 
   1735 /**
   1736  * Release all static memory held by breakiterator.
   1737  */
   1738 U_CDECL_BEGIN
   1739 static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
   1740     if (gLanguageBreakFactories) {
   1741         delete gLanguageBreakFactories;
   1742         gLanguageBreakFactories = NULL;
   1743     }
   1744     return TRUE;
   1745 }
   1746 U_CDECL_END
   1747 
   1748 U_CDECL_BEGIN
   1749 static void U_CALLCONV _deleteFactory(void *obj) {
   1750     delete (U_NAMESPACE_QUALIFIER LanguageBreakFactory *) obj;
   1751 }
   1752 U_CDECL_END
   1753 U_NAMESPACE_BEGIN
   1754 
   1755 static const LanguageBreakEngine*
   1756 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
   1757 {
   1758     UBool       needsInit;
   1759     UErrorCode  status = U_ZERO_ERROR;
   1760     UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit);
   1761 
   1762     if (needsInit) {
   1763         UStack  *factories = new UStack(_deleteFactory, NULL, status);
   1764         if (factories != NULL && U_SUCCESS(status)) {
   1765             ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
   1766             factories->push(builtIn, status);
   1767 #ifdef U_LOCAL_SERVICE_HOOK
   1768             LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
   1769             if (extra != NULL) {
   1770                 factories->push(extra, status);
   1771             }
   1772 #endif
   1773         }
   1774         umtx_lock(NULL);
   1775         if (gLanguageBreakFactories == NULL) {
   1776             gLanguageBreakFactories = factories;
   1777             factories = NULL;
   1778             ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
   1779         }
   1780         umtx_unlock(NULL);
   1781         delete factories;
   1782     }
   1783 
   1784     if (gLanguageBreakFactories == NULL) {
   1785         return NULL;
   1786     }
   1787 
   1788     int32_t i = gLanguageBreakFactories->size();
   1789     const LanguageBreakEngine *lbe = NULL;
   1790     while (--i >= 0) {
   1791         LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
   1792         lbe = factory->getEngineFor(c, breakType);
   1793         if (lbe != NULL) {
   1794             break;
   1795         }
   1796     }
   1797     return lbe;
   1798 }
   1799 
   1800 
   1801 //-------------------------------------------------------------------------------
   1802 //
   1803 //  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the
   1804 //                          the characer c.
   1805 //
   1806 //-------------------------------------------------------------------------------
   1807 const LanguageBreakEngine *
   1808 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
   1809     const LanguageBreakEngine *lbe = NULL;
   1810     UErrorCode status = U_ZERO_ERROR;
   1811 
   1812     if (fLanguageBreakEngines == NULL) {
   1813         fLanguageBreakEngines = new UStack(status);
   1814         if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
   1815             delete fLanguageBreakEngines;
   1816             fLanguageBreakEngines = 0;
   1817             return NULL;
   1818         }
   1819     }
   1820 
   1821     int32_t i = fLanguageBreakEngines->size();
   1822     while (--i >= 0) {
   1823         lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
   1824         if (lbe->handles(c, fBreakType)) {
   1825             return lbe;
   1826         }
   1827     }
   1828 
   1829     // No existing dictionary took the character. See if a factory wants to
   1830     // give us a new LanguageBreakEngine for this character.
   1831     lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
   1832 
   1833     // If we got one, use it and push it on our stack.
   1834     if (lbe != NULL) {
   1835         fLanguageBreakEngines->push((void *)lbe, status);
   1836         // Even if we can't remember it, we can keep looking it up, so
   1837         // return it even if the push fails.
   1838         return lbe;
   1839     }
   1840 
   1841     // No engine is forthcoming for this character. Add it to the
   1842     // reject set. Create the reject break engine if needed.
   1843     if (fUnhandledBreakEngine == NULL) {
   1844         fUnhandledBreakEngine = new UnhandledEngine(status);
   1845         if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
   1846             status = U_MEMORY_ALLOCATION_ERROR;
   1847         }
   1848         // Put it last so that scripts for which we have an engine get tried
   1849         // first.
   1850         fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
   1851         // If we can't insert it, or creation failed, get rid of it
   1852         if (U_FAILURE(status)) {
   1853             delete fUnhandledBreakEngine;
   1854             fUnhandledBreakEngine = 0;
   1855             return NULL;
   1856         }
   1857     }
   1858 
   1859     // Tell the reject engine about the character; at its discretion, it may
   1860     // add more than just the one character.
   1861     fUnhandledBreakEngine->handleCharacter(c, fBreakType);
   1862 
   1863     return fUnhandledBreakEngine;
   1864 }
   1865 
   1866 
   1867 
   1868 /*int32_t RuleBasedBreakIterator::getBreakType() const {
   1869     return fBreakType;
   1870 }*/
   1871 
   1872 void RuleBasedBreakIterator::setBreakType(int32_t type) {
   1873     fBreakType = type;
   1874     reset();
   1875 }
   1876 
   1877 U_NAMESPACE_END
   1878 
   1879 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   1880