Home | History | Annotate | Download | only in common
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 
      4 // file: rbbi_cache.h
      5 //
      6 #ifndef RBBI_CACHE_H
      7 #define RBBI_CACHE_H
      8 
      9 #include "unicode/utypes.h"
     10 
     11 #if !UCONFIG_NO_BREAK_ITERATION
     12 
     13 #include "unicode/rbbi.h"
     14 #include "unicode/uobject.h"
     15 
     16 #include "uvectr32.h"
     17 
     18 U_NAMESPACE_BEGIN
     19 
     20 /* DictionaryCache  stores the boundaries obtained from a run of dictionary characters.
     21  *                 Dictionary boundaries are moved first to this cache, then from here
     22  *                 to the main BreakCache, where they may inter-leave with non-dictionary
     23  *                 boundaries. The public BreakIterator API always fetches directly
     24  *                 from the main BreakCache, not from here.
     25  *
     26  *                 In common situations, the number of boundaries in a single dictionary run
     27  *                 should be quite small, it will be terminated by punctuation, spaces,
     28  *                 or any other non-dictionary characters. The main BreakCache may end
     29  *                 up with boundaries from multiple dictionary based runs.
     30  *
     31  *                 The boundaries are stored in a simple ArrayList (vector), with the
     32  *                 assumption that they will be accessed sequentially.
     33  */
     34 class RuleBasedBreakIterator::DictionaryCache: public UMemory {
     35   public:
     36      DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status);
     37      ~DictionaryCache();
     38 
     39      void reset();
     40 
     41      UBool following(int32_t fromPos, int32_t *pos, int32_t *statusIndex);
     42      UBool preceding(int32_t fromPos, int32_t *pos, int32_t *statusIndex);
     43 
     44     /**
     45      * Populate the cache with the dictionary based boundaries within a region of text.
     46      * @param startPos  The start position of a range of text
     47      * @param endPos    The end position of a range of text
     48      * @param firstRuleStatus The rule status index that applies to the break at startPos
     49      * @param otherRuleStatus The rule status index that applies to boundaries other than startPos
     50      * @internal
     51      */
     52     void populateDictionary(int32_t startPos, int32_t endPos,
     53                          int32_t firstRuleStatus, int32_t otherRuleStatus);
     54 
     55 
     56 
     57     RuleBasedBreakIterator *fBI;
     58 
     59     UVector32           fBreaks;                // A vector containing the boundaries.
     60     int32_t             fPositionInCache;       // Index in fBreaks of last boundary returned by following()
     61                                                 //    or preceding(). Optimizes sequential access.
     62     int32_t             fStart;                 // Text position of first boundary in cache.
     63     int32_t             fLimit;                 // Last boundary in cache. Which is the limit of the
     64                                                 //    text segment being handled by the dictionary.
     65     int32_t             fFirstRuleStatusIndex;  // Rule status info for first boundary.
     66     int32_t             fOtherRuleStatusIndex;  // Rule status info for 2nd through last boundaries.
     67 };
     68 
     69 
     70 /*
     71  * class BreakCache
     72  *
     73  * Cache of break boundary positions and rule status values.
     74  * Break iterator API functions, next(), previous(), etc., will use cached results
     75  * when possible, and otherwise cache new results as they are obtained.
     76  *
     77  * Uniformly caches both dictionary and rule based (non-dictionary) boundaries.
     78  *
     79  * The cache is implemented as a single circular buffer.
     80  */
     81 
     82 /*
     83  * size of the circular cache buffer.
     84  */
     85 
     86 class RuleBasedBreakIterator::BreakCache: public UMemory {
     87   public:
     88                 BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status);
     89     virtual     ~BreakCache();
     90     void        reset(int32_t pos = 0, int32_t ruleStatus = 0);
     91     void        next() {    if (fBufIdx == fEndBufIdx) {
     92                                 nextOL();
     93                             } else {
     94                                 fBufIdx = modChunkSize(fBufIdx + 1);
     95                                 fTextIdx = fBI->fPosition = fBoundaries[fBufIdx];
     96                                 fBI->fRuleStatusIndex = fStatuses[fBufIdx];
     97                             }
     98                 };
     99 
    100 
    101     void        nextOL();
    102     void        previous(UErrorCode &status);
    103 
    104     // Move the iteration state to the position following the startPosition.
    105     // Input position must be pinned to the input length.
    106     void        following(int32_t startPosition, UErrorCode &status);
    107 
    108     void        preceding(int32_t startPosition, UErrorCode &status);
    109 
    110     /*
    111      * Update the state of the public BreakIterator (fBI) to reflect the
    112      * current state of the break iterator cache (this).
    113      */
    114     int32_t     current();
    115 
    116     /**
    117      * Add boundaries to the cache near the specified position.
    118      * The given position need not be a boundary itself.
    119      * The input position must be within the range of the text, and
    120      * on a code point boundary.
    121      * If the requested position is a break boundary, leave the iteration
    122      * position on it.
    123      * If the requested position is not a boundary, leave the iteration
    124      * position on the preceding boundary and include both the
    125      * preceding and following boundaries in the cache.
    126      * Additional boundaries, either preceding or following, may be added
    127      * to the cache as a side effect.
    128      *
    129      * Return FALSE if the operation failed.
    130      */
    131     UBool populateNear(int32_t position, UErrorCode &status);
    132 
    133     /**
    134      *  Add boundary(s) to the cache following the current last boundary.
    135      *  Return FALSE if at the end of the text, and no more boundaries can be added.
    136      *  Leave iteration position at the first newly added boundary, or unchanged if no boundary was added.
    137      */
    138     UBool populateFollowing();
    139 
    140     /**
    141      *  Add one or more boundaries to the cache preceding the first currently cached boundary.
    142      *  Leave the iteration position on the first added boundary.
    143      *  Return false if no boundaries could be added (if at the start of the text.)
    144      */
    145     UBool populatePreceding(UErrorCode &status);
    146 
    147     enum UpdatePositionValues {
    148         RetainCachePosition = 0,
    149         UpdateCachePosition = 1
    150     };
    151 
    152     /*
    153      * Add the boundary following the current position.
    154      * The current position can be left as it was, or changed to the newly added boundary,
    155      * as specified by the update parameter.
    156      */
    157     void addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update);
    158 
    159 
    160     /*
    161      * Add the boundary preceding the current position.
    162      * The current position can be left as it was, or changed to the newly added boundary,
    163      * as specified by the update parameter.
    164      */
    165     bool addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update);
    166 
    167     /**
    168      *  Set the cache position to the specified position, or, if the position
    169      *  falls between to cached boundaries, to the preceding boundary.
    170      *  Fails if the requested position is outside of the range of boundaries currently held by the cache.
    171      *  The startPosition must be on a code point boundary.
    172      *
    173      *  Return TRUE if successful, FALSE if the specified position is after
    174      *  the last cached boundary or before the first.
    175      */
    176     UBool                   seek(int32_t startPosition);
    177 
    178     void dumpCache();
    179 
    180   private:
    181     static inline int32_t   modChunkSize(int index) { return index & (CACHE_SIZE - 1); };
    182 
    183     static constexpr int32_t CACHE_SIZE = 128;
    184     static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two.");
    185 
    186     RuleBasedBreakIterator *fBI;
    187     int32_t                 fStartBufIdx;
    188     int32_t                 fEndBufIdx;    // inclusive
    189 
    190     int32_t                 fTextIdx;
    191     int32_t                 fBufIdx;
    192 
    193     int32_t                 fBoundaries[CACHE_SIZE];
    194     uint16_t                fStatuses[CACHE_SIZE];
    195 
    196     UVector32               fSideBuffer;
    197 };
    198 
    199 U_NAMESPACE_END
    200 
    201 #endif // #if !UCONFIG_NO_BREAK_ITERATION
    202 
    203 #endif // RBBI_CACHE_H
    204