Home | History | Annotate | Download | only in impl
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5 ******************************************************************************
      6 * Copyright (C) 1996-2015, International Business Machines Corporation and
      7 * others. All Rights Reserved.
      8 ******************************************************************************
      9 */
     10 
     11 package android.icu.impl;
     12 
     13 import java.util.NoSuchElementException;
     14 
     15 import android.icu.lang.UCharacter;
     16 import android.icu.text.UTF16;
     17 import android.icu.util.RangeValueIterator;
     18 
     19 /**
     20  * <p>Class enabling iteration of the values in a Trie.</p>
     21  *
     22  * <p>2015-sep-03 TODO: Only used in test code, move there.
     23  *
     24  * <p>Result of each iteration contains the interval of codepoints that have
     25  * the same value type and the value type itself.</p>
     26  * <p>The comparison of each codepoint value is done via extract(), which the
     27  * default implementation is to return the value as it is.</p>
     28  * <p>Method extract() can be overwritten to perform manipulations on
     29  * codepoint values in order to perform specialized comparison.</p>
     30  * <p>TrieIterator is designed to be a generic iterator for the CharTrie
     31  * and the IntTrie, hence to accommodate both types of data, the return
     32  * result will be in terms of int (32 bit) values.</p>
     33  * <p>See android.icu.text.UCharacterTypeIterator for examples of use.</p>
     34  * <p>Notes for porting utrie_enum from icu4c to icu4j:<br>
     35  * Internally, icu4c's utrie_enum performs all iterations in its body. In Java
     36  * sense, the caller will have to pass a object with a callback function
     37  * UTrieEnumRange(const void *context, UChar32 start, UChar32 limit,
     38  * uint32_t value) into utrie_enum. utrie_enum will then find ranges of
     39  * codepoints with the same value as determined by
     40  * UTrieEnumValue(const void *context, uint32_t value). for each range,
     41  * utrie_enum calls the callback function to perform a task. In this way,
     42  * icu4c performs the iteration within utrie_enum.
     43  * To follow the JDK model, icu4j is slightly different from icu4c.
     44  * Instead of requesting the caller to implement an object for a callback.
     45  * The caller will have to implement a subclass of TrieIterator, fleshing out
     46  * the method extract(int) (equivalent to UTrieEnumValue). Independent of icu4j,
     47  * the caller will have to code his own iteration and flesh out the task
     48  * (equivalent to UTrieEnumRange) to be performed in the iteration loop.
     49  * </p>
     50  * <p>There are basically 3 usage scenarios for porting:</p>
     51  * <p>1) UTrieEnumValue is the only implemented callback then just implement a
     52  * subclass of TrieIterator and override the extract(int) method. The
     53  * extract(int) method is analogus to UTrieEnumValue callback.
     54  * </p>
     55  * <p>2) UTrieEnumValue and UTrieEnumRange both are implemented then implement
     56  * a subclass of TrieIterator, override the extract method and iterate, e.g
     57  * </p>
     58  * <p>utrie_enum(&normTrie, _enumPropertyStartsValue, _enumPropertyStartsRange,
     59  *               set);<br>
     60  * In Java :<br>
     61  * <pre>
     62  * class TrieIteratorImpl extends TrieIterator{
     63  *     public TrieIteratorImpl(Trie data){
     64  *         super(data);
     65  *     }
     66  *     public int extract(int value){
     67  *         // port the implementation of _enumPropertyStartsValue here
     68  *     }
     69  * }
     70  * ....
     71  * TrieIterator fcdIter  = new TrieIteratorImpl(fcdTrieImpl.fcdTrie);
     72  * while(fcdIter.next(result)) {
     73  *     // port the implementation of _enumPropertyStartsRange
     74  * }
     75  * </pre>
     76  * </p>
     77  * <p>3) UTrieEnumRange is the only implemented callback then just implement
     78  * the while loop, when utrie_enum is called
     79  * <pre>
     80  * // utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
     81  * TrieIterator fcdIter  = new TrieIterator(fcdTrieImpl.fcdTrie);
     82  * while(fcdIter.next(result)){
     83  *     set.add(result.start);
     84  * }
     85  * </p>
     86  * @author synwee
     87  * @see android.icu.impl.Trie
     88  * @hide Only a subset of ICU is exposed in Android
     89  */
     90 public class TrieIterator implements RangeValueIterator
     91 
     92 {
     93     // public constructor ---------------------------------------------
     94 
     95     /**
     96     * TrieEnumeration constructor
     97     * @param trie to be used
     98     * @exception IllegalArgumentException throw when argument is null.
     99     */
    100     public TrieIterator(Trie trie)
    101     {
    102         if (trie == null) {
    103             throw new IllegalArgumentException(
    104                                           "Argument trie cannot be null");
    105         }
    106         m_trie_             = trie;
    107         // synwee: check that extract belongs to the child class
    108         m_initialValue_     = extract(m_trie_.getInitialValue());
    109         reset();
    110     }
    111 
    112     // public methods -------------------------------------------------
    113 
    114     /**
    115     * <p>Returns true if we are not at the end of the iteration, false
    116     * otherwise.</p>
    117     * <p>The next set of codepoints with the same value type will be
    118     * calculated during this call and returned in the arguement element.</p>
    119     * @param element return result
    120     * @return true if we are not at the end of the iteration, false otherwise.
    121     * @exception NoSuchElementException - if no more elements exist.
    122     * @see android.icu.util.RangeValueIterator.Element
    123     */
    124     @Override
    125     public final boolean next(Element element)
    126     {
    127         if (m_nextCodepoint_ > UCharacter.MAX_VALUE) {
    128             return false;
    129         }
    130         if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE &&
    131             calculateNextBMPElement(element)) {
    132             return true;
    133         }
    134         calculateNextSupplementaryElement(element);
    135         return true;
    136     }
    137 
    138     /**
    139     * Resets the iterator to the beginning of the iteration
    140     */
    141     @Override
    142     public final void reset()
    143     {
    144         m_currentCodepoint_ = 0;
    145         m_nextCodepoint_    = 0;
    146         m_nextIndex_        = 0;
    147         m_nextBlock_ = m_trie_.m_index_[0] << Trie.INDEX_STAGE_2_SHIFT_;
    148         if (m_nextBlock_ == m_trie_.m_dataOffset_) {
    149             m_nextValue_ = m_initialValue_;
    150         }
    151         else {
    152             m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_));
    153         }
    154         m_nextBlockIndex_ = 0;
    155         m_nextTrailIndexOffset_ = TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_;
    156     }
    157 
    158     // protected methods ----------------------------------------------
    159 
    160     /**
    161     * Called by next() to extracts a 32 bit value from a trie value
    162     * used for comparison.
    163     * This method is to be overwritten if special manipulation is to be done
    164     * to retrieve a relevant comparison.
    165     * The default function is to return the value as it is.
    166     * @param value a value from the trie
    167     * @return extracted value
    168     */
    169     protected int extract(int value)
    170     {
    171         return value;
    172     }
    173 
    174     // private methods ------------------------------------------------
    175 
    176     /**
    177     * Set the result values
    178     * @param element return result object
    179     * @param start codepoint of range
    180     * @param limit (end + 1) codepoint of range
    181     * @param value common value of range
    182     */
    183     private final void setResult(Element element, int start, int limit,
    184                                  int value)
    185     {
    186         element.start = start;
    187         element.limit = limit;
    188         element.value = value;
    189     }
    190 
    191     /**
    192     * Finding the next element.
    193     * This method is called just before returning the result of
    194     * next().
    195     * We always store the next element before it is requested.
    196     * In the case that we have to continue calculations into the
    197     * supplementary planes, a false will be returned.
    198     * @param element return result object
    199     * @return true if the next range is found, false if we have to proceed to
    200     *         the supplementary range.
    201     */
    202     private final boolean calculateNextBMPElement(Element element)
    203     {
    204         int currentValue    = m_nextValue_;
    205         m_currentCodepoint_ = m_nextCodepoint_;
    206         m_nextCodepoint_ ++;
    207         m_nextBlockIndex_ ++;
    208         if (!checkBlockDetail(currentValue)) {
    209             setResult(element, m_currentCodepoint_, m_nextCodepoint_,
    210                       currentValue);
    211             return true;
    212         }
    213         // synwee check that next block index == 0 here
    214         // enumerate BMP - the main loop enumerates data blocks
    215         while (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE) {
    216             // because of the way the character is split to form the index
    217             // the lead surrogate and trail surrogate can not be in the
    218             // mid of a block
    219             if (m_nextCodepoint_ == LEAD_SURROGATE_MIN_VALUE_) {
    220                 // skip lead surrogate code units,
    221                 // go to lead surrogate codepoints
    222                 m_nextIndex_ = BMP_INDEX_LENGTH_;
    223             }
    224             else if (m_nextCodepoint_ == TRAIL_SURROGATE_MIN_VALUE_) {
    225                 // go back to regular BMP code points
    226                 m_nextIndex_ = m_nextCodepoint_ >> Trie.INDEX_STAGE_1_SHIFT_;
    227             } else {
    228                 m_nextIndex_ ++;
    229             }
    230 
    231             m_nextBlockIndex_ = 0;
    232             if (!checkBlock(currentValue)) {
    233                 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
    234                           currentValue);
    235                 return true;
    236             }
    237         }
    238         m_nextCodepoint_ --;   // step one back since this value has not been
    239         m_nextBlockIndex_ --;  // retrieved yet.
    240         return false;
    241     }
    242 
    243     /**
    244     * Finds the next supplementary element.
    245     * For each entry in the trie, the value to be delivered is passed through
    246     * extract().
    247     * We always store the next element before it is requested.
    248     * Called after calculateNextBMP() completes its round of BMP characters.
    249     * There is a slight difference in the usage of m_currentCodepoint_
    250     * here as compared to calculateNextBMP(). Though both represents the
    251     * lower bound of the next element, in calculateNextBMP() it gets set
    252     * at the start of any loop, where-else, in calculateNextSupplementary()
    253     * since m_currentCodepoint_ already contains the lower bound of the
    254     * next element (passed down from calculateNextBMP()), we keep it till
    255     * the end before resetting it to the new value.
    256     * Note, if there are no more iterations, it will never get to here.
    257     * Blocked out by next().
    258     * @param element return result object
    259     */
    260     private final void calculateNextSupplementaryElement(Element element)
    261     {
    262         int currentValue = m_nextValue_;
    263         m_nextCodepoint_ ++;
    264         m_nextBlockIndex_ ++;
    265 
    266         if (UTF16.getTrailSurrogate(m_nextCodepoint_)
    267                                         != UTF16.TRAIL_SURROGATE_MIN_VALUE) {
    268             // this piece is only called when we are in the middle of a lead
    269             // surrogate block
    270             if (!checkNullNextTrailIndex() && !checkBlockDetail(currentValue)) {
    271                 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
    272                           currentValue);
    273                 m_currentCodepoint_ = m_nextCodepoint_;
    274                 return;
    275             }
    276             // we have cleared one block
    277             m_nextIndex_ ++;
    278             m_nextTrailIndexOffset_ ++;
    279             if (!checkTrailBlock(currentValue)) {
    280                 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
    281                           currentValue);
    282                 m_currentCodepoint_ = m_nextCodepoint_;
    283                 return;
    284             }
    285         }
    286         int nextLead  = UTF16.getLeadSurrogate(m_nextCodepoint_);
    287         // enumerate supplementary code points
    288         while (nextLead < TRAIL_SURROGATE_MIN_VALUE_) {
    289             // lead surrogate access
    290             final int leadBlock =
    291                    m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
    292                                                    Trie.INDEX_STAGE_2_SHIFT_;
    293             if (leadBlock == m_trie_.m_dataOffset_) {
    294                 // no entries for a whole block of lead surrogates
    295                 if (currentValue != m_initialValue_) {
    296                     m_nextValue_      = m_initialValue_;
    297                     m_nextBlock_      = leadBlock;  // == m_trie_.m_dataOffset_
    298                     m_nextBlockIndex_ = 0;
    299                     setResult(element, m_currentCodepoint_, m_nextCodepoint_,
    300                               currentValue);
    301                     m_currentCodepoint_ = m_nextCodepoint_;
    302                     return;
    303                 }
    304 
    305                 nextLead += DATA_BLOCK_LENGTH_;
    306                 // number of total affected supplementary codepoints in one
    307                 // block
    308                 // this is not a simple addition of
    309                 // DATA_BLOCK_SUPPLEMENTARY_LENGTH since we need to consider
    310                 // that we might have moved some of the codepoints
    311                 m_nextCodepoint_ = Character.toCodePoint((char)nextLead, (char)UTF16.TRAIL_SURROGATE_MIN_VALUE);
    312                 continue;
    313             }
    314             if (m_trie_.m_dataManipulate_ == null) {
    315                 throw new NullPointerException(
    316                             "The field DataManipulate in this Trie is null");
    317             }
    318             // enumerate trail surrogates for this lead surrogate
    319             m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
    320                                m_trie_.getValue(leadBlock +
    321                                    (nextLead & Trie.INDEX_STAGE_3_MASK_)));
    322             if (m_nextIndex_ <= 0) {
    323                 // no data for this lead surrogate
    324                 if (currentValue != m_initialValue_) {
    325                     m_nextValue_      = m_initialValue_;
    326                     m_nextBlock_      = m_trie_.m_dataOffset_;
    327                     m_nextBlockIndex_ = 0;
    328                     setResult(element, m_currentCodepoint_, m_nextCodepoint_,
    329                               currentValue);
    330                     m_currentCodepoint_ = m_nextCodepoint_;
    331                     return;
    332                 }
    333                 m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_;
    334             } else {
    335                 m_nextTrailIndexOffset_ = 0;
    336                 if (!checkTrailBlock(currentValue)) {
    337                     setResult(element, m_currentCodepoint_, m_nextCodepoint_,
    338                               currentValue);
    339                     m_currentCodepoint_ = m_nextCodepoint_;
    340                     return;
    341                 }
    342             }
    343             nextLead ++;
    344          }
    345 
    346          // deliver last range
    347          setResult(element, m_currentCodepoint_, UCharacter.MAX_VALUE + 1,
    348                    currentValue);
    349     }
    350 
    351     /**
    352     * Internal block value calculations
    353     * Performs calculations on a data block to find codepoints in m_nextBlock_
    354     * after the index m_nextBlockIndex_ that has the same value.
    355     * Note m_*_ variables at this point is the next codepoint whose value
    356     * has not been calculated.
    357     * But when returned with false, it will be the last codepoint whose
    358     * value has been calculated.
    359     * @param currentValue the value which other codepoints are tested against
    360     * @return true if the whole block has the same value as currentValue or if
    361     *              the whole block has been calculated, false otherwise.
    362     */
    363     private final boolean checkBlockDetail(int currentValue)
    364     {
    365         while (m_nextBlockIndex_ < DATA_BLOCK_LENGTH_) {
    366             m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_ +
    367                                                     m_nextBlockIndex_));
    368             if (m_nextValue_ != currentValue) {
    369                 return false;
    370             }
    371             ++ m_nextBlockIndex_;
    372             ++ m_nextCodepoint_;
    373         }
    374         return true;
    375     }
    376 
    377     /**
    378     * Internal block value calculations
    379     * Performs calculations on a data block to find codepoints in m_nextBlock_
    380     * that has the same value.
    381     * Will call checkBlockDetail() if highlevel check fails.
    382     * Note m_*_ variables at this point is the next codepoint whose value
    383     * has not been calculated.
    384     * @param currentBlock the initial block containing all currentValue
    385     * @param currentValue the value which other codepoints are tested against
    386     * @return true if the whole block has the same value as currentValue or if
    387     *              the whole block has been calculated, false otherwise.
    388     */
    389     private final boolean checkBlock(int currentValue)
    390     {
    391         int currentBlock = m_nextBlock_;
    392         m_nextBlock_ = m_trie_.m_index_[m_nextIndex_] <<
    393                                                   Trie.INDEX_STAGE_2_SHIFT_;
    394         if (m_nextBlock_ == currentBlock &&
    395             (m_nextCodepoint_ - m_currentCodepoint_) >= DATA_BLOCK_LENGTH_) {
    396             // the block is the same as the previous one, filled with
    397             // currentValue
    398             m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
    399         }
    400         else if (m_nextBlock_ == m_trie_.m_dataOffset_) {
    401             // this is the all-initial-value block
    402             if (currentValue != m_initialValue_) {
    403                 m_nextValue_      = m_initialValue_;
    404                 m_nextBlockIndex_ = 0;
    405                 return false;
    406             }
    407             m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
    408         }
    409         else {
    410             if (!checkBlockDetail(currentValue)) {
    411                 return false;
    412             }
    413         }
    414         return true;
    415     }
    416 
    417     /**
    418     * Internal block value calculations
    419     * Performs calculations on multiple data blocks for a set of trail
    420     * surrogates to find codepoints in m_nextBlock_ that has the same value.
    421     * Will call checkBlock() for internal block checks.
    422     * Note m_*_ variables at this point is the next codepoint whose value
    423     * has not been calculated.
    424     * @param currentValue the value which other codepoints are tested against
    425     * @return true if the whole block has the same value as currentValue or if
    426     *              the whole block has been calculated, false otherwise.
    427     */
    428     private final boolean checkTrailBlock(int currentValue)
    429     {
    430         // enumerate code points for this lead surrogate
    431         while (m_nextTrailIndexOffset_ < TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_)
    432         {
    433             // if we ever reach here, we are at the start of a new block
    434             m_nextBlockIndex_ = 0;
    435             // copy of most of the body of the BMP loop
    436             if (!checkBlock(currentValue)) {
    437                 return false;
    438             }
    439             m_nextTrailIndexOffset_ ++;
    440             m_nextIndex_ ++;
    441         }
    442         return true;
    443     }
    444 
    445     /**
    446     * Checks if we are beginning at the start of a initial block.
    447     * If we are then the rest of the codepoints in this initial block
    448     * has the same values.
    449     * We increment m_nextCodepoint_ and relevant data members if so.
    450     * This is used only in for the supplementary codepoints because
    451     * the offset to the trail indexes could be 0.
    452     * @return true if we are at the start of a initial block.
    453     */
    454     private final boolean checkNullNextTrailIndex()
    455     {
    456         if (m_nextIndex_ <= 0) {
    457             m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1;
    458             int nextLead  = UTF16.getLeadSurrogate(m_nextCodepoint_);
    459             int leadBlock =
    460                    m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
    461                                                    Trie.INDEX_STAGE_2_SHIFT_;
    462             if (m_trie_.m_dataManipulate_ == null) {
    463                 throw new NullPointerException(
    464                             "The field DataManipulate in this Trie is null");
    465             }
    466             m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
    467                                m_trie_.getValue(leadBlock +
    468                                    (nextLead & Trie.INDEX_STAGE_3_MASK_)));
    469             m_nextIndex_ --;
    470             m_nextBlockIndex_ =  DATA_BLOCK_LENGTH_;
    471             return true;
    472         }
    473         return false;
    474     }
    475 
    476     // private data members --------------------------------------------
    477 
    478     /**
    479     * Size of the stage 1 BMP indexes
    480     */
    481     private static final int BMP_INDEX_LENGTH_ =
    482                                         0x10000 >> Trie.INDEX_STAGE_1_SHIFT_;
    483     /**
    484     * Lead surrogate minimum value
    485     */
    486     private static final int LEAD_SURROGATE_MIN_VALUE_ = 0xD800;
    487     /**
    488     * Trail surrogate minimum value
    489     */
    490     private static final int TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00;
    491     /*
    492     * Trail surrogate maximum value
    493     */
    494     //private static final int TRAIL_SURROGATE_MAX_VALUE_ = 0xDFFF;
    495     /**
    496     * Number of trail surrogate
    497     */
    498     private static final int TRAIL_SURROGATE_COUNT_ = 0x400;
    499     /**
    500     * Number of stage 1 indexes for supplementary calculations that maps to
    501     * each lead surrogate character.
    502     * See second pass into getRawOffset for the trail surrogate character.
    503     * 10 for significant number of bits for trail surrogates, 5 for what we
    504     * discard during shifting.
    505     */
    506     private static final int TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_ =
    507                                     1 << (10 - Trie.INDEX_STAGE_1_SHIFT_);
    508     /**
    509     * Number of data values in a stage 2 (data array) block.
    510     */
    511     private static final int DATA_BLOCK_LENGTH_ =
    512                                               1 << Trie.INDEX_STAGE_1_SHIFT_;
    513 //    /**
    514 //    * Number of codepoints in a stage 2 block
    515 //    */
    516 //    private static final int DATA_BLOCK_SUPPLEMENTARY_LENGTH_ =
    517 //                                                     DATA_BLOCK_LENGTH_ << 10;
    518     /**
    519     * Trie instance
    520     */
    521     private Trie m_trie_;
    522     /**
    523     * Initial value for trie values
    524     */
    525     private int m_initialValue_;
    526     /**
    527     * Next element results and data.
    528     */
    529     private int m_currentCodepoint_;
    530     private int m_nextCodepoint_;
    531     private int m_nextValue_;
    532     private int m_nextIndex_;
    533     private int m_nextBlock_;
    534     private int m_nextBlockIndex_;
    535     private int m_nextTrailIndexOffset_;
    536 }
    537