Home | History | Annotate | Download | only in unicode
      1 /*
      2  ******************************************************************************
      3  *   Copyright (C) 1997-2014, International Business Machines
      4  *   Corporation and others.  All Rights Reserved.
      5  ******************************************************************************
      6  */
      7 
      8 /**
      9  * \file
     10  * \brief C++ API: Collation Element Iterator.
     11  */
     12 
     13 /**
     14 * File coleitr.h
     15 *
     16 * Created by: Helena Shih
     17 *
     18 * Modification History:
     19 *
     20 *  Date       Name        Description
     21 *
     22 *  8/18/97    helena      Added internal API documentation.
     23 * 08/03/98    erm         Synched with 1.2 version CollationElementIterator.java
     24 * 12/10/99    aliu        Ported Thai collation support from Java.
     25 * 01/25/01    swquek      Modified into a C++ wrapper calling C APIs (ucoliter.h)
     26 * 02/19/01    swquek      Removed CollationElementsIterator() since it is
     27 *                         private constructor and no calls are made to it
     28 * 2012-2014   markus      Rewritten in C++ again.
     29 */
     30 
     31 #ifndef COLEITR_H
     32 #define COLEITR_H
     33 
     34 #include "unicode/utypes.h"
     35 
     36 #if !UCONFIG_NO_COLLATION
     37 
     38 #include "unicode/unistr.h"
     39 #include "unicode/uobject.h"
     40 
     41 struct UCollationElements;
     42 struct UHashtable;
     43 
     44 U_NAMESPACE_BEGIN
     45 
     46 struct CollationData;
     47 
     48 class CollationIterator;
     49 class RuleBasedCollator;
     50 class UCollationPCE;
     51 class UVector32;
     52 
     53 /**
     54 * The CollationElementIterator class is used as an iterator to walk through
     55 * each character of an international string. Use the iterator to return the
     56 * ordering priority of the positioned character. The ordering priority of a
     57 * character, which we refer to as a key, defines how a character is collated in
     58 * the given collation object.
     59 * For example, consider the following in Slovak and in traditional Spanish collation:
     60 * <pre>
     61 *        "ca" -> the first key is key('c') and second key is key('a').
     62 *        "cha" -> the first key is key('ch') and second key is key('a').</pre>
     63 * And in German phonebook collation,
     64 * <pre> \htmlonly       "&#x00E6;b"-> the first key is key('a'), the second key is key('e'), and
     65 *        the third key is key('b'). \endhtmlonly </pre>
     66 * The key of a character, is an integer composed of primary order(short),
     67 * secondary order(char), and tertiary order(char). Java strictly defines the
     68 * size and signedness of its primitive data types. Therefore, the static
     69 * functions primaryOrder(), secondaryOrder(), and tertiaryOrder() return
     70 * int32_t to ensure the correctness of the key value.
     71 * <p>Example of the iterator usage: (without error checking)
     72 * <pre>
     73 * \code
     74 *   void CollationElementIterator_Example()
     75 *   {
     76 *       UnicodeString str = "This is a test";
     77 *       UErrorCode success = U_ZERO_ERROR;
     78 *       RuleBasedCollator* rbc =
     79 *           (RuleBasedCollator*) RuleBasedCollator::createInstance(success);
     80 *       CollationElementIterator* c =
     81 *           rbc->createCollationElementIterator( str );
     82 *       int32_t order = c->next(success);
     83 *       c->reset();
     84 *       order = c->previous(success);
     85 *       delete c;
     86 *       delete rbc;
     87 *   }
     88 * \endcode
     89 * </pre>
     90 * <p>
     91 * The method next() returns the collation order of the next character based on
     92 * the comparison level of the collator. The method previous() returns the
     93 * collation order of the previous character based on the comparison level of
     94 * the collator. The Collation Element Iterator moves only in one direction
     95 * between calls to reset(), setOffset(), or setText(). That is, next()
     96 * and previous() can not be inter-used. Whenever previous() is to be called after
     97 * next() or vice versa, reset(), setOffset() or setText() has to be called first
     98 * to reset the status, shifting pointers to either the end or the start of
     99 * the string (reset() or setText()), or the specified position (setOffset()).
    100 * Hence at the next call of next() or previous(), the first or last collation order,
    101 * or collation order at the spefcifieid position will be returned. If a change of
    102 * direction is done without one of these calls, the result is undefined.
    103 * <p>
    104 * The result of a forward iterate (next()) and reversed result of the backward
    105 * iterate (previous()) on the same string are equivalent, if collation orders
    106 * with the value 0 are ignored.
    107 * Character based on the comparison level of the collator.  A collation order
    108 * consists of primary order, secondary order and tertiary order.  The data
    109 * type of the collation order is <strong>int32_t</strong>.
    110 *
    111 * Note, CollationElementIterator should not be subclassed.
    112 * @see     Collator
    113 * @see     RuleBasedCollator
    114 * @version 1.8 Jan 16 2001
    115 */
    116 class U_I18N_API CollationElementIterator : public UObject {
    117 public:
    118 
    119     // CollationElementIterator public data member ------------------------------
    120 
    121     enum {
    122         /**
    123          * NULLORDER indicates that an error has occured while processing
    124          * @stable ICU 2.0
    125          */
    126         NULLORDER = (int32_t)0xffffffff
    127     };
    128 
    129     // CollationElementIterator public constructor/destructor -------------------
    130 
    131     /**
    132     * Copy constructor.
    133     *
    134     * @param other    the object to be copied from
    135     * @stable ICU 2.0
    136     */
    137     CollationElementIterator(const CollationElementIterator& other);
    138 
    139     /**
    140     * Destructor
    141     * @stable ICU 2.0
    142     */
    143     virtual ~CollationElementIterator();
    144 
    145     // CollationElementIterator public methods ----------------------------------
    146 
    147     /**
    148     * Returns true if "other" is the same as "this"
    149     *
    150     * @param other    the object to be compared
    151     * @return         true if "other" is the same as "this"
    152     * @stable ICU 2.0
    153     */
    154     UBool operator==(const CollationElementIterator& other) const;
    155 
    156     /**
    157     * Returns true if "other" is not the same as "this".
    158     *
    159     * @param other    the object to be compared
    160     * @return         true if "other" is not the same as "this"
    161     * @stable ICU 2.0
    162     */
    163     UBool operator!=(const CollationElementIterator& other) const;
    164 
    165     /**
    166     * Resets the cursor to the beginning of the string.
    167     * @stable ICU 2.0
    168     */
    169     void reset(void);
    170 
    171     /**
    172     * Gets the ordering priority of the next character in the string.
    173     * @param status the error code status.
    174     * @return the next character's ordering. otherwise returns NULLORDER if an
    175     *         error has occured or if the end of string has been reached
    176     * @stable ICU 2.0
    177     */
    178     int32_t next(UErrorCode& status);
    179 
    180     /**
    181     * Get the ordering priority of the previous collation element in the string.
    182     * @param status the error code status.
    183     * @return the previous element's ordering. otherwise returns NULLORDER if an
    184     *         error has occured or if the start of string has been reached
    185     * @stable ICU 2.0
    186     */
    187     int32_t previous(UErrorCode& status);
    188 
    189     /**
    190     * Gets the primary order of a collation order.
    191     * @param order the collation order
    192     * @return the primary order of a collation order.
    193     * @stable ICU 2.0
    194     */
    195     static inline int32_t primaryOrder(int32_t order);
    196 
    197     /**
    198     * Gets the secondary order of a collation order.
    199     * @param order the collation order
    200     * @return the secondary order of a collation order.
    201     * @stable ICU 2.0
    202     */
    203     static inline int32_t secondaryOrder(int32_t order);
    204 
    205     /**
    206     * Gets the tertiary order of a collation order.
    207     * @param order the collation order
    208     * @return the tertiary order of a collation order.
    209     * @stable ICU 2.0
    210     */
    211     static inline int32_t tertiaryOrder(int32_t order);
    212 
    213     /**
    214     * Return the maximum length of any expansion sequences that end with the
    215     * specified comparison order.
    216     * @param order a collation order returned by previous or next.
    217     * @return maximum size of the expansion sequences ending with the collation
    218     *         element or 1 if collation element does not occur at the end of any
    219     *         expansion sequence
    220     * @stable ICU 2.0
    221     */
    222     int32_t getMaxExpansion(int32_t order) const;
    223 
    224     /**
    225     * Gets the comparison order in the desired strength. Ignore the other
    226     * differences.
    227     * @param order The order value
    228     * @stable ICU 2.0
    229     */
    230     int32_t strengthOrder(int32_t order) const;
    231 
    232     /**
    233     * Sets the source string.
    234     * @param str the source string.
    235     * @param status the error code status.
    236     * @stable ICU 2.0
    237     */
    238     void setText(const UnicodeString& str, UErrorCode& status);
    239 
    240     /**
    241     * Sets the source string.
    242     * @param str the source character iterator.
    243     * @param status the error code status.
    244     * @stable ICU 2.0
    245     */
    246     void setText(CharacterIterator& str, UErrorCode& status);
    247 
    248     /**
    249     * Checks if a comparison order is ignorable.
    250     * @param order the collation order.
    251     * @return TRUE if a character is ignorable, FALSE otherwise.
    252     * @stable ICU 2.0
    253     */
    254     static inline UBool isIgnorable(int32_t order);
    255 
    256     /**
    257     * Gets the offset of the currently processed character in the source string.
    258     * @return the offset of the character.
    259     * @stable ICU 2.0
    260     */
    261     int32_t getOffset(void) const;
    262 
    263     /**
    264     * Sets the offset of the currently processed character in the source string.
    265     * @param newOffset the new offset.
    266     * @param status the error code status.
    267     * @return the offset of the character.
    268     * @stable ICU 2.0
    269     */
    270     void setOffset(int32_t newOffset, UErrorCode& status);
    271 
    272     /**
    273     * ICU "poor man's RTTI", returns a UClassID for the actual class.
    274     *
    275     * @stable ICU 2.2
    276     */
    277     virtual UClassID getDynamicClassID() const;
    278 
    279     /**
    280     * ICU "poor man's RTTI", returns a UClassID for this class.
    281     *
    282     * @stable ICU 2.2
    283     */
    284     static UClassID U_EXPORT2 getStaticClassID();
    285 
    286 #ifndef U_HIDE_INTERNAL_API
    287     /** @internal */
    288     static inline CollationElementIterator *fromUCollationElements(UCollationElements *uc) {
    289         return reinterpret_cast<CollationElementIterator *>(uc);
    290     }
    291     /** @internal */
    292     static inline const CollationElementIterator *fromUCollationElements(const UCollationElements *uc) {
    293         return reinterpret_cast<const CollationElementIterator *>(uc);
    294     }
    295     /** @internal */
    296     inline UCollationElements *toUCollationElements() {
    297         return reinterpret_cast<UCollationElements *>(this);
    298     }
    299     /** @internal */
    300     inline const UCollationElements *toUCollationElements() const {
    301         return reinterpret_cast<const UCollationElements *>(this);
    302     }
    303 #endif  // U_HIDE_INTERNAL_API
    304 
    305 private:
    306     friend class RuleBasedCollator;
    307     friend class UCollationPCE;
    308 
    309     /**
    310     * CollationElementIterator constructor. This takes the source string and the
    311     * collation object. The cursor will walk thru the source string based on the
    312     * predefined collation rules. If the source string is empty, NULLORDER will
    313     * be returned on the calls to next().
    314     * @param sourceText    the source string.
    315     * @param order         the collation object.
    316     * @param status        the error code status.
    317     */
    318     CollationElementIterator(const UnicodeString& sourceText,
    319         const RuleBasedCollator* order, UErrorCode& status);
    320     // Note: The constructors should take settings & tailoring, not a collator,
    321     // to avoid circular dependencies.
    322     // However, for operator==() we would need to be able to compare tailoring data for equality
    323     // without making CollationData or CollationTailoring depend on TailoredSet.
    324     // (See the implementation of RuleBasedCollator::operator==().)
    325     // That might require creating an intermediate class that would be used
    326     // by both CollationElementIterator and RuleBasedCollator
    327     // but only contain the part of RBC== related to data and rules.
    328 
    329     /**
    330     * CollationElementIterator constructor. This takes the source string and the
    331     * collation object.  The cursor will walk thru the source string based on the
    332     * predefined collation rules.  If the source string is empty, NULLORDER will
    333     * be returned on the calls to next().
    334     * @param sourceText    the source string.
    335     * @param order         the collation object.
    336     * @param status        the error code status.
    337     */
    338     CollationElementIterator(const CharacterIterator& sourceText,
    339         const RuleBasedCollator* order, UErrorCode& status);
    340 
    341     /**
    342     * Assignment operator
    343     *
    344     * @param other    the object to be copied
    345     */
    346     const CollationElementIterator&
    347         operator=(const CollationElementIterator& other);
    348 
    349     CollationElementIterator(); // default constructor not implemented
    350 
    351     /** Normalizes dir_=1 (just after setOffset()) to dir_=0 (just after reset()). */
    352     inline int8_t normalizeDir() const { return dir_ == 1 ? 0 : dir_; }
    353 
    354     static UHashtable *computeMaxExpansions(const CollationData *data, UErrorCode &errorCode);
    355 
    356     static int32_t getMaxExpansion(const UHashtable *maxExpansions, int32_t order);
    357 
    358     // CollationElementIterator private data members ----------------------------
    359 
    360     CollationIterator *iter_;  // owned
    361     const RuleBasedCollator *rbc_;  // aliased
    362     uint32_t otherHalf_;
    363     /**
    364      * <0: backwards; 0: just after reset() (previous() begins from end);
    365      * 1: just after setOffset(); >1: forward
    366      */
    367     int8_t dir_;
    368     /**
    369      * Stores offsets from expansions and from unsafe-backwards iteration,
    370      * so that getOffset() returns intermediate offsets for the CEs
    371      * that are consistent with forward iteration.
    372      */
    373     UVector32 *offsets_;
    374 
    375     UnicodeString string_;
    376 };
    377 
    378 // CollationElementIterator inline method definitions --------------------------
    379 
    380 inline int32_t CollationElementIterator::primaryOrder(int32_t order)
    381 {
    382     return (order >> 16) & 0xffff;
    383 }
    384 
    385 inline int32_t CollationElementIterator::secondaryOrder(int32_t order)
    386 {
    387     return (order >> 8) & 0xff;
    388 }
    389 
    390 inline int32_t CollationElementIterator::tertiaryOrder(int32_t order)
    391 {
    392     return order & 0xff;
    393 }
    394 
    395 inline UBool CollationElementIterator::isIgnorable(int32_t order)
    396 {
    397     return (order & 0xffff0000) == 0;
    398 }
    399 
    400 U_NAMESPACE_END
    401 
    402 #endif /* #if !UCONFIG_NO_COLLATION */
    403 
    404 #endif
    405