Home | History | Annotate | Download | only in i18n
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2001-2014 IBM and others. All rights reserved.
      4 **********************************************************************
      5 *   Date        Name        Description
      6 *  08/13/2001   synwee      Creation.
      7 **********************************************************************
      8 */
      9 #ifndef USRCHIMP_H
     10 #define USRCHIMP_H
     11 
     12 #include "unicode/utypes.h"
     13 
     14 #if !UCONFIG_NO_COLLATION
     15 
     16 #include "unicode/normalizer2.h"
     17 #include "unicode/ucol.h"
     18 #include "unicode/ucoleitr.h"
     19 #include "unicode/ubrk.h"
     20 
     21 /* mask off anything but primary order */
     22 #define UCOL_PRIMARYORDERMASK 0xffff0000
     23 /* mask off anything but secondary order */
     24 #define UCOL_SECONDARYORDERMASK 0x0000ff00
     25 /* mask off anything but tertiary order */
     26 #define UCOL_TERTIARYORDERMASK 0x000000ff
     27 /* primary order shift */
     28 #define UCOL_PRIMARYORDERSHIFT 16
     29 /* secondary order shift */
     30 #define UCOL_SECONDARYORDERSHIFT 8
     31 
     32 #define UCOL_IGNORABLE 0
     33 
     34 /* get weights from a CE */
     35 #define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff)
     36 #define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
     37 #define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)
     38 
     39 #define UCOL_CONTINUATION_MARKER 0xC0
     40 
     41 #define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)
     42 
     43 /**
     44  * This indicates an error has occured during processing or there are no more CEs
     45  * to be returned.
     46  */
     47 #define UCOL_PROCESSED_NULLORDER        ((int64_t)U_INT64_MAX)
     48 
     49 U_NAMESPACE_BEGIN
     50 
     51 class CollationElementIterator;
     52 class Collator;
     53 
     54 struct PCEI
     55 {
     56     uint64_t ce;
     57     int32_t  low;
     58     int32_t  high;
     59 };
     60 
     61 struct PCEBuffer
     62 {
     63     PCEI    defaultBuffer[16];
     64     PCEI   *buffer;
     65     int32_t bufferIndex;
     66     int32_t bufferSize;
     67 
     68     PCEBuffer();
     69     ~PCEBuffer();
     70 
     71     void  reset();
     72     UBool empty() const;
     73     void  put(uint64_t ce, int32_t ixLow, int32_t ixHigh);
     74     const PCEI *get();
     75 };
     76 
     77 class UCollationPCE : public UMemory {
     78 private:
     79     PCEBuffer          pceBuffer;
     80     CollationElementIterator *cei;
     81     UCollationStrength strength;
     82     UBool              toShift;
     83     UBool              isShifted;
     84     uint32_t           variableTop;
     85 
     86 public:
     87     UCollationPCE(UCollationElements *elems);
     88     UCollationPCE(CollationElementIterator *iter);
     89     ~UCollationPCE();
     90 
     91     void init(UCollationElements *elems);
     92     void init(CollationElementIterator *iter);
     93 
     94     /**
     95      * Get the processed ordering priority of the next collation element in the text.
     96      * A single character may contain more than one collation element.
     97      *
     98      * @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
     99      * @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
    100      * @param status A pointer to an UErrorCode to receive any errors.
    101      * @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER
    102      *         if an error has occured or if the end of string has been reached
    103      */
    104     int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
    105     /**
    106      * Get the processed ordering priority of the previous collation element in the text.
    107      * A single character may contain more than one collation element.
    108      *
    109      * @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
    110      * @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
    111      * @param status A pointer to an UErrorCode to receive any errors. Noteably
    112      *               a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
    113      *               buffer has been exhausted.
    114      * @return The previous collation elements ordering, otherwise returns
    115      *         UCOL_PROCESSED_NULLORDER if an error has occured or if the start of
    116      *         string has been reached.
    117      */
    118     int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
    119 
    120 private:
    121     void init(const Collator &coll);
    122     uint64_t processCE(uint32_t ce);
    123 };
    124 
    125 U_NAMESPACE_END
    126 
    127 #define INITIAL_ARRAY_SIZE_       256
    128 #define MAX_TABLE_SIZE_           257
    129 
    130 struct USearch {
    131     // required since collation element iterator does not have a getText API
    132     const UChar              *text;
    133           int32_t             textLength; // exact length
    134           UBool               isOverlap;
    135           UBool               isCanonicalMatch;
    136           int16_t             elementComparisonType;
    137           UBreakIterator     *internalBreakIter;  //internal character breakiterator
    138           UBreakIterator     *breakIter;
    139     // value USEARCH_DONE is the default value
    140     // if we are not at the start of the text or the end of the text,
    141     // depending on the iteration direction and matchedIndex is USEARCH_DONE
    142     // it means that we can't find any more matches in that particular direction
    143           int32_t             matchedIndex;
    144           int32_t             matchedLength;
    145           UBool               isForwardSearching;
    146           UBool               reset;
    147 };
    148 
    149 struct UPattern {
    150     const UChar              *text;
    151           int32_t             textLength; // exact length
    152           // length required for backwards ce comparison
    153           int32_t             cesLength;
    154           int32_t            *ces;
    155           int32_t             cesBuffer[INITIAL_ARRAY_SIZE_];
    156           int32_t             pcesLength;
    157           int64_t            *pces;
    158           int64_t             pcesBuffer[INITIAL_ARRAY_SIZE_];
    159           UBool               hasPrefixAccents;
    160           UBool               hasSuffixAccents;
    161           int16_t             defaultShiftSize;
    162           int16_t             shift[MAX_TABLE_SIZE_];
    163           int16_t             backShift[MAX_TABLE_SIZE_];
    164 };
    165 
    166 struct UStringSearch {
    167     struct USearch            *search;
    168     struct UPattern            pattern;
    169     const  UCollator          *collator;
    170     const  icu::Normalizer2   *nfd;
    171     // positions within the collation element iterator is used to determine
    172     // if we are at the start of the text.
    173            UCollationElements *textIter;
    174            icu::UCollationPCE *textProcessedIter;
    175     // utility collation element, used throughout program for temporary
    176     // iteration.
    177            UCollationElements *utilIter;
    178            UBool               ownCollator;
    179            UCollationStrength  strength;
    180            uint32_t            ceMask;
    181            uint32_t            variableTop;
    182            UBool               toShift;
    183            UChar               canonicalPrefixAccents[INITIAL_ARRAY_SIZE_];
    184            UChar               canonicalSuffixAccents[INITIAL_ARRAY_SIZE_];
    185 };
    186 
    187 /**
    188 * Exact matches without checking for the ends for extra accents.
    189 * The match after the position within the collation element iterator is to be
    190 * found.
    191 * After a match is found the offset in the collation element iterator will be
    192 * shifted to the start of the match.
    193 * Implementation note:
    194 * For tertiary we can't use the collator->tertiaryMask, that is a
    195 * preprocessed mask that takes into account case options. since we are only
    196 * concerned with exact matches, we don't need that.
    197 * Alternate handling - since only the 16 most significant digits is only used,
    198 * we can safely do a compare without masking if the ce is a variable, we mask
    199 * and get only the primary values no shifting to quartenary is required since
    200 * all primary values less than variabletop will need to be masked off anyway.
    201 * If the end character is composite and the pattern ce does not match the text
    202 * ce, we skip it until we find a match in the end composite character or when
    203 * it has passed the character. This is so that we can match pattern "a" with
    204 * the text "\u00e6"
    205 * @param strsrch string search data
    206 * @param status error status if any
    207 * @return TRUE if an exact match is found, FALSE otherwise
    208 */
    209 U_CFUNC
    210 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);
    211 
    212 /**
    213 * Canonical matches.
    214 * According to the definition, matches found here will include the whole span
    215 * of beginning and ending accents if it overlaps that region.
    216 * @param strsrch string search data
    217 * @param status error status if any
    218 * @return TRUE if a canonical match is found, FALSE otherwise
    219 */
    220 U_CFUNC
    221 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);
    222 
    223 /**
    224 * Gets the previous match.
    225 * Comments follows from handleNextExact
    226 * @param strsrch string search data
    227 * @param status error status if any
    228 * @return True if a exact math is found, FALSE otherwise.
    229 */
    230 U_CFUNC
    231 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);
    232 
    233 /**
    234 * Canonical matches.
    235 * According to the definition, matches found here will include the whole span
    236 * of beginning and ending accents if it overlaps that region.
    237 * @param strsrch string search data
    238 * @param status error status if any
    239 * @return TRUE if a canonical match is found, FALSE otherwise
    240 */
    241 U_CFUNC
    242 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
    243                                       UErrorCode    *status);
    244 
    245 #endif /* #if !UCONFIG_NO_COLLATION */
    246 
    247 #endif
    248