Home | History | Annotate | Download | only in i18n
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 2001-2015 IBM and others. All rights reserved.
      6 **********************************************************************
      7 *   Date        Name        Description
      8 *  08/13/2001   synwee      Creation.
      9 **********************************************************************
     10 */
     11 #ifndef USRCHIMP_H
     12 #define USRCHIMP_H
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_COLLATION
     17 
     18 #include "unicode/normalizer2.h"
     19 #include "unicode/ucol.h"
     20 #include "unicode/ucoleitr.h"
     21 #include "unicode/ubrk.h"
     22 
     23 /* mask off anything but primary order */
     24 #define UCOL_PRIMARYORDERMASK 0xffff0000
     25 /* mask off anything but secondary order */
     26 #define UCOL_SECONDARYORDERMASK 0x0000ff00
     27 /* mask off anything but tertiary order */
     28 #define UCOL_TERTIARYORDERMASK 0x000000ff
     29 /* primary order shift */
     30 #define UCOL_PRIMARYORDERSHIFT 16
     31 /* secondary order shift */
     32 #define UCOL_SECONDARYORDERSHIFT 8
     33 
     34 #define UCOL_IGNORABLE 0
     35 
     36 /* get weights from a CE */
     37 #define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff)
     38 #define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
     39 #define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)
     40 
     41 #define UCOL_CONTINUATION_MARKER 0xC0
     42 
     43 #define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)
     44 
     45 /**
     46  * This indicates an error has occured during processing or there are no more CEs
     47  * to be returned.
     48  */
     49 #define UCOL_PROCESSED_NULLORDER        ((int64_t)U_INT64_MAX)
     50 
     51 U_NAMESPACE_BEGIN
     52 
     53 class CollationElementIterator;
     54 class Collator;
     55 
     56 struct PCEI
     57 {
     58     uint64_t ce;
     59     int32_t  low;
     60     int32_t  high;
     61 };
     62 
     63 struct PCEBuffer
     64 {
     65     PCEI    defaultBuffer[16];
     66     PCEI   *buffer;
     67     int32_t bufferIndex;
     68     int32_t bufferSize;
     69 
     70     PCEBuffer();
     71     ~PCEBuffer();
     72 
     73     void  reset();
     74     UBool isEmpty() const;
     75     void  put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
     76     const PCEI *get();
     77 };
     78 
     79 class UCollationPCE : public UMemory {
     80 private:
     81     PCEBuffer          pceBuffer;
     82     CollationElementIterator *cei;
     83     UCollationStrength strength;
     84     UBool              toShift;
     85     UBool              isShifted;
     86     uint32_t           variableTop;
     87 
     88 public:
     89     UCollationPCE(UCollationElements *elems);
     90     UCollationPCE(CollationElementIterator *iter);
     91     ~UCollationPCE();
     92 
     93     void init(UCollationElements *elems);
     94     void init(CollationElementIterator *iter);
     95 
     96     /**
     97      * Get the processed ordering priority of the next collation element in the text.
     98      * A single character may contain more than one collation element.
     99      *
    100      * @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
    101      * @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
    102      * @param status A pointer to an UErrorCode to receive any errors.
    103      * @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER
    104      *         if an error has occured or if the end of string has been reached
    105      */
    106     int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
    107     /**
    108      * Get the processed ordering priority of the previous collation element in the text.
    109      * A single character may contain more than one collation element.
    110      *
    111      * @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
    112      * @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
    113      * @param status A pointer to an UErrorCode to receive any errors. Noteably
    114      *               a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
    115      *               buffer has been exhausted.
    116      * @return The previous collation elements ordering, otherwise returns
    117      *         UCOL_PROCESSED_NULLORDER if an error has occured or if the start of
    118      *         string has been reached.
    119      */
    120     int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
    121 
    122 private:
    123     void init(const Collator &coll);
    124     uint64_t processCE(uint32_t ce);
    125 };
    126 
    127 U_NAMESPACE_END
    128 
    129 #define INITIAL_ARRAY_SIZE_       256
    130 #define MAX_TABLE_SIZE_           257
    131 
    132 struct USearch {
    133     // required since collation element iterator does not have a getText API
    134     const UChar              *text;
    135           int32_t             textLength; // exact length
    136           UBool               isOverlap;
    137           UBool               isCanonicalMatch;
    138           int16_t             elementComparisonType;
    139           UBreakIterator     *internalBreakIter;  //internal character breakiterator
    140           UBreakIterator     *breakIter;
    141     // value USEARCH_DONE is the default value
    142     // if we are not at the start of the text or the end of the text,
    143     // depending on the iteration direction and matchedIndex is USEARCH_DONE
    144     // it means that we can't find any more matches in that particular direction
    145           int32_t             matchedIndex;
    146           int32_t             matchedLength;
    147           UBool               isForwardSearching;
    148           UBool               reset;
    149 };
    150 
    151 struct UPattern {
    152     const UChar              *text;
    153           int32_t             textLength; // exact length
    154           // length required for backwards ce comparison
    155           int32_t             cesLength;
    156           int32_t            *ces;
    157           int32_t             cesBuffer[INITIAL_ARRAY_SIZE_];
    158           int32_t             pcesLength;
    159           int64_t            *pces;
    160           int64_t             pcesBuffer[INITIAL_ARRAY_SIZE_];
    161           UBool               hasPrefixAccents;
    162           UBool               hasSuffixAccents;
    163           int16_t             defaultShiftSize;
    164           int16_t             shift[MAX_TABLE_SIZE_];
    165           int16_t             backShift[MAX_TABLE_SIZE_];
    166 };
    167 
    168 struct UStringSearch {
    169     struct USearch            *search;
    170     struct UPattern            pattern;
    171     const  UCollator          *collator;
    172     const  icu::Normalizer2   *nfd;
    173     // positions within the collation element iterator is used to determine
    174     // if we are at the start of the text.
    175            UCollationElements *textIter;
    176            icu::UCollationPCE *textProcessedIter;
    177     // utility collation element, used throughout program for temporary
    178     // iteration.
    179            UCollationElements *utilIter;
    180            UBool               ownCollator;
    181            UCollationStrength  strength;
    182            uint32_t            ceMask;
    183            uint32_t            variableTop;
    184            UBool               toShift;
    185            UChar               canonicalPrefixAccents[INITIAL_ARRAY_SIZE_];
    186            UChar               canonicalSuffixAccents[INITIAL_ARRAY_SIZE_];
    187 };
    188 
    189 /**
    190 * Exact matches without checking for the ends for extra accents.
    191 * The match after the position within the collation element iterator is to be
    192 * found.
    193 * After a match is found the offset in the collation element iterator will be
    194 * shifted to the start of the match.
    195 * Implementation note:
    196 * For tertiary we can't use the collator->tertiaryMask, that is a
    197 * preprocessed mask that takes into account case options. since we are only
    198 * concerned with exact matches, we don't need that.
    199 * Alternate handling - since only the 16 most significant digits is only used,
    200 * we can safely do a compare without masking if the ce is a variable, we mask
    201 * and get only the primary values no shifting to quartenary is required since
    202 * all primary values less than variabletop will need to be masked off anyway.
    203 * If the end character is composite and the pattern ce does not match the text
    204 * ce, we skip it until we find a match in the end composite character or when
    205 * it has passed the character. This is so that we can match pattern "a" with
    206 * the text "\u00e6"
    207 * @param strsrch string search data
    208 * @param status error status if any
    209 * @return TRUE if an exact match is found, FALSE otherwise
    210 */
    211 U_CFUNC
    212 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);
    213 
    214 /**
    215 * Canonical matches.
    216 * According to the definition, matches found here will include the whole span
    217 * of beginning and ending accents if it overlaps that region.
    218 * @param strsrch string search data
    219 * @param status error status if any
    220 * @return TRUE if a canonical match is found, FALSE otherwise
    221 */
    222 U_CFUNC
    223 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);
    224 
    225 /**
    226 * Gets the previous match.
    227 * Comments follows from handleNextExact
    228 * @param strsrch string search data
    229 * @param status error status if any
    230 * @return True if a exact math is found, FALSE otherwise.
    231 */
    232 U_CFUNC
    233 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);
    234 
    235 /**
    236 * Canonical matches.
    237 * According to the definition, matches found here will include the whole span
    238 * of beginning and ending accents if it overlaps that region.
    239 * @param strsrch string search data
    240 * @param status error status if any
    241 * @return TRUE if a canonical match is found, FALSE otherwise
    242 */
    243 U_CFUNC
    244 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
    245                                       UErrorCode    *status);
    246 
    247 #endif /* #if !UCONFIG_NO_COLLATION */
    248 
    249 #endif
    250