Home | History | Annotate | Download | only in i18n
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2001-2011 IBM and others. All rights reserved.
      4 **********************************************************************
      5 *   Date        Name        Description
      6 *  08/13/2001   synwee      Creation.
      7 **********************************************************************
      8 */
      9 #ifndef USRCHIMP_H
     10 #define USRCHIMP_H
     11 
     12 #include "unicode/utypes.h"
     13 
     14 #if !UCONFIG_NO_COLLATION
     15 
     16 #include "unicode/normalizer2.h"
     17 #include "unicode/ucol.h"
     18 #include "unicode/ucoleitr.h"
     19 #include "unicode/ubrk.h"
     20 
     21 #define INITIAL_ARRAY_SIZE_       256
     22 #define MAX_TABLE_SIZE_           257
     23 
     24 struct USearch {
     25     // required since collation element iterator does not have a getText API
     26     const UChar              *text;
     27           int32_t             textLength; // exact length
     28           UBool               isOverlap;
     29           UBool               isCanonicalMatch;
     30           int16_t             elementComparisonType;
     31           UBreakIterator     *internalBreakIter;  //internal character breakiterator
     32           UBreakIterator     *breakIter;
     33     // value USEARCH_DONE is the default value
     34     // if we are not at the start of the text or the end of the text,
     35     // depending on the iteration direction and matchedIndex is USEARCH_DONE
     36     // it means that we can't find any more matches in that particular direction
     37           int32_t             matchedIndex;
     38           int32_t             matchedLength;
     39           UBool               isForwardSearching;
     40           UBool               reset;
     41 };
     42 
     43 struct UPattern {
     44     const UChar              *text;
     45           int32_t             textLength; // exact length
     46           // length required for backwards ce comparison
     47           int32_t             CELength;
     48           int32_t            *CE;
     49           int32_t             CEBuffer[INITIAL_ARRAY_SIZE_];
     50           int32_t             PCELength;
     51           int64_t            *PCE;
     52           int64_t             PCEBuffer[INITIAL_ARRAY_SIZE_];
     53           UBool               hasPrefixAccents;
     54           UBool               hasSuffixAccents;
     55           int16_t             defaultShiftSize;
     56           int16_t             shift[MAX_TABLE_SIZE_];
     57           int16_t             backShift[MAX_TABLE_SIZE_];
     58 };
     59 
     60 struct UStringSearch {
     61     struct USearch            *search;
     62     struct UPattern            pattern;
     63     const  UCollator          *collator;
     64     const  icu::Normalizer2   *nfd;
     65     // positions within the collation element iterator is used to determine
     66     // if we are at the start of the text.
     67            UCollationElements *textIter;
     68     // utility collation element, used throughout program for temporary
     69     // iteration.
     70            UCollationElements *utilIter;
     71            UBool               ownCollator;
     72            UCollationStrength  strength;
     73            uint32_t            ceMask;
     74            uint32_t            variableTop;
     75            UBool               toShift;
     76            UChar               canonicalPrefixAccents[INITIAL_ARRAY_SIZE_];
     77            UChar               canonicalSuffixAccents[INITIAL_ARRAY_SIZE_];
     78 };
     79 
     80 /**
     81 * Exact matches without checking for the ends for extra accents.
     82 * The match after the position within the collation element iterator is to be
     83 * found.
     84 * After a match is found the offset in the collation element iterator will be
     85 * shifted to the start of the match.
     86 * Implementation note:
     87 * For tertiary we can't use the collator->tertiaryMask, that is a
     88 * preprocessed mask that takes into account case options. since we are only
     89 * concerned with exact matches, we don't need that.
     90 * Alternate handling - since only the 16 most significant digits is only used,
     91 * we can safely do a compare without masking if the ce is a variable, we mask
     92 * and get only the primary values no shifting to quartenary is required since
     93 * all primary values less than variabletop will need to be masked off anyway.
     94 * If the end character is composite and the pattern ce does not match the text
     95 * ce, we skip it until we find a match in the end composite character or when
     96 * it has passed the character. This is so that we can match pattern "a" with
     97 * the text "\u00e6"
     98 * @param strsrch string search data
     99 * @param status error status if any
    100 * @return TRUE if an exact match is found, FALSE otherwise
    101 */
    102 U_CFUNC
    103 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);
    104 
    105 /**
    106 * Canonical matches.
    107 * According to the definition, matches found here will include the whole span
    108 * of beginning and ending accents if it overlaps that region.
    109 * @param strsrch string search data
    110 * @param status error status if any
    111 * @return TRUE if a canonical match is found, FALSE otherwise
    112 */
    113 U_CFUNC
    114 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);
    115 
    116 /**
    117 * Gets the previous match.
    118 * Comments follows from handleNextExact
    119 * @param strsrch string search data
    120 * @param status error status if any
    121 * @return True if a exact math is found, FALSE otherwise.
    122 */
    123 U_CFUNC
    124 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);
    125 
    126 /**
    127 * Canonical matches.
    128 * According to the definition, matches found here will include the whole span
    129 * of beginning and ending accents if it overlaps that region.
    130 * @param strsrch string search data
    131 * @param status error status if any
    132 * @return TRUE if a canonical match is found, FALSE otherwise
    133 */
    134 U_CFUNC
    135 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
    136                                       UErrorCode    *status);
    137 
    138 #endif /* #if !UCONFIG_NO_COLLATION */
    139 
    140 #endif
    141