Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 * Copyright (C) 2013-2014, International Business Machines
      4 * Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 * collationsets.h
      7 *
      8 * created on: 2013feb09
      9 * created by: Markus W. Scherer
     10 */
     11 
     12 #ifndef __COLLATIONSETS_H__
     13 #define __COLLATIONSETS_H__
     14 
     15 #include "unicode/utypes.h"
     16 
     17 #if !UCONFIG_NO_COLLATION
     18 
     19 #include "unicode/uniset.h"
     20 #include "collation.h"
     21 
     22 U_NAMESPACE_BEGIN
     23 
     24 struct CollationData;
     25 
     26 /**
     27  * Finds the set of characters and strings that sort differently in the tailoring
     28  * from the base data.
     29  *
     30  * Every mapping in the tailoring needs to be compared to the base,
     31  * because some mappings are copied for optimization, and
     32  * all contractions for a character are copied if any contractions for that character
     33  * are added, modified or removed.
     34  *
     35  * It might be simpler to re-parse the rule string, but:
     36  * - That would require duplicating some of the from-rules builder code.
     37  * - That would make the runtime code depend on the builder.
     38  * - That would only work if we have the rule string, and we allow users to
     39  *   omit the rule string from data files.
     40  */
     41 class TailoredSet : public UMemory {
     42 public:
     43     TailoredSet(UnicodeSet *t)
     44             : data(NULL), baseData(NULL),
     45               tailored(t),
     46               suffix(NULL),
     47               errorCode(U_ZERO_ERROR) {}
     48 
     49     void forData(const CollationData *d, UErrorCode &errorCode);
     50 
     51     /**
     52      * @return U_SUCCESS(errorCode) in C++, void in Java
     53      * @internal only public for access by callback
     54      */
     55     UBool handleCE32(UChar32 start, UChar32 end, uint32_t ce32);
     56 
     57 private:
     58     void compare(UChar32 c, uint32_t ce32, uint32_t baseCE32);
     59     void comparePrefixes(UChar32 c, const UChar *p, const UChar *q);
     60     void compareContractions(UChar32 c, const UChar *p, const UChar *q);
     61 
     62     void addPrefixes(const CollationData *d, UChar32 c, const UChar *p);
     63     void addPrefix(const CollationData *d, const UnicodeString &pfx, UChar32 c, uint32_t ce32);
     64     void addContractions(UChar32 c, const UChar *p);
     65     void addSuffix(UChar32 c, const UnicodeString &sfx);
     66     void add(UChar32 c);
     67 
     68     /** Prefixes are reversed in the data structure. */
     69     void setPrefix(const UnicodeString &pfx) {
     70         unreversedPrefix = pfx;
     71         unreversedPrefix.reverse();
     72     }
     73     void resetPrefix() {
     74         unreversedPrefix.remove();
     75     }
     76 
     77     const CollationData *data;
     78     const CollationData *baseData;
     79     UnicodeSet *tailored;
     80     UnicodeString unreversedPrefix;
     81     const UnicodeString *suffix;
     82     UErrorCode errorCode;
     83 };
     84 
     85 class ContractionsAndExpansions : public UMemory {
     86 public:
     87     class CESink : public UMemory {
     88     public:
     89         virtual ~CESink();
     90         virtual void handleCE(int64_t ce) = 0;
     91         virtual void handleExpansion(const int64_t ces[], int32_t length) = 0;
     92     };
     93 
     94     ContractionsAndExpansions(UnicodeSet *con, UnicodeSet *exp, CESink *s, UBool prefixes)
     95             : data(NULL),
     96               contractions(con), expansions(exp),
     97               sink(s),
     98               addPrefixes(prefixes),
     99               checkTailored(0),
    100               suffix(NULL),
    101               errorCode(U_ZERO_ERROR) {}
    102 
    103     void forData(const CollationData *d, UErrorCode &errorCode);
    104     void forCodePoint(const CollationData *d, UChar32 c, UErrorCode &ec);
    105 
    106     // all following: @internal, only public for access by callback
    107 
    108     void handleCE32(UChar32 start, UChar32 end, uint32_t ce32);
    109 
    110     void handlePrefixes(UChar32 start, UChar32 end, uint32_t ce32);
    111     void handleContractions(UChar32 start, UChar32 end, uint32_t ce32);
    112 
    113     void addExpansions(UChar32 start, UChar32 end);
    114     void addStrings(UChar32 start, UChar32 end, UnicodeSet *set);
    115 
    116     /** Prefixes are reversed in the data structure. */
    117     void setPrefix(const UnicodeString &pfx) {
    118         unreversedPrefix = pfx;
    119         unreversedPrefix.reverse();
    120     }
    121     void resetPrefix() {
    122         unreversedPrefix.remove();
    123     }
    124 
    125     const CollationData *data;
    126     UnicodeSet *contractions;
    127     UnicodeSet *expansions;
    128     CESink *sink;
    129     UBool addPrefixes;
    130     int8_t checkTailored;  // -1: collected tailored  +1: exclude tailored
    131     UnicodeSet tailored;
    132     UnicodeSet ranges;
    133     UnicodeString unreversedPrefix;
    134     const UnicodeString *suffix;
    135     int64_t ces[Collation::MAX_EXPANSION_LENGTH];
    136     UErrorCode errorCode;
    137 };
    138 
    139 U_NAMESPACE_END
    140 
    141 #endif  // !UCONFIG_NO_COLLATION
    142 #endif  // __COLLATIONSETS_H__
    143