Home | History | Annotate | Download | only in i18n
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2013-2014, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * collationsets.h
      9 *
     10 * created on: 2013feb09
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #ifndef __COLLATIONSETS_H__
     15 #define __COLLATIONSETS_H__
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_COLLATION
     20 
     21 #include "unicode/uniset.h"
     22 #include "collation.h"
     23 
     24 U_NAMESPACE_BEGIN
     25 
     26 struct CollationData;
     27 
     28 /**
     29  * Finds the set of characters and strings that sort differently in the tailoring
     30  * from the base data.
     31  *
     32  * Every mapping in the tailoring needs to be compared to the base,
     33  * because some mappings are copied for optimization, and
     34  * all contractions for a character are copied if any contractions for that character
     35  * are added, modified or removed.
     36  *
     37  * It might be simpler to re-parse the rule string, but:
     38  * - That would require duplicating some of the from-rules builder code.
     39  * - That would make the runtime code depend on the builder.
     40  * - That would only work if we have the rule string, and we allow users to
     41  *   omit the rule string from data files.
     42  */
     43 class TailoredSet : public UMemory {
     44 public:
     45     TailoredSet(UnicodeSet *t)
     46             : data(NULL), baseData(NULL),
     47               tailored(t),
     48               suffix(NULL),
     49               errorCode(U_ZERO_ERROR) {}
     50 
     51     void forData(const CollationData *d, UErrorCode &errorCode);
     52 
     53     /**
     54      * @return U_SUCCESS(errorCode) in C++, void in Java
     55      * @internal only public for access by callback
     56      */
     57     UBool handleCE32(UChar32 start, UChar32 end, uint32_t ce32);
     58 
     59 private:
     60     void compare(UChar32 c, uint32_t ce32, uint32_t baseCE32);
     61     void comparePrefixes(UChar32 c, const UChar *p, const UChar *q);
     62     void compareContractions(UChar32 c, const UChar *p, const UChar *q);
     63 
     64     void addPrefixes(const CollationData *d, UChar32 c, const UChar *p);
     65     void addPrefix(const CollationData *d, const UnicodeString &pfx, UChar32 c, uint32_t ce32);
     66     void addContractions(UChar32 c, const UChar *p);
     67     void addSuffix(UChar32 c, const UnicodeString &sfx);
     68     void add(UChar32 c);
     69 
     70     /** Prefixes are reversed in the data structure. */
     71     void setPrefix(const UnicodeString &pfx) {
     72         unreversedPrefix = pfx;
     73         unreversedPrefix.reverse();
     74     }
     75     void resetPrefix() {
     76         unreversedPrefix.remove();
     77     }
     78 
     79     const CollationData *data;
     80     const CollationData *baseData;
     81     UnicodeSet *tailored;
     82     UnicodeString unreversedPrefix;
     83     const UnicodeString *suffix;
     84     UErrorCode errorCode;
     85 };
     86 
     87 class ContractionsAndExpansions : public UMemory {
     88 public:
     89     class CESink : public UMemory {
     90     public:
     91         virtual ~CESink();
     92         virtual void handleCE(int64_t ce) = 0;
     93         virtual void handleExpansion(const int64_t ces[], int32_t length) = 0;
     94     };
     95 
     96     ContractionsAndExpansions(UnicodeSet *con, UnicodeSet *exp, CESink *s, UBool prefixes)
     97             : data(NULL),
     98               contractions(con), expansions(exp),
     99               sink(s),
    100               addPrefixes(prefixes),
    101               checkTailored(0),
    102               suffix(NULL),
    103               errorCode(U_ZERO_ERROR) {}
    104 
    105     void forData(const CollationData *d, UErrorCode &errorCode);
    106     void forCodePoint(const CollationData *d, UChar32 c, UErrorCode &ec);
    107 
    108     // all following: @internal, only public for access by callback
    109 
    110     void handleCE32(UChar32 start, UChar32 end, uint32_t ce32);
    111 
    112     void handlePrefixes(UChar32 start, UChar32 end, uint32_t ce32);
    113     void handleContractions(UChar32 start, UChar32 end, uint32_t ce32);
    114 
    115     void addExpansions(UChar32 start, UChar32 end);
    116     void addStrings(UChar32 start, UChar32 end, UnicodeSet *set);
    117 
    118     /** Prefixes are reversed in the data structure. */
    119     void setPrefix(const UnicodeString &pfx) {
    120         unreversedPrefix = pfx;
    121         unreversedPrefix.reverse();
    122     }
    123     void resetPrefix() {
    124         unreversedPrefix.remove();
    125     }
    126 
    127     const CollationData *data;
    128     UnicodeSet *contractions;
    129     UnicodeSet *expansions;
    130     CESink *sink;
    131     UBool addPrefixes;
    132     int8_t checkTailored;  // -1: collected tailored  +1: exclude tailored
    133     UnicodeSet tailored;
    134     UnicodeSet ranges;
    135     UnicodeString unreversedPrefix;
    136     const UnicodeString *suffix;
    137     int64_t ces[Collation::MAX_EXPANSION_LENGTH];
    138     UErrorCode errorCode;
    139 };
    140 
    141 U_NAMESPACE_END
    142 
    143 #endif  // !UCONFIG_NO_COLLATION
    144 #endif  // __COLLATIONSETS_H__
    145