Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 * Copyright (C) 2013-2014, International Business Machines
      4 * Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 * collationruleparser.h
      7 *
      8 * created on: 2013apr10
      9 * created by: Markus W. Scherer
     10 */
     11 
     12 #ifndef __COLLATIONRULEPARSER_H__
     13 #define __COLLATIONRULEPARSER_H__
     14 
     15 #include "unicode/utypes.h"
     16 
     17 #if !UCONFIG_NO_COLLATION
     18 
     19 #include "unicode/ucol.h"
     20 #include "unicode/uniset.h"
     21 #include "unicode/unistr.h"
     22 
     23 struct UParseError;
     24 
     25 U_NAMESPACE_BEGIN
     26 
     27 struct CollationData;
     28 struct CollationTailoring;
     29 
     30 class Locale;
     31 class Normalizer2;
     32 
     33 struct CollationSettings;
     34 
     35 class U_I18N_API CollationRuleParser : public UMemory {
     36 public:
     37     /** Special reset positions. */
     38     enum Position {
     39         FIRST_TERTIARY_IGNORABLE,
     40         LAST_TERTIARY_IGNORABLE,
     41         FIRST_SECONDARY_IGNORABLE,
     42         LAST_SECONDARY_IGNORABLE,
     43         FIRST_PRIMARY_IGNORABLE,
     44         LAST_PRIMARY_IGNORABLE,
     45         FIRST_VARIABLE,
     46         LAST_VARIABLE,
     47         FIRST_REGULAR,
     48         LAST_REGULAR,
     49         FIRST_IMPLICIT,
     50         LAST_IMPLICIT,
     51         FIRST_TRAILING,
     52         LAST_TRAILING
     53     };
     54 
     55     /**
     56      * First character of contractions that encode special reset positions.
     57      * U+FFFE cannot be tailored via rule syntax.
     58      *
     59      * The second contraction character is POS_BASE + Position.
     60      */
     61     static const UChar POS_LEAD = 0xfffe;
     62     /**
     63      * Base for the second character of contractions that encode special reset positions.
     64      * Braille characters U+28xx are printable and normalization-inert.
     65      * @see POS_LEAD
     66      */
     67     static const UChar POS_BASE = 0x2800;
     68 
     69     class U_I18N_API Sink : public UObject {
     70     public:
     71         virtual ~Sink();
     72         /**
     73          * Adds a reset.
     74          * strength=UCOL_IDENTICAL for &str.
     75          * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3.
     76          */
     77         virtual void addReset(int32_t strength, const UnicodeString &str,
     78                               const char *&errorReason, UErrorCode &errorCode) = 0;
     79         /**
     80          * Adds a relation with strength and prefix | str / extension.
     81          */
     82         virtual void addRelation(int32_t strength, const UnicodeString &prefix,
     83                                  const UnicodeString &str, const UnicodeString &extension,
     84                                  const char *&errorReason, UErrorCode &errorCode) = 0;
     85 
     86         virtual void suppressContractions(const UnicodeSet &set, const char *&errorReason,
     87                                           UErrorCode &errorCode);
     88 
     89         virtual void optimize(const UnicodeSet &set, const char *&errorReason,
     90                               UErrorCode &errorCode);
     91     };
     92 
     93     class U_I18N_API Importer : public UObject {
     94     public:
     95         virtual ~Importer();
     96         virtual const UnicodeString *getRules(
     97                 const char *localeID, const char *collationType,
     98                 const char *&errorReason, UErrorCode &errorCode) = 0;
     99     };
    100 
    101     /**
    102      * Constructor.
    103      * The Sink must be set before parsing.
    104      * The Importer can be set, otherwise [import locale] syntax is not supported.
    105      */
    106     CollationRuleParser(const CollationData *base, UErrorCode &errorCode);
    107     ~CollationRuleParser();
    108 
    109     /**
    110      * Sets the pointer to a Sink object.
    111      * The pointer is aliased: Pointer copy without cloning or taking ownership.
    112      */
    113     void setSink(Sink *sinkAlias) {
    114         sink = sinkAlias;
    115     }
    116 
    117     /**
    118      * Sets the pointer to an Importer object.
    119      * The pointer is aliased: Pointer copy without cloning or taking ownership.
    120      */
    121     void setImporter(Importer *importerAlias) {
    122         importer = importerAlias;
    123     }
    124 
    125     void parse(const UnicodeString &ruleString,
    126                CollationSettings &outSettings,
    127                UParseError *outParseError,
    128                UErrorCode &errorCode);
    129 
    130     const char *getErrorReason() const { return errorReason; }
    131 
    132     /**
    133      * Gets a script or reorder code from its string representation.
    134      * @return the script/reorder code, or
    135      * -1==UCOL_REORDER_CODE_DEFAULT, or
    136      * -2 if not recognized
    137      */
    138     static int32_t getReorderCode(const char *word);
    139 
    140 private:
    141     /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */
    142     static const int32_t STRENGTH_MASK = 0xf;
    143     static const int32_t STARRED_FLAG = 0x10;
    144     static const int32_t OFFSET_SHIFT = 8;
    145 
    146     void parse(const UnicodeString &ruleString, UErrorCode &errorCode);
    147     void parseRuleChain(UErrorCode &errorCode);
    148     int32_t parseResetAndPosition(UErrorCode &errorCode);
    149     int32_t parseRelationOperator(UErrorCode &errorCode);
    150     void parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode);
    151     void parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode);
    152     int32_t parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode);
    153     int32_t parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode);
    154 
    155     /**
    156      * Sets str to a contraction of U+FFFE and (U+2800 + Position).
    157      * @return rule index after the special reset position
    158      */
    159     int32_t parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode);
    160     void parseSetting(UErrorCode &errorCode);
    161     void parseReordering(const UnicodeString &raw, UErrorCode &errorCode);
    162     static UColAttributeValue getOnOffValue(const UnicodeString &s);
    163 
    164     int32_t parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode);
    165     int32_t readWords(int32_t i, UnicodeString &raw) const;
    166     int32_t skipComment(int32_t i) const;
    167 
    168     void setParseError(const char *reason, UErrorCode &errorCode);
    169     void setErrorContext();
    170 
    171     /**
    172      * ASCII [:P:] and [:S:]:
    173      * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E]
    174      */
    175     static UBool isSyntaxChar(UChar32 c);
    176     int32_t skipWhiteSpace(int32_t i) const;
    177 
    178     const Normalizer2 &nfd, &nfc;
    179 
    180     const UnicodeString *rules;
    181     const CollationData *const baseData;
    182     CollationSettings *settings;
    183     UParseError *parseError;
    184     const char *errorReason;
    185 
    186     Sink *sink;
    187     Importer *importer;
    188 
    189     int32_t ruleIndex;
    190 };
    191 
    192 U_NAMESPACE_END
    193 
    194 #endif  // !UCONFIG_NO_COLLATION
    195 #endif  // __COLLATIONRULEPARSER_H__
    196