Home | History | Annotate | Download | only in i18n
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2013, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 *
      7 * indentifier_info.h
      8 *
      9 * created on: 2013 Jan 7
     10 * created by: Andy Heninger
     11 */
     12 
     13 #ifndef __IDENTIFIER_INFO_H__
     14 #define __IDENTIFIER_INFO_H__
     15 
     16 #include "unicode/utypes.h"
     17 
     18 #include "unicode/uniset.h"
     19 #include "unicode/uspoof.h"
     20 #include "uhash.h"
     21 
     22 U_NAMESPACE_BEGIN
     23 
     24 class ScriptSet;
     25 
     26 // TODO(andy): review consistency of reference vs pointer arguments to the funcions.
     27 
     28 /**
     29  * This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile
     30  * then setIdentifier. Available methods include:
     31  * <ol>
     32  * <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in
     33  * each of these.
     34  * <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be
     35  * either Katakana or Hiragana.
     36  * <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates.
     37  * <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in
     38  * the identifier.
     39  * <li>call getRestrictionLevel to see what the UTS36 restriction level is.
     40  * </ol>
     41  *
     42  * This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo
     43  */
     44 class U_I18N_API IdentifierInfo : public UMemory {
     45 
     46   public:
     47     /**
     48      * Create an identifier info object. Subsequently, call setIdentifier(), etc.
     49      * @internal
     50      */
     51     IdentifierInfo(UErrorCode &status);
     52 
     53     /**
     54       * Destructor
     55       */
     56     virtual ~IdentifierInfo();
     57 
     58   private:
     59     /* Disallow copying for now. Can be added if there's a need. */
     60     IdentifierInfo(const IdentifierInfo &other);
     61 
     62   public:
     63 
     64     /**
     65      * Set the identifier profile: the characters that are to be allowed in the identifier.
     66      *
     67      * @param identifierProfile the characters that are to be allowed in the identifier
     68      * @return this
     69      * @internal
     70      */
     71     IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile);
     72 
     73     /**
     74      * Get the identifier profile: the characters that are to be allowed in the identifier.
     75      *
     76      * @return The characters that are to be allowed in the identifier.
     77      * @internal
     78      */
     79     const UnicodeSet &getIdentifierProfile() const;
     80 
     81 
     82     /**
     83      * Set an identifier to analyze. Afterwards, call methods like getScripts()
     84      *
     85      * @param identifier the identifier to analyze
     86      * @param status Errorcode, set if errors occur.
     87      * @return this
     88      * @internal
     89      */
     90     IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status);
     91 
     92 
     93     /**
     94      * Get the identifier that was analyzed. The returned string is owned by the ICU library,
     95      * and must not be deleted by the caller.
     96      *
     97      * @return the identifier that was analyzed.
     98      * @internal
     99      */
    100     const UnicodeString *getIdentifier() const;
    101 
    102 
    103     /**
    104      * Get the scripts found in the identifiers.
    105      *
    106      * @return the set of explicit scripts.
    107      * @internal
    108      */
    109     const ScriptSet *getScripts() const;
    110 
    111     /**
    112      * Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then
    113      * the set consisting of those scripts will be returned.
    114      *
    115      * @return a uhash, with each key being of type (ScriptSet *).
    116      *         This is a set, not a map, so the value stored in the uhash is not relevant.
    117      *         (It is, in fact, 1).
    118      *         Ownership of the uhash and its contents remains with the IndetifierInfo object,
    119      *         and remains valid until a new identifer is set or until the object is deleted.
    120      * @internal
    121      */
    122     const UHashtable *getAlternates() const;
    123 
    124     /**
    125      * Get the representative characters (zeros) for the numerics found in the identifier.
    126      *
    127      * @return the set of explicit scripts.
    128      * @internal
    129      */
    130     const UnicodeSet *getNumerics() const;
    131 
    132     /**
    133      * Find out which scripts are in common among the alternates.
    134      *
    135      * @return the set of scripts that are in common among the alternates.
    136      * @internal
    137      */
    138     const ScriptSet *getCommonAmongAlternates() const;
    139 
    140     /**
    141       * Get the number of scripts appearing in the identifier.
    142       *   Note: Common and Inherited scripts are omitted from the count.
    143       *   Note: Result may be high when the identifier contains characters
    144       *         with alternate scripts. The distinction between
    145       *         0, 1 and > 1 will remain valid, however.
    146       * @return the number of scripts.
    147       */
    148     int32_t getScriptCount() const;
    149 
    150 #if !UCONFIG_NO_NORMALIZATION
    151 
    152     /**
    153      * Find the "tightest" restriction level that the identifier satisfies.
    154      *
    155      * @return the restriction level.
    156      * @internal
    157      */
    158     URestrictionLevel getRestrictionLevel(UErrorCode &status) const;
    159 
    160 #endif /*!UCONFIG_NO_NORMALIZATION */
    161 
    162     UnicodeString toString() const;
    163 
    164     /**
    165      * Produce a readable string of alternates.
    166      *
    167      * @param alternates a UHashtable of UScriptSets.
    168      *        Keys only, no meaningful values in the UHash.
    169      * @return display form
    170      * @internal
    171      */
    172     static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status);
    173 
    174     /**
    175      * Static memory cleanup function.
    176      * @internal
    177      */
    178     static UBool      cleanup();
    179   private:
    180 
    181     IdentifierInfo  & clear();
    182     UBool             containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const;
    183 
    184     UnicodeString     *fIdentifier;
    185     ScriptSet         *fRequiredScripts;
    186     UHashtable        *fScriptSetSet;
    187     ScriptSet         *fCommonAmongAlternates;
    188     UnicodeSet        *fNumerics;
    189     UnicodeSet        *fIdentifierProfile;
    190 
    191     static UnicodeSet *ASCII;
    192     static ScriptSet  *JAPANESE;
    193     static ScriptSet  *CHINESE;
    194     static ScriptSet  *KOREAN;
    195     static ScriptSet  *CONFUSABLE_WITH_LATIN;
    196 
    197 
    198 
    199 };
    200 
    201 U_NAMESPACE_END
    202 
    203 #endif // __IDENTIFIER_INFO_H__
    204 
    205