Home | History | Annotate | Download | only in unicode
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2009-2010, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  normalizer2.h
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2009nov22
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #ifndef __NORMALIZER2_H__
     18 #define __NORMALIZER2_H__
     19 
     20 /**
     21  * \file
     22  * \brief C++ API: New API for Unicode Normalization.
     23  */
     24 
     25 #include "unicode/utypes.h"
     26 
     27 #if !UCONFIG_NO_NORMALIZATION
     28 
     29 #include "unicode/uniset.h"
     30 #include "unicode/unistr.h"
     31 #include "unicode/unorm2.h"
     32 
     33 U_NAMESPACE_BEGIN
     34 
     35 /**
     36  * Unicode normalization functionality for standard Unicode normalization or
     37  * for using custom mapping tables.
     38  * All instances of this class are unmodifiable/immutable.
     39  * Instances returned by getInstance() are singletons that must not be deleted by the caller.
     40  * The Normalizer2 class is not intended for public subclassing.
     41  *
     42  * The primary functions are to produce a normalized string and to detect whether
     43  * a string is already normalized.
     44  * The most commonly used normalization forms are those defined in
     45  * http://www.unicode.org/unicode/reports/tr15/
     46  * However, this API supports additional normalization forms for specialized purposes.
     47  * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
     48  * and can be used in implementations of UTS #46.
     49  *
     50  * Not only are the standard compose and decompose modes supplied,
     51  * but additional modes are provided as documented in the Mode enum.
     52  *
     53  * Some of the functions in this class identify normalization boundaries.
     54  * At a normalization boundary, the portions of the string
     55  * before it and starting from it do not interact and can be handled independently.
     56  *
     57  * The spanQuickCheckYes() stops at a normalization boundary.
     58  * When the goal is a normalized string, then the text before the boundary
     59  * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
     60  *
     61  * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
     62  * a character is guaranteed to be at a normalization boundary,
     63  * regardless of context.
     64  * This is used for moving from one normalization boundary to the next
     65  * or preceding boundary, and for performing iterative normalization.
     66  *
     67  * Iterative normalization is useful when only a small portion of a
     68  * longer string needs to be processed.
     69  * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
     70  * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
     71  * (to process only the substring for which sort key bytes are computed).
     72  *
     73  * The set of normalization boundaries returned by these functions may not be
     74  * complete: There may be more boundaries that could be returned.
     75  * Different functions may return different boundaries.
     76  * @stable ICU 4.4
     77  */
     78 class U_COMMON_API Normalizer2 : public UObject {
     79 public:
     80     /**
     81      * Returns a Normalizer2 instance which uses the specified data file
     82      * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
     83      * and which composes or decomposes text according to the specified mode.
     84      * Returns an unmodifiable singleton instance. Do not delete it.
     85      *
     86      * Use packageName=NULL for data files that are part of ICU's own data.
     87      * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
     88      * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
     89      * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
     90      *
     91      * @param packageName NULL for ICU built-in data, otherwise application data package name
     92      * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
     93      * @param mode normalization mode (compose or decompose etc.)
     94      * @param errorCode Standard ICU error code. Its input value must
     95      *                  pass the U_SUCCESS() test, or else the function returns
     96      *                  immediately. Check for U_FAILURE() on output or use with
     97      *                  function chaining. (See User Guide for details.)
     98      * @return the requested Normalizer2, if successful
     99      * @stable ICU 4.4
    100      */
    101     static const Normalizer2 *
    102     getInstance(const char *packageName,
    103                 const char *name,
    104                 UNormalization2Mode mode,
    105                 UErrorCode &errorCode);
    106 
    107     /**
    108      * Returns the normalized form of the source string.
    109      * @param src source string
    110      * @param errorCode Standard ICU error code. Its input value must
    111      *                  pass the U_SUCCESS() test, or else the function returns
    112      *                  immediately. Check for U_FAILURE() on output or use with
    113      *                  function chaining. (See User Guide for details.)
    114      * @return normalized src
    115      * @stable ICU 4.4
    116      */
    117     UnicodeString
    118     normalize(const UnicodeString &src, UErrorCode &errorCode) const {
    119         UnicodeString result;
    120         normalize(src, result, errorCode);
    121         return result;
    122     }
    123     /**
    124      * Writes the normalized form of the source string to the destination string
    125      * (replacing its contents) and returns the destination string.
    126      * The source and destination strings must be different objects.
    127      * @param src source string
    128      * @param dest destination string; its contents is replaced with normalized src
    129      * @param errorCode Standard ICU error code. Its input value must
    130      *                  pass the U_SUCCESS() test, or else the function returns
    131      *                  immediately. Check for U_FAILURE() on output or use with
    132      *                  function chaining. (See User Guide for details.)
    133      * @return dest
    134      * @stable ICU 4.4
    135      */
    136     virtual UnicodeString &
    137     normalize(const UnicodeString &src,
    138               UnicodeString &dest,
    139               UErrorCode &errorCode) const = 0;
    140     /**
    141      * Appends the normalized form of the second string to the first string
    142      * (merging them at the boundary) and returns the first string.
    143      * The result is normalized if the first string was normalized.
    144      * The first and second strings must be different objects.
    145      * @param first string, should be normalized
    146      * @param second string, will be normalized
    147      * @param errorCode Standard ICU error code. Its input value must
    148      *                  pass the U_SUCCESS() test, or else the function returns
    149      *                  immediately. Check for U_FAILURE() on output or use with
    150      *                  function chaining. (See User Guide for details.)
    151      * @return first
    152      * @stable ICU 4.4
    153      */
    154     virtual UnicodeString &
    155     normalizeSecondAndAppend(UnicodeString &first,
    156                              const UnicodeString &second,
    157                              UErrorCode &errorCode) const = 0;
    158     /**
    159      * Appends the second string to the first string
    160      * (merging them at the boundary) and returns the first string.
    161      * The result is normalized if both the strings were normalized.
    162      * The first and second strings must be different objects.
    163      * @param first string, should be normalized
    164      * @param second string, should be normalized
    165      * @param errorCode Standard ICU error code. Its input value must
    166      *                  pass the U_SUCCESS() test, or else the function returns
    167      *                  immediately. Check for U_FAILURE() on output or use with
    168      *                  function chaining. (See User Guide for details.)
    169      * @return first
    170      * @stable ICU 4.4
    171      */
    172     virtual UnicodeString &
    173     append(UnicodeString &first,
    174            const UnicodeString &second,
    175            UErrorCode &errorCode) const = 0;
    176 
    177     /**
    178      * Gets the decomposition mapping of c. Equivalent to normalize(UnicodeString(c))
    179      * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster.
    180      * This function is independent of the mode of the Normalizer2.
    181      * @param c code point
    182      * @param decomposition String object which will be set to c's
    183      *                      decomposition mapping, if there is one.
    184      * @return TRUE if c has a decomposition, otherwise FALSE
    185      * @draft ICU 4.6
    186      */
    187     virtual UBool
    188     getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
    189 
    190     /**
    191      * Tests if the string is normalized.
    192      * Internally, in cases where the quickCheck() method would return "maybe"
    193      * (which is only possible for the two COMPOSE modes) this method
    194      * resolves to "yes" or "no" to provide a definitive result,
    195      * at the cost of doing more work in those cases.
    196      * @param s input string
    197      * @param errorCode Standard ICU error code. Its input value must
    198      *                  pass the U_SUCCESS() test, or else the function returns
    199      *                  immediately. Check for U_FAILURE() on output or use with
    200      *                  function chaining. (See User Guide for details.)
    201      * @return TRUE if s is normalized
    202      * @stable ICU 4.4
    203      */
    204     virtual UBool
    205     isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
    206 
    207     /**
    208      * Tests if the string is normalized.
    209      * For the two COMPOSE modes, the result could be "maybe" in cases that
    210      * would take a little more work to resolve definitively.
    211      * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
    212      * combination of quick check + normalization, to avoid
    213      * re-checking the "yes" prefix.
    214      * @param s input string
    215      * @param errorCode Standard ICU error code. Its input value must
    216      *                  pass the U_SUCCESS() test, or else the function returns
    217      *                  immediately. Check for U_FAILURE() on output or use with
    218      *                  function chaining. (See User Guide for details.)
    219      * @return UNormalizationCheckResult
    220      * @stable ICU 4.4
    221      */
    222     virtual UNormalizationCheckResult
    223     quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
    224 
    225     /**
    226      * Returns the end of the normalized substring of the input string.
    227      * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
    228      * the substring <code>UnicodeString(s, 0, end)</code>
    229      * will pass the quick check with a "yes" result.
    230      *
    231      * The returned end index is usually one or more characters before the
    232      * "no" or "maybe" character: The end index is at a normalization boundary.
    233      * (See the class documentation for more about normalization boundaries.)
    234      *
    235      * When the goal is a normalized string and most input strings are expected
    236      * to be normalized already, then call this method,
    237      * and if it returns a prefix shorter than the input string,
    238      * copy that prefix and use normalizeSecondAndAppend() for the remainder.
    239      * @param s input string
    240      * @param errorCode Standard ICU error code. Its input value must
    241      *                  pass the U_SUCCESS() test, or else the function returns
    242      *                  immediately. Check for U_FAILURE() on output or use with
    243      *                  function chaining. (See User Guide for details.)
    244      * @return "yes" span end index
    245      * @stable ICU 4.4
    246      */
    247     virtual int32_t
    248     spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
    249 
    250     /**
    251      * Tests if the character always has a normalization boundary before it,
    252      * regardless of context.
    253      * If true, then the character does not normalization-interact with
    254      * preceding characters.
    255      * In other words, a string containing this character can be normalized
    256      * by processing portions before this character and starting from this
    257      * character independently.
    258      * This is used for iterative normalization. See the class documentation for details.
    259      * @param c character to test
    260      * @return TRUE if c has a normalization boundary before it
    261      * @stable ICU 4.4
    262      */
    263     virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
    264 
    265     /**
    266      * Tests if the character always has a normalization boundary after it,
    267      * regardless of context.
    268      * If true, then the character does not normalization-interact with
    269      * following characters.
    270      * In other words, a string containing this character can be normalized
    271      * by processing portions up to this character and after this
    272      * character independently.
    273      * This is used for iterative normalization. See the class documentation for details.
    274      * Note that this operation may be significantly slower than hasBoundaryBefore().
    275      * @param c character to test
    276      * @return TRUE if c has a normalization boundary after it
    277      * @stable ICU 4.4
    278      */
    279     virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
    280 
    281     /**
    282      * Tests if the character is normalization-inert.
    283      * If true, then the character does not change, nor normalization-interact with
    284      * preceding or following characters.
    285      * In other words, a string containing this character can be normalized
    286      * by processing portions before this character and after this
    287      * character independently.
    288      * This is used for iterative normalization. See the class documentation for details.
    289      * Note that this operation may be significantly slower than hasBoundaryBefore().
    290      * @param c character to test
    291      * @return TRUE if c is normalization-inert
    292      * @stable ICU 4.4
    293      */
    294     virtual UBool isInert(UChar32 c) const = 0;
    295 
    296 private:
    297     // No ICU "poor man's RTTI" for this class nor its subclasses.
    298     virtual UClassID getDynamicClassID() const;
    299 };
    300 
    301 /**
    302  * Normalization filtered by a UnicodeSet.
    303  * Normalizes portions of the text contained in the filter set and leaves
    304  * portions not contained in the filter set unchanged.
    305  * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
    306  * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
    307  * This class implements all of (and only) the Normalizer2 API.
    308  * An instance of this class is unmodifiable/immutable but is constructed and
    309  * must be destructed by the owner.
    310  * @stable ICU 4.4
    311  */
    312 class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
    313 public:
    314     /**
    315      * Constructs a filtered normalizer wrapping any Normalizer2 instance
    316      * and a filter set.
    317      * Both are aliased and must not be modified or deleted while this object
    318      * is used.
    319      * The filter set should be frozen; otherwise the performance will suffer greatly.
    320      * @param n2 wrapped Normalizer2 instance
    321      * @param filterSet UnicodeSet which determines the characters to be normalized
    322      * @stable ICU 4.4
    323      */
    324     FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
    325             norm2(n2), set(filterSet) {}
    326 
    327     /**
    328      * Writes the normalized form of the source string to the destination string
    329      * (replacing its contents) and returns the destination string.
    330      * The source and destination strings must be different objects.
    331      * @param src source string
    332      * @param dest destination string; its contents is replaced with normalized src
    333      * @param errorCode Standard ICU error code. Its input value must
    334      *                  pass the U_SUCCESS() test, or else the function returns
    335      *                  immediately. Check for U_FAILURE() on output or use with
    336      *                  function chaining. (See User Guide for details.)
    337      * @return dest
    338      * @stable ICU 4.4
    339      */
    340     virtual UnicodeString &
    341     normalize(const UnicodeString &src,
    342               UnicodeString &dest,
    343               UErrorCode &errorCode) const;
    344     /**
    345      * Appends the normalized form of the second string to the first string
    346      * (merging them at the boundary) and returns the first string.
    347      * The result is normalized if the first string was normalized.
    348      * The first and second strings must be different objects.
    349      * @param first string, should be normalized
    350      * @param second string, will be normalized
    351      * @param errorCode Standard ICU error code. Its input value must
    352      *                  pass the U_SUCCESS() test, or else the function returns
    353      *                  immediately. Check for U_FAILURE() on output or use with
    354      *                  function chaining. (See User Guide for details.)
    355      * @return first
    356      * @stable ICU 4.4
    357      */
    358     virtual UnicodeString &
    359     normalizeSecondAndAppend(UnicodeString &first,
    360                              const UnicodeString &second,
    361                              UErrorCode &errorCode) const;
    362     /**
    363      * Appends the second string to the first string
    364      * (merging them at the boundary) and returns the first string.
    365      * The result is normalized if both the strings were normalized.
    366      * The first and second strings must be different objects.
    367      * @param first string, should be normalized
    368      * @param second string, should be normalized
    369      * @param errorCode Standard ICU error code. Its input value must
    370      *                  pass the U_SUCCESS() test, or else the function returns
    371      *                  immediately. Check for U_FAILURE() on output or use with
    372      *                  function chaining. (See User Guide for details.)
    373      * @return first
    374      * @stable ICU 4.4
    375      */
    376     virtual UnicodeString &
    377     append(UnicodeString &first,
    378            const UnicodeString &second,
    379            UErrorCode &errorCode) const;
    380 
    381     /**
    382      * Gets the decomposition mapping of c. Equivalent to normalize(UnicodeString(c))
    383      * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster.
    384      * This function is independent of the mode of the Normalizer2.
    385      * @param c code point
    386      * @param decomposition String object which will be set to c's
    387      *                      decomposition mapping, if there is one.
    388      * @return TRUE if c has a decomposition, otherwise FALSE
    389      * @draft ICU 4.6
    390      */
    391     virtual UBool
    392     getDecomposition(UChar32 c, UnicodeString &decomposition) const;
    393 
    394     /**
    395      * Tests if the string is normalized.
    396      * For details see the Normalizer2 base class documentation.
    397      * @param s input string
    398      * @param errorCode Standard ICU error code. Its input value must
    399      *                  pass the U_SUCCESS() test, or else the function returns
    400      *                  immediately. Check for U_FAILURE() on output or use with
    401      *                  function chaining. (See User Guide for details.)
    402      * @return TRUE if s is normalized
    403      * @stable ICU 4.4
    404      */
    405     virtual UBool
    406     isNormalized(const UnicodeString &s, UErrorCode &errorCode) const;
    407     /**
    408      * Tests if the string is normalized.
    409      * For details see the Normalizer2 base class documentation.
    410      * @param s input string
    411      * @param errorCode Standard ICU error code. Its input value must
    412      *                  pass the U_SUCCESS() test, or else the function returns
    413      *                  immediately. Check for U_FAILURE() on output or use with
    414      *                  function chaining. (See User Guide for details.)
    415      * @return UNormalizationCheckResult
    416      * @stable ICU 4.4
    417      */
    418     virtual UNormalizationCheckResult
    419     quickCheck(const UnicodeString &s, UErrorCode &errorCode) const;
    420     /**
    421      * Returns the end of the normalized substring of the input string.
    422      * For details see the Normalizer2 base class documentation.
    423      * @param s input string
    424      * @param errorCode Standard ICU error code. Its input value must
    425      *                  pass the U_SUCCESS() test, or else the function returns
    426      *                  immediately. Check for U_FAILURE() on output or use with
    427      *                  function chaining. (See User Guide for details.)
    428      * @return "yes" span end index
    429      * @stable ICU 4.4
    430      */
    431     virtual int32_t
    432     spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const;
    433 
    434     /**
    435      * Tests if the character always has a normalization boundary before it,
    436      * regardless of context.
    437      * For details see the Normalizer2 base class documentation.
    438      * @param c character to test
    439      * @return TRUE if c has a normalization boundary before it
    440      * @stable ICU 4.4
    441      */
    442     virtual UBool hasBoundaryBefore(UChar32 c) const;
    443 
    444     /**
    445      * Tests if the character always has a normalization boundary after it,
    446      * regardless of context.
    447      * For details see the Normalizer2 base class documentation.
    448      * @param c character to test
    449      * @return TRUE if c has a normalization boundary after it
    450      * @stable ICU 4.4
    451      */
    452     virtual UBool hasBoundaryAfter(UChar32 c) const;
    453 
    454     /**
    455      * Tests if the character is normalization-inert.
    456      * For details see the Normalizer2 base class documentation.
    457      * @param c character to test
    458      * @return TRUE if c is normalization-inert
    459      * @stable ICU 4.4
    460      */
    461     virtual UBool isInert(UChar32 c) const;
    462 private:
    463     UnicodeString &
    464     normalize(const UnicodeString &src,
    465               UnicodeString &dest,
    466               USetSpanCondition spanCondition,
    467               UErrorCode &errorCode) const;
    468 
    469     UnicodeString &
    470     normalizeSecondAndAppend(UnicodeString &first,
    471                              const UnicodeString &second,
    472                              UBool doNormalize,
    473                              UErrorCode &errorCode) const;
    474 
    475     const Normalizer2 &norm2;
    476     const UnicodeSet &set;
    477 };
    478 
    479 U_NAMESPACE_END
    480 
    481 #endif  // !UCONFIG_NO_NORMALIZATION
    482 #endif  // __NORMALIZER2_H__
    483