Home | History | Annotate | Download | only in unicode
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2009-2010, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  normalizer2.h
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2009nov22
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #ifndef __NORMALIZER2_H__
     18 #define __NORMALIZER2_H__
     19 
     20 /**
     21  * \file
     22  * \brief C++ API: New API for Unicode Normalization.
     23  */
     24 
     25 #include "unicode/utypes.h"
     26 
     27 #if !UCONFIG_NO_NORMALIZATION
     28 
     29 #include "unicode/uniset.h"
     30 #include "unicode/unistr.h"
     31 #include "unicode/unorm2.h"
     32 
     33 U_NAMESPACE_BEGIN
     34 
     35 /**
     36  * Unicode normalization functionality for standard Unicode normalization or
     37  * for using custom mapping tables.
     38  * All instances of this class are unmodifiable/immutable.
     39  * Instances returned by getInstance() are singletons that must not be deleted by the caller.
     40  *
     41  * The primary functions are to produce a normalized string and to detect whether
     42  * a string is already normalized.
     43  * The most commonly used normalization forms are those defined in
     44  * http://www.unicode.org/unicode/reports/tr15/
     45  * However, this API supports additional normalization forms for specialized purposes.
     46  * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
     47  * and can be used in implementations of UTS #46.
     48  *
     49  * Not only are the standard compose and decompose modes supplied,
     50  * but additional modes are provided as documented in the Mode enum.
     51  *
     52  * Some of the functions in this class identify normalization boundaries.
     53  * At a normalization boundary, the portions of the string
     54  * before it and starting from it do not interact and can be handled independently.
     55  *
     56  * The spanQuickCheckYes() stops at a normalization boundary.
     57  * When the goal is a normalized string, then the text before the boundary
     58  * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
     59  *
     60  * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
     61  * a character is guaranteed to be at a normalization boundary,
     62  * regardless of context.
     63  * This is used for moving from one normalization boundary to the next
     64  * or preceding boundary, and for performing iterative normalization.
     65  *
     66  * Iterative normalization is useful when only a small portion of a
     67  * longer string needs to be processed.
     68  * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
     69  * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
     70  * (to process only the substring for which sort key bytes are computed).
     71  *
     72  * The set of normalization boundaries returned by these functions may not be
     73  * complete: There may be more boundaries that could be returned.
     74  * Different functions may return different boundaries.
     75  * @draft ICU 4.4
     76  */
     77 class U_COMMON_API Normalizer2 : public UObject {
     78 public:
     79     /**
     80      * Returns a Normalizer2 instance which uses the specified data file
     81      * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
     82      * and which composes or decomposes text according to the specified mode.
     83      * Returns an unmodifiable singleton instance. Do not delete it.
     84      *
     85      * Use packageName=NULL for data files that are part of ICU's own data.
     86      * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
     87      * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
     88      * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
     89      *
     90      * @param packageName NULL for ICU built-in data, otherwise application data package name
     91      * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
     92      * @param mode normalization mode (compose or decompose etc.)
     93      * @param errorCode Standard ICU error code. Its input value must
     94      *                  pass the U_SUCCESS() test, or else the function returns
     95      *                  immediately. Check for U_FAILURE() on output or use with
     96      *                  function chaining. (See User Guide for details.)
     97      * @return the requested Normalizer2, if successful
     98      * @draft ICU 4.4
     99      */
    100     static const Normalizer2 *
    101     getInstance(const char *packageName,
    102                 const char *name,
    103                 UNormalization2Mode mode,
    104                 UErrorCode &errorCode);
    105 
    106     /**
    107      * Returns the normalized form of the source string.
    108      * @param src source string
    109      * @param errorCode Standard ICU error code. Its input value must
    110      *                  pass the U_SUCCESS() test, or else the function returns
    111      *                  immediately. Check for U_FAILURE() on output or use with
    112      *                  function chaining. (See User Guide for details.)
    113      * @return normalized src
    114      * @draft ICU 4.4
    115      */
    116     UnicodeString
    117     normalize(const UnicodeString &src, UErrorCode &errorCode) const {
    118         UnicodeString result;
    119         normalize(src, result, errorCode);
    120         return result;
    121     }
    122     /**
    123      * Writes the normalized form of the source string to the destination string
    124      * (replacing its contents) and returns the destination string.
    125      * The source and destination strings must be different objects.
    126      * @param src source string
    127      * @param dest destination string; its contents is replaced with normalized src
    128      * @param errorCode Standard ICU error code. Its input value must
    129      *                  pass the U_SUCCESS() test, or else the function returns
    130      *                  immediately. Check for U_FAILURE() on output or use with
    131      *                  function chaining. (See User Guide for details.)
    132      * @return dest
    133      * @draft ICU 4.4
    134      */
    135     virtual UnicodeString &
    136     normalize(const UnicodeString &src,
    137               UnicodeString &dest,
    138               UErrorCode &errorCode) const = 0;
    139     /**
    140      * Appends the normalized form of the second string to the first string
    141      * (merging them at the boundary) and returns the first string.
    142      * The result is normalized if the first string was normalized.
    143      * The first and second strings must be different objects.
    144      * @param first string, should be normalized
    145      * @param second string, will be normalized
    146      * @param errorCode Standard ICU error code. Its input value must
    147      *                  pass the U_SUCCESS() test, or else the function returns
    148      *                  immediately. Check for U_FAILURE() on output or use with
    149      *                  function chaining. (See User Guide for details.)
    150      * @return first
    151      * @draft ICU 4.4
    152      */
    153     virtual UnicodeString &
    154     normalizeSecondAndAppend(UnicodeString &first,
    155                              const UnicodeString &second,
    156                              UErrorCode &errorCode) const = 0;
    157     /**
    158      * Appends the second string to the first string
    159      * (merging them at the boundary) and returns the first string.
    160      * The result is normalized if both the strings were normalized.
    161      * The first and second strings must be different objects.
    162      * @param first string, should be normalized
    163      * @param second string, should be normalized
    164      * @param errorCode Standard ICU error code. Its input value must
    165      *                  pass the U_SUCCESS() test, or else the function returns
    166      *                  immediately. Check for U_FAILURE() on output or use with
    167      *                  function chaining. (See User Guide for details.)
    168      * @return first
    169      * @draft ICU 4.4
    170      */
    171     virtual UnicodeString &
    172     append(UnicodeString &first,
    173            const UnicodeString &second,
    174            UErrorCode &errorCode) const = 0;
    175 
    176     /**
    177      * Tests if the string is normalized.
    178      * Internally, in cases where the quickCheck() method would return "maybe"
    179      * (which is only possible for the two COMPOSE modes) this method
    180      * resolves to "yes" or "no" to provide a definitive result,
    181      * at the cost of doing more work in those cases.
    182      * @param s input string
    183      * @param errorCode Standard ICU error code. Its input value must
    184      *                  pass the U_SUCCESS() test, or else the function returns
    185      *                  immediately. Check for U_FAILURE() on output or use with
    186      *                  function chaining. (See User Guide for details.)
    187      * @return TRUE if s is normalized
    188      * @draft ICU 4.4
    189      */
    190     virtual UBool
    191     isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
    192 
    193     /**
    194      * Tests if the string is normalized.
    195      * For the two COMPOSE modes, the result could be "maybe" in cases that
    196      * would take a little more work to resolve definitively.
    197      * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
    198      * combination of quick check + normalization, to avoid
    199      * re-checking the "yes" prefix.
    200      * @param s input string
    201      * @param errorCode Standard ICU error code. Its input value must
    202      *                  pass the U_SUCCESS() test, or else the function returns
    203      *                  immediately. Check for U_FAILURE() on output or use with
    204      *                  function chaining. (See User Guide for details.)
    205      * @return UNormalizationCheckResult
    206      * @draft ICU 4.4
    207      */
    208     virtual UNormalizationCheckResult
    209     quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
    210 
    211     /**
    212      * Returns the end of the normalized substring of the input string.
    213      * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
    214      * the substring <code>UnicodeString(s, 0, end)</code>
    215      * will pass the quick check with a "yes" result.
    216      *
    217      * The returned end index is usually one or more characters before the
    218      * "no" or "maybe" character: The end index is at a normalization boundary.
    219      * (See the class documentation for more about normalization boundaries.)
    220      *
    221      * When the goal is a normalized string and most input strings are expected
    222      * to be normalized already, then call this method,
    223      * and if it returns a prefix shorter than the input string,
    224      * copy that prefix and use normalizeSecondAndAppend() for the remainder.
    225      * @param s input string
    226      * @param errorCode Standard ICU error code. Its input value must
    227      *                  pass the U_SUCCESS() test, or else the function returns
    228      *                  immediately. Check for U_FAILURE() on output or use with
    229      *                  function chaining. (See User Guide for details.)
    230      * @return "yes" span end index
    231      * @draft ICU 4.4
    232      */
    233     virtual int32_t
    234     spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
    235 
    236     /**
    237      * Tests if the character always has a normalization boundary before it,
    238      * regardless of context.
    239      * If true, then the character does not normalization-interact with
    240      * preceding characters.
    241      * In other words, a string containing this character can be normalized
    242      * by processing portions before this character and starting from this
    243      * character independently.
    244      * This is used for iterative normalization. See the class documentation for details.
    245      * @param c character to test
    246      * @return TRUE if c has a normalization boundary before it
    247      * @draft ICU 4.4
    248      */
    249     virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
    250 
    251     /**
    252      * Tests if the character always has a normalization boundary after it,
    253      * regardless of context.
    254      * If true, then the character does not normalization-interact with
    255      * following characters.
    256      * In other words, a string containing this character can be normalized
    257      * by processing portions up to this character and after this
    258      * character independently.
    259      * This is used for iterative normalization. See the class documentation for details.
    260      * Note that this operation may be significantly slower than hasBoundaryBefore().
    261      * @param c character to test
    262      * @return TRUE if c has a normalization boundary after it
    263      * @draft ICU 4.4
    264      */
    265     virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
    266 
    267     /**
    268      * Tests if the character is normalization-inert.
    269      * If true, then the character does not change, nor normalization-interact with
    270      * preceding or following characters.
    271      * In other words, a string containing this character can be normalized
    272      * by processing portions before this character and after this
    273      * character independently.
    274      * This is used for iterative normalization. See the class documentation for details.
    275      * Note that this operation may be significantly slower than hasBoundaryBefore().
    276      * @param c character to test
    277      * @return TRUE if c is normalization-inert
    278      * @draft ICU 4.4
    279      */
    280     virtual UBool isInert(UChar32 c) const = 0;
    281 
    282     /**
    283      * ICU "poor man's RTTI", returns a UClassID for this class.
    284      * @returns a UClassID for this class.
    285      * @draft ICU 4.4
    286      */
    287     static UClassID U_EXPORT2 getStaticClassID();
    288 
    289     /**
    290      * ICU "poor man's RTTI", returns a UClassID for the actual class.
    291      * @return a UClassID for the actual class.
    292      * @draft ICU 4.4
    293      */
    294     virtual UClassID getDynamicClassID() const = 0;
    295 };
    296 
    297 /**
    298  * Normalization filtered by a UnicodeSet.
    299  * Normalizes portions of the text contained in the filter set and leaves
    300  * portions not contained in the filter set unchanged.
    301  * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
    302  * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
    303  * This class implements all of (and only) the Normalizer2 API.
    304  * An instance of this class is unmodifiable/immutable but is constructed and
    305  * must be destructed by the owner.
    306  * @draft ICU 4.4
    307  */
    308 class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
    309 public:
    310     /**
    311      * Constructs a filtered normalizer wrapping any Normalizer2 instance
    312      * and a filter set.
    313      * Both are aliased and must not be modified or deleted while this object
    314      * is used.
    315      * The filter set should be frozen; otherwise the performance will suffer greatly.
    316      * @param n2 wrapped Normalizer2 instance
    317      * @param filterSet UnicodeSet which determines the characters to be normalized
    318      * @draft ICU 4.4
    319      */
    320     FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
    321             norm2(n2), set(filterSet) {}
    322 
    323     /**
    324      * Writes the normalized form of the source string to the destination string
    325      * (replacing its contents) and returns the destination string.
    326      * The source and destination strings must be different objects.
    327      * @param src source string
    328      * @param dest destination string; its contents is replaced with normalized src
    329      * @param errorCode Standard ICU error code. Its input value must
    330      *                  pass the U_SUCCESS() test, or else the function returns
    331      *                  immediately. Check for U_FAILURE() on output or use with
    332      *                  function chaining. (See User Guide for details.)
    333      * @return dest
    334      * @draft ICU 4.4
    335      */
    336     virtual UnicodeString &
    337     normalize(const UnicodeString &src,
    338               UnicodeString &dest,
    339               UErrorCode &errorCode) const;
    340     /**
    341      * Appends the normalized form of the second string to the first string
    342      * (merging them at the boundary) and returns the first string.
    343      * The result is normalized if the first string was normalized.
    344      * The first and second strings must be different objects.
    345      * @param first string, should be normalized
    346      * @param second string, will be normalized
    347      * @param errorCode Standard ICU error code. Its input value must
    348      *                  pass the U_SUCCESS() test, or else the function returns
    349      *                  immediately. Check for U_FAILURE() on output or use with
    350      *                  function chaining. (See User Guide for details.)
    351      * @return first
    352      * @draft ICU 4.4
    353      */
    354     virtual UnicodeString &
    355     normalizeSecondAndAppend(UnicodeString &first,
    356                              const UnicodeString &second,
    357                              UErrorCode &errorCode) const;
    358     /**
    359      * Appends the second string to the first string
    360      * (merging them at the boundary) and returns the first string.
    361      * The result is normalized if both the strings were normalized.
    362      * The first and second strings must be different objects.
    363      * @param first string, should be normalized
    364      * @param second string, should be normalized
    365      * @param errorCode Standard ICU error code. Its input value must
    366      *                  pass the U_SUCCESS() test, or else the function returns
    367      *                  immediately. Check for U_FAILURE() on output or use with
    368      *                  function chaining. (See User Guide for details.)
    369      * @return first
    370      * @draft ICU 4.4
    371      */
    372     virtual UnicodeString &
    373     append(UnicodeString &first,
    374            const UnicodeString &second,
    375            UErrorCode &errorCode) const;
    376 
    377     /**
    378      * Tests if the string is normalized.
    379      * For details see the Normalizer2 base class documentation.
    380      * @param s input string
    381      * @param errorCode Standard ICU error code. Its input value must
    382      *                  pass the U_SUCCESS() test, or else the function returns
    383      *                  immediately. Check for U_FAILURE() on output or use with
    384      *                  function chaining. (See User Guide for details.)
    385      * @return TRUE if s is normalized
    386      * @draft ICU 4.4
    387      */
    388     virtual UBool
    389     isNormalized(const UnicodeString &s, UErrorCode &errorCode) const;
    390     /**
    391      * Tests if the string is normalized.
    392      * For details see the Normalizer2 base class documentation.
    393      * @param s input string
    394      * @param errorCode Standard ICU error code. Its input value must
    395      *                  pass the U_SUCCESS() test, or else the function returns
    396      *                  immediately. Check for U_FAILURE() on output or use with
    397      *                  function chaining. (See User Guide for details.)
    398      * @return UNormalizationCheckResult
    399      * @draft ICU 4.4
    400      */
    401     virtual UNormalizationCheckResult
    402     quickCheck(const UnicodeString &s, UErrorCode &errorCode) const;
    403     /**
    404      * Returns the end of the normalized substring of the input string.
    405      * For details see the Normalizer2 base class documentation.
    406      * @param s input string
    407      * @param errorCode Standard ICU error code. Its input value must
    408      *                  pass the U_SUCCESS() test, or else the function returns
    409      *                  immediately. Check for U_FAILURE() on output or use with
    410      *                  function chaining. (See User Guide for details.)
    411      * @return "yes" span end index
    412      * @draft ICU 4.4
    413      */
    414     virtual int32_t
    415     spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const;
    416 
    417     /**
    418      * Tests if the character always has a normalization boundary before it,
    419      * regardless of context.
    420      * For details see the Normalizer2 base class documentation.
    421      * @param c character to test
    422      * @return TRUE if c has a normalization boundary before it
    423      * @draft ICU 4.4
    424      */
    425     virtual UBool hasBoundaryBefore(UChar32 c) const;
    426 
    427     /**
    428      * Tests if the character always has a normalization boundary after it,
    429      * regardless of context.
    430      * For details see the Normalizer2 base class documentation.
    431      * @param c character to test
    432      * @return TRUE if c has a normalization boundary after it
    433      * @draft ICU 4.4
    434      */
    435     virtual UBool hasBoundaryAfter(UChar32 c) const;
    436 
    437     /**
    438      * Tests if the character is normalization-inert.
    439      * For details see the Normalizer2 base class documentation.
    440      * @param c character to test
    441      * @return TRUE if c is normalization-inert
    442      * @draft ICU 4.4
    443      */
    444     virtual UBool isInert(UChar32 c) const;
    445 
    446     /**
    447      * ICU "poor man's RTTI", returns a UClassID for this class.
    448      * @returns a UClassID for this class.
    449      * @draft ICU 4.4
    450      */
    451     static UClassID U_EXPORT2 getStaticClassID();
    452 
    453     /**
    454      * ICU "poor man's RTTI", returns a UClassID for the actual class.
    455      * @return a UClassID for the actual class.
    456      * @draft ICU 4.4
    457      */
    458     virtual UClassID getDynamicClassID() const;
    459 private:
    460     UnicodeString &
    461     normalize(const UnicodeString &src,
    462               UnicodeString &dest,
    463               USetSpanCondition spanCondition,
    464               UErrorCode &errorCode) const;
    465 
    466     UnicodeString &
    467     normalizeSecondAndAppend(UnicodeString &first,
    468                              const UnicodeString &second,
    469                              UBool doNormalize,
    470                              UErrorCode &errorCode) const;
    471 
    472     const Normalizer2 &norm2;
    473     const UnicodeSet &set;
    474 };
    475 
    476 U_NAMESPACE_END
    477 
    478 #endif  // !UCONFIG_NO_NORMALIZATION
    479 #endif  // __NORMALIZER2_H__
    480