Home | History | Annotate | Download | only in text
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5  *******************************************************************************
      6  *   Copyright (C) 2009-2016, International Business Machines
      7  *   Corporation and others.  All Rights Reserved.
      8  *******************************************************************************
      9  */
     10 
     11 package android.icu.text;
     12 
     13 import java.io.IOException;
     14 import java.io.InputStream;
     15 import java.nio.ByteBuffer;
     16 
     17 import android.icu.impl.ICUBinary;
     18 import android.icu.impl.Norm2AllModes;
     19 import android.icu.util.ICUUncheckedIOException;
     20 
     21 /**
     22  * Unicode normalization functionality for standard Unicode normalization or
     23  * for using custom mapping tables.
     24  * All instances of this class are unmodifiable/immutable.
     25  * The Normalizer2 class is not intended for public subclassing.
     26  * <p>
     27  * The primary functions are to produce a normalized string and to detect whether
     28  * a string is already normalized.
     29  * The most commonly used normalization forms are those defined in
     30  * http://www.unicode.org/unicode/reports/tr15/
     31  * However, this API supports additional normalization forms for specialized purposes.
     32  * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
     33  * and can be used in implementations of UTS #46.
     34  * <p>
     35  * Not only are the standard compose and decompose modes supplied,
     36  * but additional modes are provided as documented in the Mode enum.
     37  * <p>
     38  * Some of the functions in this class identify normalization boundaries.
     39  * At a normalization boundary, the portions of the string
     40  * before it and starting from it do not interact and can be handled independently.
     41  * <p>
     42  * The spanQuickCheckYes() stops at a normalization boundary.
     43  * When the goal is a normalized string, then the text before the boundary
     44  * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
     45  * <p>
     46  * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
     47  * a character is guaranteed to be at a normalization boundary,
     48  * regardless of context.
     49  * This is used for moving from one normalization boundary to the next
     50  * or preceding boundary, and for performing iterative normalization.
     51  * <p>
     52  * Iterative normalization is useful when only a small portion of a
     53  * longer string needs to be processed.
     54  * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
     55  * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
     56  * (to process only the substring for which sort key bytes are computed).
     57  * <p>
     58  * The set of normalization boundaries returned by these functions may not be
     59  * complete: There may be more boundaries that could be returned.
     60  * Different functions may return different boundaries.
     61  * @author Markus W. Scherer
     62  */
     63 public abstract class Normalizer2 {
     64     /**
     65      * Constants for normalization modes.
     66      * For details about standard Unicode normalization forms
     67      * and about the algorithms which are also used with custom mapping tables
     68      * see http://www.unicode.org/unicode/reports/tr15/
     69      */
     70     public enum Mode {
     71         /**
     72          * Decomposition followed by composition.
     73          * Same as standard NFC when using an "nfc" instance.
     74          * Same as standard NFKC when using an "nfkc" instance.
     75          * For details about standard Unicode normalization forms
     76          * see http://www.unicode.org/unicode/reports/tr15/
     77          */
     78         COMPOSE,
     79         /**
     80          * Map, and reorder canonically.
     81          * Same as standard NFD when using an "nfc" instance.
     82          * Same as standard NFKD when using an "nfkc" instance.
     83          * For details about standard Unicode normalization forms
     84          * see http://www.unicode.org/unicode/reports/tr15/
     85          */
     86         DECOMPOSE,
     87         /**
     88          * "Fast C or D" form.
     89          * If a string is in this form, then further decomposition <i>without reordering</i>
     90          * would yield the same form as DECOMPOSE.
     91          * Text in "Fast C or D" form can be processed efficiently with data tables
     92          * that are "canonically closed", that is, that provide equivalent data for
     93          * equivalent text, without having to be fully normalized.<br>
     94          * Not a standard Unicode normalization form.<br>
     95          * Not a unique form: Different FCD strings can be canonically equivalent.<br>
     96          * For details see http://www.unicode.org/notes/tn5/#FCD
     97          */
     98         FCD,
     99         /**
    100          * Compose only contiguously.
    101          * Also known as "FCC" or "Fast C Contiguous".
    102          * The result will often but not always be in NFC.
    103          * The result will conform to FCD which is useful for processing.<br>
    104          * Not a standard Unicode normalization form.<br>
    105          * For details see http://www.unicode.org/notes/tn5/#FCC
    106          */
    107         COMPOSE_CONTIGUOUS
    108     };
    109 
    110     /**
    111      * Returns a Normalizer2 instance for Unicode NFC normalization.
    112      * Same as getInstance(null, "nfc", Mode.COMPOSE).
    113      * Returns an unmodifiable singleton instance.
    114      * @return the requested Normalizer2, if successful
    115      */
    116     public static Normalizer2 getNFCInstance() {
    117         return Norm2AllModes.getNFCInstance().comp;
    118     }
    119 
    120     /**
    121      * Returns a Normalizer2 instance for Unicode NFD normalization.
    122      * Same as getInstance(null, "nfc", Mode.DECOMPOSE).
    123      * Returns an unmodifiable singleton instance.
    124      * @return the requested Normalizer2, if successful
    125      */
    126     public static Normalizer2 getNFDInstance() {
    127         return Norm2AllModes.getNFCInstance().decomp;
    128     }
    129 
    130     /**
    131      * Returns a Normalizer2 instance for Unicode NFKC normalization.
    132      * Same as getInstance(null, "nfkc", Mode.COMPOSE).
    133      * Returns an unmodifiable singleton instance.
    134      * @return the requested Normalizer2, if successful
    135      */
    136     public static Normalizer2 getNFKCInstance() {
    137         return Norm2AllModes.getNFKCInstance().comp;
    138     }
    139 
    140     /**
    141      * Returns a Normalizer2 instance for Unicode NFKD normalization.
    142      * Same as getInstance(null, "nfkc", Mode.DECOMPOSE).
    143      * Returns an unmodifiable singleton instance.
    144      * @return the requested Normalizer2, if successful
    145      */
    146     public static Normalizer2 getNFKDInstance() {
    147         return Norm2AllModes.getNFKCInstance().decomp;
    148     }
    149 
    150     /**
    151      * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
    152      * Same as getInstance(null, "nfkc_cf", Mode.COMPOSE).
    153      * Returns an unmodifiable singleton instance.
    154      * @return the requested Normalizer2, if successful
    155      */
    156     public static Normalizer2 getNFKCCasefoldInstance() {
    157         return Norm2AllModes.getNFKC_CFInstance().comp;
    158     }
    159 
    160     /**
    161      * Returns a Normalizer2 instance which uses the specified data file
    162      * (an ICU data file if data=null, or else custom binary data)
    163      * and which composes or decomposes text according to the specified mode.
    164      * Returns an unmodifiable singleton instance.
    165      * <ul>
    166      * <li>Use data=null for data files that are part of ICU's own data.
    167      * <li>Use name="nfc" and COMPOSE/DECOMPOSE for Unicode standard NFC/NFD.
    168      * <li>Use name="nfkc" and COMPOSE/DECOMPOSE for Unicode standard NFKC/NFKD.
    169      * <li>Use name="nfkc_cf" and COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
    170      * </ul>
    171      * If data!=null, then the binary data is read once and cached using the provided
    172      * name as the key.
    173      * If you know or expect the data to be cached already, you can use data!=null
    174      * for non-ICU data as well.
    175      * <p>Any {@link java.io.IOException} is wrapped into a {@link android.icu.util.ICUUncheckedIOException}.
    176      * @param data the binary, big-endian normalization (.nrm file) data, or null for ICU data
    177      * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
    178      * @param mode normalization mode (compose or decompose etc.)
    179      * @return the requested Normalizer2, if successful
    180      */
    181     public static Normalizer2 getInstance(InputStream data, String name, Mode mode) {
    182         // TODO: If callers really use this API, then we should add an overload that takes a ByteBuffer.
    183         ByteBuffer bytes = null;
    184         if (data != null) {
    185             try {
    186                 bytes = ICUBinary.getByteBufferFromInputStreamAndCloseStream(data);
    187             } catch (IOException e) {
    188                 throw new ICUUncheckedIOException(e);
    189             }
    190         }
    191         Norm2AllModes all2Modes=Norm2AllModes.getInstance(bytes, name);
    192         switch(mode) {
    193         case COMPOSE: return all2Modes.comp;
    194         case DECOMPOSE: return all2Modes.decomp;
    195         case FCD: return all2Modes.fcd;
    196         case COMPOSE_CONTIGUOUS: return all2Modes.fcc;
    197         default: return null;  // will not occur
    198         }
    199     }
    200 
    201     /**
    202      * Returns the normalized form of the source string.
    203      * @param src source string
    204      * @return normalized src
    205      */
    206     public String normalize(CharSequence src) {
    207         if(src instanceof String) {
    208             // Fastpath: Do not construct a new String if the src is a String
    209             // and is already normalized.
    210             int spanLength=spanQuickCheckYes(src);
    211             if(spanLength==src.length()) {
    212                 return (String)src;
    213             }
    214             if (spanLength != 0) {
    215                 StringBuilder sb=new StringBuilder(src.length()).append(src, 0, spanLength);
    216                 return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString();
    217             }
    218         }
    219         return normalize(src, new StringBuilder(src.length())).toString();
    220     }
    221 
    222     /**
    223      * Writes the normalized form of the source string to the destination string
    224      * (replacing its contents) and returns the destination string.
    225      * The source and destination strings must be different objects.
    226      * @param src source string
    227      * @param dest destination string; its contents is replaced with normalized src
    228      * @return dest
    229      */
    230     public abstract StringBuilder normalize(CharSequence src, StringBuilder dest);
    231 
    232     /**
    233      * Writes the normalized form of the source string to the destination Appendable
    234      * and returns the destination Appendable.
    235      * The source and destination strings must be different objects.
    236      *
    237      * <p>Any {@link java.io.IOException} is wrapped into a {@link android.icu.util.ICUUncheckedIOException}.
    238      *
    239      * @param src source string
    240      * @param dest destination Appendable; gets normalized src appended
    241      * @return dest
    242      */
    243     public abstract Appendable normalize(CharSequence src, Appendable dest);
    244 
    245     /**
    246      * Appends the normalized form of the second string to the first string
    247      * (merging them at the boundary) and returns the first string.
    248      * The result is normalized if the first string was normalized.
    249      * The first and second strings must be different objects.
    250      * @param first string, should be normalized
    251      * @param second string, will be normalized
    252      * @return first
    253      */
    254     public abstract StringBuilder normalizeSecondAndAppend(
    255             StringBuilder first, CharSequence second);
    256 
    257     /**
    258      * Appends the second string to the first string
    259      * (merging them at the boundary) and returns the first string.
    260      * The result is normalized if both the strings were normalized.
    261      * The first and second strings must be different objects.
    262      * @param first string, should be normalized
    263      * @param second string, should be normalized
    264      * @return first
    265      */
    266     public abstract StringBuilder append(StringBuilder first, CharSequence second);
    267 
    268     /**
    269      * Gets the decomposition mapping of c.
    270      * Roughly equivalent to normalizing the String form of c
    271      * on a DECOMPOSE Normalizer2 instance, but much faster, and except that this function
    272      * returns null if c does not have a decomposition mapping in this instance's data.
    273      * This function is independent of the mode of the Normalizer2.
    274      * @param c code point
    275      * @return c's decomposition mapping, if any; otherwise null
    276      */
    277     public abstract String getDecomposition(int c);
    278 
    279     /**
    280      * Gets the raw decomposition mapping of c.
    281      *
    282      * <p>This is similar to the getDecomposition() method but returns the
    283      * raw decomposition mapping as specified in UnicodeData.txt or
    284      * (for custom data) in the mapping files processed by the gennorm2 tool.
    285      * By contrast, getDecomposition() returns the processed,
    286      * recursively-decomposed version of this mapping.
    287      *
    288      * <p>When used on a standard NFKC Normalizer2 instance,
    289      * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
    290      *
    291      * <p>When used on a standard NFC Normalizer2 instance,
    292      * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
    293      * in this case, the result contains either one or two code points (=1..4 Java chars).
    294      *
    295      * <p>This function is independent of the mode of the Normalizer2.
    296      * The default implementation returns null.
    297      * @param c code point
    298      * @return c's raw decomposition mapping, if any; otherwise null
    299      */
    300     public String getRawDecomposition(int c) { return null; }
    301 
    302     /**
    303      * Performs pairwise composition of a &amp; b and returns the composite if there is one.
    304      *
    305      * <p>Returns a composite code point c only if c has a two-way mapping to a+b.
    306      * In standard Unicode normalization, this means that
    307      * c has a canonical decomposition to a+b
    308      * and c does not have the Full_Composition_Exclusion property.
    309      *
    310      * <p>This function is independent of the mode of the Normalizer2.
    311      * The default implementation returns a negative value.
    312      * @param a A (normalization starter) code point.
    313      * @param b Another code point.
    314      * @return The non-negative composite code point if there is one; otherwise a negative value.
    315      */
    316     public int composePair(int a, int b) { return -1; }
    317 
    318     /**
    319      * Gets the combining class of c.
    320      * The default implementation returns 0
    321      * but all standard implementations return the Unicode Canonical_Combining_Class value.
    322      * @param c code point
    323      * @return c's combining class
    324      */
    325     public int getCombiningClass(int c) { return 0; }
    326 
    327     /**
    328      * Tests if the string is normalized.
    329      * Internally, in cases where the quickCheck() method would return "maybe"
    330      * (which is only possible for the two COMPOSE modes) this method
    331      * resolves to "yes" or "no" to provide a definitive result,
    332      * at the cost of doing more work in those cases.
    333      * @param s input string
    334      * @return true if s is normalized
    335      */
    336     public abstract boolean isNormalized(CharSequence s);
    337 
    338     /**
    339      * Tests if the string is normalized.
    340      * For the two COMPOSE modes, the result could be "maybe" in cases that
    341      * would take a little more work to resolve definitively.
    342      * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
    343      * combination of quick check + normalization, to avoid
    344      * re-checking the "yes" prefix.
    345      * @param s input string
    346      * @return the quick check result
    347      */
    348     public abstract Normalizer.QuickCheckResult quickCheck(CharSequence s);
    349 
    350     /**
    351      * Returns the end of the normalized substring of the input string.
    352      * In other words, with <code>end=spanQuickCheckYes(s);</code>
    353      * the substring <code>s.subSequence(0, end)</code>
    354      * will pass the quick check with a "yes" result.
    355      * <p>
    356      * The returned end index is usually one or more characters before the
    357      * "no" or "maybe" character: The end index is at a normalization boundary.
    358      * (See the class documentation for more about normalization boundaries.)
    359      * <p>
    360      * When the goal is a normalized string and most input strings are expected
    361      * to be normalized already, then call this method,
    362      * and if it returns a prefix shorter than the input string,
    363      * copy that prefix and use normalizeSecondAndAppend() for the remainder.
    364      * @param s input string
    365      * @return "yes" span end index
    366      */
    367     public abstract int spanQuickCheckYes(CharSequence s);
    368 
    369     /**
    370      * Tests if the character always has a normalization boundary before it,
    371      * regardless of context.
    372      * If true, then the character does not normalization-interact with
    373      * preceding characters.
    374      * In other words, a string containing this character can be normalized
    375      * by processing portions before this character and starting from this
    376      * character independently.
    377      * This is used for iterative normalization. See the class documentation for details.
    378      * @param c character to test
    379      * @return true if c has a normalization boundary before it
    380      */
    381     public abstract boolean hasBoundaryBefore(int c);
    382 
    383     /**
    384      * Tests if the character always has a normalization boundary after it,
    385      * regardless of context.
    386      * If true, then the character does not normalization-interact with
    387      * following characters.
    388      * In other words, a string containing this character can be normalized
    389      * by processing portions up to this character and after this
    390      * character independently.
    391      * This is used for iterative normalization. See the class documentation for details.
    392      * <p>
    393      * Note that this operation may be significantly slower than hasBoundaryBefore().
    394      * @param c character to test
    395      * @return true if c has a normalization boundary after it
    396      */
    397     public abstract boolean hasBoundaryAfter(int c);
    398 
    399     /**
    400      * Tests if the character is normalization-inert.
    401      * If true, then the character does not change, nor normalization-interact with
    402      * preceding or following characters.
    403      * In other words, a string containing this character can be normalized
    404      * by processing portions before this character and after this
    405      * character independently.
    406      * This is used for iterative normalization. See the class documentation for details.
    407      * <p>
    408      * Note that this operation may be significantly slower than hasBoundaryBefore().
    409      * @param c character to test
    410      * @return true if c is normalization-inert
    411      */
    412     public abstract boolean isInert(int c);
    413 
    414     /**
    415      * Sole constructor.  (For invocation by subclass constructors,
    416      * typically implicit.)
    417      * @deprecated This API is ICU internal only.
    418      * @hide original deprecated declaration
    419      * @hide draft / provisional / internal are hidden on Android
    420      */
    421     @Deprecated
    422     protected Normalizer2() {
    423     }
    424 }
    425