Home | History | Annotate | Download | only in text
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  *******************************************************************************
      5  * Copyright (C) 2000-2016, International Business Machines Corporation and
      6  * others. All Rights Reserved.
      7  *******************************************************************************
      8  */
      9 package com.ibm.icu.text;
     10 import java.nio.CharBuffer;
     11 import java.text.CharacterIterator;
     12 
     13 import com.ibm.icu.impl.Norm2AllModes;
     14 import com.ibm.icu.impl.Normalizer2Impl;
     15 import com.ibm.icu.impl.UCaseProps;
     16 import com.ibm.icu.lang.UCharacter;
     17 import com.ibm.icu.util.ICUCloneNotSupportedException;
     18 
     19 /**
     20  * Old Unicode normalization API.
     21  *
     22  * <p>This API has been replaced by the {@link Normalizer2} class and is only available
     23  * for backward compatibility. This class simply delegates to the Normalizer2 class.
     24  * There are two exceptions: The new API does not provide a replacement for
     25  * <code>QuickCheckResult</code> and <code>compare()</code>.
     26  *
     27  * <p><code>normalize</code> transforms Unicode text into an equivalent composed or
     28  * decomposed form, allowing for easier sorting and searching of text.
     29  * <code>normalize</code> supports the standard normalization forms described in
     30  * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
     31  * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
     32  *
     33  * <p>Characters with accents or other adornments can be encoded in
     34  * several different ways in Unicode.  For example, take the character A-acute.
     35  * In Unicode, this can be encoded as a single character (the
     36  * "composed" form):
     37  *
     38  * <pre>
     39  *      00C1    LATIN CAPITAL LETTER A WITH ACUTE
     40  * </pre>
     41  *
     42  * or as two separate characters (the "decomposed" form):
     43  *
     44  * <pre>
     45  *      0041    LATIN CAPITAL LETTER A
     46  *      0301    COMBINING ACUTE ACCENT
     47  * </pre>
     48  *
     49  * <p>To a user of your program, however, both of these sequences should be
     50  * treated as the same "user-level" character "A with acute accent".  When you
     51  * are searching or comparing text, you must ensure that these two sequences are
     52  * treated equivalently.  In addition, you must handle characters with more than
     53  * one accent.  Sometimes the order of a character's combining accents is
     54  * significant, while in other cases accent sequences in different orders are
     55  * really equivalent.
     56  *
     57  * <p>Similarly, the string "ffi" can be encoded as three separate letters:
     58  *
     59  * <pre>
     60  *      0066    LATIN SMALL LETTER F
     61  *      0066    LATIN SMALL LETTER F
     62  *      0069    LATIN SMALL LETTER I
     63  * </pre>
     64  *
     65  * or as the single character
     66  *
     67  * <pre>
     68  *      FB03    LATIN SMALL LIGATURE FFI
     69  * </pre>
     70  *
     71  * <p>The ffi ligature is not a distinct semantic character, and strictly speaking
     72  * it shouldn't be in Unicode at all, but it was included for compatibility
     73  * with existing character sets that already provided it.  The Unicode standard
     74  * identifies such characters by giving them "compatibility" decompositions
     75  * into the corresponding semantic characters.  When sorting and searching, you
     76  * will often want to use these mappings.
     77  *
     78  * <p><code>normalize</code> helps solve these problems by transforming text into
     79  * the canonical composed and decomposed forms as shown in the first example
     80  * above. In addition, you can have it perform compatibility decompositions so
     81  * that you can treat compatibility characters the same as their equivalents.
     82  * Finally, <code>normalize</code> rearranges accents into the proper canonical
     83  * order, so that you do not have to worry about accent rearrangement on your
     84  * own.
     85  *
     86  * <p>Form FCD, "Fast C or D", is also designed for collation.
     87  * It allows to work on strings that are not necessarily normalized
     88  * with an algorithm (like in collation) that works under "canonical closure",
     89  * i.e., it treats precomposed characters and their decomposed equivalents the
     90  * same.
     91  *
     92  * <p>It is not a normalization form because it does not provide for uniqueness of
     93  * representation. Multiple strings may be canonically equivalent (their NFDs
     94  * are identical) and may all conform to FCD without being identical themselves.
     95  *
     96  * <p>The form is defined such that the "raw decomposition", the recursive
     97  * canonical decomposition of each character, results in a string that is
     98  * canonically ordered. This means that precomposed characters are allowed for
     99  * as long as their decompositions do not need canonical reordering.
    100  *
    101  * <p>Its advantage for a process like collation is that all NFD and most NFC texts
    102  * - and many unnormalized texts - already conform to FCD and do not need to be
    103  * normalized (NFD) for such a process. The FCD quick check will return YES for
    104  * most strings in practice.
    105  *
    106  * <p>normalize(FCD) may be implemented with NFD.
    107  *
    108  * <p>For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications):
    109  * http://www.unicode.org/notes/tn5/#FCD
    110  *
    111  * <p>ICU collation performs either NFD or FCD normalization automatically if
    112  * normalization is turned on for the collator object. Beyond collation and
    113  * string search, normalized strings may be useful for string equivalence
    114  * comparisons, transliteration/transcription, unique representations, etc.
    115  *
    116  * <p>The W3C generally recommends to exchange texts in NFC.
    117  * Note also that most legacy character encodings use only precomposed forms and
    118  * often do not encode any combining marks by themselves. For conversion to such
    119  * character encodings the Unicode text needs to be normalized to NFC.
    120  * For more usage examples, see the Unicode Standard Annex.
    121  *
    122  * <p>Note: The Normalizer class also provides API for iterative normalization.
    123  * While the setIndex() and getIndex() refer to indices in the
    124  * underlying Unicode input text, the next() and previous() methods
    125  * iterate through characters in the normalized output.
    126  * This means that there is not necessarily a one-to-one correspondence
    127  * between characters returned by next() and previous() and the indices
    128  * passed to and returned from setIndex() and getIndex().
    129  * It is for this reason that Normalizer does not implement the CharacterIterator interface.
    130  *
    131  * @stable ICU 2.8
    132  */
    133 public final class Normalizer implements Cloneable {
    134     // The input text and our position in it
    135     private UCharacterIterator  text;
    136     private Normalizer2         norm2;
    137     private Mode                mode;
    138     private int                 options;
    139 
    140     // The normalization buffer is the result of normalization
    141     // of the source in [currentIndex..nextIndex[ .
    142     private int                 currentIndex;
    143     private int                 nextIndex;
    144 
    145     // A buffer for holding intermediate results
    146     private StringBuilder       buffer;
    147     private int                 bufferPos;
    148 
    149     // Helper classes to defer loading of normalization data.
    150     private static final class ModeImpl {
    151         private ModeImpl(Normalizer2 n2) {
    152             normalizer2 = n2;
    153         }
    154         private final Normalizer2 normalizer2;
    155     }
    156     private static final class NFDModeImpl {
    157         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance());
    158     }
    159     private static final class NFKDModeImpl {
    160         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance());
    161     }
    162     private static final class NFCModeImpl {
    163         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance());
    164     }
    165     private static final class NFKCModeImpl {
    166         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance());
    167     }
    168     private static final class FCDModeImpl {
    169         private static final ModeImpl INSTANCE = new ModeImpl(Norm2AllModes.getFCDNormalizer2());
    170     }
    171 
    172     private static final class Unicode32 {
    173         private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze();
    174     }
    175     private static final class NFD32ModeImpl {
    176         private static final ModeImpl INSTANCE =
    177             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(),
    178                                                  Unicode32.INSTANCE));
    179     }
    180     private static final class NFKD32ModeImpl {
    181         private static final ModeImpl INSTANCE =
    182             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(),
    183                                                  Unicode32.INSTANCE));
    184     }
    185     private static final class NFC32ModeImpl {
    186         private static final ModeImpl INSTANCE =
    187             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(),
    188                                                  Unicode32.INSTANCE));
    189     }
    190     private static final class NFKC32ModeImpl {
    191         private static final ModeImpl INSTANCE =
    192             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(),
    193                                                  Unicode32.INSTANCE));
    194     }
    195     private static final class FCD32ModeImpl {
    196         private static final ModeImpl INSTANCE =
    197             new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getFCDNormalizer2(),
    198                                                  Unicode32.INSTANCE));
    199     }
    200 
    201     /**
    202      * Options bit set value to select Unicode 3.2 normalization
    203      * (except NormalizationCorrections).
    204      * At most one Unicode version can be selected at a time.
    205      *
    206      * @deprecated ICU 56 Use {@link FilteredNormalizer2} instead.
    207      */
    208     @Deprecated
    209     public static final int UNICODE_3_2=0x20;
    210 
    211     /**
    212      * Constant indicating that the end of the iteration has been reached.
    213      * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}.
    214      *
    215      * @deprecated ICU 56
    216      */
    217     @Deprecated
    218     public static final int DONE = UCharacterIterator.DONE;
    219 
    220     /**
    221      * Constants for normalization modes.
    222      * <p>
    223      * The Mode class is not intended for public subclassing.
    224      * Only the Mode constants provided by the Normalizer class should be used,
    225      * and any fields or methods should not be called or overridden by users.
    226      *
    227      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    228      */
    229     @Deprecated
    230     public static abstract class Mode {
    231         /**
    232          * Sole constructor
    233          * @internal
    234          * @deprecated This API is ICU internal only.
    235          */
    236         @Deprecated
    237         protected Mode() {
    238         }
    239 
    240         /**
    241          * @internal
    242          * @deprecated This API is ICU internal only.
    243          */
    244         @Deprecated
    245         protected abstract Normalizer2 getNormalizer2(int options);
    246     }
    247 
    248     private static final class NONEMode extends Mode {
    249         @Override
    250         protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; }
    251     }
    252     private static final class NFDMode extends Mode {
    253         @Override
    254         protected Normalizer2 getNormalizer2(int options) {
    255             return (options&UNICODE_3_2) != 0 ?
    256                     NFD32ModeImpl.INSTANCE.normalizer2 : NFDModeImpl.INSTANCE.normalizer2;
    257         }
    258     }
    259     private static final class NFKDMode extends Mode {
    260         @Override
    261         protected Normalizer2 getNormalizer2(int options) {
    262             return (options&UNICODE_3_2) != 0 ?
    263                     NFKD32ModeImpl.INSTANCE.normalizer2 : NFKDModeImpl.INSTANCE.normalizer2;
    264         }
    265     }
    266     private static final class NFCMode extends Mode {
    267         @Override
    268         protected Normalizer2 getNormalizer2(int options) {
    269             return (options&UNICODE_3_2) != 0 ?
    270                     NFC32ModeImpl.INSTANCE.normalizer2 : NFCModeImpl.INSTANCE.normalizer2;
    271         }
    272     }
    273     private static final class NFKCMode extends Mode {
    274         @Override
    275         protected Normalizer2 getNormalizer2(int options) {
    276             return (options&UNICODE_3_2) != 0 ?
    277                     NFKC32ModeImpl.INSTANCE.normalizer2 : NFKCModeImpl.INSTANCE.normalizer2;
    278         }
    279     }
    280     private static final class FCDMode extends Mode {
    281         @Override
    282         protected Normalizer2 getNormalizer2(int options) {
    283             return (options&UNICODE_3_2) != 0 ?
    284                     FCD32ModeImpl.INSTANCE.normalizer2 : FCDModeImpl.INSTANCE.normalizer2;
    285         }
    286     }
    287 
    288     /**
    289      * No decomposition/composition.
    290      *
    291      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    292      */
    293     @Deprecated
    294     public static final Mode NONE = new NONEMode();
    295 
    296     /**
    297      * Canonical decomposition.
    298      *
    299      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    300      */
    301     @Deprecated
    302     public static final Mode NFD = new NFDMode();
    303 
    304     /**
    305      * Compatibility decomposition.
    306      *
    307      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    308      */
    309     @Deprecated
    310     public static final Mode NFKD = new NFKDMode();
    311 
    312     /**
    313      * Canonical decomposition followed by canonical composition.
    314      *
    315      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    316      */
    317     @Deprecated
    318     public static final Mode NFC = new NFCMode();
    319 
    320     /**
    321      * Default normalization.
    322      *
    323      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    324      */
    325     @Deprecated
    326     public static final Mode DEFAULT = NFC;
    327 
    328     /**
    329      * Compatibility decomposition followed by canonical composition.
    330      *
    331      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    332      */
    333     @Deprecated
    334     public static final Mode NFKC =new NFKCMode();
    335 
    336     /**
    337      * "Fast C or D" form.
    338      *
    339      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    340      */
    341     @Deprecated
    342     public static final Mode FCD = new FCDMode();
    343 
    344     /**
    345      * Null operation for use with the {@link com.ibm.icu.text.Normalizer constructors}
    346      * and the static {@link #normalize normalize} method.  This value tells
    347      * the <tt>Normalizer</tt> to do nothing but return unprocessed characters
    348      * from the underlying String or CharacterIterator.  If you have code which
    349      * requires raw text at some times and normalized text at others, you can
    350      * use <tt>NO_OP</tt> for the cases where you want raw text, rather
    351      * than having a separate code path that bypasses <tt>Normalizer</tt>
    352      * altogether.
    353      * <p>
    354      * @see #setMode
    355      * @deprecated ICU 2.8. Use Nomalizer.NONE
    356      * @see #NONE
    357      */
    358     @Deprecated
    359     public static final Mode NO_OP = NONE;
    360 
    361     /**
    362      * Canonical decomposition followed by canonical composition.  Used with the
    363      * {@link com.ibm.icu.text.Normalizer constructors} and the static
    364      * {@link #normalize normalize} method to determine the operation to be
    365      * performed.
    366      * <p>
    367      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
    368      * off, this operation produces output that is in
    369      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
    370      * Form</a>
    371      * <b>C</b>.
    372      * <p>
    373      * @see #setMode
    374      * @deprecated ICU 2.8. Use Normalier.NFC
    375      * @see #NFC
    376      */
    377     @Deprecated
    378     public static final Mode COMPOSE = NFC;
    379 
    380     /**
    381      * Compatibility decomposition followed by canonical composition.
    382      * Used with the {@link com.ibm.icu.text.Normalizer constructors} and the static
    383      * {@link #normalize normalize} method to determine the operation to be
    384      * performed.
    385      * <p>
    386      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
    387      * off, this operation produces output that is in
    388      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
    389      * Form</a>
    390      * <b>KC</b>.
    391      * <p>
    392      * @see #setMode
    393      * @deprecated ICU 2.8. Use Normalizer.NFKC
    394      * @see #NFKC
    395      */
    396     @Deprecated
    397     public static final Mode COMPOSE_COMPAT = NFKC;
    398 
    399     /**
    400      * Canonical decomposition.  This value is passed to the
    401      * {@link com.ibm.icu.text.Normalizer constructors} and the static
    402      * {@link #normalize normalize}
    403      * method to determine the operation to be performed.
    404      * <p>
    405      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
    406      * off, this operation produces output that is in
    407      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
    408      * Form</a>
    409      * <b>D</b>.
    410      * <p>
    411      * @see #setMode
    412      * @deprecated ICU 2.8. Use Normalizer.NFD
    413      * @see #NFD
    414      */
    415     @Deprecated
    416     public static final Mode DECOMP = NFD;
    417 
    418     /**
    419      * Compatibility decomposition.  This value is passed to the
    420      * {@link com.ibm.icu.text.Normalizer constructors} and the static
    421      * {@link #normalize normalize}
    422      * method to determine the operation to be performed.
    423      * <p>
    424      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
    425      * off, this operation produces output that is in
    426      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
    427      * Form</a>
    428      * <b>KD</b>.
    429      * <p>
    430      * @see #setMode
    431      * @deprecated ICU 2.8. Use Normalizer.NFKD
    432      * @see #NFKD
    433      */
    434     @Deprecated
    435     public static final Mode DECOMP_COMPAT = NFKD;
    436 
    437     /**
    438      * Option to disable Hangul/Jamo composition and decomposition.
    439      * This option applies to Korean text,
    440      * which can be represented either in the Jamo alphabet or in Hangul
    441      * characters, which are really just two or three Jamo combined
    442      * into one visual glyph.  Since Jamo takes up more storage space than
    443      * Hangul, applications that process only Hangul text may wish to turn
    444      * this option on when decomposing text.
    445      * <p>
    446      * The Unicode standard treates Hangul to Jamo conversion as a
    447      * canonical decomposition, so this option must be turned <b>off</b> if you
    448      * wish to transform strings into one of the standard
    449      * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
    450      * Unicode Normalization Forms</a>.
    451      * <p>
    452      * @see #setOption
    453      * @deprecated ICU 2.8. This option is no longer supported.
    454      */
    455     @Deprecated
    456     public static final int IGNORE_HANGUL = 0x0001;
    457 
    458     /**
    459      * Result values for quickCheck().
    460      * For details see Unicode Technical Report 15.
    461      * @stable ICU 2.8
    462      */
    463     public static final class QuickCheckResult{
    464         //private int resultValue;
    465         private QuickCheckResult(int value) {
    466             //resultValue=value;
    467         }
    468     }
    469     /**
    470      * Indicates that string is not in the normalized format
    471      * @stable ICU 2.8
    472      */
    473     public static final QuickCheckResult NO = new QuickCheckResult(0);
    474 
    475     /**
    476      * Indicates that string is in the normalized format
    477      * @stable ICU 2.8
    478      */
    479     public static final QuickCheckResult YES = new QuickCheckResult(1);
    480 
    481     /**
    482      * Indicates it cannot be determined if string is in the normalized
    483      * format without further thorough checks.
    484      * @stable ICU 2.8
    485      */
    486     public static final QuickCheckResult MAYBE = new QuickCheckResult(2);
    487 
    488     /**
    489      * Option bit for compare:
    490      * Case sensitively compare the strings
    491      * @stable ICU 2.8
    492      */
    493     public static final int FOLD_CASE_DEFAULT =  UCharacter.FOLD_CASE_DEFAULT;
    494 
    495     /**
    496      * Option bit for compare:
    497      * Both input strings are assumed to fulfill FCD conditions.
    498      * @stable ICU 2.8
    499      */
    500     public static final int INPUT_IS_FCD    =      0x20000;
    501 
    502     /**
    503      * Option bit for compare:
    504      * Perform case-insensitive comparison.
    505      * @stable ICU 2.8
    506      */
    507     public static final int COMPARE_IGNORE_CASE  =     0x10000;
    508 
    509     /**
    510      * Option bit for compare:
    511      * Compare strings in code point order instead of code unit order.
    512      * @stable ICU 2.8
    513      */
    514     public static final int COMPARE_CODE_POINT_ORDER = 0x8000;
    515 
    516     /**
    517      * Option value for case folding:
    518      * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I
    519      * and dotless i appropriately for Turkic languages (tr, az).
    520      * @see UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I
    521      * @stable ICU 2.8
    522      */
    523     public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = UCharacter.FOLD_CASE_EXCLUDE_SPECIAL_I;
    524 
    525     /**
    526      * Lowest-order bit number of compare() options bits corresponding to
    527      * normalization options bits.
    528      *
    529      * The options parameter for compare() uses most bits for
    530      * itself and for various comparison and folding flags.
    531      * The most significant bits, however, are shifted down and passed on
    532      * to the normalization implementation.
    533      * (That is, from compare(..., options, ...),
    534      * options&gt;&gt;COMPARE_NORM_OPTIONS_SHIFT will be passed on to the
    535      * internal normalization functions.)
    536      *
    537      * @see #compare
    538      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    539      */
    540     @Deprecated
    541     public static final int COMPARE_NORM_OPTIONS_SHIFT  = 20;
    542 
    543     //-------------------------------------------------------------------------
    544     // Iterator constructors
    545     //-------------------------------------------------------------------------
    546 
    547     /**
    548      * Creates a new <tt>Normalizer</tt> object for iterating over the
    549      * normalized form of a given string.
    550      * <p>
    551      * The <tt>options</tt> parameter specifies which optional
    552      * <tt>Normalizer</tt> features are to be enabled for this object.
    553      * <p>
    554      * @param str  The string to be normalized.  The normalization
    555      *              will start at the beginning of the string.
    556      *
    557      * @param mode The normalization mode.
    558      *
    559      * @param opt Any optional features to be enabled.
    560      *            Currently the only available option is {@link #UNICODE_3_2}.
    561      *            If you want the default behavior corresponding to one of the
    562      *            standard Unicode Normalization Forms, use 0 for this argument.
    563      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    564      */
    565     @Deprecated
    566     public Normalizer(String str, Mode mode, int opt) {
    567         this.text = UCharacterIterator.getInstance(str);
    568         this.mode = mode;
    569         this.options=opt;
    570         norm2 = mode.getNormalizer2(opt);
    571         buffer = new StringBuilder();
    572     }
    573 
    574     /**
    575      * Creates a new <tt>Normalizer</tt> object for iterating over the
    576      * normalized form of the given text.
    577      * <p>
    578      * @param iter  The input text to be normalized.  The normalization
    579      *              will start at the beginning of the string.
    580      *
    581      * @param mode  The normalization mode.
    582      *
    583      * @param opt Any optional features to be enabled.
    584      *            Currently the only available option is {@link #UNICODE_3_2}.
    585      *            If you want the default behavior corresponding to one of the
    586      *            standard Unicode Normalization Forms, use 0 for this argument.
    587      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    588      */
    589     @Deprecated
    590     public Normalizer(CharacterIterator iter, Mode mode, int opt) {
    591         this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone());
    592         this.mode = mode;
    593         this.options = opt;
    594         norm2 = mode.getNormalizer2(opt);
    595         buffer = new StringBuilder();
    596     }
    597 
    598     /**
    599      * Creates a new <tt>Normalizer</tt> object for iterating over the
    600      * normalized form of the given text.
    601      * <p>
    602      * @param iter  The input text to be normalized.  The normalization
    603      *              will start at the beginning of the string.
    604      *
    605      * @param mode  The normalization mode.
    606      * @param options The normalization options, ORed together (0 for no options).
    607      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    608      */
    609     @Deprecated
    610     public Normalizer(UCharacterIterator iter, Mode mode, int options) {
    611         try {
    612             this.text     = (UCharacterIterator)iter.clone();
    613             this.mode     = mode;
    614             this.options  = options;
    615             norm2 = mode.getNormalizer2(options);
    616             buffer = new StringBuilder();
    617         } catch (CloneNotSupportedException e) {
    618             throw new ICUCloneNotSupportedException(e);
    619         }
    620     }
    621 
    622     /**
    623      * Clones this <tt>Normalizer</tt> object.  All properties of this
    624      * object are duplicated in the new object, including the cloning of any
    625      * {@link CharacterIterator} that was passed in to the constructor
    626      * or to {@link #setText(CharacterIterator) setText}.
    627      * However, the text storage underlying
    628      * the <tt>CharacterIterator</tt> is not duplicated unless the
    629      * iterator's <tt>clone</tt> method does so.
    630      *
    631      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    632      */
    633     @Deprecated
    634     @Override
    635     public Object clone() {
    636         try {
    637             Normalizer copy = (Normalizer) super.clone();
    638             copy.text = (UCharacterIterator) text.clone();
    639             copy.mode = mode;
    640             copy.options = options;
    641             copy.norm2 = norm2;
    642             copy.buffer = new StringBuilder(buffer);
    643             copy.bufferPos = bufferPos;
    644             copy.currentIndex = currentIndex;
    645             copy.nextIndex = nextIndex;
    646             return copy;
    647         }
    648         catch (CloneNotSupportedException e) {
    649             throw new ICUCloneNotSupportedException(e);
    650         }
    651     }
    652 
    653     //--------------------------------------------------------------------------
    654     // Static Utility methods
    655     //--------------------------------------------------------------------------
    656 
    657     private static final Normalizer2 getComposeNormalizer2(boolean compat, int options) {
    658         return (compat ? NFKC : NFC).getNormalizer2(options);
    659     }
    660     private static final Normalizer2 getDecomposeNormalizer2(boolean compat, int options) {
    661         return (compat ? NFKD : NFD).getNormalizer2(options);
    662     }
    663 
    664     /**
    665      * Compose a string.
    666      * The string will be composed to according to the specified mode.
    667      * @param str        The string to compose.
    668      * @param compat     If true the string will be composed according to
    669      *                    NFKC rules and if false will be composed according to
    670      *                    NFC rules.
    671      * @return String    The composed string
    672      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    673      */
    674     @Deprecated
    675     public static String compose(String str, boolean compat) {
    676         return compose(str,compat,0);
    677     }
    678 
    679     /**
    680      * Compose a string.
    681      * The string will be composed to according to the specified mode.
    682      * @param str        The string to compose.
    683      * @param compat     If true the string will be composed according to
    684      *                    NFKC rules and if false will be composed according to
    685      *                    NFC rules.
    686      * @param options    The only recognized option is UNICODE_3_2
    687      * @return String    The composed string
    688      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    689      */
    690     @Deprecated
    691     public static String compose(String str, boolean compat, int options) {
    692         return getComposeNormalizer2(compat, options).normalize(str);
    693     }
    694 
    695     /**
    696      * Compose a string.
    697      * The string will be composed to according to the specified mode.
    698      * @param source The char array to compose.
    699      * @param target A char buffer to receive the normalized text.
    700      * @param compat If true the char array will be composed according to
    701      *                NFKC rules and if false will be composed according to
    702      *                NFC rules.
    703      * @param options The normalization options, ORed together (0 for no options).
    704      * @return int   The total buffer size needed;if greater than length of
    705      *                result, the output was truncated.
    706      * @exception IndexOutOfBoundsException if target.length is less than the
    707      *             required length
    708      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    709      */
    710     @Deprecated
    711     public static int compose(char[] source,char[] target, boolean compat, int options) {
    712         return compose(source, 0, source.length, target, 0, target.length, compat, options);
    713     }
    714 
    715     /**
    716      * Compose a string.
    717      * The string will be composed to according to the specified mode.
    718      * @param src       The char array to compose.
    719      * @param srcStart  Start index of the source
    720      * @param srcLimit  Limit index of the source
    721      * @param dest      The char buffer to fill in
    722      * @param destStart Start index of the destination buffer
    723      * @param destLimit End index of the destination buffer
    724      * @param compat If true the char array will be composed according to
    725      *                NFKC rules and if false will be composed according to
    726      *                NFC rules.
    727      * @param options The normalization options, ORed together (0 for no options).
    728      * @return int   The total buffer size needed;if greater than length of
    729      *                result, the output was truncated.
    730      * @exception IndexOutOfBoundsException if target.length is less than the
    731      *             required length
    732      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    733      */
    734     @Deprecated
    735     public static int compose(char[] src,int srcStart, int srcLimit,
    736                               char[] dest,int destStart, int destLimit,
    737                               boolean compat, int options) {
    738         CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
    739         CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
    740         getComposeNormalizer2(compat, options).normalize(srcBuffer, app);
    741         return app.length();
    742     }
    743 
    744     /**
    745      * Decompose a string.
    746      * The string will be decomposed to according to the specified mode.
    747      * @param str       The string to decompose.
    748      * @param compat    If true the string will be decomposed according to NFKD
    749      *                   rules and if false will be decomposed according to NFD
    750      *                   rules.
    751      * @return String   The decomposed string
    752      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    753      */
    754     @Deprecated
    755     public static String decompose(String str, boolean compat) {
    756         return decompose(str,compat,0);
    757     }
    758 
    759     /**
    760      * Decompose a string.
    761      * The string will be decomposed to according to the specified mode.
    762      * @param str     The string to decompose.
    763      * @param compat  If true the string will be decomposed according to NFKD
    764      *                 rules and if false will be decomposed according to NFD
    765      *                 rules.
    766      * @param options The normalization options, ORed together (0 for no options).
    767      * @return String The decomposed string
    768      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    769      */
    770     @Deprecated
    771     public static String decompose(String str, boolean compat, int options) {
    772         return getDecomposeNormalizer2(compat, options).normalize(str);
    773     }
    774 
    775     /**
    776      * Decompose a string.
    777      * The string will be decomposed to according to the specified mode.
    778      * @param source The char array to decompose.
    779      * @param target A char buffer to receive the normalized text.
    780      * @param compat If true the char array will be decomposed according to NFKD
    781      *                rules and if false will be decomposed according to
    782      *                NFD rules.
    783      * @return int   The total buffer size needed;if greater than length of
    784      *                result,the output was truncated.
    785      * @param options The normalization options, ORed together (0 for no options).
    786      * @exception IndexOutOfBoundsException if the target capacity is less than
    787      *             the required length
    788      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    789      */
    790     @Deprecated
    791     public static int decompose(char[] source,char[] target, boolean compat, int options) {
    792         return decompose(source, 0, source.length, target, 0, target.length, compat, options);
    793     }
    794 
    795     /**
    796      * Decompose a string.
    797      * The string will be decomposed to according to the specified mode.
    798      * @param src       The char array to compose.
    799      * @param srcStart  Start index of the source
    800      * @param srcLimit  Limit index of the source
    801      * @param dest      The char buffer to fill in
    802      * @param destStart Start index of the destination buffer
    803      * @param destLimit End index of the destination buffer
    804      * @param compat If true the char array will be decomposed according to NFKD
    805      *                rules and if false will be decomposed according to
    806      *                NFD rules.
    807      * @param options The normalization options, ORed together (0 for no options).
    808      * @return int   The total buffer size needed;if greater than length of
    809      *                result,the output was truncated.
    810      * @exception IndexOutOfBoundsException if the target capacity is less than
    811      *             the required length
    812      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    813      */
    814     @Deprecated
    815     public static int decompose(char[] src,int srcStart, int srcLimit,
    816                                 char[] dest,int destStart, int destLimit,
    817                                 boolean compat, int options) {
    818         CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
    819         CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
    820         getDecomposeNormalizer2(compat, options).normalize(srcBuffer, app);
    821         return app.length();
    822     }
    823 
    824     /**
    825      * Normalizes a <tt>String</tt> using the given normalization operation.
    826      * <p>
    827      * The <tt>options</tt> parameter specifies which optional
    828      * <tt>Normalizer</tt> features are to be enabled for this operation.
    829      * Currently the only available option is {@link #UNICODE_3_2}.
    830      * If you want the default behavior corresponding to one of the standard
    831      * Unicode Normalization Forms, use 0 for this argument.
    832      * <p>
    833      * @param str       the input string to be normalized.
    834      * @param mode      the normalization mode
    835      * @param options   the optional features to be enabled.
    836      * @return String   the normalized string
    837      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    838      */
    839     @Deprecated
    840     public static String normalize(String str, Mode mode, int options) {
    841         return mode.getNormalizer2(options).normalize(str);
    842     }
    843 
    844     /**
    845      * Normalize a string.
    846      * The string will be normalized according to the specified normalization
    847      * mode and options.
    848      * @param src        The string to normalize.
    849      * @param mode       The normalization mode; one of Normalizer.NONE,
    850      *                    Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
    851      *                    Normalizer.NFKD, Normalizer.DEFAULT
    852      * @return the normalized string
    853      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    854      */
    855     @Deprecated
    856     public static String normalize(String src,Mode mode) {
    857         return normalize(src, mode, 0);
    858     }
    859     /**
    860      * Normalize a string.
    861      * The string will be normalized according to the specified normalization
    862      * mode and options.
    863      * @param source The char array to normalize.
    864      * @param target A char buffer to receive the normalized text.
    865      * @param mode   The normalization mode; one of Normalizer.NONE,
    866      *                Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
    867      *                Normalizer.NFKD, Normalizer.DEFAULT
    868      * @param options The normalization options, ORed together (0 for no options).
    869      * @return int   The total buffer size needed;if greater than length of
    870      *                result, the output was truncated.
    871      * @exception    IndexOutOfBoundsException if the target capacity is less
    872      *                than the required length
    873      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    874      */
    875     @Deprecated
    876     public static int normalize(char[] source,char[] target, Mode  mode, int options) {
    877         return normalize(source,0,source.length,target,0,target.length,mode, options);
    878     }
    879 
    880     /**
    881      * Normalize a string.
    882      * The string will be normalized according to the specified normalization
    883      * mode and options.
    884      * @param src       The char array to compose.
    885      * @param srcStart  Start index of the source
    886      * @param srcLimit  Limit index of the source
    887      * @param dest      The char buffer to fill in
    888      * @param destStart Start index of the destination buffer
    889      * @param destLimit End index of the destination buffer
    890      * @param mode      The normalization mode; one of Normalizer.NONE,
    891      *                   Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
    892      *                   Normalizer.NFKD, Normalizer.DEFAULT
    893      * @param options The normalization options, ORed together (0 for no options).
    894      * @return int      The total buffer size needed;if greater than length of
    895      *                   result, the output was truncated.
    896      * @exception       IndexOutOfBoundsException if the target capacity is
    897      *                   less than the required length
    898      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    899      */
    900     @Deprecated
    901     public static int normalize(char[] src,int srcStart, int srcLimit,
    902                                 char[] dest,int destStart, int destLimit,
    903                                 Mode  mode, int options) {
    904         CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
    905         CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
    906         mode.getNormalizer2(options).normalize(srcBuffer, app);
    907         return app.length();
    908     }
    909 
    910     /**
    911      * Normalize a codepoint according to the given mode
    912      * @param char32    The input string to be normalized.
    913      * @param mode      The normalization mode
    914      * @param options   Options for use with exclusion set and tailored Normalization
    915      *                                   The only option that is currently recognized is UNICODE_3_2
    916      * @return String   The normalized string
    917      * @see #UNICODE_3_2
    918      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    919      */
    920     @Deprecated
    921     public static String normalize(int char32, Mode mode, int options) {
    922         if(mode == NFD && options == 0) {
    923             String decomposition = Normalizer2.getNFCInstance().getDecomposition(char32);
    924             if(decomposition == null) {
    925                 decomposition = UTF16.valueOf(char32);
    926             }
    927             return decomposition;
    928         }
    929         return normalize(UTF16.valueOf(char32), mode, options);
    930     }
    931 
    932     /**
    933      * Convenience method to normalize a codepoint according to the given mode
    934      * @param char32    The input string to be normalized.
    935      * @param mode      The normalization mode
    936      * @return String   The normalized string
    937      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    938      */
    939     @Deprecated
    940     public static String normalize(int char32, Mode mode) {
    941         return normalize(char32, mode, 0);
    942     }
    943 
    944     /**
    945      * Convenience method.
    946      *
    947      * @param source   string for determining if it is in a normalized format
    948      * @param mode     normalization format (Normalizer.NFC,Normalizer.NFD,
    949      *                  Normalizer.NFKC,Normalizer.NFKD)
    950      * @return         Return code to specify if the text is normalized or not
    951      *                     (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
    952      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    953      */
    954     @Deprecated
    955     public static QuickCheckResult quickCheck(String source, Mode mode) {
    956         return quickCheck(source, mode, 0);
    957     }
    958 
    959     /**
    960      * Performing quick check on a string, to quickly determine if the string is
    961      * in a particular normalization format.
    962      * Three types of result can be returned Normalizer.YES, Normalizer.NO or
    963      * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument
    964      * string is in the desired normalized format, Normalizer.NO determines that
    965      * argument string is not in the desired normalized format. A
    966      * Normalizer.MAYBE result indicates that a more thorough check is required,
    967      * the user may have to put the string in its normalized form and compare
    968      * the results.
    969      *
    970      * @param source   string for determining if it is in a normalized format
    971      * @param mode     normalization format (Normalizer.NFC,Normalizer.NFD,
    972      *                  Normalizer.NFKC,Normalizer.NFKD)
    973      * @param options   Options for use with exclusion set and tailored Normalization
    974      *                                   The only option that is currently recognized is UNICODE_3_2
    975      * @return         Return code to specify if the text is normalized or not
    976      *                     (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
    977      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    978      */
    979     @Deprecated
    980     public static QuickCheckResult quickCheck(String source, Mode mode, int options) {
    981         return mode.getNormalizer2(options).quickCheck(source);
    982     }
    983 
    984     /**
    985      * Convenience method.
    986      *
    987      * @param source Array of characters for determining if it is in a
    988      *                normalized format
    989      * @param mode   normalization format (Normalizer.NFC,Normalizer.NFD,
    990      *                Normalizer.NFKC,Normalizer.NFKD)
    991      * @param options   Options for use with exclusion set and tailored Normalization
    992      *                                   The only option that is currently recognized is UNICODE_3_2
    993      * @return       Return code to specify if the text is normalized or not
    994      *                (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
    995      * @deprecated ICU 56 Use {@link Normalizer2} instead.
    996      */
    997     @Deprecated
    998     public static QuickCheckResult quickCheck(char[] source, Mode mode, int options) {
    999         return quickCheck(source, 0, source.length, mode, options);
   1000     }
   1001 
   1002     /**
   1003      * Performing quick check on a string, to quickly determine if the string is
   1004      * in a particular normalization format.
   1005      * Three types of result can be returned Normalizer.YES, Normalizer.NO or
   1006      * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument
   1007      * string is in the desired normalized format, Normalizer.NO determines that
   1008      * argument string is not in the desired normalized format. A
   1009      * Normalizer.MAYBE result indicates that a more thorough check is required,
   1010      * the user may have to put the string in its normalized form and compare
   1011      * the results.
   1012      *
   1013      * @param source    string for determining if it is in a normalized format
   1014      * @param start     the start index of the source
   1015      * @param limit     the limit index of the source it is equal to the length
   1016      * @param mode      normalization format (Normalizer.NFC,Normalizer.NFD,
   1017      *                   Normalizer.NFKC,Normalizer.NFKD)
   1018      * @param options   Options for use with exclusion set and tailored Normalization
   1019      *                                   The only option that is currently recognized is UNICODE_3_2
   1020      * @return          Return code to specify if the text is normalized or not
   1021      *                   (Normalizer.YES, Normalizer.NO or
   1022      *                   Normalizer.MAYBE)
   1023      * @deprecated ICU 56 Use {@link Normalizer2} instead.
   1024      */
   1025     @Deprecated
   1026     public static QuickCheckResult quickCheck(char[] source,int start,
   1027                                               int limit, Mode mode,int options) {
   1028         CharBuffer srcBuffer = CharBuffer.wrap(source, start, limit - start);
   1029         return mode.getNormalizer2(options).quickCheck(srcBuffer);
   1030     }
   1031 
   1032     /**
   1033      * Test if a string is in a given normalization form.
   1034      * This is semantically equivalent to source.equals(normalize(source, mode)).
   1035      *
   1036      * Unlike quickCheck(), this function returns a definitive result,
   1037      * never a "maybe".
   1038      * For NFD, NFKD, and FCD, both functions work exactly the same.
   1039      * For NFC and NFKC where quickCheck may return "maybe", this function will
   1040      * perform further tests to arrive at a true/false result.
   1041      * @param src       The input array of characters to be checked to see if
   1042      *                   it is normalized
   1043      * @param start     The strart index in the source
   1044      * @param limit     The limit index in the source
   1045      * @param mode      the normalization mode
   1046      * @param options   Options for use with exclusion set and tailored Normalization
   1047      *                                   The only option that is currently recognized is UNICODE_3_2
   1048      * @return Boolean value indicating whether the source string is in the
   1049      *         "mode" normalization form
   1050      * @deprecated ICU 56 Use {@link Normalizer2} instead.
   1051      */
   1052     @Deprecated
   1053     public static boolean isNormalized(char[] src,int start,
   1054                                        int limit, Mode mode,
   1055                                        int options) {
   1056         CharBuffer srcBuffer = CharBuffer.wrap(src, start, limit - start);
   1057         return mode.getNormalizer2(options).isNormalized(srcBuffer);
   1058     }
   1059 
   1060     /**
   1061      * Test if a string is in a given normalization form.
   1062      * This is semantically equivalent to source.equals(normalize(source, mode)).
   1063      *
   1064      * Unlike quickCheck(), this function returns a definitive result,
   1065      * never a "maybe".
   1066      * For NFD, NFKD, and FCD, both functions work exactly the same.
   1067      * For NFC and NFKC where quickCheck may return "maybe", this function will
   1068      * perform further tests to arrive at a true/false result.
   1069      * @param str       the input string to be checked to see if it is
   1070      *                   normalized
   1071      * @param mode      the normalization mode
   1072      * @param options   Options for use with exclusion set and tailored Normalization
   1073      *                  The only option that is currently recognized is UNICODE_3_2
   1074      * @see #isNormalized
   1075      * @deprecated ICU 56 Use {@link Normalizer2} instead.
   1076      */
   1077     @Deprecated
   1078     public static boolean isNormalized(String str, Mode mode, int options) {
   1079         return mode.getNormalizer2(options).isNormalized(str);
   1080     }
   1081 
   1082     /**
   1083      * Convenience Method
   1084      * @param char32    the input code point to be checked to see if it is
   1085      *                   normalized
   1086      * @param mode      the normalization mode
   1087      * @param options   Options for use with exclusion set and tailored Normalization
   1088      *                  The only option that is currently recognized is UNICODE_3_2
   1089      *
   1090      * @see #isNormalized
   1091      * @deprecated ICU 56 Use {@link Normalizer2} instead.
   1092      */
   1093     @Deprecated
   1094     public static boolean isNormalized(int char32, Mode mode,int options) {
   1095         return isNormalized(UTF16.valueOf(char32), mode, options);
   1096     }
   1097 
   1098     /**
   1099      * Compare two strings for canonical equivalence.
   1100      * Further options include case-insensitive comparison and
   1101      * code point order (as opposed to code unit order).
   1102      *
   1103      * Canonical equivalence between two strings is defined as their normalized
   1104      * forms (NFD or NFC) being identical.
   1105      * This function compares strings incrementally instead of normalizing
   1106      * (and optionally case-folding) both strings entirely,
   1107      * improving performance significantly.
   1108      *
   1109      * Bulk normalization is only necessary if the strings do not fulfill the
   1110      * FCD conditions. Only in this case, and only if the strings are relatively
   1111      * long, is memory allocated temporarily.
   1112      * For FCD strings and short non-FCD strings there is no memory allocation.
   1113      *
   1114      * Semantically, this is equivalent to
   1115      *   strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))
   1116      * where code point order and foldCase are all optional.
   1117      *
   1118      * @param s1        First source character array.
   1119      * @param s1Start   start index of source
   1120      * @param s1Limit   limit of the source
   1121      *
   1122      * @param s2        Second source character array.
   1123      * @param s2Start   start index of the source
   1124      * @param s2Limit   limit of the source
   1125      *
   1126      * @param options A bit set of options:
   1127      *   - FOLD_CASE_DEFAULT or 0 is used for default options:
   1128      *     Case-sensitive comparison in code unit order, and the input strings
   1129      *     are quick-checked for FCD.
   1130      *
   1131      *   - INPUT_IS_FCD
   1132      *     Set if the caller knows that both s1 and s2 fulfill the FCD
   1133      *     conditions.If not set, the function will quickCheck for FCD
   1134      *     and normalize if necessary.
   1135      *
   1136      *   - COMPARE_CODE_POINT_ORDER
   1137      *     Set to choose code point order instead of code unit order
   1138      *
   1139      *   - COMPARE_IGNORE_CASE
   1140      *     Set to compare strings case-insensitively using case folding,
   1141      *     instead of case-sensitively.
   1142      *     If set, then the following case folding options are used.
   1143      *
   1144      *
   1145      * @return &lt;0 or 0 or &gt;0 as usual for string comparisons
   1146      *
   1147      * @see #normalize
   1148      * @see #FCD
   1149      * @stable ICU 2.8
   1150      */
   1151     public static int compare(char[] s1, int s1Start, int s1Limit,
   1152                               char[] s2, int s2Start, int s2Limit,
   1153                               int options) {
   1154         if( s1==null || s1Start<0 || s1Limit<0 ||
   1155             s2==null || s2Start<0 || s2Limit<0 ||
   1156             s1Limit<s1Start || s2Limit<s2Start
   1157         ) {
   1158             throw new IllegalArgumentException();
   1159         }
   1160         return internalCompare(CharBuffer.wrap(s1, s1Start, s1Limit-s1Start),
   1161                                CharBuffer.wrap(s2, s2Start, s2Limit-s2Start),
   1162                                options);
   1163     }
   1164 
   1165     /**
   1166      * Compare two strings for canonical equivalence.
   1167      * Further options include case-insensitive comparison and
   1168      * code point order (as opposed to code unit order).
   1169      *
   1170      * Canonical equivalence between two strings is defined as their normalized
   1171      * forms (NFD or NFC) being identical.
   1172      * This function compares strings incrementally instead of normalizing
   1173      * (and optionally case-folding) both strings entirely,
   1174      * improving performance significantly.
   1175      *
   1176      * Bulk normalization is only necessary if the strings do not fulfill the
   1177      * FCD conditions. Only in this case, and only if the strings are relatively
   1178      * long, is memory allocated temporarily.
   1179      * For FCD strings and short non-FCD strings there is no memory allocation.
   1180      *
   1181      * Semantically, this is equivalent to
   1182      *   strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))
   1183      * where code point order and foldCase are all optional.
   1184      *
   1185      * @param s1 First source string.
   1186      * @param s2 Second source string.
   1187      *
   1188      * @param options A bit set of options:
   1189      *   - FOLD_CASE_DEFAULT or 0 is used for default options:
   1190      *     Case-sensitive comparison in code unit order, and the input strings
   1191      *     are quick-checked for FCD.
   1192      *
   1193      *   - INPUT_IS_FCD
   1194      *     Set if the caller knows that both s1 and s2 fulfill the FCD
   1195      *     conditions. If not set, the function will quickCheck for FCD
   1196      *     and normalize if necessary.
   1197      *
   1198      *   - COMPARE_CODE_POINT_ORDER
   1199      *     Set to choose code point order instead of code unit order
   1200      *
   1201      *   - COMPARE_IGNORE_CASE
   1202      *     Set to compare strings case-insensitively using case folding,
   1203      *     instead of case-sensitively.
   1204      *     If set, then the following case folding options are used.
   1205      *
   1206      * @return &lt;0 or 0 or &gt;0 as usual for string comparisons
   1207      *
   1208      * @see #normalize
   1209      * @see #FCD
   1210      * @stable ICU 2.8
   1211      */
   1212     public static int compare(String s1, String s2, int options) {
   1213         return internalCompare(s1, s2, options);
   1214     }
   1215 
   1216     /**
   1217      * Compare two strings for canonical equivalence.
   1218      * Further options include case-insensitive comparison and
   1219      * code point order (as opposed to code unit order).
   1220      * Convenience method.
   1221      *
   1222      * @param s1 First source string.
   1223      * @param s2 Second source string.
   1224      *
   1225      * @param options A bit set of options:
   1226      *   - FOLD_CASE_DEFAULT or 0 is used for default options:
   1227      *     Case-sensitive comparison in code unit order, and the input strings
   1228      *     are quick-checked for FCD.
   1229      *
   1230      *   - INPUT_IS_FCD
   1231      *     Set if the caller knows that both s1 and s2 fulfill the FCD
   1232      *     conditions. If not set, the function will quickCheck for FCD
   1233      *     and normalize if necessary.
   1234      *
   1235      *   - COMPARE_CODE_POINT_ORDER
   1236      *     Set to choose code point order instead of code unit order
   1237      *
   1238      *   - COMPARE_IGNORE_CASE
   1239      *     Set to compare strings case-insensitively using case folding,
   1240      *     instead of case-sensitively.
   1241      *     If set, then the following case folding options are used.
   1242      *
   1243      * @return &lt;0 or 0 or &gt;0 as usual for string comparisons
   1244      *
   1245      * @see #normalize
   1246      * @see #FCD
   1247      * @stable ICU 2.8
   1248      */
   1249     public static int compare(char[] s1, char[] s2, int options) {
   1250         return internalCompare(CharBuffer.wrap(s1), CharBuffer.wrap(s2), options);
   1251     }
   1252 
   1253     /**
   1254      * Convenience method that can have faster implementation
   1255      * by not allocating buffers.
   1256      * @param char32a    the first code point to be checked against the
   1257      * @param char32b    the second code point
   1258      * @param options    A bit set of options
   1259      * @stable ICU 2.8
   1260      */
   1261     public static int compare(int char32a, int char32b, int options) {
   1262         return internalCompare(UTF16.valueOf(char32a), UTF16.valueOf(char32b), options|INPUT_IS_FCD);
   1263     }
   1264 
   1265     /**
   1266      * Convenience method that can have faster implementation
   1267      * by not allocating buffers.
   1268      * @param char32a   the first code point to be checked against
   1269      * @param str2      the second string
   1270      * @param options   A bit set of options
   1271      * @stable ICU 2.8
   1272      */
   1273     public static int compare(int char32a, String str2, int options) {
   1274         return internalCompare(UTF16.valueOf(char32a), str2, options);
   1275     }
   1276 
   1277     /* Concatenation of normalized strings --------------------------------- */
   1278     /**
   1279      * Concatenate normalized strings, making sure that the result is normalized
   1280      * as well.
   1281      *
   1282      * If both the left and the right strings are in
   1283      * the normalization form according to "mode",
   1284      * then the result will be
   1285      *
   1286      * <code>
   1287      *     dest=normalize(left+right, mode)
   1288      * </code>
   1289      *
   1290      * With the input strings already being normalized,
   1291      * this function will use next() and previous()
   1292      * to find the adjacent end pieces of the input strings.
   1293      * Only the concatenation of these end pieces will be normalized and
   1294      * then concatenated with the remaining parts of the input strings.
   1295      *
   1296      * It is allowed to have dest==left to avoid copying the entire left string.
   1297      *
   1298      * @param left Left source array, may be same as dest.
   1299      * @param leftStart start in the left array.
   1300      * @param leftLimit limit in the left array (==length)
   1301      * @param right Right source array.
   1302      * @param rightStart start in the right array.
   1303      * @param rightLimit limit in the right array (==length)
   1304      * @param dest The output buffer; can be null if destStart==destLimit==0
   1305      *              for pure preflighting.
   1306      * @param destStart start in the destination array
   1307      * @param destLimit limit in the destination array (==length)
   1308      * @param mode The normalization mode.
   1309      * @param options The normalization options, ORed together (0 for no options).
   1310      * @return Length of output (number of chars) when successful or
   1311      *          IndexOutOfBoundsException
   1312      * @exception IndexOutOfBoundsException whose message has the string
   1313      *             representation of destination capacity required.
   1314      * @see #normalize
   1315      * @see #next
   1316      * @see #previous
   1317      * @exception IndexOutOfBoundsException if target capacity is less than the
   1318      *             required length
   1319      * @deprecated ICU 56 Use {@link Normalizer2} instead.
   1320      */
   1321     @Deprecated
   1322     public static int concatenate(char[] left,  int leftStart,  int leftLimit,
   1323                                   char[] right, int rightStart, int rightLimit,
   1324                                   char[] dest,  int destStart,  int destLimit,
   1325                                   Normalizer.Mode mode, int options) {
   1326         if(dest == null) {
   1327             throw new IllegalArgumentException();
   1328         }
   1329 
   1330         /* check for overlapping right and destination */
   1331         if (right == dest && rightStart < destLimit && destStart < rightLimit) {
   1332             throw new IllegalArgumentException("overlapping right and dst ranges");
   1333         }
   1334 
   1335         /* allow left==dest */
   1336         StringBuilder destBuilder=new StringBuilder(leftLimit-leftStart+rightLimit-rightStart+16);
   1337         destBuilder.append(left, leftStart, leftLimit-leftStart);
   1338         CharBuffer rightBuffer=CharBuffer.wrap(right, rightStart, rightLimit-rightStart);
   1339         mode.getNormalizer2(options).append(destBuilder, rightBuffer);
   1340         int destLength=destBuilder.length();
   1341         if(destLength<=(destLimit-destStart)) {
   1342             destBuilder.getChars(0, destLength, dest, destStart);
   1343             return destLength;
   1344         } else {
   1345             throw new IndexOutOfBoundsException(Integer.toString(destLength));
   1346         }
   1347     }
   1348 
   1349     /**
   1350      * Concatenate normalized strings, making sure that the result is normalized
   1351      * as well.
   1352      *
   1353      * If both the left and the right strings are in
   1354      * the normalization form according to "mode",
   1355      * then the result will be
   1356      *
   1357      * <code>
   1358      *     dest=normalize(left+right, mode)
   1359      * </code>
   1360      *
   1361      * For details see concatenate
   1362      *
   1363      * @param left Left source string.
   1364      * @param right Right source string.
   1365      * @param mode The normalization mode.
   1366      * @param options The normalization options, ORed together (0 for no options).
   1367      * @return result
   1368      *
   1369      * @see #concatenate
   1370      * @see #normalize
   1371      * @see #next
   1372      * @see #previous
   1373      * @see #concatenate
   1374      * @deprecated ICU 56 Use {@link Normalizer2} instead.
   1375      */
   1376     @Deprecated
   1377     public static String concatenate(char[] left, char[] right,Mode mode, int options) {
   1378         StringBuilder dest=new StringBuilder(left.length+right.length+16).append(left);
   1379         return mode.getNormalizer2(options).append(dest, CharBuffer.wrap(right)).toString();
   1380     }
   1381 
   1382     /**
   1383      * Concatenate normalized strings, making sure that the result is normalized
   1384      * as well.
   1385      *
   1386      * If both the left and the right strings are in
   1387      * the normalization form according to "mode",
   1388      * then the result will be
   1389      *
   1390      * <code>
   1391      *     dest=normalize(left+right, mode)
   1392      * </code>
   1393      *
   1394      * With the input strings already being normalized,
   1395      * this function will use next() and previous()
   1396      * to find the adjacent end pieces of the input strings.
   1397      * Only the concatenation of these end pieces will be normalized and
   1398      * then concatenated with the remaining parts of the input strings.
   1399      *
   1400      * @param left Left source string.
   1401      * @param right Right source string.
   1402      * @param mode The normalization mode.
   1403      * @param options The normalization options, ORed together (0 for no options).
   1404      * @return result
   1405      *
   1406      * @see #concatenate
   1407      * @see #normalize
   1408      * @see #next
   1409      * @see #previous
   1410      * @see #concatenate
   1411      * @deprecated ICU 56 Use {@link Normalizer2} instead.
   1412      */
   1413     @Deprecated
   1414     public static String concatenate(String left, String right, Mode mode, int options) {
   1415         StringBuilder dest=new StringBuilder(left.length()+right.length()+16).append(left);
   1416         return mode.getNormalizer2(options).append(dest, right).toString();
   1417     }
   1418 
   1419     /**
   1420      * Gets the FC_NFKC closure value.
   1421      * @param c The code point whose closure value is to be retrieved
   1422      * @param dest The char array to receive the closure value
   1423      * @return the length of the closure value; 0 if there is none
   1424      * @deprecated ICU 56
   1425      */
   1426     @Deprecated
   1427     public static int getFC_NFKC_Closure(int c,char[] dest) {
   1428         String closure=getFC_NFKC_Closure(c);
   1429         int length=closure.length();
   1430         if(length!=0 && dest!=null && length<=dest.length) {
   1431             closure.getChars(0, length, dest, 0);
   1432         }
   1433         return length;
   1434     }
   1435     /**
   1436      * Gets the FC_NFKC closure value.
   1437      * @param c The code point whose closure value is to be retrieved
   1438      * @return String representation of the closure value; "" if there is none
   1439      * @deprecated ICU 56
   1440      */
   1441     @Deprecated
   1442     public static String getFC_NFKC_Closure(int c) {
   1443         // Compute the FC_NFKC_Closure on the fly:
   1444         // We have the API for complete coverage of Unicode properties, although
   1445         // this value by itself is not useful via API.
   1446         // (What could be useful is a custom normalization table that combines
   1447         // case folding and NFKC.)
   1448         // For the derivation, see Unicode's DerivedNormalizationProps.txt.
   1449         Normalizer2 nfkc=NFKCModeImpl.INSTANCE.normalizer2;
   1450         UCaseProps csp=UCaseProps.INSTANCE;
   1451         // first: b = NFKC(Fold(a))
   1452         StringBuilder folded=new StringBuilder();
   1453         int folded1Length=csp.toFullFolding(c, folded, 0);
   1454         if(folded1Length<0) {
   1455             Normalizer2Impl nfkcImpl=((Norm2AllModes.Normalizer2WithImpl)nfkc).impl;
   1456             if(nfkcImpl.getCompQuickCheck(nfkcImpl.getNorm16(c))!=0) {
   1457                 return "";  // c does not change at all under CaseFolding+NFKC
   1458             }
   1459             folded.appendCodePoint(c);
   1460         } else {
   1461             if(folded1Length>UCaseProps.MAX_STRING_LENGTH) {
   1462                 folded.appendCodePoint(folded1Length);
   1463             }
   1464         }
   1465         String kc1=nfkc.normalize(folded);
   1466         // second: c = NFKC(Fold(b))
   1467         String kc2=nfkc.normalize(UCharacter.foldCase(kc1, 0));
   1468         // if (c != b) add the mapping from a to c
   1469         if(kc1.equals(kc2)) {
   1470             return "";
   1471         } else {
   1472             return kc2;
   1473         }
   1474     }
   1475 
   1476     //-------------------------------------------------------------------------
   1477     // Iteration API
   1478     //-------------------------------------------------------------------------
   1479 
   1480     /**
   1481      * Return the current character in the normalized text.
   1482      * @return The codepoint as an int
   1483      * @deprecated ICU 56
   1484      */
   1485     @Deprecated
   1486     public int current() {
   1487         if(bufferPos<buffer.length() || nextNormalize()) {
   1488             return buffer.codePointAt(bufferPos);
   1489         } else {
   1490             return DONE;
   1491         }
   1492     }
   1493 
   1494     /**
   1495      * Return the next character in the normalized text and advance
   1496      * the iteration position by one.  If the end
   1497      * of the text has already been reached, {@link #DONE} is returned.
   1498      * @return The codepoint as an int
   1499      * @deprecated ICU 56
   1500      */
   1501     @Deprecated
   1502     public int next() {
   1503         if(bufferPos<buffer.length() ||  nextNormalize()) {
   1504             int c=buffer.codePointAt(bufferPos);
   1505             bufferPos+=Character.charCount(c);
   1506             return c;
   1507         } else {
   1508             return DONE;
   1509         }
   1510     }
   1511 
   1512 
   1513     /**
   1514      * Return the previous character in the normalized text and decrement
   1515      * the iteration position by one.  If the beginning
   1516      * of the text has already been reached, {@link #DONE} is returned.
   1517      * @return The codepoint as an int
   1518      * @deprecated ICU 56
   1519      */
   1520     @Deprecated
   1521     public int previous() {
   1522         if(bufferPos>0 || previousNormalize()) {
   1523             int c=buffer.codePointBefore(bufferPos);
   1524             bufferPos-=Character.charCount(c);
   1525             return c;
   1526         } else {
   1527             return DONE;
   1528         }
   1529     }
   1530 
   1531     /**
   1532      * Reset the index to the beginning of the text.
   1533      * This is equivalent to setIndexOnly(startIndex)).
   1534      * @deprecated ICU 56
   1535      */
   1536     @Deprecated
   1537     public void reset() {
   1538         text.setToStart();
   1539         currentIndex=nextIndex=0;
   1540         clearBuffer();
   1541     }
   1542 
   1543     /**
   1544      * Set the iteration position in the input text that is being normalized,
   1545      * without any immediate normalization.
   1546      * After setIndexOnly(), getIndex() will return the same index that is
   1547      * specified here.
   1548      *
   1549      * @param index the desired index in the input text.
   1550      * @deprecated ICU 56
   1551      */
   1552     @Deprecated
   1553     public void setIndexOnly(int index) {
   1554         text.setIndex(index);  // validates index
   1555         currentIndex=nextIndex=index;
   1556         clearBuffer();
   1557     }
   1558 
   1559     /**
   1560      * Set the iteration position in the input text that is being normalized
   1561      * and return the first normalized character at that position.
   1562      * <p>
   1563      * <b>Note:</b> This method sets the position in the <em>input</em> text,
   1564      * while {@link #next} and {@link #previous} iterate through characters
   1565      * in the normalized <em>output</em>.  This means that there is not
   1566      * necessarily a one-to-one correspondence between characters returned
   1567      * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
   1568      * returned from <tt>setIndex</tt> and {@link #getIndex}.
   1569      * <p>
   1570      * @param index the desired index in the input text.
   1571      *
   1572      * @return   the first normalized character that is the result of iterating
   1573      *            forward starting at the given index.
   1574      *
   1575      * @throws IllegalArgumentException if the given index is less than
   1576      *          {@link #getBeginIndex} or greater than {@link #getEndIndex}.
   1577      * @deprecated ICU 3.2
   1578      * @obsolete ICU 3.2
   1579      */
   1580     @Deprecated
   1581      ///CLOVER:OFF
   1582      public int setIndex(int index) {
   1583          setIndexOnly(index);
   1584          return current();
   1585      }
   1586      ///CLOVER:ON
   1587     /**
   1588      * Retrieve the index of the start of the input text. This is the begin
   1589      * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
   1590      * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
   1591      * @deprecated ICU 2.2. Use startIndex() instead.
   1592      * @return The codepoint as an int
   1593      * @see #startIndex
   1594      */
   1595     @Deprecated
   1596     public int getBeginIndex() {
   1597         return 0;
   1598     }
   1599 
   1600     /**
   1601      * Retrieve the index of the end of the input text.  This is the end index
   1602      * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
   1603      * over which this <tt>Normalizer</tt> is iterating
   1604      * @deprecated ICU 2.2. Use endIndex() instead.
   1605      * @return The codepoint as an int
   1606      * @see #endIndex
   1607      */
   1608     @Deprecated
   1609     public int getEndIndex() {
   1610         return endIndex();
   1611     }
   1612     /**
   1613      * Return the first character in the normalized text.  This resets
   1614      * the <tt>Normalizer's</tt> position to the beginning of the text.
   1615      * @return The codepoint as an int
   1616      * @deprecated ICU 56
   1617      */
   1618     @Deprecated
   1619     public int first() {
   1620         reset();
   1621         return next();
   1622     }
   1623 
   1624     /**
   1625      * Return the last character in the normalized text.  This resets
   1626      * the <tt>Normalizer's</tt> position to be just before the
   1627      * the input text corresponding to that normalized character.
   1628      * @return The codepoint as an int
   1629      * @deprecated ICU 56
   1630      */
   1631     @Deprecated
   1632     public int last() {
   1633         text.setToLimit();
   1634         currentIndex=nextIndex=text.getIndex();
   1635         clearBuffer();
   1636         return previous();
   1637     }
   1638 
   1639     /**
   1640      * Retrieve the current iteration position in the input text that is
   1641      * being normalized.  This method is useful in applications such as
   1642      * searching, where you need to be able to determine the position in
   1643      * the input text that corresponds to a given normalized output character.
   1644      * <p>
   1645      * <b>Note:</b> This method sets the position in the <em>input</em>, while
   1646      * {@link #next} and {@link #previous} iterate through characters in the
   1647      * <em>output</em>.  This means that there is not necessarily a one-to-one
   1648      * correspondence between characters returned by <tt>next</tt> and
   1649      * <tt>previous</tt> and the indices passed to and returned from
   1650      * <tt>setIndex</tt> and {@link #getIndex}.
   1651      * @return The current iteration position
   1652      * @deprecated ICU 56
   1653      */
   1654     @Deprecated
   1655     public int getIndex() {
   1656         if(bufferPos<buffer.length()) {
   1657             return currentIndex;
   1658         } else {
   1659             return nextIndex;
   1660         }
   1661     }
   1662 
   1663     /**
   1664      * Retrieve the index of the start of the input text. This is the begin
   1665      * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
   1666      * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
   1667      * @return The current iteration position
   1668      * @deprecated ICU 56
   1669      */
   1670     @Deprecated
   1671     public int startIndex() {
   1672         return 0;
   1673     }
   1674 
   1675     /**
   1676      * Retrieve the index of the end of the input text.  This is the end index
   1677      * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
   1678      * over which this <tt>Normalizer</tt> is iterating
   1679      * @return The current iteration position
   1680      * @deprecated ICU 56
   1681      */
   1682     @Deprecated
   1683     public int endIndex() {
   1684         return text.getLength();
   1685     }
   1686 
   1687     //-------------------------------------------------------------------------
   1688     // Iterator attributes
   1689     //-------------------------------------------------------------------------
   1690     /**
   1691      * Set the normalization mode for this object.
   1692      * <p>
   1693      * <b>Note:</b>If the normalization mode is changed while iterating
   1694      * over a string, calls to {@link #next} and {@link #previous} may
   1695      * return previously buffers characters in the old normalization mode
   1696      * until the iteration is able to re-sync at the next base character.
   1697      * It is safest to call {@link #setText setText()}, {@link #first},
   1698      * {@link #last}, etc. after calling <tt>setMode</tt>.
   1699      * <p>
   1700      * @param newMode the new mode for this <tt>Normalizer</tt>.
   1701      * The supported modes are:
   1702      * <ul>
   1703      *  <li>{@link #NFC}    - Unicode canonical decompositiion
   1704      *                        followed by canonical composition.
   1705      *  <li>{@link #NFKC}   - Unicode compatibility decompositiion
   1706      *                        follwed by canonical composition.
   1707      *  <li>{@link #NFD}    - Unicode canonical decomposition
   1708      *  <li>{@link #NFKD}   - Unicode compatibility decomposition.
   1709      *  <li>{@link #NONE}   - Do nothing but return characters
   1710      *                        from the underlying input text.
   1711      * </ul>
   1712      *
   1713      * @see #getMode
   1714      * @deprecated ICU 56
   1715      */
   1716     @Deprecated
   1717     public void setMode(Mode newMode) {
   1718         mode = newMode;
   1719         norm2 = mode.getNormalizer2(options);
   1720     }
   1721     /**
   1722      * Return the basic operation performed by this <tt>Normalizer</tt>
   1723      *
   1724      * @see #setMode
   1725      * @deprecated ICU 56
   1726      */
   1727     @Deprecated
   1728     public Mode getMode() {
   1729         return mode;
   1730     }
   1731     /**
   1732      * Set options that affect this <tt>Normalizer</tt>'s operation.
   1733      * Options do not change the basic composition or decomposition operation
   1734      * that is being performed , but they control whether
   1735      * certain optional portions of the operation are done.
   1736      * Currently the only available option is:
   1737      *
   1738      * <ul>
   1739      *   <li>{@link #UNICODE_3_2} - Use Normalization conforming to Unicode version 3.2.
   1740      * </ul>
   1741      *
   1742      * @param   option  the option whose value is to be set.
   1743      * @param   value   the new setting for the option.  Use <tt>true</tt> to
   1744      *                  turn the option on and <tt>false</tt> to turn it off.
   1745      *
   1746      * @see #getOption
   1747      * @deprecated ICU 56
   1748      */
   1749     @Deprecated
   1750     public void setOption(int option,boolean value) {
   1751         if (value) {
   1752             options |= option;
   1753         } else {
   1754             options &= (~option);
   1755         }
   1756         norm2 = mode.getNormalizer2(options);
   1757     }
   1758 
   1759     /**
   1760      * Determine whether an option is turned on or off.
   1761      * <p>
   1762      * @see #setOption
   1763      * @deprecated ICU 56
   1764      */
   1765     @Deprecated
   1766     public int getOption(int option) {
   1767         if((options & option)!=0) {
   1768             return 1 ;
   1769         } else {
   1770             return 0;
   1771         }
   1772     }
   1773 
   1774     /**
   1775      * Gets the underlying text storage
   1776      * @param fillIn the char buffer to fill the UTF-16 units.
   1777      *         The length of the buffer should be equal to the length of the
   1778      *         underlying text storage
   1779      * @throws IndexOutOfBoundsException If the index passed for the array is invalid.
   1780      * @see   #getLength
   1781      * @deprecated ICU 56
   1782      */
   1783     @Deprecated
   1784     public int getText(char[] fillIn) {
   1785         return text.getText(fillIn);
   1786     }
   1787 
   1788     /**
   1789      * Gets the length of underlying text storage
   1790      * @return the length
   1791      * @deprecated ICU 56
   1792      */
   1793     @Deprecated
   1794     public int getLength() {
   1795         return text.getLength();
   1796     }
   1797 
   1798     /**
   1799      * Returns the text under iteration as a string
   1800      * @return a copy of the text under iteration.
   1801      * @deprecated ICU 56
   1802      */
   1803     @Deprecated
   1804     public String getText() {
   1805         return text.getText();
   1806     }
   1807 
   1808     /**
   1809      * Set the input text over which this <tt>Normalizer</tt> will iterate.
   1810      * The iteration position is set to the beginning of the input text.
   1811      * @param newText   The new string to be normalized.
   1812      * @deprecated ICU 56
   1813      */
   1814     @Deprecated
   1815     public void setText(StringBuffer newText) {
   1816         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
   1817         if (newIter == null) {
   1818             throw new IllegalStateException("Could not create a new UCharacterIterator");
   1819         }
   1820         text = newIter;
   1821         reset();
   1822     }
   1823 
   1824     /**
   1825      * Set the input text over which this <tt>Normalizer</tt> will iterate.
   1826      * The iteration position is set to the beginning of the input text.
   1827      * @param newText   The new string to be normalized.
   1828      * @deprecated ICU 56
   1829      */
   1830     @Deprecated
   1831     public void setText(char[] newText) {
   1832         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
   1833         if (newIter == null) {
   1834             throw new IllegalStateException("Could not create a new UCharacterIterator");
   1835         }
   1836         text = newIter;
   1837         reset();
   1838     }
   1839 
   1840     /**
   1841      * Set the input text over which this <tt>Normalizer</tt> will iterate.
   1842      * The iteration position is set to the beginning of the input text.
   1843      * @param newText   The new string to be normalized.
   1844      * @deprecated ICU 56
   1845      */
   1846     @Deprecated
   1847     public void setText(String newText) {
   1848         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
   1849         if (newIter == null) {
   1850             throw new IllegalStateException("Could not create a new UCharacterIterator");
   1851         }
   1852         text = newIter;
   1853         reset();
   1854     }
   1855 
   1856     /**
   1857      * Set the input text over which this <tt>Normalizer</tt> will iterate.
   1858      * The iteration position is set to the beginning of the input text.
   1859      * @param newText   The new string to be normalized.
   1860      * @deprecated ICU 56
   1861      */
   1862     @Deprecated
   1863     public void setText(CharacterIterator newText) {
   1864         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
   1865         if (newIter == null) {
   1866             throw new IllegalStateException("Could not create a new UCharacterIterator");
   1867         }
   1868         text = newIter;
   1869         reset();
   1870     }
   1871 
   1872     /**
   1873      * Set the input text over which this <tt>Normalizer</tt> will iterate.
   1874      * The iteration position is set to the beginning of the string.
   1875      * @param newText   The new string to be normalized.
   1876      * @deprecated ICU 56
   1877      */
   1878     @Deprecated
   1879     public void setText(UCharacterIterator newText) {
   1880         try{
   1881             UCharacterIterator newIter = (UCharacterIterator)newText.clone();
   1882             if (newIter == null) {
   1883                 throw new IllegalStateException("Could not create a new UCharacterIterator");
   1884             }
   1885             text = newIter;
   1886             reset();
   1887         }catch(CloneNotSupportedException e) {
   1888             throw new ICUCloneNotSupportedException("Could not clone the UCharacterIterator", e);
   1889         }
   1890     }
   1891 
   1892     private void clearBuffer() {
   1893         buffer.setLength(0);
   1894         bufferPos=0;
   1895     }
   1896 
   1897     private boolean nextNormalize() {
   1898         clearBuffer();
   1899         currentIndex=nextIndex;
   1900         text.setIndex(nextIndex);
   1901         // Skip at least one character so we make progress.
   1902         int c=text.nextCodePoint();
   1903         if(c<0) {
   1904             return false;
   1905         }
   1906         StringBuilder segment=new StringBuilder().appendCodePoint(c);
   1907         while((c=text.nextCodePoint())>=0) {
   1908             if(norm2.hasBoundaryBefore(c)) {
   1909                 text.moveCodePointIndex(-1);
   1910                 break;
   1911             }
   1912             segment.appendCodePoint(c);
   1913         }
   1914         nextIndex=text.getIndex();
   1915         norm2.normalize(segment, buffer);
   1916         return buffer.length()!=0;
   1917     }
   1918 
   1919     private boolean previousNormalize() {
   1920         clearBuffer();
   1921         nextIndex=currentIndex;
   1922         text.setIndex(currentIndex);
   1923         StringBuilder segment=new StringBuilder();
   1924         int c;
   1925         while((c=text.previousCodePoint())>=0) {
   1926             if(c<=0xffff) {
   1927                 segment.insert(0, (char)c);
   1928             } else {
   1929                 segment.insert(0, Character.toChars(c));
   1930             }
   1931             if(norm2.hasBoundaryBefore(c)) {
   1932                 break;
   1933             }
   1934         }
   1935         currentIndex=text.getIndex();
   1936         norm2.normalize(segment, buffer);
   1937         bufferPos=buffer.length();
   1938         return buffer.length()!=0;
   1939     }
   1940 
   1941     /* compare canonically equivalent ------------------------------------------- */
   1942 
   1943     // TODO: Broaden the public compare(String, String, options) API like this. Ticket #7407
   1944     private static int internalCompare(CharSequence s1, CharSequence s2, int options) {
   1945         int normOptions=options>>>COMPARE_NORM_OPTIONS_SHIFT;
   1946         options|= COMPARE_EQUIV;
   1947 
   1948         /*
   1949          * UAX #21 Case Mappings, as fixed for Unicode version 4
   1950          * (see Jitterbug 2021), defines a canonical caseless match as
   1951          *
   1952          * A string X is a canonical caseless match
   1953          * for a string Y if and only if
   1954          * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y)))
   1955          *
   1956          * For better performance, we check for FCD (or let the caller tell us that
   1957          * both strings are in FCD) for the inner normalization.
   1958          * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that
   1959          * case-folding preserves the FCD-ness of a string.
   1960          * The outer normalization is then only performed by NormalizerImpl.cmpEquivFold()
   1961          * when there is a difference.
   1962          *
   1963          * Exception: When using the Turkic case-folding option, we do perform
   1964          * full NFD first. This is because in the Turkic case precomposed characters
   1965          * with 0049 capital I or 0069 small i fold differently whether they
   1966          * are first decomposed or not, so an FCD check - a check only for
   1967          * canonical order - is not sufficient.
   1968          */
   1969         if((options&INPUT_IS_FCD)==0 || (options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) {
   1970             Normalizer2 n2;
   1971             if((options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) {
   1972                 n2=NFD.getNormalizer2(normOptions);
   1973             } else {
   1974                 n2=FCD.getNormalizer2(normOptions);
   1975             }
   1976 
   1977             // check if s1 and/or s2 fulfill the FCD conditions
   1978             int spanQCYes1=n2.spanQuickCheckYes(s1);
   1979             int spanQCYes2=n2.spanQuickCheckYes(s2);
   1980 
   1981             /*
   1982              * ICU 2.4 had a further optimization:
   1983              * If both strings were not in FCD, then they were both NFD'ed,
   1984              * and the COMPARE_EQUIV option was turned off.
   1985              * It is not entirely clear that this is valid with the current
   1986              * definition of the canonical caseless match.
   1987              * Therefore, ICU 2.6 removes that optimization.
   1988              */
   1989 
   1990             if(spanQCYes1<s1.length()) {
   1991                 StringBuilder fcd1=new StringBuilder(s1.length()+16).append(s1, 0, spanQCYes1);
   1992                 s1=n2.normalizeSecondAndAppend(fcd1, s1.subSequence(spanQCYes1, s1.length()));
   1993             }
   1994             if(spanQCYes2<s2.length()) {
   1995                 StringBuilder fcd2=new StringBuilder(s2.length()+16).append(s2, 0, spanQCYes2);
   1996                 s2=n2.normalizeSecondAndAppend(fcd2, s2.subSequence(spanQCYes2, s2.length()));
   1997             }
   1998         }
   1999 
   2000         return cmpEquivFold(s1, s2, options);
   2001     }
   2002 
   2003     /*
   2004      * Compare two strings for canonical equivalence.
   2005      * Further options include case-insensitive comparison and
   2006      * code point order (as opposed to code unit order).
   2007      *
   2008      * In this function, canonical equivalence is optional as well.
   2009      * If canonical equivalence is tested, then both strings must fulfill
   2010      * the FCD check.
   2011      *
   2012      * Semantically, this is equivalent to
   2013      *   strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))
   2014      * where code point order, NFD and foldCase are all optional.
   2015      *
   2016      * String comparisons almost always yield results before processing both strings
   2017      * completely.
   2018      * They are generally more efficient working incrementally instead of
   2019      * performing the sub-processing (strlen, normalization, case-folding)
   2020      * on the entire strings first.
   2021      *
   2022      * It is also unnecessary to not normalize identical characters.
   2023      *
   2024      * This function works in principle as follows:
   2025      *
   2026      * loop {
   2027      *   get one code unit c1 from s1 (-1 if end of source)
   2028      *   get one code unit c2 from s2 (-1 if end of source)
   2029      *
   2030      *   if(either string finished) {
   2031      *     return result;
   2032      *   }
   2033      *   if(c1==c2) {
   2034      *     continue;
   2035      *   }
   2036      *
   2037      *   // c1!=c2
   2038      *   try to decompose/case-fold c1/c2, and continue if one does;
   2039      *
   2040      *   // still c1!=c2 and neither decomposes/case-folds, return result
   2041      *   return c1-c2;
   2042      * }
   2043      *
   2044      * When a character decomposes, then the pointer for that source changes to
   2045      * the decomposition, pushing the previous pointer onto a stack.
   2046      * When the end of the decomposition is reached, then the code unit reader
   2047      * pops the previous source from the stack.
   2048      * (Same for case-folding.)
   2049      *
   2050      * This is complicated further by operating on variable-width UTF-16.
   2051      * The top part of the loop works on code units, while lookups for decomposition
   2052      * and case-folding need code points.
   2053      * Code points are assembled after the equality/end-of-source part.
   2054      * The source pointer is only advanced beyond all code units when the code point
   2055      * actually decomposes/case-folds.
   2056      *
   2057      * If we were on a trail surrogate unit when assembling a code point,
   2058      * and the code point decomposes/case-folds, then the decomposition/folding
   2059      * result must be compared with the part of the other string that corresponds to
   2060      * this string's lead surrogate.
   2061      * Since we only assemble a code point when hitting a trail unit when the
   2062      * preceding lead units were identical, we back up the other string by one unit
   2063      * in such a case.
   2064      *
   2065      * The optional code point order comparison at the end works with
   2066      * the same fix-up as the other code point order comparison functions.
   2067      * See ustring.c and the comment near the end of this function.
   2068      *
   2069      * Assumption: A decomposition or case-folding result string never contains
   2070      * a single surrogate. This is a safe assumption in the Unicode Standard.
   2071      * Therefore, we do not need to check for surrogate pairs across
   2072      * decomposition/case-folding boundaries.
   2073      *
   2074      * Further assumptions (see verifications tstnorm.cpp):
   2075      * The API function checks for FCD first, while the core function
   2076      * first case-folds and then decomposes. This requires that case-folding does not
   2077      * un-FCD any strings.
   2078      *
   2079      * The API function may also NFD the input and turn off decomposition.
   2080      * This requires that case-folding does not un-NFD strings either.
   2081      *
   2082      * TODO If any of the above two assumptions is violated,
   2083      * then this entire code must be re-thought.
   2084      * If this happens, then a simple solution is to case-fold both strings up front
   2085      * and to turn off UNORM_INPUT_IS_FCD.
   2086      * We already do this when not both strings are in FCD because makeFCD
   2087      * would be a partial NFD before the case folding, which does not work.
   2088      * Note that all of this is only a problem when case-folding _and_
   2089      * canonical equivalence come together.
   2090      * (Comments in unorm_compare() are more up to date than this TODO.)
   2091      */
   2092 
   2093     /* stack element for previous-level source/decomposition pointers */
   2094     private static final class CmpEquivLevel {
   2095         CharSequence cs;
   2096         int s;
   2097     };
   2098     private static final CmpEquivLevel[] createCmpEquivLevelStack() {
   2099         return new CmpEquivLevel[] {
   2100             new CmpEquivLevel(), new CmpEquivLevel()
   2101         };
   2102     }
   2103 
   2104     /**
   2105      * Internal option for unorm_cmpEquivFold() for decomposing.
   2106      * If not set, just do strcasecmp().
   2107      */
   2108     private static final int COMPARE_EQUIV=0x80000;
   2109 
   2110     /* internal function; package visibility for use by UTF16.StringComparator */
   2111     /*package*/ static int cmpEquivFold(CharSequence cs1, CharSequence cs2, int options) {
   2112         Normalizer2Impl nfcImpl;
   2113         UCaseProps csp;
   2114 
   2115         /* current-level start/limit - s1/s2 as current */
   2116         int s1, s2, limit1, limit2;
   2117 
   2118         /* decomposition and case folding variables */
   2119         int length;
   2120 
   2121         /* stacks of previous-level start/current/limit */
   2122         CmpEquivLevel[] stack1=null, stack2=null;
   2123 
   2124         /* buffers for algorithmic decompositions */
   2125         String decomp1, decomp2;
   2126 
   2127         /* case folding buffers, only use current-level start/limit */
   2128         StringBuilder fold1, fold2;
   2129 
   2130         /* track which is the current level per string */
   2131         int level1, level2;
   2132 
   2133         /* current code units, and code points for lookups */
   2134         int c1, c2, cp1, cp2;
   2135 
   2136         /* no argument error checking because this itself is not an API */
   2137 
   2138         /*
   2139          * assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set
   2140          * otherwise this function must behave exactly as uprv_strCompare()
   2141          * not checking for that here makes testing this function easier
   2142          */
   2143 
   2144         /* normalization/properties data loaded? */
   2145         if((options&COMPARE_EQUIV)!=0) {
   2146             nfcImpl=Norm2AllModes.getNFCInstance().impl;
   2147         } else {
   2148             nfcImpl=null;
   2149         }
   2150         if((options&COMPARE_IGNORE_CASE)!=0) {
   2151             csp=UCaseProps.INSTANCE;
   2152             fold1=new StringBuilder();
   2153             fold2=new StringBuilder();
   2154         } else {
   2155             csp=null;
   2156             fold1=fold2=null;
   2157         }
   2158 
   2159         /* initialize */
   2160         s1=0;
   2161         limit1=cs1.length();
   2162         s2=0;
   2163         limit2=cs2.length();
   2164 
   2165         level1=level2=0;
   2166         c1=c2=-1;
   2167 
   2168         /* comparison loop */
   2169         for(;;) {
   2170             /*
   2171              * here a code unit value of -1 means "get another code unit"
   2172              * below it will mean "this source is finished"
   2173              */
   2174 
   2175             if(c1<0) {
   2176                 /* get next code unit from string 1, post-increment */
   2177                 for(;;) {
   2178                     if(s1==limit1) {
   2179                         if(level1==0) {
   2180                             c1=-1;
   2181                             break;
   2182                         }
   2183                     } else {
   2184                         c1=cs1.charAt(s1++);
   2185                         break;
   2186                     }
   2187 
   2188                     /* reached end of level buffer, pop one level */
   2189                     do {
   2190                         --level1;
   2191                         cs1=stack1[level1].cs;
   2192                     } while(cs1==null);
   2193                     s1=stack1[level1].s;
   2194                     limit1=cs1.length();
   2195                 }
   2196             }
   2197 
   2198             if(c2<0) {
   2199                 /* get next code unit from string 2, post-increment */
   2200                 for(;;) {
   2201                     if(s2==limit2) {
   2202                         if(level2==0) {
   2203                             c2=-1;
   2204                             break;
   2205                         }
   2206                     } else {
   2207                         c2=cs2.charAt(s2++);
   2208                         break;
   2209                     }
   2210 
   2211                     /* reached end of level buffer, pop one level */
   2212                     do {
   2213                         --level2;
   2214                         cs2=stack2[level2].cs;
   2215                     } while(cs2==null);
   2216                     s2=stack2[level2].s;
   2217                     limit2=cs2.length();
   2218                 }
   2219             }
   2220 
   2221             /*
   2222              * compare c1 and c2
   2223              * either variable c1, c2 is -1 only if the corresponding string is finished
   2224              */
   2225             if(c1==c2) {
   2226                 if(c1<0) {
   2227                     return 0;   /* c1==c2==-1 indicating end of strings */
   2228                 }
   2229                 c1=c2=-1;       /* make us fetch new code units */
   2230                 continue;
   2231             } else if(c1<0) {
   2232                 return -1;      /* string 1 ends before string 2 */
   2233             } else if(c2<0) {
   2234                 return 1;       /* string 2 ends before string 1 */
   2235             }
   2236             /* c1!=c2 && c1>=0 && c2>=0 */
   2237 
   2238             /* get complete code points for c1, c2 for lookups if either is a surrogate */
   2239             cp1=c1;
   2240             if(UTF16.isSurrogate((char)c1)) {
   2241                 char c;
   2242 
   2243                 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
   2244                     if(s1!=limit1 && Character.isLowSurrogate(c=cs1.charAt(s1))) {
   2245                         /* advance ++s1; only below if cp1 decomposes/case-folds */
   2246                         cp1=Character.toCodePoint((char)c1, c);
   2247                     }
   2248                 } else /* isTrail(c1) */ {
   2249                     if(0<=(s1-2) && Character.isHighSurrogate(c=cs1.charAt(s1-2))) {
   2250                         cp1=Character.toCodePoint(c, (char)c1);
   2251                     }
   2252                 }
   2253             }
   2254 
   2255             cp2=c2;
   2256             if(UTF16.isSurrogate((char)c2)) {
   2257                 char c;
   2258 
   2259                 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
   2260                     if(s2!=limit2 && Character.isLowSurrogate(c=cs2.charAt(s2))) {
   2261                         /* advance ++s2; only below if cp2 decomposes/case-folds */
   2262                         cp2=Character.toCodePoint((char)c2, c);
   2263                     }
   2264                 } else /* isTrail(c2) */ {
   2265                     if(0<=(s2-2) && Character.isHighSurrogate(c=cs2.charAt(s2-2))) {
   2266                         cp2=Character.toCodePoint(c, (char)c2);
   2267                     }
   2268                 }
   2269             }
   2270 
   2271             /*
   2272              * go down one level for each string
   2273              * continue with the main loop as soon as there is a real change
   2274              */
   2275 
   2276             if( level1==0 && (options&COMPARE_IGNORE_CASE)!=0 &&
   2277                 (length=csp.toFullFolding(cp1, fold1, options))>=0
   2278             ) {
   2279                 /* cp1 case-folds to the code point "length" or to p[length] */
   2280                 if(UTF16.isSurrogate((char)c1)) {
   2281                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
   2282                         /* advance beyond source surrogate pair if it case-folds */
   2283                         ++s1;
   2284                     } else /* isTrail(c1) */ {
   2285                         /*
   2286                          * we got a supplementary code point when hitting its trail surrogate,
   2287                          * therefore the lead surrogate must have been the same as in the other string;
   2288                          * compare this decomposition with the lead surrogate in the other string
   2289                          * remember that this simulates bulk text replacement:
   2290                          * the decomposition would replace the entire code point
   2291                          */
   2292                         --s2;
   2293                         c2=cs2.charAt(s2-1);
   2294                     }
   2295                 }
   2296 
   2297                 /* push current level pointers */
   2298                 if(stack1==null) {
   2299                     stack1=createCmpEquivLevelStack();
   2300                 }
   2301                 stack1[0].cs=cs1;
   2302                 stack1[0].s=s1;
   2303                 ++level1;
   2304 
   2305                 /* copy the folding result to fold1[] */
   2306                 /* Java: the buffer was probably not empty, remove the old contents */
   2307                 if(length<=UCaseProps.MAX_STRING_LENGTH) {
   2308                     fold1.delete(0, fold1.length()-length);
   2309                 } else {
   2310                     fold1.setLength(0);
   2311                     fold1.appendCodePoint(length);
   2312                 }
   2313 
   2314                 /* set next level pointers to case folding */
   2315                 cs1=fold1;
   2316                 s1=0;
   2317                 limit1=fold1.length();
   2318 
   2319                 /* get ready to read from decomposition, continue with loop */
   2320                 c1=-1;
   2321                 continue;
   2322             }
   2323 
   2324             if( level2==0 && (options&COMPARE_IGNORE_CASE)!=0 &&
   2325                 (length=csp.toFullFolding(cp2, fold2, options))>=0
   2326             ) {
   2327                 /* cp2 case-folds to the code point "length" or to p[length] */
   2328                 if(UTF16.isSurrogate((char)c2)) {
   2329                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
   2330                         /* advance beyond source surrogate pair if it case-folds */
   2331                         ++s2;
   2332                     } else /* isTrail(c2) */ {
   2333                         /*
   2334                          * we got a supplementary code point when hitting its trail surrogate,
   2335                          * therefore the lead surrogate must have been the same as in the other string;
   2336                          * compare this decomposition with the lead surrogate in the other string
   2337                          * remember that this simulates bulk text replacement:
   2338                          * the decomposition would replace the entire code point
   2339                          */
   2340                         --s1;
   2341                         c1=cs1.charAt(s1-1);
   2342                     }
   2343                 }
   2344 
   2345                 /* push current level pointers */
   2346                 if(stack2==null) {
   2347                     stack2=createCmpEquivLevelStack();
   2348                 }
   2349                 stack2[0].cs=cs2;
   2350                 stack2[0].s=s2;
   2351                 ++level2;
   2352 
   2353                 /* copy the folding result to fold2[] */
   2354                 /* Java: the buffer was probably not empty, remove the old contents */
   2355                 if(length<=UCaseProps.MAX_STRING_LENGTH) {
   2356                     fold2.delete(0, fold2.length()-length);
   2357                 } else {
   2358                     fold2.setLength(0);
   2359                     fold2.appendCodePoint(length);
   2360                 }
   2361 
   2362                 /* set next level pointers to case folding */
   2363                 cs2=fold2;
   2364                 s2=0;
   2365                 limit2=fold2.length();
   2366 
   2367                 /* get ready to read from decomposition, continue with loop */
   2368                 c2=-1;
   2369                 continue;
   2370             }
   2371 
   2372             if( level1<2 && (options&COMPARE_EQUIV)!=0 &&
   2373                 (decomp1=nfcImpl.getDecomposition(cp1))!=null
   2374             ) {
   2375                 /* cp1 decomposes into p[length] */
   2376                 if(UTF16.isSurrogate((char)c1)) {
   2377                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
   2378                         /* advance beyond source surrogate pair if it decomposes */
   2379                         ++s1;
   2380                     } else /* isTrail(c1) */ {
   2381                         /*
   2382                          * we got a supplementary code point when hitting its trail surrogate,
   2383                          * therefore the lead surrogate must have been the same as in the other string;
   2384                          * compare this decomposition with the lead surrogate in the other string
   2385                          * remember that this simulates bulk text replacement:
   2386                          * the decomposition would replace the entire code point
   2387                          */
   2388                         --s2;
   2389                         c2=cs2.charAt(s2-1);
   2390                     }
   2391                 }
   2392 
   2393                 /* push current level pointers */
   2394                 if(stack1==null) {
   2395                     stack1=createCmpEquivLevelStack();
   2396                 }
   2397                 stack1[level1].cs=cs1;
   2398                 stack1[level1].s=s1;
   2399                 ++level1;
   2400 
   2401                 /* set empty intermediate level if skipped */
   2402                 if(level1<2) {
   2403                     stack1[level1++].cs=null;
   2404                 }
   2405 
   2406                 /* set next level pointers to decomposition */
   2407                 cs1=decomp1;
   2408                 s1=0;
   2409                 limit1=decomp1.length();
   2410 
   2411                 /* get ready to read from decomposition, continue with loop */
   2412                 c1=-1;
   2413                 continue;
   2414             }
   2415 
   2416             if( level2<2 && (options&COMPARE_EQUIV)!=0 &&
   2417                 (decomp2=nfcImpl.getDecomposition(cp2))!=null
   2418             ) {
   2419                 /* cp2 decomposes into p[length] */
   2420                 if(UTF16.isSurrogate((char)c2)) {
   2421                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
   2422                         /* advance beyond source surrogate pair if it decomposes */
   2423                         ++s2;
   2424                     } else /* isTrail(c2) */ {
   2425                         /*
   2426                          * we got a supplementary code point when hitting its trail surrogate,
   2427                          * therefore the lead surrogate must have been the same as in the other string;
   2428                          * compare this decomposition with the lead surrogate in the other string
   2429                          * remember that this simulates bulk text replacement:
   2430                          * the decomposition would replace the entire code point
   2431                          */
   2432                         --s1;
   2433                         c1=cs1.charAt(s1-1);
   2434                     }
   2435                 }
   2436 
   2437                 /* push current level pointers */
   2438                 if(stack2==null) {
   2439                     stack2=createCmpEquivLevelStack();
   2440                 }
   2441                 stack2[level2].cs=cs2;
   2442                 stack2[level2].s=s2;
   2443                 ++level2;
   2444 
   2445                 /* set empty intermediate level if skipped */
   2446                 if(level2<2) {
   2447                     stack2[level2++].cs=null;
   2448                 }
   2449 
   2450                 /* set next level pointers to decomposition */
   2451                 cs2=decomp2;
   2452                 s2=0;
   2453                 limit2=decomp2.length();
   2454 
   2455                 /* get ready to read from decomposition, continue with loop */
   2456                 c2=-1;
   2457                 continue;
   2458             }
   2459 
   2460             /*
   2461              * no decomposition/case folding, max level for both sides:
   2462              * return difference result
   2463              *
   2464              * code point order comparison must not just return cp1-cp2
   2465              * because when single surrogates are present then the surrogate pairs
   2466              * that formed cp1 and cp2 may be from different string indexes
   2467              *
   2468              * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
   2469              * c1=d800 cp1=10001 c2=dc00 cp2=10000
   2470              * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
   2471              *
   2472              * therefore, use same fix-up as in ustring.c/uprv_strCompare()
   2473              * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
   2474              * so we have slightly different pointer/start/limit comparisons here
   2475              */
   2476 
   2477             if(c1>=0xd800 && c2>=0xd800 && (options&COMPARE_CODE_POINT_ORDER)!=0) {
   2478                 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
   2479                 if(
   2480                     (c1<=0xdbff && s1!=limit1 && Character.isLowSurrogate(cs1.charAt(s1))) ||
   2481                     (Character.isLowSurrogate((char)c1) && 0!=(s1-1) && Character.isHighSurrogate(cs1.charAt(s1-2)))
   2482                 ) {
   2483                     /* part of a surrogate pair, leave >=d800 */
   2484                 } else {
   2485                     /* BMP code point - may be surrogate code point - make <d800 */
   2486                     c1-=0x2800;
   2487                 }
   2488 
   2489                 if(
   2490                     (c2<=0xdbff && s2!=limit2 && Character.isLowSurrogate(cs2.charAt(s2))) ||
   2491                     (Character.isLowSurrogate((char)c2) && 0!=(s2-1) && Character.isHighSurrogate(cs2.charAt(s2-2)))
   2492                 ) {
   2493                     /* part of a surrogate pair, leave >=d800 */
   2494                 } else {
   2495                     /* BMP code point - may be surrogate code point - make <d800 */
   2496                     c2-=0x2800;
   2497                 }
   2498             }
   2499 
   2500             return c1-c2;
   2501         }
   2502     }
   2503 
   2504     /**
   2505      * An Appendable that writes into a char array with a capacity that may be
   2506      * less than array.length.
   2507      * (By contrast, CharBuffer will write beyond destLimit all the way up to array.length.)
   2508      * <p>
   2509      * An overflow is only reported at the end, for the old Normalizer API functions that write
   2510      * to char arrays.
   2511      */
   2512     private static final class CharsAppendable implements Appendable {
   2513         public CharsAppendable(char[] dest, int destStart, int destLimit) {
   2514             chars=dest;
   2515             start=offset=destStart;
   2516             limit=destLimit;
   2517         }
   2518         public int length() {
   2519             int len=offset-start;
   2520             if(offset<=limit) {
   2521                 return len;
   2522             } else {
   2523                 throw new IndexOutOfBoundsException(Integer.toString(len));
   2524             }
   2525         }
   2526         @Override
   2527         public Appendable append(char c) {
   2528             if(offset<limit) {
   2529                 chars[offset]=c;
   2530             }
   2531             ++offset;
   2532             return this;
   2533         }
   2534         @Override
   2535         public Appendable append(CharSequence s) {
   2536             return append(s, 0, s.length());
   2537         }
   2538         @Override
   2539         public Appendable append(CharSequence s, int sStart, int sLimit) {
   2540             int len=sLimit-sStart;
   2541             if(len<=(limit-offset)) {
   2542                 while(sStart<sLimit) {  // TODO: Is there a better way to copy the characters?
   2543                     chars[offset++]=s.charAt(sStart++);
   2544                 }
   2545             } else {
   2546                 offset+=len;
   2547             }
   2548             return this;
   2549         }
   2550 
   2551         private final char[] chars;
   2552         private final int start, limit;
   2553         private int offset;
   2554     }
   2555 }
   2556