Home | History | Annotate | Download | only in text
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /**
      5  *******************************************************************************
      6  * Copyright (C) 1996-2016, International Business Machines Corporation and
      7  * others. All Rights Reserved.
      8  *******************************************************************************
      9  */
     10 
     11 package android.icu.text;
     12 
     13 import android.icu.impl.Utility;
     14 
     15 /**
     16  * <p>
     17  * Standalone utility class providing UTF16 character conversions and indexing conversions.
     18  * </p>
     19  * <p>
     20  * Code that uses strings alone rarely need modification. By design, UTF-16 does not allow overlap,
     21  * so searching for strings is a safe operation. Similarly, concatenation is always safe.
     22  * Substringing is safe if the start and end are both on UTF-32 boundaries. In normal code, the
     23  * values for start and end are on those boundaries, since they arose from operations like
     24  * searching. If not, the nearest UTF-32 boundaries can be determined using <code>bounds()</code>.
     25  * </p>
     26  * <strong>Examples:</strong>
     27  * <p>
     28  * The following examples illustrate use of some of these methods.
     29  *
     30  * <pre>
     31  * // iteration forwards: Original
     32  * for (int i = 0; i &lt; s.length(); ++i) {
     33  *     char ch = s.charAt(i);
     34  *     doSomethingWith(ch);
     35  * }
     36  *
     37  * // iteration forwards: Changes for UTF-32
     38  * int ch;
     39  * for (int i = 0; i &lt; s.length(); i += UTF16.getCharCount(ch)) {
     40  *     ch = UTF16.charAt(s, i);
     41  *     doSomethingWith(ch);
     42  * }
     43  *
     44  * // iteration backwards: Original
     45  * for (int i = s.length() - 1; i &gt;= 0; --i) {
     46  *     char ch = s.charAt(i);
     47  *     doSomethingWith(ch);
     48  * }
     49  *
     50  * // iteration backwards: Changes for UTF-32
     51  * int ch;
     52  * for (int i = s.length() - 1; i &gt; 0; i -= UTF16.getCharCount(ch)) {
     53  *     ch = UTF16.charAt(s, i);
     54  *     doSomethingWith(ch);
     55  * }
     56  * </pre>
     57  *
     58  * <strong>Notes:</strong>
     59  * <ul>
     60  * <li> <strong>Naming:</strong> For clarity, High and Low surrogates are called <code>Lead</code>
     61  * and <code>Trail</code> in the API, which gives a better sense of their ordering in a string.
     62  * <code>offset16</code> and <code>offset32</code> are used to distinguish offsets to UTF-16
     63  * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is used to contain UTF-32
     64  * characters, as opposed to <code>char16</code>, which is a UTF-16 code unit. </li>
     65  * <li> <strong>Roundtripping Offsets:</strong> You can always roundtrip from a UTF-32 offset to a
     66  * UTF-16 offset and back. Because of the difference in structure, you can roundtrip from a UTF-16
     67  * offset to a UTF-32 offset and back if and only if <code>bounds(string, offset16) != TRAIL</code>.
     68  * </li>
     69  * <li> <strong>Exceptions:</strong> The error checking will throw an exception if indices are out
     70  * of bounds. Other than than that, all methods will behave reasonably, even if unmatched surrogates
     71  * or out-of-bounds UTF-32 values are present. <code>UCharacter.isLegal()</code> can be used to
     72  * check for validity if desired. </li>
     73  * <li> <strong>Unmatched Surrogates:</strong> If the string contains unmatched surrogates, then
     74  * these are counted as one UTF-32 value. This matches their iteration behavior, which is vital. It
     75  * also matches common display practice as missing glyphs (see the Unicode Standard Section 5.4,
     76  * 5.5). </li>
     77  * <li> <strong>Optimization:</strong> The method implementations may need optimization if the
     78  * compiler doesn't fold static final methods. Since surrogate pairs will form an exceeding small
     79  * percentage of all the text in the world, the singleton case should always be optimized for. </li>
     80  * </ul>
     81  *
     82  * @author Mark Davis, with help from Markus Scherer
     83  * @hide Only a subset of ICU is exposed in Android
     84  */
     85 
     86 public final class UTF16 {
     87     // public variables ---------------------------------------------------
     88 
     89     /**
     90      * Value returned in {@link #bounds(String, int) bounds()}.
     91      * These values are chosen specifically so that it actually represents the position of the
     92      * character [offset16 - (value &gt;&gt; 2), offset16 + (value &amp; 3)]
     93      */
     94     public static final int SINGLE_CHAR_BOUNDARY = 1, LEAD_SURROGATE_BOUNDARY = 2,
     95             TRAIL_SURROGATE_BOUNDARY = 5;
     96 
     97     /**
     98      * The lowest Unicode code point value.
     99      */
    100     public static final int CODEPOINT_MIN_VALUE = 0;
    101 
    102     /**
    103      * The highest Unicode code point value (scalar value) according to the Unicode Standard.
    104      */
    105     public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
    106 
    107     /**
    108      * The minimum value for Supplementary code points
    109      */
    110     public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
    111 
    112     /**
    113      * Lead surrogate minimum value
    114      */
    115     public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
    116 
    117     /**
    118      * Trail surrogate minimum value
    119      */
    120     public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
    121 
    122     /**
    123      * Lead surrogate maximum value
    124      */
    125     public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
    126 
    127     /**
    128      * Trail surrogate maximum value
    129      */
    130     public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
    131 
    132     /**
    133      * Surrogate minimum value
    134      */
    135     public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
    136 
    137     /**
    138      * Maximum surrogate value
    139      */
    140     public static final int SURROGATE_MAX_VALUE = TRAIL_SURROGATE_MAX_VALUE;
    141 
    142     /**
    143      * Lead surrogate bitmask
    144      */
    145     private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
    146 
    147     /**
    148      * Trail surrogate bitmask
    149      */
    150     private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
    151 
    152     /**
    153      * Surrogate bitmask
    154      */
    155     private static final int SURROGATE_BITMASK = 0xFFFFF800;
    156 
    157     /**
    158      * Lead surrogate bits
    159      */
    160     private static final int LEAD_SURROGATE_BITS = 0xD800;
    161 
    162     /**
    163      * Trail surrogate bits
    164      */
    165     private static final int TRAIL_SURROGATE_BITS = 0xDC00;
    166 
    167     /**
    168      * Surrogate bits
    169      */
    170     private static final int SURROGATE_BITS = 0xD800;
    171 
    172     // constructor --------------------------------------------------------
    173 
    174     // /CLOVER:OFF
    175     /**
    176      * Prevent instance from being created.
    177      */
    178     private UTF16() {
    179     }
    180 
    181     // /CLOVER:ON
    182     // public method ------------------------------------------------------
    183 
    184     /**
    185      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
    186      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
    187      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">
    188      * UCharacter.isLegal()</a></code>
    189      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
    190      * character will be returned. If a complete supplementary character is not found the incomplete
    191      * character will be returned
    192      *
    193      * @param source Array of UTF-16 chars
    194      * @param offset16 UTF-16 offset to the start of the character.
    195      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
    196      *         of that codepoint are the same as in <code>bounds32()</code>.
    197      * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
    198      */
    199     public static int charAt(String source, int offset16) {
    200         char single = source.charAt(offset16);
    201         if (single < LEAD_SURROGATE_MIN_VALUE) {
    202             return single;
    203         }
    204         return _charAt(source, offset16, single);
    205     }
    206 
    207     private static int _charAt(String source, int offset16, char single) {
    208         if (single > TRAIL_SURROGATE_MAX_VALUE) {
    209             return single;
    210         }
    211 
    212         // Convert the UTF-16 surrogate pair if necessary.
    213         // For simplicity in usage, and because the frequency of pairs is
    214         // low, look both directions.
    215 
    216         if (single <= LEAD_SURROGATE_MAX_VALUE) {
    217             ++offset16;
    218             if (source.length() != offset16) {
    219                 char trail = source.charAt(offset16);
    220                 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
    221                     return Character.toCodePoint(single, trail);
    222                 }
    223             }
    224         } else {
    225             --offset16;
    226             if (offset16 >= 0) {
    227                 // single is a trail surrogate so
    228                 char lead = source.charAt(offset16);
    229                 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
    230                     return Character.toCodePoint(lead, single);
    231                 }
    232             }
    233         }
    234         return single; // return unmatched surrogate
    235     }
    236 
    237     /**
    238      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
    239      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
    240      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">
    241      * UCharacter.isLegal()</a></code>
    242      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
    243      * character will be returned. If a complete supplementary character is not found the incomplete
    244      * character will be returned
    245      *
    246      * @param source Array of UTF-16 chars
    247      * @param offset16 UTF-16 offset to the start of the character.
    248      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
    249      *         of that codepoint are the same as in <code>bounds32()</code>.
    250      * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
    251      */
    252     public static int charAt(CharSequence source, int offset16) {
    253         char single = source.charAt(offset16);
    254         if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
    255             return single;
    256         }
    257         return _charAt(source, offset16, single);
    258     }
    259 
    260     private static int _charAt(CharSequence source, int offset16, char single) {
    261         if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
    262             return single;
    263         }
    264 
    265         // Convert the UTF-16 surrogate pair if necessary.
    266         // For simplicity in usage, and because the frequency of pairs is
    267         // low, look both directions.
    268 
    269         if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
    270             ++offset16;
    271             if (source.length() != offset16) {
    272                 char trail = source.charAt(offset16);
    273                 if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE
    274                         && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
    275                     return Character.toCodePoint(single, trail);
    276                 }
    277             }
    278         } else {
    279             --offset16;
    280             if (offset16 >= 0) {
    281                 // single is a trail surrogate so
    282                 char lead = source.charAt(offset16);
    283                 if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE
    284                         && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
    285                     return Character.toCodePoint(lead, single);
    286                 }
    287             }
    288         }
    289         return single; // return unmatched surrogate
    290     }
    291 
    292     /**
    293      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
    294      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
    295      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
    296      * </a></code>
    297      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
    298      * character will be returned. If a complete supplementary character is not found the incomplete
    299      * character will be returned
    300      *
    301      * @param source UTF-16 chars string buffer
    302      * @param offset16 UTF-16 offset to the start of the character.
    303      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
    304      *         of that codepoint are the same as in <code>bounds32()</code>.
    305      * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
    306      */
    307     public static int charAt(StringBuffer source, int offset16) {
    308         if (offset16 < 0 || offset16 >= source.length()) {
    309             throw new StringIndexOutOfBoundsException(offset16);
    310         }
    311 
    312         char single = source.charAt(offset16);
    313         if (!isSurrogate(single)) {
    314             return single;
    315         }
    316 
    317         // Convert the UTF-16 surrogate pair if necessary.
    318         // For simplicity in usage, and because the frequency of pairs is
    319         // low, look both directions.
    320 
    321         if (single <= LEAD_SURROGATE_MAX_VALUE) {
    322             ++offset16;
    323             if (source.length() != offset16) {
    324                 char trail = source.charAt(offset16);
    325                 if (isTrailSurrogate(trail))
    326                     return Character.toCodePoint(single, trail);
    327             }
    328         } else {
    329             --offset16;
    330             if (offset16 >= 0) {
    331                 // single is a trail surrogate so
    332                 char lead = source.charAt(offset16);
    333                 if (isLeadSurrogate(lead)) {
    334                     return Character.toCodePoint(lead, single);
    335                 }
    336             }
    337         }
    338         return single; // return unmatched surrogate
    339     }
    340 
    341     /**
    342      * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards
    343      * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
    344      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
    345      * </a></code>
    346      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
    347      * character will be returned. If a complete supplementary character is not found the incomplete
    348      * character will be returned
    349      *
    350      * @param source Array of UTF-16 chars
    351      * @param start Offset to substring in the source array for analyzing
    352      * @param limit Offset to substring in the source array for analyzing
    353      * @param offset16 UTF-16 offset relative to start
    354      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
    355      *         of that codepoint are the same as in <code>bounds32()</code>.
    356      * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit.
    357      */
    358     public static int charAt(char source[], int start, int limit, int offset16) {
    359         offset16 += start;
    360         if (offset16 < start || offset16 >= limit) {
    361             throw new ArrayIndexOutOfBoundsException(offset16);
    362         }
    363 
    364         char single = source[offset16];
    365         if (!isSurrogate(single)) {
    366             return single;
    367         }
    368 
    369         // Convert the UTF-16 surrogate pair if necessary.
    370         // For simplicity in usage, and because the frequency of pairs is
    371         // low, look both directions.
    372         if (single <= LEAD_SURROGATE_MAX_VALUE) {
    373             offset16++;
    374             if (offset16 >= limit) {
    375                 return single;
    376             }
    377             char trail = source[offset16];
    378             if (isTrailSurrogate(trail)) {
    379                 return Character.toCodePoint(single, trail);
    380             }
    381         } else { // isTrailSurrogate(single), so
    382             if (offset16 == start) {
    383                 return single;
    384             }
    385             offset16--;
    386             char lead = source[offset16];
    387             if (isLeadSurrogate(lead))
    388                 return Character.toCodePoint(lead, single);
    389         }
    390         return single; // return unmatched surrogate
    391     }
    392 
    393     /**
    394      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
    395      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
    396      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
    397      * </a></code>
    398      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
    399      * character will be returned. If a complete supplementary character is not found the incomplete
    400      * character will be returned
    401      *
    402      * @param source UTF-16 chars string buffer
    403      * @param offset16 UTF-16 offset to the start of the character.
    404      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
    405      *         of that codepoint are the same as in <code>bounds32()</code>.
    406      * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
    407      */
    408     public static int charAt(Replaceable source, int offset16) {
    409         if (offset16 < 0 || offset16 >= source.length()) {
    410             throw new StringIndexOutOfBoundsException(offset16);
    411         }
    412 
    413         char single = source.charAt(offset16);
    414         if (!isSurrogate(single)) {
    415             return single;
    416         }
    417 
    418         // Convert the UTF-16 surrogate pair if necessary.
    419         // For simplicity in usage, and because the frequency of pairs is
    420         // low, look both directions.
    421 
    422         if (single <= LEAD_SURROGATE_MAX_VALUE) {
    423             ++offset16;
    424             if (source.length() != offset16) {
    425                 char trail = source.charAt(offset16);
    426                 if (isTrailSurrogate(trail))
    427                     return Character.toCodePoint(single, trail);
    428             }
    429         } else {
    430             --offset16;
    431             if (offset16 >= 0) {
    432                 // single is a trail surrogate so
    433                 char lead = source.charAt(offset16);
    434                 if (isLeadSurrogate(lead)) {
    435                     return Character.toCodePoint(lead, single);
    436                 }
    437             }
    438         }
    439         return single; // return unmatched surrogate
    440     }
    441 
    442     /**
    443      * Determines how many chars this char32 requires. If a validity check is required, use <code>
    444      * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
    445      * on char32 before calling.
    446      *
    447      * @param char32 The input codepoint.
    448      * @return 2 if is in supplementary space, otherwise 1.
    449      */
    450     public static int getCharCount(int char32) {
    451         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
    452             return 1;
    453         }
    454         return 2;
    455     }
    456 
    457     /**
    458      * Returns the type of the boundaries around the char at offset16. Used for random access.
    459      *
    460      * @param source Text to analyse
    461      * @param offset16 UTF-16 offset
    462      * @return
    463      *            <ul>
    464      *            <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16+1]
    465      *            <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
    466      *            are [offset16, offset16 + 2]
    467      *            <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
    468      *            bounds are [offset16 - 1, offset16 + 1]
    469      *            </ul>
    470      *            For bit-twiddlers, the return values for these are chosen so that the boundaries
    471      *            can be gotten by: [offset16 - (value &gt;&gt; 2), offset16 + (value &amp; 3)].
    472      * @exception IndexOutOfBoundsException If offset16 is out of bounds.
    473      */
    474     public static int bounds(String source, int offset16) {
    475         char ch = source.charAt(offset16);
    476         if (isSurrogate(ch)) {
    477             if (isLeadSurrogate(ch)) {
    478                 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) {
    479                     return LEAD_SURROGATE_BOUNDARY;
    480                 }
    481             } else {
    482                 // isTrailSurrogate(ch), so
    483                 --offset16;
    484                 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) {
    485                     return TRAIL_SURROGATE_BOUNDARY;
    486                 }
    487             }
    488         }
    489         return SINGLE_CHAR_BOUNDARY;
    490     }
    491 
    492     /**
    493      * Returns the type of the boundaries around the char at offset16. Used for random access.
    494      *
    495      * @param source String buffer to analyse
    496      * @param offset16 UTF16 offset
    497      * @return
    498      *            <ul>
    499      *            <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16 + 1]
    500      *            <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
    501      *            are [offset16, offset16 + 2]
    502      *            <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
    503      *            bounds are [offset16 - 1, offset16 + 1]
    504      *            </ul>
    505      *            For bit-twiddlers, the return values for these are chosen so that the boundaries
    506      *            can be gotten by: [offset16 - (value &gt;&gt; 2), offset16 + (value &amp; 3)].
    507      * @exception IndexOutOfBoundsException If offset16 is out of bounds.
    508      */
    509     public static int bounds(StringBuffer source, int offset16) {
    510         char ch = source.charAt(offset16);
    511         if (isSurrogate(ch)) {
    512             if (isLeadSurrogate(ch)) {
    513                 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) {
    514                     return LEAD_SURROGATE_BOUNDARY;
    515                 }
    516             } else {
    517                 // isTrailSurrogate(ch), so
    518                 --offset16;
    519                 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) {
    520                     return TRAIL_SURROGATE_BOUNDARY;
    521                 }
    522             }
    523         }
    524         return SINGLE_CHAR_BOUNDARY;
    525     }
    526 
    527     /**
    528      * Returns the type of the boundaries around the char at offset16. Used for random access. Note
    529      * that the boundaries are determined with respect to the subarray, hence the char array
    530      * {0xD800, 0xDC00} has the result SINGLE_CHAR_BOUNDARY for start = offset16 = 0 and limit = 1.
    531      *
    532      * @param source Char array to analyse
    533      * @param start Offset to substring in the source array for analyzing
    534      * @param limit Offset to substring in the source array for analyzing
    535      * @param offset16 UTF16 offset relative to start
    536      * @return
    537      *            <ul>
    538      *            <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are
    539      *            <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
    540      *            are [offset16, offset16 + 2]
    541      *            <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
    542      *            bounds are [offset16 - 1, offset16 + 1]
    543      *            </ul>
    544      *            For bit-twiddlers, the boundary values for these are chosen so that the boundaries
    545      *            can be gotten by: [offset16 - (boundvalue &gt;&gt; 2), offset16 + (boundvalue &amp; 3)].
    546      * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit.
    547      */
    548     public static int bounds(char source[], int start, int limit, int offset16) {
    549         offset16 += start;
    550         if (offset16 < start || offset16 >= limit) {
    551             throw new ArrayIndexOutOfBoundsException(offset16);
    552         }
    553         char ch = source[offset16];
    554         if (isSurrogate(ch)) {
    555             if (isLeadSurrogate(ch)) {
    556                 ++offset16;
    557                 if (offset16 < limit && isTrailSurrogate(source[offset16])) {
    558                     return LEAD_SURROGATE_BOUNDARY;
    559                 }
    560             } else { // isTrailSurrogate(ch), so
    561                 --offset16;
    562                 if (offset16 >= start && isLeadSurrogate(source[offset16])) {
    563                     return TRAIL_SURROGATE_BOUNDARY;
    564                 }
    565             }
    566         }
    567         return SINGLE_CHAR_BOUNDARY;
    568     }
    569 
    570     /**
    571      * Determines whether the code value is a surrogate.
    572      *
    573      * @param char16 The input character.
    574      * @return true If the input character is a surrogate.
    575      */
    576     public static boolean isSurrogate(char char16) {
    577         return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
    578     }
    579 
    580     /**
    581      * Determines whether the character is a trail surrogate.
    582      *
    583      * @param char16 The input character.
    584      * @return true If the input character is a trail surrogate.
    585      */
    586     public static boolean isTrailSurrogate(char char16) {
    587         return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
    588     }
    589 
    590     /**
    591      * Determines whether the character is a lead surrogate.
    592      *
    593      * @param char16 The input character.
    594      * @return true If the input character is a lead surrogate
    595      */
    596     public static boolean isLeadSurrogate(char char16) {
    597         return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
    598     }
    599 
    600     /**
    601      * Returns the lead surrogate. If a validity check is required, use
    602      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32
    603      * before calling.
    604      *
    605      * @param char32 The input character.
    606      * @return lead surrogate if the getCharCount(ch) is 2; <br>
    607      *         and 0 otherwise (note: 0 is not a valid lead surrogate).
    608      */
    609     public static char getLeadSurrogate(int char32) {
    610         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
    611             return (char) (LEAD_SURROGATE_OFFSET_ + (char32 >> LEAD_SURROGATE_SHIFT_));
    612         }
    613         return 0;
    614     }
    615 
    616     /**
    617      * Returns the trail surrogate. If a validity check is required, use
    618      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32
    619      * before calling.
    620      *
    621      * @param char32 The input character.
    622      * @return the trail surrogate if the getCharCount(ch) is 2; <br>
    623      *         otherwise the character itself
    624      */
    625     public static char getTrailSurrogate(int char32) {
    626         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
    627             return (char) (TRAIL_SURROGATE_MIN_VALUE + (char32 & TRAIL_SURROGATE_MASK_));
    628         }
    629         return (char) char32;
    630     }
    631 
    632     /**
    633      * Convenience method corresponding to String.valueOf(char). Returns a one or two char string
    634      * containing the UTF-32 value in UTF16 format. If a validity check is required, use
    635      * {@link android.icu.lang.UCharacter#isLegal(int)} on char32 before calling.
    636      *
    637      * @param char32 The input character.
    638      * @return string value of char32 in UTF16 format
    639      * @exception IllegalArgumentException Thrown if char32 is a invalid codepoint.
    640      */
    641     public static String valueOf(int char32) {
    642         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
    643             throw new IllegalArgumentException("Illegal codepoint");
    644         }
    645         return toString(char32);
    646     }
    647 
    648     /**
    649      * Convenience method corresponding to String.valueOf(codepoint at offset16). Returns a one or
    650      * two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a surrogate
    651      * character, the whole supplementary codepoint will be returned. If a validity check is
    652      * required, use {@link android.icu.lang.UCharacter#isLegal(int)} on the
    653      * codepoint at offset16 before calling. The result returned will be a newly created String
    654      * obtained by calling source.substring(..) with the appropriate indexes.
    655      *
    656      * @param source The input string.
    657      * @param offset16 The UTF16 index to the codepoint in source
    658      * @return string value of char32 in UTF16 format
    659      */
    660     public static String valueOf(String source, int offset16) {
    661         switch (bounds(source, offset16)) {
    662         case LEAD_SURROGATE_BOUNDARY:
    663             return source.substring(offset16, offset16 + 2);
    664         case TRAIL_SURROGATE_BOUNDARY:
    665             return source.substring(offset16 - 1, offset16 + 1);
    666         default:
    667             return source.substring(offset16, offset16 + 1);
    668         }
    669     }
    670 
    671     /**
    672      * Convenience method corresponding to StringBuffer.valueOf(codepoint at offset16). Returns a
    673      * one or two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a
    674      * surrogate character, the whole supplementary codepoint will be returned. If a validity check
    675      * is required, use {@link android.icu.lang.UCharacter#isLegal(int)} on
    676      * the codepoint at offset16 before calling. The result returned will be a newly created String
    677      * obtained by calling source.substring(..) with the appropriate indexes.
    678      *
    679      * @param source The input string buffer.
    680      * @param offset16 The UTF16 index to the codepoint in source
    681      * @return string value of char32 in UTF16 format
    682      */
    683     public static String valueOf(StringBuffer source, int offset16) {
    684         switch (bounds(source, offset16)) {
    685         case LEAD_SURROGATE_BOUNDARY:
    686             return source.substring(offset16, offset16 + 2);
    687         case TRAIL_SURROGATE_BOUNDARY:
    688             return source.substring(offset16 - 1, offset16 + 1);
    689         default:
    690             return source.substring(offset16, offset16 + 1);
    691         }
    692     }
    693 
    694     /**
    695      * Convenience method. Returns a one or two char string containing the UTF-32 value in UTF16
    696      * format. If offset16 indexes a surrogate character, the whole supplementary codepoint will be
    697      * returned, except when either the leading or trailing surrogate character lies out of the
    698      * specified subarray. In the latter case, only the surrogate character within bounds will be
    699      * returned. If a validity check is required, use
    700      * {@link android.icu.lang.UCharacter#isLegal(int)} on the codepoint at
    701      * offset16 before calling. The result returned will be a newly created String containing the
    702      * relevant characters.
    703      *
    704      * @param source The input char array.
    705      * @param start Start index of the subarray
    706      * @param limit End index of the subarray
    707      * @param offset16 The UTF16 index to the codepoint in source relative to start
    708      * @return string value of char32 in UTF16 format
    709      */
    710     public static String valueOf(char source[], int start, int limit, int offset16) {
    711         switch (bounds(source, start, limit, offset16)) {
    712         case LEAD_SURROGATE_BOUNDARY:
    713             return new String(source, start + offset16, 2);
    714         case TRAIL_SURROGATE_BOUNDARY:
    715             return new String(source, start + offset16 - 1, 2);
    716         }
    717         return new String(source, start + offset16, 1);
    718     }
    719 
    720     /**
    721      * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
    722      * the {@link UTF16 class description} for notes on roundtripping.
    723      *
    724      * @param source The UTF-16 string
    725      * @param offset32 UTF-32 offset
    726      * @return UTF-16 offset
    727      * @exception IndexOutOfBoundsException If offset32 is out of bounds.
    728      */
    729     public static int findOffsetFromCodePoint(String source, int offset32) {
    730         char ch;
    731         int size = source.length(), result = 0, count = offset32;
    732         if (offset32 < 0 || offset32 > size) {
    733             throw new StringIndexOutOfBoundsException(offset32);
    734         }
    735         while (result < size && count > 0) {
    736             ch = source.charAt(result);
    737             if (isLeadSurrogate(ch) && ((result + 1) < size)
    738                     && isTrailSurrogate(source.charAt(result + 1))) {
    739                 result++;
    740             }
    741 
    742             count--;
    743             result++;
    744         }
    745         if (count != 0) {
    746             throw new StringIndexOutOfBoundsException(offset32);
    747         }
    748         return result;
    749     }
    750 
    751     /**
    752      * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
    753      * the {@link UTF16 class description} for notes on roundtripping.
    754      *
    755      * @param source The UTF-16 string buffer
    756      * @param offset32 UTF-32 offset
    757      * @return UTF-16 offset
    758      * @exception IndexOutOfBoundsException If offset32 is out of bounds.
    759      */
    760     public static int findOffsetFromCodePoint(StringBuffer source, int offset32) {
    761         char ch;
    762         int size = source.length(), result = 0, count = offset32;
    763         if (offset32 < 0 || offset32 > size) {
    764             throw new StringIndexOutOfBoundsException(offset32);
    765         }
    766         while (result < size && count > 0) {
    767             ch = source.charAt(result);
    768             if (isLeadSurrogate(ch) && ((result + 1) < size)
    769                     && isTrailSurrogate(source.charAt(result + 1))) {
    770                 result++;
    771             }
    772 
    773             count--;
    774             result++;
    775         }
    776         if (count != 0) {
    777             throw new StringIndexOutOfBoundsException(offset32);
    778         }
    779         return result;
    780     }
    781 
    782     /**
    783      * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
    784      * the {@link UTF16 class description} for notes on roundtripping.
    785      *
    786      * @param source The UTF-16 char array whose substring is to be analysed
    787      * @param start Offset of the substring to be analysed
    788      * @param limit Offset of the substring to be analysed
    789      * @param offset32 UTF-32 offset relative to start
    790      * @return UTF-16 offset relative to start
    791      * @exception IndexOutOfBoundsException If offset32 is out of bounds.
    792      */
    793     public static int findOffsetFromCodePoint(char source[], int start, int limit, int offset32) {
    794         char ch;
    795         int result = start, count = offset32;
    796         if (offset32 > limit - start) {
    797             throw new ArrayIndexOutOfBoundsException(offset32);
    798         }
    799         while (result < limit && count > 0) {
    800             ch = source[result];
    801             if (isLeadSurrogate(ch) && ((result + 1) < limit)
    802                     && isTrailSurrogate(source[result + 1])) {
    803                 result++;
    804             }
    805 
    806             count--;
    807             result++;
    808         }
    809         if (count != 0) {
    810             throw new ArrayIndexOutOfBoundsException(offset32);
    811         }
    812         return result - start;
    813     }
    814 
    815     /**
    816      * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at or after the given
    817      * UTF-16 offset. Used for random access. See the {@link UTF16 class description} for
    818      * notes on roundtripping.<br>
    819      * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
    820      * of the <strong>lead</strong> of the pair is returned. </i>
    821      * <p>
    822      * To find the UTF-32 length of a string, use:
    823      *
    824      * <pre>
    825      * len32 = countCodePoint(source, source.length());
    826      * </pre>
    827      *
    828      * @param source Text to analyse
    829      * @param offset16 UTF-16 offset &lt; source text length.
    830      * @return UTF-32 offset
    831      * @exception IndexOutOfBoundsException If offset16 is out of bounds.
    832      */
    833     public static int findCodePointOffset(String source, int offset16) {
    834         if (offset16 < 0 || offset16 > source.length()) {
    835             throw new StringIndexOutOfBoundsException(offset16);
    836         }
    837 
    838         int result = 0;
    839         char ch;
    840         boolean hadLeadSurrogate = false;
    841 
    842         for (int i = 0; i < offset16; ++i) {
    843             ch = source.charAt(i);
    844             if (hadLeadSurrogate && isTrailSurrogate(ch)) {
    845                 hadLeadSurrogate = false; // count valid trail as zero
    846             } else {
    847                 hadLeadSurrogate = isLeadSurrogate(ch);
    848                 ++result; // count others as 1
    849             }
    850         }
    851 
    852         if (offset16 == source.length()) {
    853             return result;
    854         }
    855 
    856         // end of source being the less significant surrogate character
    857         // shift result back to the start of the supplementary character
    858         if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) {
    859             result--;
    860         }
    861 
    862         return result;
    863     }
    864 
    865     /**
    866      * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16
    867      * offset. Used for random access. See the {@link UTF16 class description} for notes on
    868      * roundtripping.<br>
    869      * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
    870      * of the <strong>lead</strong> of the pair is returned. </i>
    871      * <p>
    872      * To find the UTF-32 length of a string, use:
    873      *
    874      * <pre>
    875      * len32 = countCodePoint(source);
    876      * </pre>
    877      *
    878      * @param source Text to analyse
    879      * @param offset16 UTF-16 offset &lt; source text length.
    880      * @return UTF-32 offset
    881      * @exception IndexOutOfBoundsException If offset16 is out of bounds.
    882      */
    883     public static int findCodePointOffset(StringBuffer source, int offset16) {
    884         if (offset16 < 0 || offset16 > source.length()) {
    885             throw new StringIndexOutOfBoundsException(offset16);
    886         }
    887 
    888         int result = 0;
    889         char ch;
    890         boolean hadLeadSurrogate = false;
    891 
    892         for (int i = 0; i < offset16; ++i) {
    893             ch = source.charAt(i);
    894             if (hadLeadSurrogate && isTrailSurrogate(ch)) {
    895                 hadLeadSurrogate = false; // count valid trail as zero
    896             } else {
    897                 hadLeadSurrogate = isLeadSurrogate(ch);
    898                 ++result; // count others as 1
    899             }
    900         }
    901 
    902         if (offset16 == source.length()) {
    903             return result;
    904         }
    905 
    906         // end of source being the less significant surrogate character
    907         // shift result back to the start of the supplementary character
    908         if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) {
    909             result--;
    910         }
    911 
    912         return result;
    913     }
    914 
    915     /**
    916      * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16
    917      * offset. Used for random access. See the {@link UTF16 class description} for notes on
    918      * roundtripping.<br>
    919      * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
    920      * of the <strong>lead</strong> of the pair is returned. </i>
    921      * <p>
    922      * To find the UTF-32 length of a substring, use:
    923      *
    924      * <pre>
    925      * len32 = countCodePoint(source, start, limit);
    926      * </pre>
    927      *
    928      * @param source Text to analyse
    929      * @param start Offset of the substring
    930      * @param limit Offset of the substring
    931      * @param offset16 UTF-16 relative to start
    932      * @return UTF-32 offset relative to start
    933      * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit.
    934      */
    935     public static int findCodePointOffset(char source[], int start, int limit, int offset16) {
    936         offset16 += start;
    937         if (offset16 > limit) {
    938             throw new StringIndexOutOfBoundsException(offset16);
    939         }
    940 
    941         int result = 0;
    942         char ch;
    943         boolean hadLeadSurrogate = false;
    944 
    945         for (int i = start; i < offset16; ++i) {
    946             ch = source[i];
    947             if (hadLeadSurrogate && isTrailSurrogate(ch)) {
    948                 hadLeadSurrogate = false; // count valid trail as zero
    949             } else {
    950                 hadLeadSurrogate = isLeadSurrogate(ch);
    951                 ++result; // count others as 1
    952             }
    953         }
    954 
    955         if (offset16 == limit) {
    956             return result;
    957         }
    958 
    959         // end of source being the less significant surrogate character
    960         // shift result back to the start of the supplementary character
    961         if (hadLeadSurrogate && (isTrailSurrogate(source[offset16]))) {
    962             result--;
    963         }
    964 
    965         return result;
    966     }
    967 
    968     /**
    969      * Append a single UTF-32 value to the end of a StringBuffer. If a validity check is required,
    970      * use {@link android.icu.lang.UCharacter#isLegal(int)} on char32 before
    971      * calling.
    972      *
    973      * @param target The buffer to append to
    974      * @param char32 Value to append.
    975      * @return the updated StringBuffer
    976      * @exception IllegalArgumentException Thrown when char32 does not lie within the range of the Unicode codepoints
    977      */
    978     public static StringBuffer append(StringBuffer target, int char32) {
    979         // Check for irregular values
    980         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
    981             throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
    982         }
    983 
    984         // Write the UTF-16 values
    985         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
    986             target.append(getLeadSurrogate(char32));
    987             target.append(getTrailSurrogate(char32));
    988         } else {
    989             target.append((char) char32);
    990         }
    991         return target;
    992     }
    993 
    994     /**
    995      * Cover JDK 1.5 APIs. Append the code point to the buffer and return the buffer as a
    996      * convenience.
    997      *
    998      * @param target The buffer to append to
    999      * @param cp The code point to append
   1000      * @return the updated StringBuffer
   1001      * @throws IllegalArgumentException If cp is not a valid code point
   1002      */
   1003     public static StringBuffer appendCodePoint(StringBuffer target, int cp) {
   1004         return append(target, cp);
   1005     }
   1006 
   1007     /**
   1008      * Adds a codepoint to offset16 position of the argument char array.
   1009      *
   1010      * @param target Char array to be append with the new code point
   1011      * @param limit UTF16 offset which the codepoint will be appended.
   1012      * @param char32 Code point to be appended
   1013      * @return offset after char32 in the array.
   1014      * @exception IllegalArgumentException Thrown if there is not enough space for the append, or when char32 does not
   1015      *                lie within the range of the Unicode codepoints.
   1016      */
   1017     public static int append(char[] target, int limit, int char32) {
   1018         // Check for irregular values
   1019         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
   1020             throw new IllegalArgumentException("Illegal codepoint");
   1021         }
   1022         // Write the UTF-16 values
   1023         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
   1024             target[limit++] = getLeadSurrogate(char32);
   1025             target[limit++] = getTrailSurrogate(char32);
   1026         } else {
   1027             target[limit++] = (char) char32;
   1028         }
   1029         return limit;
   1030     }
   1031 
   1032     /**
   1033      * Number of codepoints in a UTF16 String
   1034      *
   1035      * @param source UTF16 string
   1036      * @return number of codepoint in string
   1037      */
   1038     public static int countCodePoint(String source) {
   1039         if (source == null || source.length() == 0) {
   1040             return 0;
   1041         }
   1042         return findCodePointOffset(source, source.length());
   1043     }
   1044 
   1045     /**
   1046      * Number of codepoints in a UTF16 String buffer
   1047      *
   1048      * @param source UTF16 string buffer
   1049      * @return number of codepoint in string
   1050      */
   1051     public static int countCodePoint(StringBuffer source) {
   1052         if (source == null || source.length() == 0) {
   1053             return 0;
   1054         }
   1055         return findCodePointOffset(source, source.length());
   1056     }
   1057 
   1058     /**
   1059      * Number of codepoints in a UTF16 char array substring
   1060      *
   1061      * @param source UTF16 char array
   1062      * @param start Offset of the substring
   1063      * @param limit Offset of the substring
   1064      * @return number of codepoint in the substring
   1065      * @exception IndexOutOfBoundsException If start and limit are not valid.
   1066      */
   1067     public static int countCodePoint(char source[], int start, int limit) {
   1068         if (source == null || source.length == 0) {
   1069             return 0;
   1070         }
   1071         return findCodePointOffset(source, start, limit, limit - start);
   1072     }
   1073 
   1074     /**
   1075      * Set a code point into a UTF16 position. Adjusts target according if we are replacing a
   1076      * non-supplementary codepoint with a supplementary and vice versa.
   1077      *
   1078      * @param target Stringbuffer
   1079      * @param offset16 UTF16 position to insert into
   1080      * @param char32 Code point
   1081      */
   1082     public static void setCharAt(StringBuffer target, int offset16, int char32) {
   1083         int count = 1;
   1084         char single = target.charAt(offset16);
   1085 
   1086         if (isSurrogate(single)) {
   1087             // pairs of the surrogate with offset16 at the lead char found
   1088             if (isLeadSurrogate(single) && (target.length() > offset16 + 1)
   1089                     && isTrailSurrogate(target.charAt(offset16 + 1))) {
   1090                 count++;
   1091             } else {
   1092                 // pairs of the surrogate with offset16 at the trail char
   1093                 // found
   1094                 if (isTrailSurrogate(single) && (offset16 > 0)
   1095                         && isLeadSurrogate(target.charAt(offset16 - 1))) {
   1096                     offset16--;
   1097                     count++;
   1098                 }
   1099             }
   1100         }
   1101         target.replace(offset16, offset16 + count, valueOf(char32));
   1102     }
   1103 
   1104     /**
   1105      * Set a code point into a UTF16 position in a char array. Adjusts target according if we are
   1106      * replacing a non-supplementary codepoint with a supplementary and vice versa.
   1107      *
   1108      * @param target char array
   1109      * @param limit numbers of valid chars in target, different from target.length. limit counts the
   1110      *            number of chars in target that represents a string, not the size of array target.
   1111      * @param offset16 UTF16 position to insert into
   1112      * @param char32 code point
   1113      * @return new number of chars in target that represents a string
   1114      * @exception IndexOutOfBoundsException if offset16 is out of range
   1115      */
   1116     public static int setCharAt(char target[], int limit, int offset16, int char32) {
   1117         if (offset16 >= limit) {
   1118             throw new ArrayIndexOutOfBoundsException(offset16);
   1119         }
   1120         int count = 1;
   1121         char single = target[offset16];
   1122 
   1123         if (isSurrogate(single)) {
   1124             // pairs of the surrogate with offset16 at the lead char found
   1125             if (isLeadSurrogate(single) && (target.length > offset16 + 1)
   1126                     && isTrailSurrogate(target[offset16 + 1])) {
   1127                 count++;
   1128             } else {
   1129                 // pairs of the surrogate with offset16 at the trail char
   1130                 // found
   1131                 if (isTrailSurrogate(single) && (offset16 > 0)
   1132                         && isLeadSurrogate(target[offset16 - 1])) {
   1133                     offset16--;
   1134                     count++;
   1135                 }
   1136             }
   1137         }
   1138 
   1139         String str = valueOf(char32);
   1140         int result = limit;
   1141         int strlength = str.length();
   1142         target[offset16] = str.charAt(0);
   1143         if (count == strlength) {
   1144             if (count == 2) {
   1145                 target[offset16 + 1] = str.charAt(1);
   1146             }
   1147         } else {
   1148             // this is not exact match in space, we'll have to do some
   1149             // shifting
   1150             System.arraycopy(target, offset16 + count, target, offset16 + strlength, limit
   1151                     - (offset16 + count));
   1152             if (count < strlength) {
   1153                 // char32 is a supplementary character trying to squeeze into
   1154                 // a non-supplementary space
   1155                 target[offset16 + 1] = str.charAt(1);
   1156                 result++;
   1157                 if (result < target.length) {
   1158                     target[result] = 0;
   1159                 }
   1160             } else {
   1161                 // char32 is a non-supplementary character trying to fill
   1162                 // into a supplementary space
   1163                 result--;
   1164                 target[result] = 0;
   1165             }
   1166         }
   1167         return result;
   1168     }
   1169 
   1170     /**
   1171      * Shifts offset16 by the argument number of codepoints
   1172      *
   1173      * @param source string
   1174      * @param offset16 UTF16 position to shift
   1175      * @param shift32 number of codepoints to shift
   1176      * @return new shifted offset16
   1177      * @exception IndexOutOfBoundsException if the new offset16 is out of bounds.
   1178      */
   1179     public static int moveCodePointOffset(String source, int offset16, int shift32) {
   1180         int result = offset16;
   1181         int size = source.length();
   1182         int count;
   1183         char ch;
   1184         if (offset16 < 0 || offset16 > size) {
   1185             throw new StringIndexOutOfBoundsException(offset16);
   1186         }
   1187         if (shift32 > 0) {
   1188             if (shift32 + offset16 > size) {
   1189                 throw new StringIndexOutOfBoundsException(offset16);
   1190             }
   1191             count = shift32;
   1192             while (result < size && count > 0) {
   1193                 ch = source.charAt(result);
   1194                 if (isLeadSurrogate(ch) && ((result + 1) < size)
   1195                         && isTrailSurrogate(source.charAt(result + 1))) {
   1196                     result++;
   1197                 }
   1198                 count--;
   1199                 result++;
   1200             }
   1201         } else {
   1202             if (offset16 + shift32 < 0) {
   1203                 throw new StringIndexOutOfBoundsException(offset16);
   1204             }
   1205             for (count = -shift32; count > 0; count--) {
   1206                 result--;
   1207                 if (result < 0) {
   1208                     break;
   1209                 }
   1210                 ch = source.charAt(result);
   1211                 if (isTrailSurrogate(ch) && result > 0
   1212                         && isLeadSurrogate(source.charAt(result - 1))) {
   1213                     result--;
   1214                 }
   1215             }
   1216         }
   1217         if (count != 0) {
   1218             throw new StringIndexOutOfBoundsException(shift32);
   1219         }
   1220         return result;
   1221     }
   1222 
   1223     /**
   1224      * Shifts offset16 by the argument number of codepoints
   1225      *
   1226      * @param source String buffer
   1227      * @param offset16 UTF16 position to shift
   1228      * @param shift32 Number of codepoints to shift
   1229      * @return new shifted offset16
   1230      * @exception IndexOutOfBoundsException If the new offset16 is out of bounds.
   1231      */
   1232     public static int moveCodePointOffset(StringBuffer source, int offset16, int shift32) {
   1233         int result = offset16;
   1234         int size = source.length();
   1235         int count;
   1236         char ch;
   1237         if (offset16 < 0 || offset16 > size) {
   1238             throw new StringIndexOutOfBoundsException(offset16);
   1239         }
   1240         if (shift32 > 0) {
   1241             if (shift32 + offset16 > size) {
   1242                 throw new StringIndexOutOfBoundsException(offset16);
   1243             }
   1244             count = shift32;
   1245             while (result < size && count > 0) {
   1246                 ch = source.charAt(result);
   1247                 if (isLeadSurrogate(ch) && ((result + 1) < size)
   1248                         && isTrailSurrogate(source.charAt(result + 1))) {
   1249                     result++;
   1250                 }
   1251                 count--;
   1252                 result++;
   1253             }
   1254         } else {
   1255             if (offset16 + shift32 < 0) {
   1256                 throw new StringIndexOutOfBoundsException(offset16);
   1257             }
   1258             for (count = -shift32; count > 0; count--) {
   1259                 result--;
   1260                 if (result < 0) {
   1261                     break;
   1262                 }
   1263                 ch = source.charAt(result);
   1264                 if (isTrailSurrogate(ch) && result > 0
   1265                         && isLeadSurrogate(source.charAt(result - 1))) {
   1266                     result--;
   1267                 }
   1268             }
   1269         }
   1270         if (count != 0) {
   1271             throw new StringIndexOutOfBoundsException(shift32);
   1272         }
   1273         return result;
   1274     }
   1275 
   1276     /**
   1277      * Shifts offset16 by the argument number of codepoints within a subarray.
   1278      *
   1279      * @param source Char array
   1280      * @param start Position of the subarray to be performed on
   1281      * @param limit Position of the subarray to be performed on
   1282      * @param offset16 UTF16 position to shift relative to start
   1283      * @param shift32 Number of codepoints to shift
   1284      * @return new shifted offset16 relative to start
   1285      * @exception IndexOutOfBoundsException If the new offset16 is out of bounds with respect to the subarray or the
   1286      *                subarray bounds are out of range.
   1287      */
   1288     public static int moveCodePointOffset(char source[], int start, int limit, int offset16,
   1289             int shift32) {
   1290         int size = source.length;
   1291         int count;
   1292         char ch;
   1293         int result = offset16 + start;
   1294         if (start < 0 || limit < start) {
   1295             throw new StringIndexOutOfBoundsException(start);
   1296         }
   1297         if (limit > size) {
   1298             throw new StringIndexOutOfBoundsException(limit);
   1299         }
   1300         if (offset16 < 0 || result > limit) {
   1301             throw new StringIndexOutOfBoundsException(offset16);
   1302         }
   1303         if (shift32 > 0) {
   1304             if (shift32 + result > size) {
   1305                 throw new StringIndexOutOfBoundsException(result);
   1306             }
   1307             count = shift32;
   1308             while (result < limit && count > 0) {
   1309                 ch = source[result];
   1310                 if (isLeadSurrogate(ch) && (result + 1 < limit)
   1311                         && isTrailSurrogate(source[result + 1])) {
   1312                     result++;
   1313                 }
   1314                 count--;
   1315                 result++;
   1316             }
   1317         } else {
   1318             if (result + shift32 < start) {
   1319                 throw new StringIndexOutOfBoundsException(result);
   1320             }
   1321             for (count = -shift32; count > 0; count--) {
   1322                 result--;
   1323                 if (result < start) {
   1324                     break;
   1325                 }
   1326                 ch = source[result];
   1327                 if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) {
   1328                     result--;
   1329                 }
   1330             }
   1331         }
   1332         if (count != 0) {
   1333             throw new StringIndexOutOfBoundsException(shift32);
   1334         }
   1335         result -= start;
   1336         return result;
   1337     }
   1338 
   1339     /**
   1340      * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the
   1341      * middle of a supplementary codepoint, char32 will be inserted after the supplementary
   1342      * codepoint. The length of target increases by one if codepoint is non-supplementary, 2
   1343      * otherwise.
   1344      * <p>
   1345      * The overall effect is exactly as if the argument were converted to a string by the method
   1346      * valueOf(char) and the characters in that string were then inserted into target at the
   1347      * position indicated by offset16.
   1348      * </p>
   1349      * <p>
   1350      * The offset argument must be greater than or equal to 0, and less than or equal to the length
   1351      * of source.
   1352      *
   1353      * @param target String buffer to insert to
   1354      * @param offset16 Offset which char32 will be inserted in
   1355      * @param char32 Codepoint to be inserted
   1356      * @return a reference to target
   1357      * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
   1358      */
   1359     public static StringBuffer insert(StringBuffer target, int offset16, int char32) {
   1360         String str = valueOf(char32);
   1361         if (offset16 != target.length() && bounds(target, offset16) == TRAIL_SURROGATE_BOUNDARY) {
   1362             offset16++;
   1363         }
   1364         target.insert(offset16, str);
   1365         return target;
   1366     }
   1367 
   1368     /**
   1369      * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the
   1370      * middle of a supplementary codepoint, char32 will be inserted after the supplementary
   1371      * codepoint. Limit increases by one if codepoint is non-supplementary, 2 otherwise.
   1372      * <p>
   1373      * The overall effect is exactly as if the argument were converted to a string by the method
   1374      * valueOf(char) and the characters in that string were then inserted into target at the
   1375      * position indicated by offset16.
   1376      * </p>
   1377      * <p>
   1378      * The offset argument must be greater than or equal to 0, and less than or equal to the limit.
   1379      *
   1380      * @param target Char array to insert to
   1381      * @param limit End index of the char array, limit &lt;= target.length
   1382      * @param offset16 Offset which char32 will be inserted in
   1383      * @param char32 Codepoint to be inserted
   1384      * @return new limit size
   1385      * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
   1386      */
   1387     public static int insert(char target[], int limit, int offset16, int char32) {
   1388         String str = valueOf(char32);
   1389         if (offset16 != limit && bounds(target, 0, limit, offset16) == TRAIL_SURROGATE_BOUNDARY) {
   1390             offset16++;
   1391         }
   1392         int size = str.length();
   1393         if (limit + size > target.length) {
   1394             throw new ArrayIndexOutOfBoundsException(offset16 + size);
   1395         }
   1396         System.arraycopy(target, offset16, target, offset16 + size, limit - offset16);
   1397         target[offset16] = str.charAt(0);
   1398         if (size == 2) {
   1399             target[offset16 + 1] = str.charAt(1);
   1400         }
   1401         return limit + size;
   1402     }
   1403 
   1404     /**
   1405      * Removes the codepoint at the specified position in this target (shortening target by 1
   1406      * character if the codepoint is a non-supplementary, 2 otherwise).
   1407      *
   1408      * @param target String buffer to remove codepoint from
   1409      * @param offset16 Offset which the codepoint will be removed
   1410      * @return a reference to target
   1411      * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
   1412      */
   1413     public static StringBuffer delete(StringBuffer target, int offset16) {
   1414         int count = 1;
   1415         switch (bounds(target, offset16)) {
   1416         case LEAD_SURROGATE_BOUNDARY:
   1417             count++;
   1418             break;
   1419         case TRAIL_SURROGATE_BOUNDARY:
   1420             count++;
   1421             offset16--;
   1422             break;
   1423         }
   1424         target.delete(offset16, offset16 + count);
   1425         return target;
   1426     }
   1427 
   1428     /**
   1429      * Removes the codepoint at the specified position in this target (shortening target by 1
   1430      * character if the codepoint is a non-supplementary, 2 otherwise).
   1431      *
   1432      * @param target String buffer to remove codepoint from
   1433      * @param limit End index of the char array, limit &lt;= target.length
   1434      * @param offset16 Offset which the codepoint will be removed
   1435      * @return a new limit size
   1436      * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
   1437      */
   1438     public static int delete(char target[], int limit, int offset16) {
   1439         int count = 1;
   1440         switch (bounds(target, 0, limit, offset16)) {
   1441         case LEAD_SURROGATE_BOUNDARY:
   1442             count++;
   1443             break;
   1444         case TRAIL_SURROGATE_BOUNDARY:
   1445             count++;
   1446             offset16--;
   1447             break;
   1448         }
   1449         System.arraycopy(target, offset16 + count, target, offset16, limit - (offset16 + count));
   1450         target[limit - count] = 0;
   1451         return limit - count;
   1452     }
   1453 
   1454     /**
   1455      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
   1456      * the argument codepoint. I.e., the smallest index <code>i</code> such that
   1457      * <code>UTF16.charAt(source, i) ==
   1458      * char32</code> is true.
   1459      * <p>
   1460      * If no such character occurs in this string, then -1 is returned.
   1461      * </p>
   1462      * <p>
   1463      * Examples:<br>
   1464      * UTF16.indexOf("abc", 'a') returns 0<br>
   1465      * UTF16.indexOf("abc\ud800\udc00", 0x10000) returns 3<br>
   1466      * UTF16.indexOf("abc\ud800\udc00", 0xd800) returns -1<br>
   1467      * </p>
   1468      * Note this method is provided as support to jdk 1.3, which does not support supplementary
   1469      * characters to its fullest.
   1470      *
   1471      * @param source UTF16 format Unicode string that will be searched
   1472      * @param char32 Codepoint to search for
   1473      * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
   1474      *         -1 if the codepoint does not occur.
   1475      */
   1476     public static int indexOf(String source, int char32) {
   1477         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
   1478             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
   1479         }
   1480         // non-surrogate bmp
   1481         if (char32 < LEAD_SURROGATE_MIN_VALUE
   1482                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
   1483             return source.indexOf((char) char32);
   1484         }
   1485         // surrogate
   1486         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
   1487             int result = source.indexOf((char) char32);
   1488             if (result >= 0) {
   1489                 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
   1490                         && isTrailSurrogate(source.charAt(result + 1))) {
   1491                     return indexOf(source, char32, result + 1);
   1492                 }
   1493                 // trail surrogate
   1494                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
   1495                     return indexOf(source, char32, result + 1);
   1496                 }
   1497             }
   1498             return result;
   1499         }
   1500         // supplementary
   1501         String char32str = toString(char32);
   1502         return source.indexOf(char32str);
   1503     }
   1504 
   1505     /**
   1506      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
   1507      * the argument string str. This method is implemented based on codepoints, hence a "lead
   1508      * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
   1509      * starts with trail surrogate character at index 0, a source with a leading a surrogate
   1510      * character before str found at in source will not have a valid match. Vice versa for lead
   1511      * surrogates that ends str. See example below.
   1512      * <p>
   1513      * If no such string str occurs in this source, then -1 is returned.
   1514      * </p>
   1515      * <p>
   1516      * Examples:<br>
   1517      * UTF16.indexOf("abc", "ab") returns 0<br>
   1518      * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br>
   1519      * UTF16.indexOf("abc\ud800\udc00", "\ud800") returns -1<br>
   1520      * </p>
   1521      * Note this method is provided as support to jdk 1.3, which does not support supplementary
   1522      * characters to its fullest.
   1523      *
   1524      * @param source UTF16 format Unicode string that will be searched
   1525      * @param str UTF16 format Unicode string to search for
   1526      * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
   1527      *         -1 if the codepoint does not occur.
   1528      */
   1529     public static int indexOf(String source, String str) {
   1530         int strLength = str.length();
   1531         // non-surrogate ends
   1532         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
   1533             return source.indexOf(str);
   1534         }
   1535 
   1536         int result = source.indexOf(str);
   1537         int resultEnd = result + strLength;
   1538         if (result >= 0) {
   1539             // check last character
   1540             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
   1541                     && isTrailSurrogate(source.charAt(resultEnd + 1))) {
   1542                 return indexOf(source, str, resultEnd + 1);
   1543             }
   1544             // check first character which is a trail surrogate
   1545             if (isTrailSurrogate(str.charAt(0)) && result > 0
   1546                     && isLeadSurrogate(source.charAt(result - 1))) {
   1547                 return indexOf(source, str, resultEnd + 1);
   1548             }
   1549         }
   1550         return result;
   1551     }
   1552 
   1553     /**
   1554      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
   1555      * the argument codepoint. I.e., the smallest index i such that: <br>
   1556      * (UTF16.charAt(source, i) == char32 &amp;&amp; i &gt;= fromIndex) is true.
   1557      * <p>
   1558      * If no such character occurs in this string, then -1 is returned.
   1559      * </p>
   1560      * <p>
   1561      * Examples:<br>
   1562      * UTF16.indexOf("abc", 'a', 1) returns -1<br>
   1563      * UTF16.indexOf("abc\ud800\udc00", 0x10000, 1) returns 3<br>
   1564      * UTF16.indexOf("abc\ud800\udc00", 0xd800, 1) returns -1<br>
   1565      * </p>
   1566      * Note this method is provided as support to jdk 1.3, which does not support supplementary
   1567      * characters to its fullest.
   1568      *
   1569      * @param source UTF16 format Unicode string that will be searched
   1570      * @param char32 Codepoint to search for
   1571      * @param fromIndex The index to start the search from.
   1572      * @return the index of the first occurrence of the codepoint in the argument Unicode string at
   1573      *         or after fromIndex, or -1 if the codepoint does not occur.
   1574      */
   1575     public static int indexOf(String source, int char32, int fromIndex) {
   1576         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
   1577             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
   1578         }
   1579         // non-surrogate bmp
   1580         if (char32 < LEAD_SURROGATE_MIN_VALUE
   1581                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
   1582             return source.indexOf((char) char32, fromIndex);
   1583         }
   1584         // surrogate
   1585         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
   1586             int result = source.indexOf((char) char32, fromIndex);
   1587             if (result >= 0) {
   1588                 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
   1589                         && isTrailSurrogate(source.charAt(result + 1))) {
   1590                     return indexOf(source, char32, result + 1);
   1591                 }
   1592                 // trail surrogate
   1593                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
   1594                     return indexOf(source, char32, result + 1);
   1595                 }
   1596             }
   1597             return result;
   1598         }
   1599         // supplementary
   1600         String char32str = toString(char32);
   1601         return source.indexOf(char32str, fromIndex);
   1602     }
   1603 
   1604     /**
   1605      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
   1606      * the argument string str. This method is implemented based on codepoints, hence a "lead
   1607      * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
   1608      * starts with trail surrogate character at index 0, a source with a leading a surrogate
   1609      * character before str found at in source will not have a valid match. Vice versa for lead
   1610      * surrogates that ends str. See example below.
   1611      * <p>
   1612      * If no such string str occurs in this source, then -1 is returned.
   1613      * </p>
   1614      * <p>
   1615      * Examples:<br>
   1616      * UTF16.indexOf("abc", "ab", 0) returns 0<br>
   1617      * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 0) returns 3<br>
   1618      * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 2) returns 3<br>
   1619      * UTF16.indexOf("abc\ud800\udc00", "\ud800", 0) returns -1<br>
   1620      * </p>
   1621      * Note this method is provided as support to jdk 1.3, which does not support supplementary
   1622      * characters to its fullest.
   1623      *
   1624      * @param source UTF16 format Unicode string that will be searched
   1625      * @param str UTF16 format Unicode string to search for
   1626      * @param fromIndex The index to start the search from.
   1627      * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
   1628      *         -1 if the codepoint does not occur.
   1629      */
   1630     public static int indexOf(String source, String str, int fromIndex) {
   1631         int strLength = str.length();
   1632         // non-surrogate ends
   1633         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
   1634             return source.indexOf(str, fromIndex);
   1635         }
   1636 
   1637         int result = source.indexOf(str, fromIndex);
   1638         int resultEnd = result + strLength;
   1639         if (result >= 0) {
   1640             // check last character
   1641             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
   1642                     && isTrailSurrogate(source.charAt(resultEnd))) {
   1643                 return indexOf(source, str, resultEnd + 1);
   1644             }
   1645             // check first character which is a trail surrogate
   1646             if (isTrailSurrogate(str.charAt(0)) && result > 0
   1647                     && isLeadSurrogate(source.charAt(result - 1))) {
   1648                 return indexOf(source, str, resultEnd + 1);
   1649             }
   1650         }
   1651         return result;
   1652     }
   1653 
   1654     /**
   1655      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
   1656      * the argument codepoint. I.e., the index returned is the largest value i such that:
   1657      * UTF16.charAt(source, i) == char32 is true.
   1658      * <p>
   1659      * Examples:<br>
   1660      * UTF16.lastIndexOf("abc", 'a') returns 0<br>
   1661      * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000) returns 3<br>
   1662      * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br>
   1663      * </p>
   1664      * <p>
   1665      * source is searched backwards starting at the last character.
   1666      * </p>
   1667      * Note this method is provided as support to jdk 1.3, which does not support supplementary
   1668      * characters to its fullest.
   1669      *
   1670      * @param source UTF16 format Unicode string that will be searched
   1671      * @param char32 Codepoint to search for
   1672      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
   1673      *         does not occur.
   1674      */
   1675     public static int lastIndexOf(String source, int char32) {
   1676         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
   1677             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
   1678         }
   1679         // non-surrogate bmp
   1680         if (char32 < LEAD_SURROGATE_MIN_VALUE
   1681                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
   1682             return source.lastIndexOf((char) char32);
   1683         }
   1684         // surrogate
   1685         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
   1686             int result = source.lastIndexOf((char) char32);
   1687             if (result >= 0) {
   1688                 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
   1689                         && isTrailSurrogate(source.charAt(result + 1))) {
   1690                     return lastIndexOf(source, char32, result - 1);
   1691                 }
   1692                 // trail surrogate
   1693                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
   1694                     return lastIndexOf(source, char32, result - 1);
   1695                 }
   1696             }
   1697             return result;
   1698         }
   1699         // supplementary
   1700         String char32str = toString(char32);
   1701         return source.lastIndexOf(char32str);
   1702     }
   1703 
   1704     /**
   1705      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
   1706      * the argument string str. This method is implemented based on codepoints, hence a "lead
   1707      * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
   1708      * starts with trail surrogate character at index 0, a source with a leading a surrogate
   1709      * character before str found at in source will not have a valid match. Vice versa for lead
   1710      * surrogates that ends str. See example below.
   1711      * <p>
   1712      * Examples:<br>
   1713      * UTF16.lastIndexOf("abc", "a") returns 0<br>
   1714      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br>
   1715      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800") returns -1<br>
   1716      * </p>
   1717      * <p>
   1718      * source is searched backwards starting at the last character.
   1719      * </p>
   1720      * Note this method is provided as support to jdk 1.3, which does not support supplementary
   1721      * characters to its fullest.
   1722      *
   1723      * @param source UTF16 format Unicode string that will be searched
   1724      * @param str UTF16 format Unicode string to search for
   1725      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
   1726      *         does not occur.
   1727      */
   1728     public static int lastIndexOf(String source, String str) {
   1729         int strLength = str.length();
   1730         // non-surrogate ends
   1731         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
   1732             return source.lastIndexOf(str);
   1733         }
   1734 
   1735         int result = source.lastIndexOf(str);
   1736         if (result >= 0) {
   1737             // check last character
   1738             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
   1739                     && isTrailSurrogate(source.charAt(result + strLength + 1))) {
   1740                 return lastIndexOf(source, str, result - 1);
   1741             }
   1742             // check first character which is a trail surrogate
   1743             if (isTrailSurrogate(str.charAt(0)) && result > 0
   1744                     && isLeadSurrogate(source.charAt(result - 1))) {
   1745                 return lastIndexOf(source, str, result - 1);
   1746             }
   1747         }
   1748         return result;
   1749     }
   1750 
   1751     /**
   1752      * <p>
   1753      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
   1754      * the argument codepoint, where the result is less than or equals to fromIndex.
   1755      * </p>
   1756      * <p>
   1757      * This method is implemented based on codepoints, hence a single surrogate character will not
   1758      * match a supplementary character.
   1759      * </p>
   1760      * <p>
   1761      * source is searched backwards starting at the last character starting at the specified index.
   1762      * </p>
   1763      * <p>
   1764      * Examples:<br>
   1765      * UTF16.lastIndexOf("abc", 'c', 2) returns 2<br>
   1766      * UTF16.lastIndexOf("abc", 'c', 1) returns -1<br>
   1767      * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 5) returns 3<br>
   1768      * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 3) returns 3<br>
   1769      * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br>
   1770      * </p>
   1771      * Note this method is provided as support to jdk 1.3, which does not support supplementary
   1772      * characters to its fullest.
   1773      *
   1774      * @param source UTF16 format Unicode string that will be searched
   1775      * @param char32 Codepoint to search for
   1776      * @param fromIndex the index to start the search from. There is no restriction on the value of
   1777      *            fromIndex. If it is greater than or equal to the length of this string, it has the
   1778      *            same effect as if it were equal to one less than the length of this string: this
   1779      *            entire string may be searched. If it is negative, it has the same effect as if it
   1780      *            were -1: -1 is returned.
   1781      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
   1782      *         does not occur.
   1783      */
   1784     public static int lastIndexOf(String source, int char32, int fromIndex) {
   1785         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
   1786             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
   1787         }
   1788         // non-surrogate bmp
   1789         if (char32 < LEAD_SURROGATE_MIN_VALUE
   1790                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
   1791             return source.lastIndexOf((char) char32, fromIndex);
   1792         }
   1793         // surrogate
   1794         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
   1795             int result = source.lastIndexOf((char) char32, fromIndex);
   1796             if (result >= 0) {
   1797                 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
   1798                         && isTrailSurrogate(source.charAt(result + 1))) {
   1799                     return lastIndexOf(source, char32, result - 1);
   1800                 }
   1801                 // trail surrogate
   1802                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
   1803                     return lastIndexOf(source, char32, result - 1);
   1804                 }
   1805             }
   1806             return result;
   1807         }
   1808         // supplementary
   1809         String char32str = toString(char32);
   1810         return source.lastIndexOf(char32str, fromIndex);
   1811     }
   1812 
   1813     /**
   1814      * <p>
   1815      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
   1816      * the argument string str, where the result is less than or equals to fromIndex.
   1817      * </p>
   1818      * <p>
   1819      * This method is implemented based on codepoints, hence a "lead surrogate character + trail
   1820      * surrogate character" is treated as one entity. Hence if the str starts with trail surrogate
   1821      * character at index 0, a source with a leading a surrogate character before str found at in
   1822      * source will not have a valid match. Vice versa for lead surrogates that ends str.
   1823      * </p>
   1824      * See example below.
   1825      * <p>
   1826      * Examples:<br>
   1827      * UTF16.lastIndexOf("abc", "c", 2) returns 2<br>
   1828      * UTF16.lastIndexOf("abc", "c", 1) returns -1<br>
   1829      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 5) returns 3<br>
   1830      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 3) returns 3<br>
   1831      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800", 4) returns -1<br>
   1832      * </p>
   1833      * <p>
   1834      * source is searched backwards starting at the last character.
   1835      * </p>
   1836      * Note this method is provided as support to jdk 1.3, which does not support supplementary
   1837      * characters to its fullest.
   1838      *
   1839      * @param source UTF16 format Unicode string that will be searched
   1840      * @param str UTF16 format Unicode string to search for
   1841      * @param fromIndex the index to start the search from. There is no restriction on the value of
   1842      *            fromIndex. If it is greater than or equal to the length of this string, it has the
   1843      *            same effect as if it were equal to one less than the length of this string: this
   1844      *            entire string may be searched. If it is negative, it has the same effect as if it
   1845      *            were -1: -1 is returned.
   1846      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
   1847      *         does not occur.
   1848      */
   1849     public static int lastIndexOf(String source, String str, int fromIndex) {
   1850         int strLength = str.length();
   1851         // non-surrogate ends
   1852         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
   1853             return source.lastIndexOf(str, fromIndex);
   1854         }
   1855 
   1856         int result = source.lastIndexOf(str, fromIndex);
   1857         if (result >= 0) {
   1858             // check last character
   1859             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
   1860                     && isTrailSurrogate(source.charAt(result + strLength))) {
   1861                 return lastIndexOf(source, str, result - 1);
   1862             }
   1863             // check first character which is a trail surrogate
   1864             if (isTrailSurrogate(str.charAt(0)) && result > 0
   1865                     && isLeadSurrogate(source.charAt(result - 1))) {
   1866                 return lastIndexOf(source, str, result - 1);
   1867             }
   1868         }
   1869         return result;
   1870     }
   1871 
   1872     /**
   1873      * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of
   1874      * oldChar32 in source with newChar32. If the character oldChar32 does not occur in the UTF16
   1875      * format Unicode string source, then source will be returned. Otherwise, a new String object is
   1876      * created that represents a codepoint sequence identical to the codepoint sequence represented
   1877      * by source, except that every occurrence of oldChar32 is replaced by an occurrence of
   1878      * newChar32.
   1879      * <p>
   1880      * Examples: <br>
   1881      * UTF16.replace("mesquite in your cellar", 'e', 'o');<br>
   1882      * returns "mosquito in your collar"<br>
   1883      * UTF16.replace("JonL", 'q', 'x');<br>
   1884      * returns "JonL" (no change)<br>
   1885      * UTF16.replace("Supplementary character \ud800\udc00", 0x10000, '!'); <br>
   1886      * returns "Supplementary character !"<br>
   1887      * UTF16.replace("Supplementary character \ud800\udc00", 0xd800, '!'); <br>
   1888      * returns "Supplementary character \ud800\udc00"<br>
   1889      * </p>
   1890      * Note this method is provided as support to jdk 1.3, which does not support supplementary
   1891      * characters to its fullest.
   1892      *
   1893      * @param source UTF16 format Unicode string which the codepoint replacements will be based on.
   1894      * @param oldChar32 Non-zero old codepoint to be replaced.
   1895      * @param newChar32 The new codepoint to replace oldChar32
   1896      * @return new String derived from source by replacing every occurrence of oldChar32 with
   1897      *         newChar32, unless when no oldChar32 is found in source then source will be returned.
   1898      */
   1899     public static String replace(String source, int oldChar32, int newChar32) {
   1900         if (oldChar32 <= 0 || oldChar32 > CODEPOINT_MAX_VALUE) {
   1901             throw new IllegalArgumentException("Argument oldChar32 is not a valid codepoint");
   1902         }
   1903         if (newChar32 <= 0 || newChar32 > CODEPOINT_MAX_VALUE) {
   1904             throw new IllegalArgumentException("Argument newChar32 is not a valid codepoint");
   1905         }
   1906 
   1907         int index = indexOf(source, oldChar32);
   1908         if (index == -1) {
   1909             return source;
   1910         }
   1911         String newChar32Str = toString(newChar32);
   1912         int oldChar32Size = 1;
   1913         int newChar32Size = newChar32Str.length();
   1914         StringBuffer result = new StringBuffer(source);
   1915         int resultIndex = index;
   1916 
   1917         if (oldChar32 >= SUPPLEMENTARY_MIN_VALUE) {
   1918             oldChar32Size = 2;
   1919         }
   1920 
   1921         while (index != -1) {
   1922             int endResultIndex = resultIndex + oldChar32Size;
   1923             result.replace(resultIndex, endResultIndex, newChar32Str);
   1924             int lastEndIndex = index + oldChar32Size;
   1925             index = indexOf(source, oldChar32, lastEndIndex);
   1926             resultIndex += newChar32Size + index - lastEndIndex;
   1927         }
   1928         return result.toString();
   1929     }
   1930 
   1931     /**
   1932      * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of oldStr
   1933      * in source with newStr. If the string oldStr does not occur in the UTF16 format Unicode string
   1934      * source, then source will be returned. Otherwise, a new String object is created that
   1935      * represents a codepoint sequence identical to the codepoint sequence represented by source,
   1936      * except that every occurrence of oldStr is replaced by an occurrence of newStr.
   1937      * <p>
   1938      * Examples: <br>
   1939      * UTF16.replace("mesquite in your cellar", "e", "o");<br>
   1940      * returns "mosquito in your collar"<br>
   1941      * UTF16.replace("mesquite in your cellar", "mesquite", "cat");<br>
   1942      * returns "cat in your cellar"<br>
   1943      * UTF16.replace("JonL", "q", "x");<br>
   1944      * returns "JonL" (no change)<br>
   1945      * UTF16.replace("Supplementary character \ud800\udc00", "\ud800\udc00", '!'); <br>
   1946      * returns "Supplementary character !"<br>
   1947      * UTF16.replace("Supplementary character \ud800\udc00", "\ud800", '!'); <br>
   1948      * returns "Supplementary character \ud800\udc00"<br>
   1949      * </p>
   1950      * Note this method is provided as support to jdk 1.3, which does not support supplementary
   1951      * characters to its fullest.
   1952      *
   1953      * @param source UTF16 format Unicode string which the replacements will be based on.
   1954      * @param oldStr Non-zero-length string to be replaced.
   1955      * @param newStr The new string to replace oldStr
   1956      * @return new String derived from source by replacing every occurrence of oldStr with newStr.
   1957      *         When no oldStr is found in source, then source will be returned.
   1958      */
   1959     public static String replace(String source, String oldStr, String newStr) {
   1960         int index = indexOf(source, oldStr);
   1961         if (index == -1) {
   1962             return source;
   1963         }
   1964         int oldStrSize = oldStr.length();
   1965         int newStrSize = newStr.length();
   1966         StringBuffer result = new StringBuffer(source);
   1967         int resultIndex = index;
   1968 
   1969         while (index != -1) {
   1970             int endResultIndex = resultIndex + oldStrSize;
   1971             result.replace(resultIndex, endResultIndex, newStr);
   1972             int lastEndIndex = index + oldStrSize;
   1973             index = indexOf(source, oldStr, lastEndIndex);
   1974             resultIndex += newStrSize + index - lastEndIndex;
   1975         }
   1976         return result.toString();
   1977     }
   1978 
   1979     /**
   1980      * Reverses a UTF16 format Unicode string and replaces source's content with it. This method
   1981      * will reverse surrogate characters correctly, instead of blindly reversing every character.
   1982      * <p>
   1983      * Examples:<br>
   1984      * UTF16.reverse(new StringBuffer( "Supplementary characters \ud800\udc00\ud801\udc01"))<br>
   1985      * returns "\ud801\udc01\ud800\udc00 sretcarahc yratnemelppuS".
   1986      *
   1987      * @param source The source StringBuffer that contains UTF16 format Unicode string to be reversed
   1988      * @return a modified source with reversed UTF16 format Unicode string.
   1989      */
   1990     public static StringBuffer reverse(StringBuffer source) {
   1991         int length = source.length();
   1992         StringBuffer result = new StringBuffer(length);
   1993         for (int i = length; i-- > 0;) {
   1994             char ch = source.charAt(i);
   1995             if (isTrailSurrogate(ch) && i > 0) {
   1996                 char ch2 = source.charAt(i - 1);
   1997                 if (isLeadSurrogate(ch2)) {
   1998                     result.append(ch2);
   1999                     result.append(ch);
   2000                     --i;
   2001                     continue;
   2002                 }
   2003             }
   2004             result.append(ch);
   2005         }
   2006         return result;
   2007     }
   2008 
   2009     /**
   2010      * Check if the string contains more Unicode code points than a certain number. This is more
   2011      * efficient than counting all code points in the entire string and comparing that number with a
   2012      * threshold. This function may not need to scan the string at all if the length is within a
   2013      * certain range, and never needs to count more than 'number + 1' code points. Logically
   2014      * equivalent to (countCodePoint(s) &gt; number). A Unicode code point may occupy either one or two
   2015      * code units.
   2016      *
   2017      * @param source The input string.
   2018      * @param number The number of code points in the string is compared against the 'number'
   2019      *            parameter.
   2020      * @return boolean value for whether the string contains more Unicode code points than 'number'.
   2021      */
   2022     public static boolean hasMoreCodePointsThan(String source, int number) {
   2023         if (number < 0) {
   2024             return true;
   2025         }
   2026         if (source == null) {
   2027             return false;
   2028         }
   2029         int length = source.length();
   2030 
   2031         // length >= 0 known
   2032         // source contains at least (length + 1) / 2 code points: <= 2
   2033         // chars per cp
   2034         if (((length + 1) >> 1) > number) {
   2035             return true;
   2036         }
   2037 
   2038         // check if source does not even contain enough chars
   2039         int maxsupplementary = length - number;
   2040         if (maxsupplementary <= 0) {
   2041             return false;
   2042         }
   2043 
   2044         // there are maxsupplementary = length - number more chars than
   2045         // asked-for code points
   2046 
   2047         // count code points until they exceed and also check that there are
   2048         // no more than maxsupplementary supplementary code points (char pairs)
   2049         int start = 0;
   2050         while (true) {
   2051             if (length == 0) {
   2052                 return false;
   2053             }
   2054             if (number == 0) {
   2055                 return true;
   2056             }
   2057             if (isLeadSurrogate(source.charAt(start++)) && start != length
   2058                     && isTrailSurrogate(source.charAt(start))) {
   2059                 start++;
   2060                 if (--maxsupplementary <= 0) {
   2061                     // too many pairs - too few code points
   2062                     return false;
   2063                 }
   2064             }
   2065             --number;
   2066         }
   2067     }
   2068 
   2069     /**
   2070      * Check if the sub-range of char array, from argument start to limit, contains more Unicode
   2071      * code points than a certain number. This is more efficient than counting all code points in
   2072      * the entire char array range and comparing that number with a threshold. This function may not
   2073      * need to scan the char array at all if start and limit is within a certain range, and never
   2074      * needs to count more than 'number + 1' code points. Logically equivalent to
   2075      * (countCodePoint(source, start, limit) &gt; number). A Unicode code point may occupy either one
   2076      * or two code units.
   2077      *
   2078      * @param source Array of UTF-16 chars
   2079      * @param start Offset to substring in the source array for analyzing
   2080      * @param limit Offset to substring in the source array for analyzing
   2081      * @param number The number of code points in the string is compared against the 'number'
   2082      *            parameter.
   2083      * @return boolean value for whether the string contains more Unicode code points than 'number'.
   2084      * @exception IndexOutOfBoundsException Thrown when limit &lt; start
   2085      */
   2086     public static boolean hasMoreCodePointsThan(char source[], int start, int limit, int number) {
   2087         int length = limit - start;
   2088         if (length < 0 || start < 0 || limit < 0) {
   2089             throw new IndexOutOfBoundsException(
   2090                     "Start and limit indexes should be non-negative and start <= limit");
   2091         }
   2092         if (number < 0) {
   2093             return true;
   2094         }
   2095         if (source == null) {
   2096             return false;
   2097         }
   2098 
   2099         // length >= 0 known
   2100         // source contains at least (length + 1) / 2 code points: <= 2
   2101         // chars per cp
   2102         if (((length + 1) >> 1) > number) {
   2103             return true;
   2104         }
   2105 
   2106         // check if source does not even contain enough chars
   2107         int maxsupplementary = length - number;
   2108         if (maxsupplementary <= 0) {
   2109             return false;
   2110         }
   2111 
   2112         // there are maxsupplementary = length - number more chars than
   2113         // asked-for code points
   2114 
   2115         // count code points until they exceed and also check that there are
   2116         // no more than maxsupplementary supplementary code points (char pairs)
   2117         while (true) {
   2118             if (length == 0) {
   2119                 return false;
   2120             }
   2121             if (number == 0) {
   2122                 return true;
   2123             }
   2124             if (isLeadSurrogate(source[start++]) && start != limit
   2125                     && isTrailSurrogate(source[start])) {
   2126                 start++;
   2127                 if (--maxsupplementary <= 0) {
   2128                     // too many pairs - too few code points
   2129                     return false;
   2130                 }
   2131             }
   2132             --number;
   2133         }
   2134     }
   2135 
   2136     /**
   2137      * Check if the string buffer contains more Unicode code points than a certain number. This is
   2138      * more efficient than counting all code points in the entire string buffer and comparing that
   2139      * number with a threshold. This function may not need to scan the string buffer at all if the
   2140      * length is within a certain range, and never needs to count more than 'number + 1' code
   2141      * points. Logically equivalent to (countCodePoint(s) &gt; number). A Unicode code point may
   2142      * occupy either one or two code units.
   2143      *
   2144      * @param source The input string buffer.
   2145      * @param number The number of code points in the string buffer is compared against the 'number'
   2146      *            parameter.
   2147      * @return boolean value for whether the string buffer contains more Unicode code points than
   2148      *         'number'.
   2149      */
   2150     public static boolean hasMoreCodePointsThan(StringBuffer source, int number) {
   2151         if (number < 0) {
   2152             return true;
   2153         }
   2154         if (source == null) {
   2155             return false;
   2156         }
   2157         int length = source.length();
   2158 
   2159         // length >= 0 known
   2160         // source contains at least (length + 1) / 2 code points: <= 2
   2161         // chars per cp
   2162         if (((length + 1) >> 1) > number) {
   2163             return true;
   2164         }
   2165 
   2166         // check if source does not even contain enough chars
   2167         int maxsupplementary = length - number;
   2168         if (maxsupplementary <= 0) {
   2169             return false;
   2170         }
   2171 
   2172         // there are maxsupplementary = length - number more chars than
   2173         // asked-for code points
   2174 
   2175         // count code points until they exceed and also check that there are
   2176         // no more than maxsupplementary supplementary code points (char pairs)
   2177         int start = 0;
   2178         while (true) {
   2179             if (length == 0) {
   2180                 return false;
   2181             }
   2182             if (number == 0) {
   2183                 return true;
   2184             }
   2185             if (isLeadSurrogate(source.charAt(start++)) && start != length
   2186                     && isTrailSurrogate(source.charAt(start))) {
   2187                 start++;
   2188                 if (--maxsupplementary <= 0) {
   2189                     // too many pairs - too few code points
   2190                     return false;
   2191                 }
   2192             }
   2193             --number;
   2194         }
   2195     }
   2196 
   2197     /**
   2198      * Cover JDK 1.5 API. Create a String from an array of codePoints.
   2199      *
   2200      * @param codePoints The code array
   2201      * @param offset The start of the text in the code point array
   2202      * @param count The number of code points
   2203      * @return a String representing the code points between offset and count
   2204      * @throws IllegalArgumentException If an invalid code point is encountered
   2205      * @throws IndexOutOfBoundsException If the offset or count are out of bounds.
   2206      */
   2207     public static String newString(int[] codePoints, int offset, int count) {
   2208         if (count < 0) {
   2209             throw new IllegalArgumentException();
   2210         }
   2211         char[] chars = new char[count];
   2212         int w = 0;
   2213         for (int r = offset, e = offset + count; r < e; ++r) {
   2214             int cp = codePoints[r];
   2215             if (cp < 0 || cp > 0x10ffff) {
   2216                 throw new IllegalArgumentException();
   2217             }
   2218             while (true) {
   2219                 try {
   2220                     if (cp < 0x010000) {
   2221                         chars[w] = (char) cp;
   2222                         w++;
   2223                     } else {
   2224                         chars[w] = (char) (LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_));
   2225                         chars[w + 1] = (char) (TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_));
   2226                         w += 2;
   2227                     }
   2228                     break;
   2229                 } catch (IndexOutOfBoundsException ex) {
   2230                     int newlen = (int) (Math.ceil((double) codePoints.length * (w + 2)
   2231                             / (r - offset + 1)));
   2232                     char[] temp = new char[newlen];
   2233                     System.arraycopy(chars, 0, temp, 0, w);
   2234                     chars = temp;
   2235                 }
   2236             }
   2237         }
   2238         return new String(chars, 0, w);
   2239     }
   2240 
   2241     /**
   2242      * <p>
   2243      * UTF16 string comparator class. Allows UTF16 string comparison to be done with the various
   2244      * modes
   2245      * </p>
   2246      * <ul>
   2247      * <li> Code point comparison or code unit comparison
   2248      * <li> Case sensitive comparison, case insensitive comparison or case insensitive comparison
   2249      * with special handling for character 'i'.
   2250      * </ul>
   2251      * <p>
   2252      * The code unit or code point comparison differ only when comparing supplementary code points
   2253      * (&#92;u10000..&#92;u10ffff) to BMP code points near the end of the BMP (i.e.,
   2254      * &#92;ue000..&#92;uffff). In code unit comparison, high BMP code points sort after
   2255      * supplementary code points because they are stored as pairs of surrogates which are at
   2256      * &#92;ud800..&#92;udfff.
   2257      * </p>
   2258      *
   2259      * @see #FOLD_CASE_DEFAULT
   2260      * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
   2261      */
   2262     public static final class StringComparator implements java.util.Comparator<String> {
   2263         // public constructor ------------------------------------------------
   2264 
   2265         /**
   2266          * Default constructor that does code unit comparison and case sensitive comparison.
   2267          */
   2268         public StringComparator() {
   2269             this(false, false, FOLD_CASE_DEFAULT);
   2270         }
   2271 
   2272         /**
   2273          * Constructor that does comparison based on the argument options.
   2274          *
   2275          * @param codepointcompare Flag to indicate true for code point comparison or false for code unit
   2276          *            comparison.
   2277          * @param ignorecase False for case sensitive comparison, true for case-insensitive comparison
   2278          * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only
   2279          *            when ignorecase is set to true. If ignorecase is false, this option is
   2280          *            ignored.
   2281          * @see #FOLD_CASE_DEFAULT
   2282          * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
   2283          * @throws IllegalArgumentException If foldcaseoption is out of range
   2284          */
   2285         public StringComparator(boolean codepointcompare, boolean ignorecase, int foldcaseoption) {
   2286             setCodePointCompare(codepointcompare);
   2287             m_ignoreCase_ = ignorecase;
   2288             if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
   2289                 throw new IllegalArgumentException("Invalid fold case option");
   2290             }
   2291             m_foldCase_ = foldcaseoption;
   2292         }
   2293 
   2294         // public data member ------------------------------------------------
   2295 
   2296         /**
   2297          * Option value for case folding comparison:
   2298          *
   2299          * <p>Comparison is case insensitive, strings are folded using default mappings defined in
   2300          * Unicode data file CaseFolding.txt, before comparison.
   2301          */
   2302         public static final int FOLD_CASE_DEFAULT = 0;
   2303 
   2304         /**
   2305          * Option value for case folding:
   2306          * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I
   2307          * and dotless i appropriately for Turkic languages (tr, az).
   2308          *
   2309          * <p>Comparison is case insensitive, strings are folded using modified mappings defined in
   2310          * Unicode data file CaseFolding.txt, before comparison.
   2311          *
   2312          * @see android.icu.lang.UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I
   2313          */
   2314         public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 1;
   2315 
   2316         // public methods ----------------------------------------------------
   2317 
   2318         // public setters ----------------------------------------------------
   2319 
   2320         /**
   2321          * Sets the comparison mode to code point compare if flag is true. Otherwise comparison mode
   2322          * is set to code unit compare
   2323          *
   2324          * @param flag True for code point compare, false for code unit compare
   2325          */
   2326         public void setCodePointCompare(boolean flag) {
   2327             if (flag) {
   2328                 m_codePointCompare_ = Normalizer.COMPARE_CODE_POINT_ORDER;
   2329             } else {
   2330                 m_codePointCompare_ = 0;
   2331             }
   2332         }
   2333 
   2334         /**
   2335          * Sets the Comparator to case-insensitive comparison mode if argument is true, otherwise
   2336          * case sensitive comparison mode if set to false.
   2337          *
   2338          * @param ignorecase True for case-insitive comparison, false for case sensitive comparison
   2339          * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only
   2340          *            when ignorecase is set to true. If ignorecase is false, this option is
   2341          *            ignored.
   2342          * @see #FOLD_CASE_DEFAULT
   2343          * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
   2344          */
   2345         public void setIgnoreCase(boolean ignorecase, int foldcaseoption) {
   2346             m_ignoreCase_ = ignorecase;
   2347             if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
   2348                 throw new IllegalArgumentException("Invalid fold case option");
   2349             }
   2350             m_foldCase_ = foldcaseoption;
   2351         }
   2352 
   2353         // public getters ----------------------------------------------------
   2354 
   2355         /**
   2356          * Checks if the comparison mode is code point compare.
   2357          *
   2358          * @return true for code point compare, false for code unit compare
   2359          */
   2360         public boolean getCodePointCompare() {
   2361             return m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER;
   2362         }
   2363 
   2364         /**
   2365          * Checks if Comparator is in the case insensitive mode.
   2366          *
   2367          * @return true if Comparator performs case insensitive comparison, false otherwise
   2368          */
   2369         public boolean getIgnoreCase() {
   2370             return m_ignoreCase_;
   2371         }
   2372 
   2373         /**
   2374          * Gets the fold case options set in Comparator to be used with case insensitive comparison.
   2375          *
   2376          * @return either FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I
   2377          * @see #FOLD_CASE_DEFAULT
   2378          * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
   2379          */
   2380         public int getIgnoreCaseOption() {
   2381             return m_foldCase_;
   2382         }
   2383 
   2384         // public other methods ----------------------------------------------
   2385 
   2386         /**
   2387          * Compare two strings depending on the options selected during construction.
   2388          *
   2389          * @param a first source string.
   2390          * @param b second source string.
   2391          * @return 0 returned if a == b. If a &lt; b, a negative value is returned. Otherwise if a &gt; b,
   2392          *         a positive value is returned.
   2393          * @exception ClassCastException thrown when either a or b is not a String object
   2394          */
   2395         @Override
   2396         public int compare(String a, String b) {
   2397             if (Utility.sameObjects(a, b)) {
   2398                 return 0;
   2399             }
   2400             if (a == null) {
   2401                 return -1;
   2402             }
   2403             if (b == null) {
   2404                 return 1;
   2405             }
   2406 
   2407             if (m_ignoreCase_) {
   2408                 return compareCaseInsensitive(a, b);
   2409             }
   2410             return compareCaseSensitive(a, b);
   2411         }
   2412 
   2413         // private data member ----------------------------------------------
   2414 
   2415         /**
   2416          * Code unit comparison flag. True if code unit comparison is required. False if code point
   2417          * comparison is required.
   2418          */
   2419         private int m_codePointCompare_;
   2420 
   2421         /**
   2422          * Fold case comparison option.
   2423          */
   2424         private int m_foldCase_;
   2425 
   2426         /**
   2427          * Flag indicator if ignore case is to be used during comparison
   2428          */
   2429         private boolean m_ignoreCase_;
   2430 
   2431         /**
   2432          * Code point order offset for surrogate characters
   2433          */
   2434         private static final int CODE_POINT_COMPARE_SURROGATE_OFFSET_ = 0x2800;
   2435 
   2436         // private method ---------------------------------------------------
   2437 
   2438         /**
   2439          * Compares case insensitive. This is a direct port of ICU4C, to make maintainence life
   2440          * easier.
   2441          *
   2442          * @param s1
   2443          *            first string to compare
   2444          * @param s2
   2445          *            second string to compare
   2446          * @return -1 is s1 &lt; s2, 0 if equals,
   2447          */
   2448         private int compareCaseInsensitive(String s1, String s2) {
   2449             return Normalizer.cmpEquivFold(s1, s2, m_foldCase_ | m_codePointCompare_
   2450                     | Normalizer.COMPARE_IGNORE_CASE);
   2451         }
   2452 
   2453         /**
   2454          * Compares case sensitive. This is a direct port of ICU4C, to make maintainence life
   2455          * easier.
   2456          *
   2457          * @param s1
   2458          *            first string to compare
   2459          * @param s2
   2460          *            second string to compare
   2461          * @return -1 is s1 &lt; s2, 0 if equals,
   2462          */
   2463         private int compareCaseSensitive(String s1, String s2) {
   2464             // compare identical prefixes - they do not need to be fixed up
   2465             // limit1 = start1 + min(lenght1, length2)
   2466             int length1 = s1.length();
   2467             int length2 = s2.length();
   2468             int minlength = length1;
   2469             int result = 0;
   2470             if (length1 < length2) {
   2471                 result = -1;
   2472             } else if (length1 > length2) {
   2473                 result = 1;
   2474                 minlength = length2;
   2475             }
   2476 
   2477             char c1 = 0;
   2478             char c2 = 0;
   2479             int index = 0;
   2480             for (; index < minlength; index++) {
   2481                 c1 = s1.charAt(index);
   2482                 c2 = s2.charAt(index);
   2483                 // check pseudo-limit
   2484                 if (c1 != c2) {
   2485                     break;
   2486                 }
   2487             }
   2488 
   2489             if (index == minlength) {
   2490                 return result;
   2491             }
   2492 
   2493             boolean codepointcompare = m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER;
   2494             // if both values are in or above the surrogate range, fix them up
   2495             if (c1 >= LEAD_SURROGATE_MIN_VALUE && c2 >= LEAD_SURROGATE_MIN_VALUE
   2496                     && codepointcompare) {
   2497                 // subtract 0x2800 from BMP code points to make them smaller
   2498                 // than supplementary ones
   2499                 if ((c1 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length1 && isTrailSurrogate(s1.charAt(index + 1)))
   2500                         || (isTrailSurrogate(c1) && index != 0 && isLeadSurrogate(s1.charAt(index - 1)))) {
   2501                     // part of a surrogate pair, leave >=d800
   2502                 } else {
   2503                     // BMP code point - may be surrogate code point - make
   2504                     // < d800
   2505                     c1 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
   2506                 }
   2507 
   2508                 if ((c2 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length2 && isTrailSurrogate(s2.charAt(index + 1)))
   2509                         || (isTrailSurrogate(c2) && index != 0 && isLeadSurrogate(s2.charAt(index - 1)))) {
   2510                     // part of a surrogate pair, leave >=d800
   2511                 } else {
   2512                     // BMP code point - may be surrogate code point - make <d800
   2513                     c2 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
   2514                 }
   2515             }
   2516 
   2517             // now c1 and c2 are in UTF-32-compatible order
   2518             return c1 - c2;
   2519         }
   2520     }
   2521 
   2522     /**
   2523      * Utility for getting a code point from a CharSequence that contains exactly one code point.
   2524      * @return the code point IF the string is non-null and consists of a single code point.
   2525      * otherwise returns -1.
   2526      * @param s to test
   2527      */
   2528     public static int getSingleCodePoint(CharSequence s) {
   2529         if (s == null || s.length() == 0) {
   2530             return -1;
   2531         } else if (s.length() == 1) {
   2532             return s.charAt(0);
   2533         } else if (s.length() > 2) {
   2534             return -1;
   2535         }
   2536 
   2537         // at this point, len = 2
   2538         int cp = Character.codePointAt(s, 0);
   2539         if (cp > 0xFFFF) { // is surrogate pair
   2540             return cp;
   2541         }
   2542         return -1;
   2543     }
   2544 
   2545     /**
   2546      * Utility for comparing a code point to a string without having to create a new string. Returns the same results
   2547      * as a code point comparison of UTF16.valueOf(codePoint) and s.toString(). More specifically, if
   2548      * <pre>
   2549      * sc = new StringComparator(true,false,0);
   2550      * fast = UTF16.compareCodePoint(codePoint, charSequence)
   2551      * slower = sc.compare(UTF16.valueOf(codePoint), charSequence == null ? "" : charSequence.toString())
   2552      * </pre>
   2553      * then
   2554      * <pre>
   2555      * Integer.signum(fast) == Integer.signum(slower)
   2556      * </pre>
   2557      * @param codePoint to test
   2558      * @param s to test
   2559      * @return equivalent of code point comparator comparing two strings.
   2560      */
   2561     public static int compareCodePoint(int codePoint, CharSequence s) {
   2562         if (s == null) {
   2563             return 1;
   2564         }
   2565         final int strLen = s.length();
   2566         if (strLen == 0) {
   2567             return 1;
   2568         }
   2569         int second = Character.codePointAt(s, 0);
   2570         int diff = codePoint - second;
   2571         if (diff != 0) {
   2572             return diff;
   2573         }
   2574         return strLen == Character.charCount(codePoint) ? 0 : -1;
   2575     }
   2576 
   2577     // private data members -------------------------------------------------
   2578 
   2579     /**
   2580      * Shift value for lead surrogate to form a supplementary character.
   2581      */
   2582     private static final int LEAD_SURROGATE_SHIFT_ = 10;
   2583 
   2584     /**
   2585      * Mask to retrieve the significant value from a trail surrogate.
   2586      */
   2587     private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
   2588 
   2589     /**
   2590      * Value that all lead surrogate starts with
   2591      */
   2592     private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE
   2593             - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
   2594 
   2595     // private methods ------------------------------------------------------
   2596 
   2597     /**
   2598      * <p>
   2599      * Converts argument code point and returns a String object representing the code point's value
   2600      * in UTF16 format.
   2601      * </p>
   2602      * <p>
   2603      * This method does not check for the validity of the codepoint, the results are not guaranteed
   2604      * if a invalid codepoint is passed as argument.
   2605      * </p>
   2606      * <p>
   2607      * The result is a string whose length is 1 for non-supplementary code points, 2 otherwise.
   2608      * </p>
   2609      *
   2610      * @param ch
   2611      *            code point
   2612      * @return string representation of the code point
   2613      */
   2614     private static String toString(int ch) {
   2615         if (ch < SUPPLEMENTARY_MIN_VALUE) {
   2616             return String.valueOf((char) ch);
   2617         }
   2618 
   2619         StringBuilder result = new StringBuilder();
   2620         result.append(getLeadSurrogate(ch));
   2621         result.append(getTrailSurrogate(ch));
   2622         return result.toString();
   2623     }
   2624 }
   2625 // eof
   2626