Home | History | Annotate | Download | only in test
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /**
      4 *******************************************************************************
      5 * Copyright (C) 2002-2004, International Business Machines Corporation and    *
      6 * others. All Rights Reserved.                                                *
      7 *******************************************************************************
      8 */
      9 package com.ibm.icu.dev.test;
     10 
     11 /**
     12  * Utility class for supplementary code point
     13  * support. This one is written purely for updating
     14  * Normalization sample from the unicode.org site.
     15  * If you want the real thing, use UTF16 class
     16  * from ICU4J
     17  * @author Vladimir Weinstein, Markus Scherer
     18  */
     19 public class UTF16Util {
     20     static final int suppOffset = (0xd800 << 10) + 0xdc00 - 0x10000;
     21 
     22     /**
     23      * Method nextCodePoint. Returns the next code point
     24      * in a string.
     25      * @param s String in question
     26      * @param i index from which we want a code point
     27      * @return int codepoint at index i
     28      */
     29     public static final int nextCodePoint(String s, int i) {
     30         int ch = s.charAt(i);
     31         if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) {
     32             int ch2 = s.charAt(i);
     33             if (0xdc00 <= ch2 && ch2 <= 0xdfff) {
     34                 ch = (ch << 10) + ch2 - suppOffset;
     35             }
     36         }
     37         return ch;
     38     }
     39 
     40     /**
     41      * Method prevCodePoint. Gets the code point preceding
     42      * index i (predecrement).
     43      * @param s String in question
     44      * @param i index in string
     45      * @return int codepoint at index --i
     46      */
     47     public static final int prevCodePoint(String s, int i) {
     48         int ch = s.charAt(--i);
     49         if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) {
     50             int ch2 = s.charAt(i);
     51             if (0xd800 <= ch2 && ch2 <= 0xdbff) {
     52                 ch = (ch2 << 10) + ch - suppOffset;
     53             }
     54         }
     55         return ch;
     56     }
     57 
     58     /**
     59      * Method nextCodePoint. Returns the next code point
     60      * in a string.
     61      * @param s StringBuffer in question
     62      * @param i index from which we want a code point
     63      * @return int codepoint at index i
     64      */
     65     public static final int nextCodePoint(StringBuffer s, int i) {
     66         int ch = s.charAt(i);
     67         if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) {
     68             int ch2 = s.charAt(i);
     69             if (0xdc00 <= ch2 && ch2 <= 0xdfff) {
     70                 ch = (ch << 10) + ch2 - suppOffset;
     71             }
     72         }
     73         return ch;
     74     }
     75 
     76     /**
     77      * Method prevCodePoint. Gets the code point preceding
     78      * index i (predecrement).
     79      * @param s StringBuffer in question
     80      * @param i index in string
     81      * @return int codepoint at index --i
     82      */
     83     public static final int prevCodePoint(StringBuffer s, int i) {
     84         int ch = s.charAt(--i);
     85         if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) {
     86             int ch2 = s.charAt(i);
     87             if (0xd800 <= ch2 && ch2 <= 0xdbff) {
     88                 ch = (ch2 << 10) + ch - suppOffset;
     89             }
     90         }
     91         return ch;
     92     }
     93 
     94     /**
     95      * Method codePointLength. Returns the length
     96      * in UTF-16 code units of a given code point
     97      * @param c code point in question
     98      * @return int length in UTF-16 code units. Can be 1 or 2
     99      */
    100     public static final int codePointLength(int c) {
    101         return c <= 0xffff ? 1 : 2;
    102     }
    103 
    104     /**
    105      * Method appendCodePoint. Appends a code point
    106      * to a StringBuffer
    107      * @param buffer StringBuffer in question
    108      * @param ch code point to append
    109      */
    110     public static final void appendCodePoint(StringBuffer buffer, int ch) {
    111         if (ch <= 0xffff) {
    112             buffer.append((char)ch);
    113         } else {
    114             buffer.append((char)(0xd7c0 + (ch >> 10)));
    115             buffer.append((char)(0xdc00 + (ch & 0x3ff)));
    116         }
    117     }
    118 
    119     /**
    120      * Method insertCodePoint. Inserts a code point in
    121      * a StringBuffer
    122      * @param buffer StringBuffer in question
    123      * @param i index at which we want code point to be inserted
    124      * @param ch code point to be inserted
    125      */
    126     public static final void insertCodePoint(StringBuffer buffer, int i, int ch) {
    127         if (ch <= 0xffff) {
    128             buffer.insert(i, (char)ch);
    129         } else {
    130             buffer.insert(i, (char)(0xd7c0 + (ch >> 10))).insert(i + 1, (char)(0xdc00 + (ch & 0x3ff)));
    131         }
    132     }
    133 
    134     /**
    135      * Method setCodePointAt. Changes a code point at a
    136      * given index. Can change the length of the string.
    137      * @param buffer StringBuffer in question
    138      * @param i index at which we want to change the contents
    139      * @param ch replacement code point
    140      * @return int difference in resulting StringBuffer length
    141      */
    142     public static final int setCodePointAt(StringBuffer buffer, int i, int ch) {
    143         int cp = nextCodePoint(buffer, i);
    144 
    145         if (ch <= 0xffff && cp <= 0xffff) { // Both BMP
    146             buffer.setCharAt(i, (char)ch);
    147             return 0;
    148         } else if (ch > 0xffff && cp > 0xffff) { // Both supplementary
    149             buffer.setCharAt(i, (char)(0xd7c0 + (ch >> 10)));
    150             buffer.setCharAt(i+1, (char)(0xdc00 + (ch & 0x3ff)));
    151             return 0;
    152         } else if (ch <= 0xffff && cp > 0xffff) { // putting BMP instead of supplementary, buffer shrinks
    153             buffer.setCharAt(i, (char)ch);
    154             buffer.deleteCharAt(i+1);
    155             return -1;
    156         } else { //if (ch > 0xffff && cp <= 0xffff) { // putting supplementary instead of BMP, buffer grows
    157             buffer.setCharAt(i, (char)(0xd7c0 + (ch >> 10)));
    158             buffer.insert(i+1, (char)(0xdc00 + (ch & 0x3ff)));
    159             return 1;
    160         }
    161     }
    162 
    163     /**
    164      * Method countCodePoint. Counts the UTF-32 code points
    165      * in a UTF-16 encoded string.
    166      * @param source String in question.
    167      * @return int number of code points in this string
    168      */
    169     public static final int countCodePoint(String source)
    170     {
    171         int result = 0;
    172         char ch;
    173         boolean hadLeadSurrogate = false;
    174 
    175         for (int i = 0; i < source.length(); ++ i)
    176         {
    177             ch = source.charAt(i);
    178             if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) {
    179                 hadLeadSurrogate = false;           // count valid trail as zero
    180             }
    181             else
    182             {
    183                 hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff);
    184                 ++ result;                          // count others as 1
    185             }
    186         }
    187 
    188         return result;
    189     }
    190 
    191     /**
    192      * Method countCodePoint. Counts the UTF-32 code points
    193      * in a UTF-16 encoded string.
    194      * @param source StringBuffer in question.
    195      * @return int number of code points in this string
    196      */
    197     public static final int countCodePoint(StringBuffer source)
    198     {
    199         int result = 0;
    200         char ch;
    201         boolean hadLeadSurrogate = false;
    202 
    203         for (int i = 0; i < source.length(); ++ i)
    204         {
    205             ch = source.charAt(i);
    206             if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) {
    207                 hadLeadSurrogate = false;           // count valid trail as zero
    208             }
    209             else
    210             {
    211                 hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff);
    212                 ++ result;                          // count others as 1
    213             }
    214         }
    215 
    216         return result;
    217     }
    218     /**
    219      * The minimum value for Supplementary code points
    220      */
    221     public static final int SUPPLEMENTARY_MIN_VALUE  = 0x10000;
    222     /**
    223      * Determines how many chars this char32 requires.
    224      * If a validity check is required, use <code>
    225      * <a href="../UCharacter.html#isLegal(char)">isLegal()</a></code> on
    226      * char32 before calling.
    227      * @param char32 the input codepoint.
    228      * @return 2 if is in supplementary space, otherwise 1.
    229      */
    230     public static int getCharCount(int char32)
    231     {
    232         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
    233             return 1;
    234         }
    235         return 2;
    236     }
    237     /**
    238      * Lead surrogate maximum value
    239      * @stable ICU 2.1
    240      */
    241     public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
    242     /**
    243      * Lead surrogate minimum value
    244      * @stable ICU 2.1
    245      */
    246     public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
    247 
    248     /**
    249      * Trail surrogate minimum value
    250      * @stable ICU 2.1
    251      */
    252     public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
    253     /**
    254      * Trail surrogate maximum value
    255      * @stable ICU 2.1
    256      */
    257     public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
    258     /**
    259      * Determines whether the code value is a surrogate.
    260      * @param char16 the input character.
    261      * @return true iff the input character is a surrogate.
    262      * @stable ICU 2.1
    263      */
    264     public static boolean isSurrogate(char char16)
    265     {
    266         return LEAD_SURROGATE_MIN_VALUE <= char16 &&
    267             char16 <= TRAIL_SURROGATE_MAX_VALUE;
    268     }
    269 
    270     /**
    271      * Determines whether the character is a trail surrogate.
    272      * @param char16 the input character.
    273      * @return true iff the input character is a trail surrogate.
    274      * @stable ICU 2.1
    275      */
    276     public static boolean isTrailSurrogate(char char16)
    277     {
    278         return (TRAIL_SURROGATE_MIN_VALUE <= char16 &&
    279                 char16 <= TRAIL_SURROGATE_MAX_VALUE);
    280     }
    281 
    282     /**
    283      * Determines whether the character is a lead surrogate.
    284      * @param char16 the input character.
    285      * @return true iff the input character is a lead surrogate
    286      * @stable ICU 2.1
    287      */
    288     public static boolean isLeadSurrogate(char char16)
    289     {
    290         return LEAD_SURROGATE_MIN_VALUE <= char16 &&
    291             char16 <= LEAD_SURROGATE_MAX_VALUE;
    292     }
    293     /**
    294      * Extract a single UTF-32 value from a substring.
    295      * Used when iterating forwards or backwards (with
    296      * <code>UTF16.getCharCount()</code>, as well as random access. If a
    297      * validity check is required, use
    298      * <code><a href="../UCharacter.html#isLegal(char)">UCharacter.isLegal()
    299      * </a></code> on the return value.
    300      * If the char retrieved is part of a surrogate pair, its supplementary
    301      * character will be returned. If a complete supplementary character is
    302      * not found the incomplete character will be returned
    303      * @param source array of UTF-16 chars
    304      * @param start offset to substring in the source array for analyzing
    305      * @param limit offset to substring in the source array for analyzing
    306      * @param offset16 UTF-16 offset relative to start
    307      * @return UTF-32 value for the UTF-32 value that contains the char at
    308      *         offset16. The boundaries of that codepoint are the same as in
    309      *         <code>bounds32()</code>.
    310      * @exception IndexOutOfBoundsException thrown if offset16 is not within
    311      *            the range of start and limit.
    312      * @stable ICU 2.1
    313      */
    314     public static int charAt(char source[], int start, int limit,
    315                              int offset16)
    316     {
    317         offset16 += start;
    318         if (offset16 < start || offset16 >= limit) {
    319             throw new ArrayIndexOutOfBoundsException(offset16);
    320         }
    321 
    322         char single = source[offset16];
    323         if (!isSurrogate(single)) {
    324             return single;
    325         }
    326 
    327         // Convert the UTF-16 surrogate pair if necessary.
    328         // For simplicity in usage, and because the frequency of pairs is
    329         // low, look both directions.
    330         if (single <= LEAD_SURROGATE_MAX_VALUE) {
    331             offset16 ++;
    332             if (offset16 >= limit) {
    333                 return single;
    334             }
    335             char trail = source[offset16];
    336             if (isTrailSurrogate(trail)) {
    337                 return getRawSupplementary(single, trail);
    338             }
    339         }
    340         else { // isTrailSurrogate(single), so
    341             if (offset16 == start) {
    342                 return single;
    343             }
    344             offset16 --;
    345             char lead = source[offset16];
    346             if (isLeadSurrogate(lead))
    347                 return getRawSupplementary(lead, single);
    348         }
    349         return single; // return unmatched surrogate
    350     }
    351     /**
    352      * Shift value for lead surrogate to form a supplementary character.
    353      */
    354     private static final int LEAD_SURROGATE_SHIFT_ = 10;
    355 
    356     /**
    357      * Offset to add to combined surrogate pair to avoid msking.
    358      */
    359     private static final int SURROGATE_OFFSET_ =
    360                            SUPPLEMENTARY_MIN_VALUE -
    361                            (LEAD_SURROGATE_MIN_VALUE <<
    362                            LEAD_SURROGATE_SHIFT_) -
    363                            TRAIL_SURROGATE_MIN_VALUE;
    364 
    365 
    366    /**
    367     * Forms a supplementary code point from the argument character<br>
    368     * Note this is for internal use hence no checks for the validity of the
    369     * surrogate characters are done
    370     * @param lead lead surrogate character
    371     * @param trail trailing surrogate character
    372     * @return code point of the supplementary character
    373     */
    374     public static int getRawSupplementary(char lead, char trail)
    375     {
    376         return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
    377     }
    378 
    379 }
    380