Home | History | Annotate | Download | only in translit
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /**
      5  *******************************************************************************
      6  * Copyright (C) 2000-2010, International Business Machines Corporation and    *
      7  * others. All Rights Reserved.                                                *
      8  *******************************************************************************
      9  */
     10 package android.icu.dev.test.translit;
     11 
     12 import android.icu.text.UTF16;
     13 import android.icu.text.UnicodeSet;
     14 import android.icu.testsharding.MainTestShard;
     15 
     16 @MainTestShard
     17 public final class TestUtility {
     18 
     19     public static String hex(char ch) {
     20         String foo = Integer.toString(ch,16).toUpperCase();
     21         return "0000".substring(0,4-foo.length()) + foo;
     22     }
     23 
     24     public static String hex(int ch) {
     25         String foo = Integer.toString(ch,16).toUpperCase();
     26         return "00000000".substring(0,4-foo.length()) + foo;
     27     }
     28 
     29     public static String hex(String s) {
     30       return hex(s,",");
     31     }
     32 
     33     public static String hex(String s, String sep) {
     34       if (s.length() == 0) return "";
     35       String result = hex(s.charAt(0));
     36       for (int i = 1; i < s.length(); ++i) {
     37         result += sep;
     38         result += hex(s.charAt(i));
     39       }
     40       return result;
     41     }
     42 
     43     public static String replace(String source, String toBeReplaced, String replacement) {
     44         StringBuffer results = new StringBuffer();
     45         int len = toBeReplaced.length();
     46         for (int i = 0; i < source.length(); ++i) {
     47             if (source.regionMatches(false, i, toBeReplaced, 0, len)) {
     48                 results.append(replacement);
     49                 i += len - 1; // minus one, since we will increment
     50             } else {
     51                 results.append(source.charAt(i));
     52             }
     53         }
     54         return results.toString();
     55     }
     56 
     57     public static String replaceAll(String source, UnicodeSet set, String replacement) {
     58         StringBuffer results = new StringBuffer();
     59         int cp;
     60         for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
     61             cp = UTF16.charAt(source,i);
     62             if (set.contains(cp)) {
     63                 results.append(replacement);
     64             } else {
     65                 UTF16.append(results, cp);
     66             }
     67         }
     68         return results.toString();
     69     }
     70 
     71     // COMMENTED OUT ALL THE OLD SCRIPT STUFF
     72     /*
     73     public static byte getScript(char c) {
     74       return getScript(getBlock(c));
     75     }
     76 
     77     public static byte getScript(byte block) {
     78       return blockToScript[block];
     79     }
     80 
     81     public static byte getBlock(char c) {
     82       int index = c >> 7;
     83       byte block = charToBlock[index];
     84       while (block < 0) { // take care of exceptions, blocks split across 128 boundaries
     85           int[] tuple = split[-block-1];
     86           if (c < tuple[0]) block = (byte)tuple[1];
     87           else block = (byte)tuple[2];
     88       }
     89       return block;
     90     }
     91 
     92     // returns next letter of script, or 0xFFFF if done
     93 
     94     public static char getNextLetter(char c, byte script) {
     95         while (c < 0xFFFF) {
     96             ++c;
     97             if (getScript(c) == script && Character.isLetter(c)) {
     98                 return c;
     99             }
    100         }
    101         return c;
    102     }
    103 
    104     // Supplements to Character methods; these methods go through
    105     // UCharacter if possible.  If not, they fall back to Character.
    106 
    107     public static boolean isUnassigned(char c) {
    108         try {
    109             return UCharacter.getType(c) == UCharacterCategory.UNASSIGNED;
    110         } catch (NullPointerException e) {
    111             System.out.print("");
    112         }
    113         return Character.getType(c) == Character.UNASSIGNED;
    114     }
    115 
    116     public static boolean isLetter(char c) {
    117         try {
    118             return UCharacter.isLetter(c);
    119         } catch (NullPointerException e) {
    120             System.out.print("");
    121         }
    122         return Character.isLetter(c);
    123     }
    124 
    125   public static void main(String[] args) {
    126     System.out.println("Blocks: ");
    127     byte lastblock = -128;
    128     for (char cc = 0; cc < 0xFFFF; ++cc) {
    129       byte block = TestUtility.getBlock(cc);
    130       if (block != lastblock) {
    131         System.out.println(TestUtility.hex(cc) + "\t" + block);
    132         lastblock = block;
    133       }
    134     }
    135     System.out.println();
    136     System.out.println("Scripts: ");
    137     byte lastScript = -128;
    138     for (char cc = 0; cc < 0xFFFF; ++cc) {
    139       byte script = TestUtility.getScript(cc);
    140       if (script != lastScript) {
    141         System.out.println(TestUtility.hex(cc) + "\t" + script);
    142         lastScript = script;
    143       }
    144     }
    145   }
    146 
    147 
    148 
    149     public static final byte // SCRIPT CODE
    150         COMMON_SCRIPT = 0,
    151         LATIN_SCRIPT = 1,
    152         GREEK_SCRIPT = 2,
    153         CYRILLIC_SCRIPT = 3,
    154         ARMENIAN_SCRIPT = 4,
    155         HEBREW_SCRIPT = 5,
    156         ARABIC_SCRIPT = 6,
    157         SYRIAC_SCRIPT = 7,
    158         THAANA_SCRIPT = 8,
    159         DEVANAGARI_SCRIPT = 9,
    160         BENGALI_SCRIPT = 10,
    161         GURMUKHI_SCRIPT = 11,
    162         GUJARATI_SCRIPT = 12,
    163         ORIYA_SCRIPT = 13,
    164         TAMIL_SCRIPT = 14,
    165         TELUGU_SCRIPT = 15,
    166         KANNADA_SCRIPT = 16,
    167         MALAYALAM_SCRIPT = 17,
    168         SINHALA_SCRIPT = 18,
    169         THAI_SCRIPT = 19,
    170         LAO_SCRIPT = 20,
    171         TIBETAN_SCRIPT = 21,
    172         MYANMAR_SCRIPT = 22,
    173         GEORGIAN_SCRIPT = 23,
    174         JAMO_SCRIPT = 24,
    175         HANGUL_SCRIPT = 25,
    176         ETHIOPIC_SCRIPT = 26,
    177         CHEROKEE_SCRIPT = 27,
    178         ABORIGINAL_SCRIPT = 28,
    179         OGHAM_SCRIPT = 29,
    180         RUNIC_SCRIPT = 30,
    181         KHMER_SCRIPT = 31,
    182         MONGOLIAN_SCRIPT = 32,
    183         HIRAGANA_SCRIPT = 33,
    184         KATAKANA_SCRIPT = 34,
    185         BOPOMOFO_SCRIPT = 35,
    186         HAN_SCRIPT = 36,
    187         YI_SCRIPT = 37;
    188 
    189     public static final byte // block code
    190         RESERVED_BLOCK = 0,
    191         BASIC_LATIN = 1,
    192         LATIN_1_SUPPLEMENT = 2,
    193         LATIN_EXTENDED_A = 3,
    194         LATIN_EXTENDED_B = 4,
    195         IPA_EXTENSIONS = 5,
    196         SPACING_MODIFIER_LETTERS = 6,
    197         COMBINING_DIACRITICAL_MARKS = 7,
    198         GREEK = 8,
    199         CYRILLIC = 9,
    200         ARMENIAN = 10,
    201         HEBREW = 11,
    202         ARABIC = 12,
    203         SYRIAC = 13,
    204         THAANA = 14,
    205         DEVANAGARI = 15,
    206         BENGALI = 16,
    207         GURMUKHI = 17,
    208         GUJARATI = 18,
    209         ORIYA = 19,
    210         TAMIL = 20,
    211         TELUGU = 21,
    212         KANNADA = 22,
    213         MALAYALAM = 23,
    214         SINHALA = 24,
    215         THAI = 25,
    216         LAO = 26,
    217         TIBETAN = 27,
    218         MYANMAR = 28,
    219         GEORGIAN = 29,
    220         HANGUL_JAMO = 30,
    221         ETHIOPIC = 31,
    222         CHEROKEE = 32,
    223         UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = 33,
    224         OGHAM = 34,
    225         RUNIC = 35,
    226         KHMER = 36,
    227         MONGOLIAN = 37,
    228         LATIN_EXTENDED_ADDITIONAL = 38,
    229         GREEK_EXTENDED = 39,
    230         GENERAL_PUNCTUATION = 40,
    231         SUPERSCRIPTS_AND_SUBSCRIPTS = 41,
    232         CURRENCY_SYMBOLS = 42,
    233         COMBINING_MARKS_FOR_SYMBOLS = 43,
    234         LETTERLIKE_SYMBOLS = 44,
    235         NUMBER_FORMS = 45,
    236         ARROWS = 46,
    237         MATHEMATICAL_OPERATORS = 47,
    238         MISCELLANEOUS_TECHNICAL = 48,
    239         CONTROL_PICTURES = 49,
    240         OPTICAL_CHARACTER_RECOGNITION = 50,
    241         ENCLOSED_ALPHANUMERICS = 51,
    242         BOX_DRAWING = 52,
    243         BLOCK_ELEMENTS = 53,
    244         GEOMETRIC_SHAPES = 54,
    245         MISCELLANEOUS_SYMBOLS = 55,
    246         DINGBATS = 56,
    247         BRAILLE_PATTERNS = 57,
    248         CJK_RADICALS_SUPPLEMENT = 58,
    249         KANGXI_RADICALS = 59,
    250         IDEOGRAPHIC_DESCRIPTION_CHARACTERS = 60,
    251         CJK_SYMBOLS_AND_PUNCTUATION = 61,
    252         HIRAGANA = 62,
    253         KATAKANA = 63,
    254         BOPOMOFO = 64,
    255         HANGUL_COMPATIBILITY_JAMO = 65,
    256         KANBUN = 66,
    257         BOPOMOFO_EXTENDED = 67,
    258         ENCLOSED_CJK_LETTERS_AND_MONTHS = 68,
    259         CJK_COMPATIBILITY = 69,
    260         CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = 70,
    261         CJK_UNIFIED_IDEOGRAPHS = 71,
    262         YI_SYLLABLES = 72,
    263         YI_RADICALS = 73,
    264         HANGUL_SYLLABLES = 74,
    265         HIGH_SURROGATES = 75,
    266         HIGH_PRIVATE_USE_SURROGATES = 76,
    267         LOW_SURROGATES = 77,
    268         PRIVATE_USE = 78,
    269         CJK_COMPATIBILITY_IDEOGRAPHS = 79,
    270         ALPHABETIC_PRESENTATION_FORMS = 80,
    271         ARABIC_PRESENTATION_FORMS_A = 81,
    272         COMBINING_HALF_MARKS = 82,
    273         CJK_COMPATIBILITY_FORMS = 83,
    274         SMALL_FORM_VARIANTS = 84,
    275         ARABIC_PRESENTATION_FORMS_B = 85,
    276         SPECIALS = 86,
    277         HALFWIDTH_AND_FULLWIDTH_FORMS = 87;
    278 
    279     static final byte[] blockToScript = {
    280         COMMON_SCRIPT, // 0, <RESERVED_BLOCK>
    281         LATIN_SCRIPT, // 1, BASIC_LATIN
    282         LATIN_SCRIPT, // 2, LATIN_1_SUPPLEMENT
    283         LATIN_SCRIPT, // 3, LATIN_EXTENDED_A
    284         LATIN_SCRIPT, // 4, LATIN_EXTENDED_B
    285         LATIN_SCRIPT, // 5, IPA_EXTENSIONS
    286         COMMON_SCRIPT, // 6, SPACING_MODIFIER_LETTERS
    287         COMMON_SCRIPT, // 7, COMBINING_DIACRITICAL_MARKS
    288         GREEK_SCRIPT, // 8, GREEK
    289         CYRILLIC_SCRIPT, // 9, CYRILLIC
    290         ARMENIAN_SCRIPT, // 10, ARMENIAN
    291         HEBREW_SCRIPT, // 11, HEBREW
    292         ARABIC_SCRIPT, // 12, ARABIC
    293         SYRIAC_SCRIPT, // 13, SYRIAC
    294         THAANA_SCRIPT, // 14, THAANA
    295         DEVANAGARI_SCRIPT, // 15, DEVANAGARI
    296         BENGALI_SCRIPT, // 16, BENGALI
    297         GURMUKHI_SCRIPT, // 17, GURMUKHI
    298         GUJARATI_SCRIPT, // 18, GUJARATI
    299         ORIYA_SCRIPT, // 19, ORIYA
    300         TAMIL_SCRIPT, // 20, TAMIL
    301         TELUGU_SCRIPT, // 21, TELUGU
    302         KANNADA_SCRIPT, // 22, KANNADA
    303         MALAYALAM_SCRIPT, // 23, MALAYALAM
    304         SINHALA_SCRIPT, // 24, SINHALA
    305         THAI_SCRIPT, // 25, THAI
    306         LAO_SCRIPT, // 26, LAO
    307         TIBETAN_SCRIPT, // 27, TIBETAN
    308         MYANMAR_SCRIPT, // 28, MYANMAR
    309         GEORGIAN_SCRIPT, // 29, GEORGIAN
    310         JAMO_SCRIPT, // 30, HANGUL_JAMO
    311         ETHIOPIC_SCRIPT, // 31, ETHIOPIC
    312         CHEROKEE_SCRIPT, // 32, CHEROKEE
    313         ABORIGINAL_SCRIPT, // 33, UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS
    314         OGHAM_SCRIPT, // 34, OGHAM
    315         RUNIC_SCRIPT, // 35, RUNIC
    316         KHMER_SCRIPT, // 36, KHMER
    317         MONGOLIAN_SCRIPT, // 37, MONGOLIAN
    318         LATIN_SCRIPT, // 38, LATIN_EXTENDED_ADDITIONAL
    319         GREEK_SCRIPT, // 39, GREEK_EXTENDED
    320         COMMON_SCRIPT, // 40, GENERAL_PUNCTUATION
    321         COMMON_SCRIPT, // 41, SUPERSCRIPTS_AND_SUBSCRIPTS
    322         COMMON_SCRIPT, // 42, CURRENCY_SYMBOLS
    323         COMMON_SCRIPT, // 43, COMBINING_MARKS_FOR_SYMBOLS
    324         COMMON_SCRIPT, // 44, LETTERLIKE_SYMBOLS
    325         COMMON_SCRIPT, // 45, NUMBER_FORMS
    326         COMMON_SCRIPT, // 46, ARROWS
    327         COMMON_SCRIPT, // 47, MATHEMATICAL_OPERATORS
    328         COMMON_SCRIPT, // 48, MISCELLANEOUS_TECHNICAL
    329         COMMON_SCRIPT, // 49, CONTROL_PICTURES
    330         COMMON_SCRIPT, // 50, OPTICAL_CHARACTER_RECOGNITION
    331         COMMON_SCRIPT, // 51, ENCLOSED_ALPHANUMERICS
    332         COMMON_SCRIPT, // 52, BOX_DRAWING
    333         COMMON_SCRIPT, // 53, BLOCK_ELEMENTS
    334         COMMON_SCRIPT, // 54, GEOMETRIC_SHAPES
    335         COMMON_SCRIPT, // 55, MISCELLANEOUS_SYMBOLS
    336         COMMON_SCRIPT, // 56, DINGBATS
    337         COMMON_SCRIPT, // 57, BRAILLE_PATTERNS
    338         HAN_SCRIPT, // 58, CJK_RADICALS_SUPPLEMENT
    339         HAN_SCRIPT, // 59, KANGXI_RADICALS
    340         HAN_SCRIPT, // 60, IDEOGRAPHIC_DESCRIPTION_CHARACTERS
    341         COMMON_SCRIPT, // 61, CJK_SYMBOLS_AND_PUNCTUATION
    342         HIRAGANA_SCRIPT, // 62, HIRAGANA
    343         KATAKANA_SCRIPT, // 63, KATAKANA
    344         BOPOMOFO_SCRIPT, // 64, BOPOMOFO
    345         JAMO_SCRIPT, // 65, HANGUL_COMPATIBILITY_JAMO
    346         HAN_SCRIPT, // 66, KANBUN
    347         BOPOMOFO_SCRIPT, // 67, BOPOMOFO_EXTENDED
    348         COMMON_SCRIPT, // 68, ENCLOSED_CJK_LETTERS_AND_MONTHS
    349         COMMON_SCRIPT, // 69, CJK_COMPATIBILITY
    350         HAN_SCRIPT, // 70, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
    351         HAN_SCRIPT, // 71, CJK_UNIFIED_IDEOGRAPHS
    352         YI_SCRIPT, // 72, YI_SYLLABLES
    353         YI_SCRIPT, // 73, YI_RADICALS
    354         HANGUL_SCRIPT, // 74, HANGUL_SYLLABLES
    355         COMMON_SCRIPT, // 75, HIGH_SURROGATES
    356         COMMON_SCRIPT, // 76, HIGH_PRIVATE_USE_SURROGATES
    357         COMMON_SCRIPT, // 77, LOW_SURROGATES
    358         COMMON_SCRIPT, // 78, PRIVATE_USE
    359         HAN_SCRIPT, // 79, CJK_COMPATIBILITY_IDEOGRAPHS
    360         COMMON_SCRIPT, // 80, ALPHABETIC_PRESENTATION_FORMS
    361         ARABIC_SCRIPT, // 81, ARABIC_PRESENTATION_FORMS_A
    362         COMMON_SCRIPT, // 82, COMBINING_HALF_MARKS
    363         COMMON_SCRIPT, // 83, CJK_COMPATIBILITY_FORMS
    364         COMMON_SCRIPT, // 84, SMALL_FORM_VARIANTS
    365         ARABIC_SCRIPT, // 85, ARABIC_PRESENTATION_FORMS_B
    366         COMMON_SCRIPT, // 86, SPECIALS
    367         COMMON_SCRIPT, // 87, HALFWIDTH_AND_FULLWIDTH_FORMS
    368         COMMON_SCRIPT, // 88, SPECIALS
    369     };
    370 
    371     // could be further reduced to a byte array, but I didn't bother.
    372     static final int[][] split = {
    373         {0x0250, 4, 5}, // -1
    374         {0x02B0, 5, 6}, // -2
    375         {0x0370, 7, 8}, // -3
    376         {0x0530, 0, 10}, // -4
    377         {0x0590, 10, 11}, // -5
    378         {0x0750, 13, 0}, // -6
    379         {0x07C0, 14, 0}, // -7
    380         {0x10A0, 28, 29}, // -8
    381         {0x13A0, 0, 32}, // -9
    382         {0x16A0, 34, 35}, // -10
    383         {0x18B0, 37, 0}, // -11
    384         {0x2070, 40, 41}, // -12
    385         {0x20A0, 41, -31}, // -13
    386         {0x2150, 44, 45}, // -14
    387         {0x2190, 45, 46}, // -15
    388         {0x2440, 49, -32}, // -16
    389         {0x25A0, 53, 54}, // -17
    390         {0x27C0, 56, 0}, // -18
    391         {0x2FE0, 59, -33}, // -19
    392         {0x3040, 61, 62}, // -20
    393         {0x30A0, 62, 63}, // -21
    394         {0x3130, 64, 65}, // -22
    395         {0x3190, 65, -34}, // -23
    396         {0x4DB6, 70, 0}, // -24
    397         {0xA490, 72, -35}, // -25
    398         {0xD7A4, 74, 0}, // -26
    399         {0xFB50, 80, 81}, // -27
    400         {0xFE20, 0, -36}, // -28
    401         {0xFEFF, 85, 86}, // -29
    402         {0xFFF0, 87, -37}, // -30
    403         {0x20D0, 42, 43}, // -31
    404         {0x2460, 50, 51}, // -32
    405         {0x2FF0, 0, 60}, // -33
    406         {0x31A0, 66, -38}, // -34
    407         {0xA4D0, 73, 0}, //-35
    408         {0xFE30, 82, -39}, //-36
    409         {0xFFFE, 88, 0}, //-37
    410         {0x31C0, 67, 0}, // -38
    411         {0xFE50, 83, -40}, //-39
    412         {0xFE70, 84, 85} // -40
    413     };
    414 
    415     static final byte[] charToBlock = {
    416       1, 2, 3, 4, -1, -2, -3, 8, 9, 9, -4, -5, 12, 12, -6, -7,
    417       0, 0, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 27,
    418       28, -8, 30, 30, 31, 31, 31, -9, 33, 33, 33, 33, 33, -10, 0, 36,
    419       37, -11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38, 38, 39, 39,
    420       -12, -13, -14, -15, 47, 47, 48, 48, -16, 51, 52, -17, 55, 55, 56, -18,
    421       57, 57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 59, -19,
    422       -20, -21, -22, -23, 68, 68, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70,
    423       70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70,
    424       70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70,
    425       70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, -24, 71, 71, 71, 71,
    426       71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
    427       71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
    428       71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
    429       71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
    430       71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
    431       71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
    432       71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
    433       71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
    434       71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
    435       71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
    436       72, 72, 72, 72, 72, 72, 72, 72, 72, -25, 0, 0, 0, 0, 0, 0,
    437       0, 0, 0, 0, 0, 0, 0, 0, 74, 74, 74, 74, 74, 74, 74, 74,
    438       74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
    439       74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
    440       74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
    441       74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
    442       74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, -26,
    443       75, 75, 75, 75, 75, 75, 75, 76, 77, 77, 77, 77, 77, 77, 77, 77,
    444       78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
    445       78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
    446       78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
    447       78, 78, 79, 79, 79, 79, -27, 81, 81, 81, 81, 81, -28, -29, 87, -30
    448     };
    449     */
    450 }
    451