1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /** 5 ******************************************************************************* 6 * Copyright (C) 2000-2010, International Business Machines Corporation and * 7 * others. All Rights Reserved. * 8 ******************************************************************************* 9 */ 10 package android.icu.dev.test.translit; 11 12 import android.icu.text.UTF16; 13 import android.icu.text.UnicodeSet; 14 import android.icu.testsharding.MainTestShard; 15 16 @MainTestShard 17 public final class TestUtility { 18 19 public static String hex(char ch) { 20 String foo = Integer.toString(ch,16).toUpperCase(); 21 return "0000".substring(0,4-foo.length()) + foo; 22 } 23 24 public static String hex(int ch) { 25 String foo = Integer.toString(ch,16).toUpperCase(); 26 return "00000000".substring(0,4-foo.length()) + foo; 27 } 28 29 public static String hex(String s) { 30 return hex(s,","); 31 } 32 33 public static String hex(String s, String sep) { 34 if (s.length() == 0) return ""; 35 String result = hex(s.charAt(0)); 36 for (int i = 1; i < s.length(); ++i) { 37 result += sep; 38 result += hex(s.charAt(i)); 39 } 40 return result; 41 } 42 43 public static String replace(String source, String toBeReplaced, String replacement) { 44 StringBuffer results = new StringBuffer(); 45 int len = toBeReplaced.length(); 46 for (int i = 0; i < source.length(); ++i) { 47 if (source.regionMatches(false, i, toBeReplaced, 0, len)) { 48 results.append(replacement); 49 i += len - 1; // minus one, since we will increment 50 } else { 51 results.append(source.charAt(i)); 52 } 53 } 54 return results.toString(); 55 } 56 57 public static String replaceAll(String source, UnicodeSet set, String replacement) { 58 StringBuffer results = new StringBuffer(); 59 int cp; 60 for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) { 61 cp = UTF16.charAt(source,i); 62 if (set.contains(cp)) { 63 results.append(replacement); 64 } else { 65 UTF16.append(results, cp); 66 } 67 } 68 return results.toString(); 69 } 70 71 // COMMENTED OUT ALL THE OLD SCRIPT STUFF 72 /* 73 public static byte getScript(char c) { 74 return getScript(getBlock(c)); 75 } 76 77 public static byte getScript(byte block) { 78 return blockToScript[block]; 79 } 80 81 public static byte getBlock(char c) { 82 int index = c >> 7; 83 byte block = charToBlock[index]; 84 while (block < 0) { // take care of exceptions, blocks split across 128 boundaries 85 int[] tuple = split[-block-1]; 86 if (c < tuple[0]) block = (byte)tuple[1]; 87 else block = (byte)tuple[2]; 88 } 89 return block; 90 } 91 92 // returns next letter of script, or 0xFFFF if done 93 94 public static char getNextLetter(char c, byte script) { 95 while (c < 0xFFFF) { 96 ++c; 97 if (getScript(c) == script && Character.isLetter(c)) { 98 return c; 99 } 100 } 101 return c; 102 } 103 104 // Supplements to Character methods; these methods go through 105 // UCharacter if possible. If not, they fall back to Character. 106 107 public static boolean isUnassigned(char c) { 108 try { 109 return UCharacter.getType(c) == UCharacterCategory.UNASSIGNED; 110 } catch (NullPointerException e) { 111 System.out.print(""); 112 } 113 return Character.getType(c) == Character.UNASSIGNED; 114 } 115 116 public static boolean isLetter(char c) { 117 try { 118 return UCharacter.isLetter(c); 119 } catch (NullPointerException e) { 120 System.out.print(""); 121 } 122 return Character.isLetter(c); 123 } 124 125 public static void main(String[] args) { 126 System.out.println("Blocks: "); 127 byte lastblock = -128; 128 for (char cc = 0; cc < 0xFFFF; ++cc) { 129 byte block = TestUtility.getBlock(cc); 130 if (block != lastblock) { 131 System.out.println(TestUtility.hex(cc) + "\t" + block); 132 lastblock = block; 133 } 134 } 135 System.out.println(); 136 System.out.println("Scripts: "); 137 byte lastScript = -128; 138 for (char cc = 0; cc < 0xFFFF; ++cc) { 139 byte script = TestUtility.getScript(cc); 140 if (script != lastScript) { 141 System.out.println(TestUtility.hex(cc) + "\t" + script); 142 lastScript = script; 143 } 144 } 145 } 146 147 148 149 public static final byte // SCRIPT CODE 150 COMMON_SCRIPT = 0, 151 LATIN_SCRIPT = 1, 152 GREEK_SCRIPT = 2, 153 CYRILLIC_SCRIPT = 3, 154 ARMENIAN_SCRIPT = 4, 155 HEBREW_SCRIPT = 5, 156 ARABIC_SCRIPT = 6, 157 SYRIAC_SCRIPT = 7, 158 THAANA_SCRIPT = 8, 159 DEVANAGARI_SCRIPT = 9, 160 BENGALI_SCRIPT = 10, 161 GURMUKHI_SCRIPT = 11, 162 GUJARATI_SCRIPT = 12, 163 ORIYA_SCRIPT = 13, 164 TAMIL_SCRIPT = 14, 165 TELUGU_SCRIPT = 15, 166 KANNADA_SCRIPT = 16, 167 MALAYALAM_SCRIPT = 17, 168 SINHALA_SCRIPT = 18, 169 THAI_SCRIPT = 19, 170 LAO_SCRIPT = 20, 171 TIBETAN_SCRIPT = 21, 172 MYANMAR_SCRIPT = 22, 173 GEORGIAN_SCRIPT = 23, 174 JAMO_SCRIPT = 24, 175 HANGUL_SCRIPT = 25, 176 ETHIOPIC_SCRIPT = 26, 177 CHEROKEE_SCRIPT = 27, 178 ABORIGINAL_SCRIPT = 28, 179 OGHAM_SCRIPT = 29, 180 RUNIC_SCRIPT = 30, 181 KHMER_SCRIPT = 31, 182 MONGOLIAN_SCRIPT = 32, 183 HIRAGANA_SCRIPT = 33, 184 KATAKANA_SCRIPT = 34, 185 BOPOMOFO_SCRIPT = 35, 186 HAN_SCRIPT = 36, 187 YI_SCRIPT = 37; 188 189 public static final byte // block code 190 RESERVED_BLOCK = 0, 191 BASIC_LATIN = 1, 192 LATIN_1_SUPPLEMENT = 2, 193 LATIN_EXTENDED_A = 3, 194 LATIN_EXTENDED_B = 4, 195 IPA_EXTENSIONS = 5, 196 SPACING_MODIFIER_LETTERS = 6, 197 COMBINING_DIACRITICAL_MARKS = 7, 198 GREEK = 8, 199 CYRILLIC = 9, 200 ARMENIAN = 10, 201 HEBREW = 11, 202 ARABIC = 12, 203 SYRIAC = 13, 204 THAANA = 14, 205 DEVANAGARI = 15, 206 BENGALI = 16, 207 GURMUKHI = 17, 208 GUJARATI = 18, 209 ORIYA = 19, 210 TAMIL = 20, 211 TELUGU = 21, 212 KANNADA = 22, 213 MALAYALAM = 23, 214 SINHALA = 24, 215 THAI = 25, 216 LAO = 26, 217 TIBETAN = 27, 218 MYANMAR = 28, 219 GEORGIAN = 29, 220 HANGUL_JAMO = 30, 221 ETHIOPIC = 31, 222 CHEROKEE = 32, 223 UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = 33, 224 OGHAM = 34, 225 RUNIC = 35, 226 KHMER = 36, 227 MONGOLIAN = 37, 228 LATIN_EXTENDED_ADDITIONAL = 38, 229 GREEK_EXTENDED = 39, 230 GENERAL_PUNCTUATION = 40, 231 SUPERSCRIPTS_AND_SUBSCRIPTS = 41, 232 CURRENCY_SYMBOLS = 42, 233 COMBINING_MARKS_FOR_SYMBOLS = 43, 234 LETTERLIKE_SYMBOLS = 44, 235 NUMBER_FORMS = 45, 236 ARROWS = 46, 237 MATHEMATICAL_OPERATORS = 47, 238 MISCELLANEOUS_TECHNICAL = 48, 239 CONTROL_PICTURES = 49, 240 OPTICAL_CHARACTER_RECOGNITION = 50, 241 ENCLOSED_ALPHANUMERICS = 51, 242 BOX_DRAWING = 52, 243 BLOCK_ELEMENTS = 53, 244 GEOMETRIC_SHAPES = 54, 245 MISCELLANEOUS_SYMBOLS = 55, 246 DINGBATS = 56, 247 BRAILLE_PATTERNS = 57, 248 CJK_RADICALS_SUPPLEMENT = 58, 249 KANGXI_RADICALS = 59, 250 IDEOGRAPHIC_DESCRIPTION_CHARACTERS = 60, 251 CJK_SYMBOLS_AND_PUNCTUATION = 61, 252 HIRAGANA = 62, 253 KATAKANA = 63, 254 BOPOMOFO = 64, 255 HANGUL_COMPATIBILITY_JAMO = 65, 256 KANBUN = 66, 257 BOPOMOFO_EXTENDED = 67, 258 ENCLOSED_CJK_LETTERS_AND_MONTHS = 68, 259 CJK_COMPATIBILITY = 69, 260 CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = 70, 261 CJK_UNIFIED_IDEOGRAPHS = 71, 262 YI_SYLLABLES = 72, 263 YI_RADICALS = 73, 264 HANGUL_SYLLABLES = 74, 265 HIGH_SURROGATES = 75, 266 HIGH_PRIVATE_USE_SURROGATES = 76, 267 LOW_SURROGATES = 77, 268 PRIVATE_USE = 78, 269 CJK_COMPATIBILITY_IDEOGRAPHS = 79, 270 ALPHABETIC_PRESENTATION_FORMS = 80, 271 ARABIC_PRESENTATION_FORMS_A = 81, 272 COMBINING_HALF_MARKS = 82, 273 CJK_COMPATIBILITY_FORMS = 83, 274 SMALL_FORM_VARIANTS = 84, 275 ARABIC_PRESENTATION_FORMS_B = 85, 276 SPECIALS = 86, 277 HALFWIDTH_AND_FULLWIDTH_FORMS = 87; 278 279 static final byte[] blockToScript = { 280 COMMON_SCRIPT, // 0, <RESERVED_BLOCK> 281 LATIN_SCRIPT, // 1, BASIC_LATIN 282 LATIN_SCRIPT, // 2, LATIN_1_SUPPLEMENT 283 LATIN_SCRIPT, // 3, LATIN_EXTENDED_A 284 LATIN_SCRIPT, // 4, LATIN_EXTENDED_B 285 LATIN_SCRIPT, // 5, IPA_EXTENSIONS 286 COMMON_SCRIPT, // 6, SPACING_MODIFIER_LETTERS 287 COMMON_SCRIPT, // 7, COMBINING_DIACRITICAL_MARKS 288 GREEK_SCRIPT, // 8, GREEK 289 CYRILLIC_SCRIPT, // 9, CYRILLIC 290 ARMENIAN_SCRIPT, // 10, ARMENIAN 291 HEBREW_SCRIPT, // 11, HEBREW 292 ARABIC_SCRIPT, // 12, ARABIC 293 SYRIAC_SCRIPT, // 13, SYRIAC 294 THAANA_SCRIPT, // 14, THAANA 295 DEVANAGARI_SCRIPT, // 15, DEVANAGARI 296 BENGALI_SCRIPT, // 16, BENGALI 297 GURMUKHI_SCRIPT, // 17, GURMUKHI 298 GUJARATI_SCRIPT, // 18, GUJARATI 299 ORIYA_SCRIPT, // 19, ORIYA 300 TAMIL_SCRIPT, // 20, TAMIL 301 TELUGU_SCRIPT, // 21, TELUGU 302 KANNADA_SCRIPT, // 22, KANNADA 303 MALAYALAM_SCRIPT, // 23, MALAYALAM 304 SINHALA_SCRIPT, // 24, SINHALA 305 THAI_SCRIPT, // 25, THAI 306 LAO_SCRIPT, // 26, LAO 307 TIBETAN_SCRIPT, // 27, TIBETAN 308 MYANMAR_SCRIPT, // 28, MYANMAR 309 GEORGIAN_SCRIPT, // 29, GEORGIAN 310 JAMO_SCRIPT, // 30, HANGUL_JAMO 311 ETHIOPIC_SCRIPT, // 31, ETHIOPIC 312 CHEROKEE_SCRIPT, // 32, CHEROKEE 313 ABORIGINAL_SCRIPT, // 33, UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS 314 OGHAM_SCRIPT, // 34, OGHAM 315 RUNIC_SCRIPT, // 35, RUNIC 316 KHMER_SCRIPT, // 36, KHMER 317 MONGOLIAN_SCRIPT, // 37, MONGOLIAN 318 LATIN_SCRIPT, // 38, LATIN_EXTENDED_ADDITIONAL 319 GREEK_SCRIPT, // 39, GREEK_EXTENDED 320 COMMON_SCRIPT, // 40, GENERAL_PUNCTUATION 321 COMMON_SCRIPT, // 41, SUPERSCRIPTS_AND_SUBSCRIPTS 322 COMMON_SCRIPT, // 42, CURRENCY_SYMBOLS 323 COMMON_SCRIPT, // 43, COMBINING_MARKS_FOR_SYMBOLS 324 COMMON_SCRIPT, // 44, LETTERLIKE_SYMBOLS 325 COMMON_SCRIPT, // 45, NUMBER_FORMS 326 COMMON_SCRIPT, // 46, ARROWS 327 COMMON_SCRIPT, // 47, MATHEMATICAL_OPERATORS 328 COMMON_SCRIPT, // 48, MISCELLANEOUS_TECHNICAL 329 COMMON_SCRIPT, // 49, CONTROL_PICTURES 330 COMMON_SCRIPT, // 50, OPTICAL_CHARACTER_RECOGNITION 331 COMMON_SCRIPT, // 51, ENCLOSED_ALPHANUMERICS 332 COMMON_SCRIPT, // 52, BOX_DRAWING 333 COMMON_SCRIPT, // 53, BLOCK_ELEMENTS 334 COMMON_SCRIPT, // 54, GEOMETRIC_SHAPES 335 COMMON_SCRIPT, // 55, MISCELLANEOUS_SYMBOLS 336 COMMON_SCRIPT, // 56, DINGBATS 337 COMMON_SCRIPT, // 57, BRAILLE_PATTERNS 338 HAN_SCRIPT, // 58, CJK_RADICALS_SUPPLEMENT 339 HAN_SCRIPT, // 59, KANGXI_RADICALS 340 HAN_SCRIPT, // 60, IDEOGRAPHIC_DESCRIPTION_CHARACTERS 341 COMMON_SCRIPT, // 61, CJK_SYMBOLS_AND_PUNCTUATION 342 HIRAGANA_SCRIPT, // 62, HIRAGANA 343 KATAKANA_SCRIPT, // 63, KATAKANA 344 BOPOMOFO_SCRIPT, // 64, BOPOMOFO 345 JAMO_SCRIPT, // 65, HANGUL_COMPATIBILITY_JAMO 346 HAN_SCRIPT, // 66, KANBUN 347 BOPOMOFO_SCRIPT, // 67, BOPOMOFO_EXTENDED 348 COMMON_SCRIPT, // 68, ENCLOSED_CJK_LETTERS_AND_MONTHS 349 COMMON_SCRIPT, // 69, CJK_COMPATIBILITY 350 HAN_SCRIPT, // 70, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A 351 HAN_SCRIPT, // 71, CJK_UNIFIED_IDEOGRAPHS 352 YI_SCRIPT, // 72, YI_SYLLABLES 353 YI_SCRIPT, // 73, YI_RADICALS 354 HANGUL_SCRIPT, // 74, HANGUL_SYLLABLES 355 COMMON_SCRIPT, // 75, HIGH_SURROGATES 356 COMMON_SCRIPT, // 76, HIGH_PRIVATE_USE_SURROGATES 357 COMMON_SCRIPT, // 77, LOW_SURROGATES 358 COMMON_SCRIPT, // 78, PRIVATE_USE 359 HAN_SCRIPT, // 79, CJK_COMPATIBILITY_IDEOGRAPHS 360 COMMON_SCRIPT, // 80, ALPHABETIC_PRESENTATION_FORMS 361 ARABIC_SCRIPT, // 81, ARABIC_PRESENTATION_FORMS_A 362 COMMON_SCRIPT, // 82, COMBINING_HALF_MARKS 363 COMMON_SCRIPT, // 83, CJK_COMPATIBILITY_FORMS 364 COMMON_SCRIPT, // 84, SMALL_FORM_VARIANTS 365 ARABIC_SCRIPT, // 85, ARABIC_PRESENTATION_FORMS_B 366 COMMON_SCRIPT, // 86, SPECIALS 367 COMMON_SCRIPT, // 87, HALFWIDTH_AND_FULLWIDTH_FORMS 368 COMMON_SCRIPT, // 88, SPECIALS 369 }; 370 371 // could be further reduced to a byte array, but I didn't bother. 372 static final int[][] split = { 373 {0x0250, 4, 5}, // -1 374 {0x02B0, 5, 6}, // -2 375 {0x0370, 7, 8}, // -3 376 {0x0530, 0, 10}, // -4 377 {0x0590, 10, 11}, // -5 378 {0x0750, 13, 0}, // -6 379 {0x07C0, 14, 0}, // -7 380 {0x10A0, 28, 29}, // -8 381 {0x13A0, 0, 32}, // -9 382 {0x16A0, 34, 35}, // -10 383 {0x18B0, 37, 0}, // -11 384 {0x2070, 40, 41}, // -12 385 {0x20A0, 41, -31}, // -13 386 {0x2150, 44, 45}, // -14 387 {0x2190, 45, 46}, // -15 388 {0x2440, 49, -32}, // -16 389 {0x25A0, 53, 54}, // -17 390 {0x27C0, 56, 0}, // -18 391 {0x2FE0, 59, -33}, // -19 392 {0x3040, 61, 62}, // -20 393 {0x30A0, 62, 63}, // -21 394 {0x3130, 64, 65}, // -22 395 {0x3190, 65, -34}, // -23 396 {0x4DB6, 70, 0}, // -24 397 {0xA490, 72, -35}, // -25 398 {0xD7A4, 74, 0}, // -26 399 {0xFB50, 80, 81}, // -27 400 {0xFE20, 0, -36}, // -28 401 {0xFEFF, 85, 86}, // -29 402 {0xFFF0, 87, -37}, // -30 403 {0x20D0, 42, 43}, // -31 404 {0x2460, 50, 51}, // -32 405 {0x2FF0, 0, 60}, // -33 406 {0x31A0, 66, -38}, // -34 407 {0xA4D0, 73, 0}, //-35 408 {0xFE30, 82, -39}, //-36 409 {0xFFFE, 88, 0}, //-37 410 {0x31C0, 67, 0}, // -38 411 {0xFE50, 83, -40}, //-39 412 {0xFE70, 84, 85} // -40 413 }; 414 415 static final byte[] charToBlock = { 416 1, 2, 3, 4, -1, -2, -3, 8, 9, 9, -4, -5, 12, 12, -6, -7, 417 0, 0, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 27, 418 28, -8, 30, 30, 31, 31, 31, -9, 33, 33, 33, 33, 33, -10, 0, 36, 419 37, -11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38, 38, 39, 39, 420 -12, -13, -14, -15, 47, 47, 48, 48, -16, 51, 52, -17, 55, 55, 56, -18, 421 57, 57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 59, -19, 422 -20, -21, -22, -23, 68, 68, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70, 423 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 424 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 425 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, -24, 71, 71, 71, 71, 426 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 427 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 428 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 429 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 430 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 431 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 432 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 433 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 434 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 435 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 436 72, 72, 72, 72, 72, 72, 72, 72, 72, -25, 0, 0, 0, 0, 0, 0, 437 0, 0, 0, 0, 0, 0, 0, 0, 74, 74, 74, 74, 74, 74, 74, 74, 438 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 439 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 440 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 441 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 442 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, -26, 443 75, 75, 75, 75, 75, 75, 75, 76, 77, 77, 77, 77, 77, 77, 77, 77, 444 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 445 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 446 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 447 78, 78, 79, 79, 79, 79, -27, 81, 81, 81, 81, 81, -28, -29, 87, -30 448 }; 449 */ 450 } 451