1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2008-2015, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.dev.test.collator; 10 import java.util.ArrayList; 11 import java.util.Arrays; 12 import java.util.Collection; 13 import java.util.Iterator; 14 import java.util.LinkedHashSet; 15 import java.util.List; 16 import java.util.Locale; 17 import java.util.Set; 18 import java.util.TreeSet; 19 20 import org.junit.Test; 21 22 import com.ibm.icu.dev.test.TestFmwk; 23 import com.ibm.icu.dev.util.CollectionUtilities; 24 import com.ibm.icu.impl.ICUDebug; 25 import com.ibm.icu.impl.Row; 26 import com.ibm.icu.impl.Row.R4; 27 import com.ibm.icu.lang.UCharacter; 28 import com.ibm.icu.lang.UProperty; 29 import com.ibm.icu.lang.UScript; 30 import com.ibm.icu.text.AlphabeticIndex; 31 import com.ibm.icu.text.AlphabeticIndex.Bucket; 32 import com.ibm.icu.text.AlphabeticIndex.Bucket.LabelType; 33 import com.ibm.icu.text.AlphabeticIndex.ImmutableIndex; 34 import com.ibm.icu.text.AlphabeticIndex.Record; 35 import com.ibm.icu.text.Collator; 36 import com.ibm.icu.text.Normalizer2; 37 import com.ibm.icu.text.RawCollationKey; 38 import com.ibm.icu.text.RuleBasedCollator; 39 import com.ibm.icu.text.UTF16; 40 import com.ibm.icu.text.UnicodeSet; 41 import com.ibm.icu.util.ULocale; 42 43 /** 44 * @author Mark Davis 45 */ 46 public class AlphabeticIndexTest extends TestFmwk { 47 /** 48 * 49 */ 50 private static final String ARROW = "\u2192"; 51 private static final boolean DEBUG = ICUDebug.enabled("alphabeticindex"); 52 53 public static Set<String> KEY_LOCALES = new LinkedHashSet(Arrays.asList( 54 "en", "es", "de", "fr", "ja", "it", "tr", "pt", "zh", "nl", 55 "pl", "ar", "ru", "zh_Hant", "ko", "th", "sv", "fi", "da", 56 "he", "nb", "el", "hr", "bg", "sk", "lt", "vi", "lv", "sr", 57 "pt_PT", "ro", "hu", "cs", "id", "sl", "fil", "fa", "uk", 58 "ca", "hi", "et", "eu", "is", "sw", "ms", "bn", "am", "ta", 59 "te", "mr", "ur", "ml", "kn", "gu", "or")); 60 private String[][] localeAndIndexCharactersLists = new String[][] { 61 /* Arabic*/ {"ar", "\u0627:\u0628:\u062A:\u062B:\u062C:\u062D:\u062E:\u062F:\u0630:\u0631:\u0632:\u0633:\u0634:\u0635:\u0636:\u0637:\u0638:\u0639:\u063A:\u0641:\u0642:\u0643:\u0644:\u0645:\u0646:\u0647:\u0648:\u064A"}, 62 /* Bulgarian*/ {"bg", "\u0410:\u0411:\u0412:\u0413:\u0414:\u0415:\u0416:\u0417:\u0418:\u0419:\u041A:\u041B:\u041C:\u041D:\u041E:\u041F:\u0420:\u0421:\u0422:\u0423:\u0424:\u0425:\u0426:\u0427:\u0428:\u0429:\u042E:\u042F"}, 63 /* Catalan*/ {"ca", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 64 /* Czech*/ {"cs", "A:B:C:\u010C:D:E:F:G:H:CH:I:J:K:L:M:N:O:P:Q:R:\u0158:S:\u0160:T:U:V:W:X:Y:Z:\u017D"}, 65 /* Danish*/ {"da", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C6:\u00D8:\u00C5"}, 66 /* German*/ {"de", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 67 /* Greek*/ {"el", "\u0391:\u0392:\u0393:\u0394:\u0395:\u0396:\u0397:\u0398:\u0399:\u039A:\u039B:\u039C:\u039D:\u039E:\u039F:\u03A0:\u03A1:\u03A3:\u03A4:\u03A5:\u03A6:\u03A7:\u03A8:\u03A9"}, 68 /* English*/ {"en", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 69 /* Spanish*/ {"es", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:\u00D1:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 70 /* Estonian*/ {"et", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:\u0160:Z:\u017D:T:U:V:\u00D5:\u00C4:\u00D6:\u00DC:X:Y"}, 71 /* Basque*/ {"eu", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 72 /* Finnish*/ {"fi", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C5:\u00C4:\u00D6"}, 73 /* Filipino*/ {"fil", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:\u00D1:Ng:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 74 /* French*/ {"fr", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 75 /* Hebrew*/ {"he", "\u05D0:\u05D1:\u05D2:\u05D3:\u05D4:\u05D5:\u05D6:\u05D7:\u05D8:\u05D9:\u05DB:\u05DC:\u05DE:\u05E0:\u05E1:\u05E2:\u05E4:\u05E6:\u05E7:\u05E8:\u05E9:\u05EA"}, 76 /* Icelandic*/ {"is", "A:\u00C1:B:C:D:\u00D0:E:\u00C9:F:G:H:I:\u00CD:J:K:L:M:N:O:\u00D3:P:Q:R:S:T:U:\u00DA:V:W:X:Y:\u00DD:Z:\u00DE:\u00C6:\u00D6"}, 77 /* Italian*/ {"it", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 78 /* Japanese*/ {"ja", "\u3042:\u304B:\u3055:\u305F:\u306A:\u306F:\u307E:\u3084:\u3089:\u308F"}, 79 /* Korean*/ {"ko", "\u3131:\u3134:\u3137:\u3139:\u3141:\u3142:\u3145:\u3147:\u3148:\u314A:\u314B:\u314C:\u314D:\u314E"}, 80 /* Lithuanian*/ {"lt", "A:B:C:\u010C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:\u0160:T:U:V:Z:\u017D"}, 81 /* Latvian*/ {"lv", "A:B:C:\u010C:D:E:F:G:\u0122:H:I:J:K:\u0136:L:\u013B:M:N:\u0145:O:P:Q:R:S:\u0160:T:U:V:W:X:Z:\u017D"}, 82 /* Norwegian Bokm\u00E5l*/ {"nb", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C6:\u00D8:\u00C5"}, 83 /* Dutch*/ {"nl", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 84 /* Polish*/ {"pl", "A:\u0104:B:C:\u0106:D:E:\u0118:F:G:H:I:J:K:L:\u0141:M:N:\u0143:O:\u00D3:P:Q:R:S:\u015A:T:U:V:W:X:Y:Z:\u0179:\u017B"}, 85 /* Portuguese*/ {"pt", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 86 /* Romanian*/ {"ro", "A:\u0102:\u00C2:B:C:D:E:F:G:H:I:\u00CE:J:K:L:M:N:O:P:Q:R:S:\u0218:T:\u021A:U:V:W:X:Y:Z"}, 87 /* Russian*/ {"ru", "\u0410:\u0411:\u0412:\u0413:\u0414:\u0415:\u0416:\u0417:\u0418:\u0419:\u041A:\u041B:\u041C:\u041D:\u041E:\u041F:\u0420:\u0421:\u0422:\u0423:\u0424:\u0425:\u0426:\u0427:\u0428:\u0429:\u042B:\u042D:\u042E:\u042F"}, 88 /* Slovak*/ {"sk", "A:\u00C4:B:C:\u010C:D:E:F:G:H:CH:I:J:K:L:M:N:O:\u00D4:P:Q:R:S:\u0160:T:U:V:W:X:Y:Z:\u017D"}, 89 /* Slovenian*/ {"sl", "A:B:C:\u010C:\u0106:D:\u0110:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:\u0160:T:U:V:W:X:Y:Z:\u017D"}, 90 /* Serbian*/ {"sr", "\u0410:\u0411:\u0412:\u0413:\u0414:\u0402:\u0415:\u0416:\u0417:\u0418:\u0408:\u041A:\u041B:\u0409:\u041C:\u041D:\u040A:\u041E:\u041F:\u0420:\u0421:\u0422:\u040B:\u0423:\u0424:\u0425:\u0426:\u0427:\u040F:\u0428"}, 91 /* Swedish*/ {"sv", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C5:\u00C4:\u00D6"}, 92 /* Turkish*/ {"tr", "A:B:C:\u00C7:D:E:F:G:H:I:\u0130:J:K:L:M:N:O:\u00D6:P:Q:R:S:\u015E:T:U:\u00DC:V:W:X:Y:Z"}, 93 /* Ukrainian*/ {"uk", "\u0410:\u0411:\u0412:\u0413:\u0490:\u0414:\u0415:\u0404:\u0416:\u0417:\u0418:\u0406:\u0407:\u0419:\u041A:\u041B:\u041C:\u041D:\u041E:\u041F:\u0420:\u0421:\u0422:\u0423:\u0424:\u0425:\u0426:\u0427:\u0428:\u0429:\u042E:\u042F"}, 94 /* Vietnamese*/ {"vi", "A:\u0102:\u00C2:B:C:D:\u0110:E:\u00CA:F:G:H:I:J:K:L:M:N:O:\u00D4:\u01A0:P:Q:R:S:T:U:\u01AF:V:W:X:Y:Z"}, 95 /* Chinese*/ {"zh", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 96 /* Chinese (Traditional Han)*/ {"zh_Hant", "1\u5283:2\u5283:3\u5283:4\u5283:5\u5283:6\u5283:7\u5283:8\u5283:9\u5283:10\u5283:11\u5283:12\u5283:13\u5283:14\u5283:15\u5283:16\u5283:17\u5283:18\u5283:19\u5283:20\u5283:21\u5283:22\u5283:23\u5283:24\u5283:25\u5283:26\u5283:27\u5283:28\u5283:29\u5283:30\u5283:31\u5283:32\u5283:33\u5283:35\u5283:36\u5283:39\u5283:48\u5283"}, 97 98 // Comment these out to make the test run faster. Later, make these run under extended 99 100 // /* Afrikaans*/ {"af", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 101 // /* Akan*/ {"ak", "A:B:C:D:E:\u0190:F:G:H:I:J:K:L:M:N:O:\u0186:P:Q:R:S:T:U:V:W:X:Y:Z"}, 102 // /* Asu*/ {"asa", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 103 // /* Azerbaijani*/ {"az", "A:B:C:\u00C7:D:E:\u018F:F:G:\u011E:H:X:I:\u0130:J:K:Q:L:M:N:O:\u00D6:P:R:S:\u015E:T:U:\u00DC:V:W:Y:Z"}, 104 // /* Belarusian*/ {"be", "\u0410:\u0411:\u0412:\u0413:\u0414:\u0415:\u0416:\u0417:\u0406:\u0419:\u041A:\u041B:\u041C:\u041D:\u041E:\u041F:\u0420:\u0421:\u0422:\u0423:\u0424:\u0425:\u0426:\u0427:\u0428:\u042B:\u042D:\u042E:\u042F"}, 105 // /* Bemba*/ {"bem", "A:B:C:E:F:G:I:J:K:L:M:N:O:P:S:T:U:W:Y"}, 106 // /* Bena*/ {"bez", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:Y:Z"}, 107 // /* Bambara*/ {"bm", "A:B:C:D:E:\u0190:F:G:H:I:J:K:L:M:N:\u019D:\u014A:O:\u0186:P:R:S:T:U:W:Y:Z"}, 108 // /* Tibetan*/ {"bo", "\u0F40:\u0F41:\u0F42:\u0F44:\u0F45:\u0F46:\u0F47:\u0F49:\u0F4F:\u0F50:\u0F51:\u0F53:\u0F54:\u0F55:\u0F56:\u0F58:\u0F59:\u0F5A:\u0F5B:\u0F5D:\u0F5E:\u0F5F:\u0F60:\u0F61:\u0F62:\u0F63:\u0F64:\u0F66:\u0F67:\u0F68"}, 109 // /* Chiga*/ {"cgg", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 110 // /* Cherokee*/ {"chr", "\u13A0:\u13A6:\u13AD:\u13B3:\u13B9:\u13BE:\u13C6:\u13CC:\u13D3:\u13DC:\u13E3:\u13E9:\u13EF"}, 111 // /* Welsh*/ {"cy", "A:B:C:CH:D:E:F:FF:G:H:I:J:L:LL:M:N:O:P:PH:R:RH:S:T:TH:U:W:Y"}, 112 // /* Taita*/ {"dav", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 113 // /* Embu*/ {"ebu", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 114 // /* Ewe*/ {"ee", "A:B:C:D:\u0189:E:\u0190:F:\u0191:G:\u0194:H:I:J:K:L:M:N:\u014A:O:\u0186:P:Q:R:S:T:U:V:\u01B2:W:X:Y:Z"}, 115 // /* Esperanto*/ {"eo", "A:B:C:\u0108:D:E:F:G:\u011C:H:\u0124:I:J:\u0134:K:L:M:N:O:P:R:S:\u015C:T:U:\u016C:V:Z"}, 116 // /* Fulah*/ {"ff", "A:B:\u0181:C:D:\u018A:E:F:G:H:I:J:K:L:M:N:\u014A:O:P:R:S:T:U:W:Y:\u01B3"}, 117 // /* Faroese*/ {"fo", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C6:\u00D8"}, 118 // /* Gusii*/ {"guz", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 119 // /* Hausa*/ {"ha", "A:B:\u0181:C:D:\u018A:E:F:G:H:I:J:K:\u0198:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 120 // /* Igbo*/ {"ig", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 121 // /* Machame*/ {"jmc", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 122 // /* Kabyle*/ {"kab", "A:B:C:D:E:\u0190:F:G:\u0194:H:I:J:K:L:M:N:P:Q:R:S:T:U:W:X:Y:Z"}, 123 // /* Kamba*/ {"kam", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 124 // /* Makonde*/ {"kde", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 125 // /* Kabuverdianu*/ {"kea", "A:B:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:X:Z"}, 126 // /* Koyra Chiini*/ {"khq", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:\u019D:\u014A:O:P:Q:R:S:T:U:W:X:Y:Z"}, 127 // /* Kikuyu*/ {"ki", "A:B:C:D:E:G:H:I:J:K:M:N:O:R:T:U:W:Y"}, 128 // /* Kalenjin*/ {"kln", "A:B:C:D:E:G:H:I:J:K:L:M:N:O:P:R:S:T:U:W:Y"}, 129 // /* Langi*/ {"lag", "A:B:C:D:E:F:G:H:I:\u0197:J:K:L:M:N:O:P:Q:R:S:T:U:\u0244:V:W:X:Y:Z"}, 130 // /* Ganda*/ {"lg", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 131 // /* Luo*/ {"luo", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y"}, 132 // /* Luyia*/ {"luy", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 133 // /* Masai*/ {"mas", "A:B:C:D:E:\u0190:G:H:I:\u0197:J:K:L:M:N:\u014A:O:\u0186:P:R:S:T:U:\u0244:W:Y"}, 134 // /* Meru*/ {"mer", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 135 // /* Morisyen*/ {"mfe", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:X:Y:Z"}, 136 // /* Malagasy*/ {"mg", "A:B:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:V:Y:Z"}, 137 // This should be the correct data. Commented till it is fixed in CLDR collation data. 138 // {"mk", "\u0410:\u0411:\u0412:\u0413:\u0403:\u0414:\u0415:\u0416:\u0417:\u0405:\u0418:\u0408:\u041A:\u040C:\u041B:\u0409:\u041C:\u041D:\u040A:\u041E:\u041F:\u0420:\u0421:\u0422:\u0423:\u0424:\u0425:\u0426:\u0427:\u040F:\u0428"}, 139 // /* Macedonian*/ {"mk", "\u0410:\u0411:\u0412:\u0413:\u0414:\u0403:\u0415:\u0416:\u0417:\u0405:\u0418:\u0408:\u041A:\u041B:\u0409:\u041C:\u041D:\u040A:\u041E:\u041F:\u0420:\u0421:\u0422:\u040C:\u0423:\u0424:\u0425:\u0426:\u0427:\u040F:\u0428"}, 140 // This should be the correct data. Commented till it is fixed in CLDR collation data. 141 // {"mt", "A:B:C:\u010A:D:E:F:\u0120:G:G\u0126:H:\u0126:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:\u017B:Z"}, 142 // /* Maltese*/ {"mt", "A:B:\u010A:C:D:E:F:\u0120:G:G\u0126:H:\u0126:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:\u017B:Z"}, 143 // /* Nama*/ {"naq", "A:B:C:D:E:F:G:H:I:K:M:N:O:P:Q:R:S:T:U:W:X:Y:Z"}, 144 // /* North Ndebele*/ {"nd", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:S:T:U:V:W:X:Y:Z"}, 145 // /* Norwegian Nynorsk*/ {"nn", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C6:\u00D8:\u00C5"}, 146 // /* Nyankole*/ {"nyn", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 147 // /* Oromo*/ {"om", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 148 // /* Romansh*/ {"rm", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 149 // /* Rombo*/ {"rof", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 150 // /* Kinyarwanda*/ {"rw", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 151 // /* Rwa*/ {"rwk", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 152 // /* Samburu*/ {"saq", "A:B:C:D:E:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y"}, 153 // /* Sena*/ {"seh", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 154 // /* Koyraboro Senni*/ {"ses", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:\u019D:\u014A:O:P:Q:R:S:T:U:W:X:Y:Z"}, 155 // /* Sango*/ {"sg", "A:B:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 156 // /* Tachelhit*/ {"shi", "A:B:C:D:E:\u0190:F:G:\u0194:H:I:J:K:L:M:N:Q:R:S:T:U:W:X:Y:Z"}, 157 // /* Tachelhit (Tifinagh)*/ {"shi_Tfng", "\u2D30:\u2D31:\u2D33:\u2D37:\u2D39:\u2D3B:\u2D3C:\u2D3D:\u2D40:\u2D43:\u2D44:\u2D45:\u2D47:\u2D49:\u2D4A:\u2D4D:\u2D4E:\u2D4F:\u2D53:\u2D54:\u2D55:\u2D56:\u2D59:\u2D5A:\u2D5B:\u2D5C:\u2D5F:\u2D61:\u2D62:\u2D63:\u2D65"}, 158 // /* Shona*/ {"sn", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 159 // /* Teso*/ {"teo", "A:B:C:D:E:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:X:Y"}, 160 // /* Tonga*/ {"to", "A:B:C:D:E:F:G:H:\u02BB:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 161 // /* Central Morocco Tamazight*/ {"tzm", "A:B:C:D:E:\u0190:F:G:\u0194:H:I:J:K:L:M:N:Q:R:S:T:U:W:X:Y:Z"}, 162 // /* Uzbek (Latin)*/ {"uz_Latn", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u02BF"}, 163 // /* Vunjo*/ {"vun", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 164 // /* Soga*/ {"xog", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 165 // /* Yoruba*/ {"yo", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 166 167 }; 168 169 // public void TestAAKeyword() { 170 // ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle.getBundleInstance( 171 // ICUResourceBundle.ICU_COLLATION_BASE_NAME, "zh"); 172 // showBundle(rb, 0); 173 // String[] keywords = Collator.getKeywords(); 174 // System.out.println(Arrays.asList(keywords)); 175 // String locale = "zh"; 176 // ULocale ulocale = new ULocale(locale); 177 // for (String keyword : keywords) { 178 // List<String> values = Arrays.asList(Collator.getKeywordValuesForLocale(keyword, ulocale, false)); 179 // List<String> allValues = Arrays.asList(Collator.getKeywordValues(keyword)); 180 // for (String value : allValues) { 181 // System.out.println(keyword + "=" + value); 182 // checkKeyword(locale, value, values.contains(value)); 183 // } 184 // } 185 // } 186 // 187 // private void checkKeyword(String locale, String collationValue, boolean shouldExist) { 188 // final ULocale base = new ULocale(locale); 189 // final ULocale desired = new ULocale(locale + "@collation=" + collationValue); 190 // Collator foo = Collator.getInstance(desired); 191 // ULocale actual = foo.getLocale(ULocale.ACTUAL_LOCALE); 192 // if (shouldExist) { 193 // assertEquals("actual should match desired", desired, actual); 194 // } else { 195 // assertEquals("actual should match base", base, actual); 196 // } 197 // int comp = foo.compare("a", ""); 198 // assertEquals("should fall back to default for zh", -1, comp); 199 // } 200 // 201 // /** 202 // * @param rb 203 // * @param i 204 // */ 205 // private static void showBundle(UResourceBundle rb, int i) { 206 // for (String key : rb.keySet()) { 207 // System.out.print("\n" + Utility.repeat(" ", i) + key); 208 // UResourceBundle rb2 = rb.get(key); 209 // showBundle(rb2, i+1); 210 // } 211 // } 212 213 214 @Test 215 public void TestA() { 216 String[][] tests = {{"zh_Hant", "", "12"}, 217 {"zh", "", "D"} 218 /*, "zh@collation=unihan", "ja@collation=unihan", "ko@collation=unihan"*/ 219 }; 220 for (String[] test : tests) { 221 AlphabeticIndex<Integer> alphabeticIndex = new AlphabeticIndex<Integer>(new ULocale(test[0])); 222 final String probe = test[1]; 223 final String expectedLabel = test[2]; 224 alphabeticIndex.addRecord(probe, 1); 225 List labels = alphabeticIndex.getBucketLabels(); 226 logln(labels.toString()); 227 Bucket<Integer> bucket = find(alphabeticIndex, probe); 228 assertEquals("locale " + test[0] + " name=" + probe + " in bucket", 229 expectedLabel, bucket.getLabel()); 230 } 231 } 232 233 private Bucket<Integer> find(AlphabeticIndex<Integer> alphabeticIndex, final String probe) { 234 for (Bucket<Integer> bucket : alphabeticIndex) { 235 for (Record<Integer> record : bucket) { 236 if (record.getName().equals(probe)) { 237 return bucket; 238 } 239 } 240 } 241 return null; 242 } 243 244 @Test 245 public void TestFirstCharacters() { 246 247 AlphabeticIndex alphabeticIndex = new AlphabeticIndex(Locale.ENGLISH); 248 RuleBasedCollator collator = alphabeticIndex.getCollator(); 249 collator.setStrength(Collator.IDENTICAL); 250 Collection<String> firsts = alphabeticIndex.getFirstCharactersInScripts(); 251 // Verify that each script is represented exactly once. 252 // Exclude pseudo-scripts like Common (no letters). 253 // Exclude scripts like Braille and Sutton SignWriting 254 // because they only have symbols, not letters. 255 UnicodeSet missingScripts = new UnicodeSet( 256 "[^[:inherited:][:unknown:][:common:][:Braille:][:SignWriting:]]"); 257 String last = ""; 258 for (String index : firsts) { 259 if (collator.compare(last,index) >= 0) { 260 errln("Characters not in order: " + last + " !< " + index); 261 } 262 int script = getFirstRealScript(index); 263 if (script == UScript.UNKNOWN) { continue; } 264 UnicodeSet s = new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, script); 265 if (missingScripts.containsNone(s)) { 266 errln("2nd character in script: " + index + "\t" + new UnicodeSet(missingScripts).retainAll(s).toPattern(false)); 267 } 268 missingScripts.removeAll(s); 269 } 270 if (missingScripts.size() != 0) { 271 String missingScriptNames = ""; 272 UnicodeSet missingChars = new UnicodeSet(missingScripts); 273 for(;;) { 274 int c = missingChars.charAt(0); 275 if (c < 0) { 276 break; 277 } 278 int script = UScript.getScript(c); 279 missingScriptNames += " " + 280 UCharacter.getPropertyValueName( 281 UProperty.SCRIPT, script, UProperty.NameChoice.SHORT); 282 missingChars.removeAll(new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, script)); 283 } 284 errln("Missing character from:" + missingScriptNames + " -- " + missingScripts); 285 } 286 } 287 288 private static final int getFirstRealScript(CharSequence s) { 289 for (int i = 0; i < s.length();) { 290 int c = Character.codePointAt(s, i); 291 int script = UScript.getScript(c); 292 if (script != UScript.UNKNOWN && script != UScript.INHERITED && script != UScript.COMMON) { 293 return script; 294 } 295 i += Character.charCount(c); 296 } 297 return UScript.UNKNOWN; 298 } 299 300 @Test 301 public void TestBuckets() { 302 ULocale additionalLocale = ULocale.ENGLISH; 303 304 for (String[] pair : localeAndIndexCharactersLists) { 305 checkBuckets(pair[0], SimpleTests, additionalLocale, "E", "edgar", "Effron", "Effron"); 306 } 307 } 308 309 @Test 310 public void TestEmpty() { 311 // just verify that it doesn't blow up. 312 Set<ULocale> locales = new LinkedHashSet<ULocale>(); 313 locales.add(ULocale.ROOT); 314 locales.addAll(Arrays.asList(ULocale.getAvailableLocales())); 315 for (ULocale locale : locales) { 316 try { 317 AlphabeticIndex<String> alphabeticIndex = new AlphabeticIndex(locale); 318 alphabeticIndex.addRecord("hi", "HI"); 319 for (Bucket<String> bucket : alphabeticIndex) { 320 @SuppressWarnings("unused") 321 LabelType labelType = bucket.getLabelType(); 322 } 323 } catch (Exception e) { 324 errln("Exception when creating AlphabeticIndex for:\t" + locale.toLanguageTag()); 325 errln(e.toString()); 326 } 327 } 328 } 329 330 @Test 331 public void TestSetGetSpecialLabels() { 332 AlphabeticIndex index = new AlphabeticIndex(Locale.GERMAN).addLabels(new Locale("ru")); 333 index.setUnderflowLabel("__"); 334 index.setInflowLabel("--"); 335 index.setOverflowLabel("^^"); 336 assertEquals("underflow label", "__", index.getUnderflowLabel()); 337 assertEquals("inflow label", "--", index.getInflowLabel()); 338 assertEquals("overflow label", "^^", index.getOverflowLabel()); 339 340 ImmutableIndex ii = index.buildImmutableIndex(); 341 assertEquals("0 -> underflow", "__", ii.getBucket(ii.getBucketIndex("0")).getLabel()); 342 assertEquals(" -> inflow", "--", ii.getBucket(ii.getBucketIndex("")).getLabel()); 343 assertEquals(" -> overflow", "^^", ii.getBucket(ii.getBucketIndex("")).getLabel()); 344 } 345 346 @Test 347 public void TestInflow() { 348 Object[][] tests = { 349 {0, ULocale.ENGLISH}, 350 {0, ULocale.ENGLISH, new ULocale("el")}, 351 {1, ULocale.ENGLISH, new ULocale("ru")}, 352 {0, ULocale.ENGLISH, new ULocale("el"), new UnicodeSet("[\u2C80]"), new ULocale("ru")}, 353 {0, ULocale.ENGLISH}, 354 {2, ULocale.ENGLISH, new ULocale("ru"), ULocale.JAPANESE}, 355 }; 356 for (Object[] test : tests) { 357 int expected = (Integer) test[0]; 358 AlphabeticIndex<Double> alphabeticIndex = new AlphabeticIndex((ULocale)test[1]); 359 for (int i = 2; i < test.length; ++i) { 360 if (test[i] instanceof ULocale) { 361 alphabeticIndex.addLabels((ULocale)test[i]); 362 } else { 363 alphabeticIndex.addLabels((UnicodeSet)test[i]); 364 } 365 } 366 Counter<AlphabeticIndex.Bucket.LabelType> counter = new Counter(); 367 for (Bucket<Double> bucket : alphabeticIndex) { 368 LabelType labelType = bucket.getLabelType(); 369 counter.add(labelType, 1); 370 } 371 String printList = Arrays.asList(test).toString(); 372 assertEquals(LabelType.UNDERFLOW + "\t" + printList, 1, counter.get(LabelType.UNDERFLOW)); 373 assertEquals(LabelType.INFLOW + "\t" + printList, expected, counter.get(LabelType.INFLOW)); 374 if (expected != counter.get(LabelType.INFLOW)) { 375 // for debugging 376 AlphabeticIndex<Double> indexCharacters2 = new AlphabeticIndex((ULocale)test[1]); 377 for (int i = 2; i < test.length; ++i) { 378 if (test[i] instanceof ULocale) { 379 indexCharacters2.addLabels((ULocale)test[i]); 380 } else { 381 indexCharacters2.addLabels((UnicodeSet)test[i]); 382 } 383 } 384 List<Bucket<Double>> buckets = CollectionUtilities.addAll(alphabeticIndex.iterator(), new ArrayList<Bucket<Double>>()); 385 logln(buckets.toString()); 386 } 387 assertEquals(LabelType.OVERFLOW + "\t" + printList, 1, counter.get(LabelType.OVERFLOW)); 388 } 389 } 390 391 private void checkBuckets(String localeString, String[] test, ULocale additionalLocale, String testBucket, String... items) { 392 StringBuilder UI = new StringBuilder(); 393 ULocale desiredLocale = new ULocale(localeString); 394 395 // Create a simple index where the values for the strings are Integers, and add the strings 396 AlphabeticIndex<Integer> index = new AlphabeticIndex<Integer>(desiredLocale).addLabels(additionalLocale); 397 int counter = 0; 398 Counter<String> itemCount = new Counter(); 399 for (String item : test) { 400 index.addRecord(item, counter++); 401 itemCount.add(item, 1); 402 } 403 assertEquals("getRecordCount()", (int)itemCount.getTotal(), index.getRecordCount()); // code coverage 404 405 List<String> labels = index.getBucketLabels(); 406 ImmutableIndex<Integer> immIndex = index.buildImmutableIndex(); 407 408 logln(desiredLocale + "\t" + desiredLocale.getDisplayName(ULocale.ENGLISH) + " - " + desiredLocale.getDisplayName(desiredLocale) + "\t" 409 + index.getCollator().getLocale(ULocale.ACTUAL_LOCALE)); 410 UI.setLength(0); 411 UI.append(desiredLocale + "\t"); 412 boolean showAll = true; 413 414 // Show index at top. We could skip or gray out empty buckets 415 for (AlphabeticIndex.Bucket<Integer> bucket : index) { 416 if (showAll || bucket.size() != 0) { 417 showLabelAtTop(UI, bucket.getLabel()); 418 } 419 } 420 logln(UI.toString()); 421 422 // Show the buckets with their contents, skipping empty buckets 423 int bucketIndex = 0; 424 for (Bucket<Integer> bucket : index) { 425 assertEquals("bucket label vs. iterator", 426 labels.get(bucketIndex), bucket.getLabel()); 427 assertEquals("bucket label vs. immutable", 428 labels.get(bucketIndex), immIndex.getBucket(bucketIndex).getLabel()); 429 assertEquals("bucket label type vs. immutable", 430 bucket.getLabelType(), immIndex.getBucket(bucketIndex).getLabelType()); 431 for (Record<Integer> r : bucket) { 432 CharSequence name = r.getName(); 433 assertEquals("getBucketIndex(" + name + ")", 434 bucketIndex, index.getBucketIndex(name)); 435 assertEquals("immutable getBucketIndex(" + name + ")", 436 bucketIndex, immIndex.getBucketIndex(name)); 437 } 438 if (bucket.getLabel().equals(testBucket)) { 439 Counter<String> keys = getKeys(bucket); 440 for (String item : items) { 441 long globalCount = itemCount.get(item); 442 long localeCount = keys.get(item); 443 if (globalCount != localeCount) { 444 errln("Error: in " + "'" + testBucket + "', '" + item + "' should have count " 445 + globalCount + " but has count " + localeCount); 446 } 447 448 } 449 } 450 451 if (bucket.size() != 0) { 452 showLabelInList(UI, bucket.getLabel()); 453 for (AlphabeticIndex.Record<Integer> item : bucket) { 454 showIndexedItem(UI, item.getName(), item.getData()); 455 } 456 logln(UI.toString()); 457 } 458 ++bucketIndex; 459 } 460 assertEquals("getBucketCount()", bucketIndex, index.getBucketCount()); 461 assertEquals("immutable getBucketCount()", bucketIndex, immIndex.getBucketCount()); 462 463 assertNull("immutable getBucket(-1)", immIndex.getBucket(-1)); 464 assertNull("immutable getBucket(count)", immIndex.getBucket(bucketIndex)); 465 466 for (Bucket<Integer> bucket : immIndex) { 467 assertEquals("immutable bucket size", 0, bucket.size()); 468 assertFalse("immutable bucket iterator.hasNext()", bucket.iterator().hasNext()); 469 } 470 } 471 472 public <T> void showIndex(AlphabeticIndex<T> index, boolean showEmpty) { 473 logln("Actual"); 474 StringBuilder UI = new StringBuilder(); 475 for (Bucket<T> bucket : index) { 476 if (showEmpty || bucket.size() != 0) { 477 showLabelInList(UI, bucket.getLabel()); 478 for (Record<T> item : bucket) { 479 showIndexedItem(UI, item.getName(), item.getData()); 480 } 481 logln(UI.toString()); 482 } 483 } 484 } 485 486 /** 487 * @param myBucketLabels 488 * @param myBucketContents 489 * @param b 490 */ 491 private void showIndex(List<String> myBucketLabels, ArrayList<Set<R4<RawCollationKey, String, Integer, Double>>> myBucketContents, boolean showEmpty) { 492 logln("Alternative"); 493 StringBuilder UI = new StringBuilder(); 494 495 for (int i = 0; i < myBucketLabels.size(); ++i) { 496 Set<R4<RawCollationKey, String, Integer, Double>> bucket = myBucketContents.get(i); 497 if (!showEmpty && bucket.size() == 0) { 498 continue; 499 } 500 UI.setLength(0); 501 UI.append("*").append(myBucketLabels.get(i)); 502 for (R4<RawCollationKey, String, Integer, Double> item : bucket) { 503 UI.append("\t ").append(item.get1().toString()).append(ARROW).append(item.get3().toString()); 504 } 505 logln(UI.toString()); 506 } 507 } 508 509 private void showLabelAtTop(StringBuilder buffer, String label) { 510 buffer.append(label + " "); 511 } 512 513 private <T> void showIndexedItem(StringBuilder buffer, CharSequence key, T value) { 514 buffer.append("\t " + key + ARROW + value); 515 } 516 517 private void showLabelInList(StringBuilder buffer, String label) { 518 buffer.setLength(0); 519 buffer.append(label); 520 } 521 522 private Counter<String> getKeys(AlphabeticIndex.Bucket<Integer> entry) { 523 Counter<String> keys = new Counter<String>(); 524 for (AlphabeticIndex.Record x : entry) { 525 String key = x.getName().toString(); 526 keys.add(key, 1); 527 } 528 return keys; 529 } 530 531 @Test 532 public void TestIndexCharactersList() { 533 for (String[] localeAndIndexCharacters : localeAndIndexCharactersLists) { 534 ULocale locale = new ULocale(localeAndIndexCharacters[0]); 535 String expectedIndexCharacters = "\u2026:" + localeAndIndexCharacters[1] + ":\u2026"; 536 Collection<String> alphabeticIndex = new AlphabeticIndex(locale).getBucketLabels(); 537 538 // Join the elements of the list to a string with delimiter ":" 539 StringBuilder sb = new StringBuilder(); 540 Iterator<String> iter = alphabeticIndex.iterator(); 541 while (iter.hasNext()) { 542 sb.append(iter.next()); 543 if (!iter.hasNext()) { 544 break; 545 } 546 sb.append(":"); 547 } 548 String actualIndexCharacters = sb.toString(); 549 if (!expectedIndexCharacters.equals(actualIndexCharacters)) { 550 errln("Test failed for locale " + localeAndIndexCharacters[0] + 551 "\n Expected = |" + expectedIndexCharacters + "|\n actual = |" + actualIndexCharacters + "|"); 552 } 553 } 554 } 555 556 @Test 557 public void TestBasics() { 558 ULocale[] list = ULocale.getAvailableLocales(); 559 // get keywords combinations 560 // don't bother with multiple combinations at this point 561 List keywords = new ArrayList(); 562 keywords.add(""); 563 564 String[] collationValues = Collator.getKeywordValues("collation"); 565 for (int j = 0; j < collationValues.length; ++j) { 566 keywords.add("@collation=" + collationValues[j]); 567 } 568 569 for (int i = 0; i < list.length; ++i) { 570 for (Iterator it = keywords.iterator(); it.hasNext();) { 571 String collationValue = (String) it.next(); 572 String localeString = list[i].toString(); 573 if (!KEY_LOCALES.contains(localeString)) continue; // TODO change in exhaustive 574 ULocale locale = new ULocale(localeString + collationValue); 575 if (collationValue.length() > 0 && !Collator.getFunctionalEquivalent("collation", locale).equals(locale)) { 576 //logln("Skipping " + locale); 577 continue; 578 } 579 580 if (locale.getCountry().length() != 0) { 581 continue; 582 } 583 boolean isUnihan = collationValue.contains("unihan"); 584 AlphabeticIndex alphabeticIndex = new AlphabeticIndex(locale); 585 if (isUnihan) { 586 // Unihan tailorings have a label per radical, and there are at least 214, 587 // if not more when simplified radicals are distinguished. 588 alphabeticIndex.setMaxLabelCount(500); 589 } 590 final Collection mainChars = alphabeticIndex.getBucketLabels(); 591 String mainCharString = mainChars.toString(); 592 if (mainCharString.length() > 500) { 593 mainCharString = mainCharString.substring(0,500) + "..."; 594 } 595 logln(mainChars.size() + "\t" + locale + "\t" + locale.getDisplayName(ULocale.ENGLISH)); 596 logln("Index:\t" + mainCharString); 597 if (!isUnihan && mainChars.size() > 100) { 598 errln("Index character set too large: " + 599 locale + " [" + mainChars.size() + "]:\n " + mainChars); 600 } 601 } 602 } 603 } 604 605 @Test 606 public void TestClientSupport() { 607 for (String localeString : new String[] {"zh"}) { // KEY_LOCALES, new String[] {"zh"} 608 ULocale ulocale = new ULocale(localeString); 609 AlphabeticIndex<Double> alphabeticIndex = new AlphabeticIndex<Double>(ulocale).addLabels(Locale.ENGLISH); 610 RuleBasedCollator collator = alphabeticIndex.getCollator(); 611 String [][] tests; 612 613 if (!localeString.equals("zh") ) { 614 tests = new String[][] {SimpleTests}; 615 } else { 616 tests = new String[][] {SimpleTests, hackPinyin, simplifiedNames}; 617 } 618 619 for (String [] shortTest : tests) { 620 double testValue = 100; 621 alphabeticIndex.clearRecords(); 622 for (String name : shortTest) { 623 alphabeticIndex.addRecord(name, testValue++); 624 } 625 626 if (DEBUG) showIndex(alphabeticIndex, false); 627 628 // make my own copy 629 testValue = 100; 630 List<String> myBucketLabels = alphabeticIndex.getBucketLabels(); 631 ArrayList<Set<R4<RawCollationKey, String, Integer, Double>>> myBucketContents = new ArrayList<Set<R4<RawCollationKey, String, Integer, Double>>>(myBucketLabels.size()); 632 for (int i = 0; i < myBucketLabels.size(); ++i) { 633 myBucketContents.add(new TreeSet<R4<RawCollationKey, String, Integer, Double>>()); 634 } 635 for (String name : shortTest) { 636 int bucketIndex = alphabeticIndex.getBucketIndex(name); 637 if (bucketIndex > myBucketContents.size()) { 638 alphabeticIndex.getBucketIndex(name); // call again for debugging 639 } 640 Set<R4<RawCollationKey, String, Integer, Double>> myBucket = myBucketContents.get(bucketIndex); 641 RawCollationKey rawCollationKey = collator.getRawCollationKey(name, null); 642 R4<RawCollationKey, String, Integer, Double> row = Row.of(rawCollationKey, name, name.length(), testValue++); 643 myBucket.add(row); 644 } 645 if (DEBUG) showIndex(myBucketLabels, myBucketContents, false); 646 647 // now compare 648 int index = 0; 649 boolean gotError = false; 650 for (AlphabeticIndex.Bucket<Double> bucket : alphabeticIndex) { 651 String bucketLabel = bucket.getLabel(); 652 String myLabel = myBucketLabels.get(index); 653 if (!bucketLabel.equals(myLabel)) { 654 gotError |= !assertEquals(ulocale + "\tBucket Labels (" + index + ")", bucketLabel, myLabel); 655 } 656 Set<R4<RawCollationKey, String, Integer, Double>> myBucket = myBucketContents.get(index); 657 Iterator<R4<RawCollationKey, String, Integer, Double>> myBucketIterator = myBucket.iterator(); 658 int recordIndex = 0; 659 for (Record<Double> record : bucket) { 660 String myName = null; 661 if (myBucketIterator.hasNext()) { 662 R4<RawCollationKey, String, Integer, Double> myRecord = myBucketIterator.next(); 663 myName = myRecord.get1(); 664 } 665 if (!record.getName().equals(myName)) { 666 gotError |= !assertEquals(ulocale + "\t" + bucketLabel + "\t" + "Record Names (" + index + "." + recordIndex++ + ")", record.getName(), myName); 667 } 668 } 669 while (myBucketIterator.hasNext()) { 670 R4<RawCollationKey, String, Integer, Double> myRecord = myBucketIterator.next(); 671 String myName = myRecord.get1(); 672 gotError |= !assertEquals(ulocale + "\t" + bucketLabel + "\t" + "Record Names (" + index + "." + recordIndex++ + ")", null, myName); 673 } 674 index++; 675 } 676 if (gotError) { 677 showIndex(myBucketLabels, myBucketContents, false); 678 showIndex(alphabeticIndex, false); 679 } 680 } 681 } 682 } 683 684 @Test 685 public void TestFirstScriptCharacters() { 686 Collection<String> firstCharacters = 687 new AlphabeticIndex(ULocale.ENGLISH).getFirstCharactersInScripts(); 688 Collection<String> expectedFirstCharacters = firstStringsInScript((RuleBasedCollator) Collator.getInstance(ULocale.ROOT)); 689 Collection<String> diff = new TreeSet<String>(firstCharacters); 690 diff.removeAll(expectedFirstCharacters); 691 assertTrue("First Characters contains unexpected ones: " + diff, diff.isEmpty()); 692 diff.clear(); 693 diff.addAll(expectedFirstCharacters); 694 diff.removeAll(firstCharacters); 695 assertTrue("First Characters missing expected ones: " + diff, diff.isEmpty()); 696 } 697 698 private static final UnicodeSet TO_TRY = new UnicodeSet("[[:^nfcqc=no:]-[:sc=Common:]-[:sc=Inherited:]-[:sc=Unknown:]]").freeze(); 699 700 /** 701 * Returns a collection of all the "First" characters of scripts, according to the collation. 702 */ 703 private static Collection<String> firstStringsInScript(RuleBasedCollator ruleBasedCollator) { 704 String[] results = new String[UScript.CODE_LIMIT]; 705 for (String current : TO_TRY) { 706 if (ruleBasedCollator.compare(current, "a") < 0) { // we only want "real" script characters, not symbols. 707 continue; 708 } 709 int script = UScript.getScript(current.codePointAt(0)); 710 if (results[script] == null) { 711 results[script] = current; 712 } else if (ruleBasedCollator.compare(current, results[script]) < 0) { 713 results[script] = current; 714 } 715 } 716 717 try { 718 UnicodeSet extras = new UnicodeSet(); 719 UnicodeSet expansions = new UnicodeSet(); 720 ruleBasedCollator.getContractionsAndExpansions(extras, expansions, true); 721 extras.addAll(expansions).removeAll(TO_TRY); 722 if (extras.size() != 0) { 723 Normalizer2 normalizer = Normalizer2.getNFKCInstance(); 724 for (String current : extras) { 725 if (!normalizer.isNormalized(current) || ruleBasedCollator.compare(current, "9") <= 0) { 726 continue; 727 } 728 int script = getFirstRealScript(current); 729 if (script == UScript.UNKNOWN && !isUnassignedBoundary(current)) { continue; } 730 if (results[script] == null) { 731 results[script] = current; 732 } else if (ruleBasedCollator.compare(current, results[script]) < 0) { 733 results[script] = current; 734 } 735 } 736 } 737 } catch (Exception e) { 738 } // why have a checked exception??? 739 740 // TODO: We should not test that we get the same strings, but that we 741 // get strings that sort primary-equal to those from the implementation. 742 743 Collection<String> result = new ArrayList<String>(); 744 for (int i = 0; i < results.length; ++i) { 745 if (results[i] != null) { 746 result.add(results[i]); 747 } 748 } 749 return result; 750 } 751 752 private static final boolean isUnassignedBoundary(CharSequence s) { 753 // The root collator provides a script-first-primary boundary contraction 754 // for the unassigned-implicit range. 755 return s.charAt(0) == 0xfdd1 && 756 UScript.getScript(Character.codePointAt(s, 1)) == UScript.UNKNOWN; 757 } 758 759 @Test 760 public void TestZZZ() { 761 // int x = 3; 762 // AlphabeticIndex index = new AlphabeticIndex(ULocale.ENGLISH); 763 // UnicodeSet additions = new UnicodeSet(); 764 // additions.add(0x410).add(0x415); // Cyrillic 765 // // additions.add(0x391).add(0x393); // Greek 766 // index.addLabels(additions); 767 // int lc = index.getLabels().size(); 768 // List labels = index.getLabels(); 769 // System.out.println("Label Count = " + lc + "\t" + labels); 770 // System.out.println("Bucket Count =" + index.getBucketCount()); 771 } 772 773 @Test 774 public void TestSimplified() { 775 checkBuckets("zh", simplifiedNames, ULocale.ENGLISH, "W", "\u897f"); 776 } 777 778 @Test 779 public void TestTraditional() { 780 checkBuckets("zh_Hant", traditionalNames, ULocale.ENGLISH, "\u4e9f", "\u5357\u9580"); 781 } 782 783 static final String[] SimpleTests = { 784 "", 785 "\u1f2d\u03c1\u03b1", 786 "$", "\u00a3", "12", "2", 787 "Davis", "Davis", "Abbot", "\u1D05avis", "Zach", "\u1D05avis", "\u01b5", "\u0130stanbul", "Istanbul", "istanbul", "\u0131stanbul", 788 "\u00deor", "\u00c5berg", "\u00d6stlund", 789 "\u1f2d\u03c1\u03b1", "\u1f08\u03b8\u03b7\u03bd\u1fb6", 790 "\u0396\u03b5\u03cd\u03c2", "\u03a0\u03bf\u03c3\u03b5\u03b9\u03b4\u1f63\u03bd", "\u1f0d\u03b9\u03b4\u03b7\u03c2", "\u0394\u03b7\u03bc\u03ae\u03c4\u03b7\u03c1", "\u1f19\u03c3\u03c4\u03b9\u03ac", 791 //"\u1f08\u03c0\u03cc\u03bb\u03bb\u03c9\u03bd", "\u1f0c\u03c1\u03c4\u03b5\u03bc\u03b9\u03c2", "\u1f19\u03c1\u03bc\u1f23\u03c2", "\u1f0c\u03c1\u03b7\u03c2", "\u1f08\u03c6\u03c1\u03bf\u03b4\u03af\u03c4\u03b7", "\u1f2d\u03c6\u03b1\u03b9\u03c3\u03c4\u03bf\u03c2", "\u0394\u03b9\u03cc\u03bd\u03c5\u03c3\u03bf\u03c2", 792 "\u6589\u85e4", "\u4f50\u85e4", "\u9234\u6728", "\u9ad8\u6a4b", "\u7530\u4e2d", "\u6e21\u8fba", "\u4f0a\u85e4", "\u5c71\u672c", "\u4e2d\u6751", "\u5c0f\u6797", "\u658e\u85e4", "\u52a0\u85e4", 793 //"\u5409\u7530", "\u5c71\u7530", "\u4f50\u3005\u6728", "\u5c71\u53e3", "\u677e\u672c", "\u4e95\u4e0a", "\u6728\u6751", "\u6797", "\u6e05\u6c34" 794 }; 795 796 static final String[] hackPinyin = { 797 "a", "\u5416", "\u58ba", // 798 "b", "\u516b", "\u62d4", "\u8500", // 799 "c", "\u5693", "\u7938", "\u9e7e", // 800 "d", "\u5491", "\u8fcf", "\u964a", // 801 "e","\u59b8", "\u92e8", "\u834b", // 802 "f", "\u53d1", "\u9197", "\u99a5", // 803 "g", "\u7324", "\u91d3", "\u8142", // 804 "h", "\u598e", "\u927f", "\u593b", // 805 "j", "\u4e0c", "\u6785", "\u9d58", // 806 "k", "\u5494", "\u958b", "\u7a52", // 807 "l", "\u5783", "\u62c9", "\u9ba5", // 808 "m", "\u5638", "\u9ebb", "\u65c0", // 809 "n", "\u62ff", "\u80ad", "\u685b", // 810 "o", "\u5662", "\u6bee", "\u8bb4", // 811 "p", "\u5991", "\u8019", "\u8c31", // 812 "q", "\u4e03", "\u6053", "\u7f56", // 813 "r", "\u5465", "\u72aa", "\u6e03", // 814 "s", "\u4ee8", "\u9491", "\u93c1", // 815 "t", "\u4ed6", "\u9248", "\u67dd", // 816 "w", "\u5c72", "\u5558", "\u5a7a", // 817 "x", "\u5915", "\u5438", "\u6bbe", // 818 "y", "\u4e2b", "\u82bd", "\u8574", // 819 "z", "\u5e00", "\u707d", "\u5c0a" 820 }; 821 822 static final String[] simplifiedNames = { 823 "Abbot", "Morton", "Zachary", "Williams", "\u8d75", "\u94b1", "\u5b59", "\u674e", "\u5468", "\u5434", "\u90d1", "\u738b", "\u51af", "\u9648", "\u696e", "\u536b", "\u848b", "\u6c88", 824 "\u97e9", "\u6768", "\u6731", "\u79e6", "\u5c24", "\u8bb8", "\u4f55", "\u5415", "\u65bd", "\u5f20", "\u5b54", "\u66f9", "\u4e25", "\u534e", "\u91d1", "\u9b4f", "\u9676", "\u59dc", "\u621a", "\u8c22", "\u90b9", 825 "\u55bb", "\u67cf", "\u6c34", "\u7aa6", "\u7ae0", "\u4e91", "\u82cf", "\u6f58", "\u845b", "\u595a", "\u8303", "\u5f6d", "\u90ce", "\u9c81", "\u97e6", "\u660c", "\u9a6c", "\u82d7", "\u51e4", "\u82b1", "\u65b9", 826 "\u4fde", "\u4efb", "\u8881", "\u67f3", "\u9146", "\u9c8d", "\u53f2", "\u5510", "\u8d39", "\u5ec9", "\u5c91", "\u859b", "\u96f7", "\u8d3a", "\u502a", "\u6c64", "\u6ed5", "\u6bb7", "\u7f57", "\u6bd5", "\u90dd", 827 "\u90ac", "\u5b89", "\u5e38", "\u4e50", "\u4e8e", "\u65f6", "\u5085", "\u76ae", "\u535e", "\u9f50", "\u5eb7", "\u4f0d", "\u4f59", "\u5143", "\u535c", "\u987e", "\u5b5f", "\u5e73", "\u9ec4", "\u548c", "\u7a46", 828 "\u8427", "\u5c39", "\u59da", "\u90b5", "\u6e5b", "\u6c6a", "\u7941", "\u6bdb", "\u79b9", "\u72c4", "\u7c73", "\u8d1d", "\u660e", "\u81e7", "\u8ba1", "\u4f0f", "\u6210", "\u6234", "\u8c08", "\u5b8b", "\u8305", 829 "\u5e9e", "\u718a", "\u7eaa", "\u8212", "\u5c48", "\u9879", "\u795d", "\u8463", "\u6881", "\u675c", "\u962e", "\u84dd", "\u95fd", "\u5e2d", "\u5b63", "\u9ebb", "\u5f3a", "\u8d3e", "\u8def", "\u5a04", "\u5371", 830 "\u6c5f", "\u7ae5", "\u989c", "\u90ed", "\u6885", "\u76db", "\u6797", "\u5201", "\u953a", "\u5f90", "\u4e18", "\u9a86", "\u9ad8", "\u590f", "\u8521", "\u7530", "\u6a0a", "\u80e1", "\u51cc", "\u970d", "\u865e", 831 "\u4e07", "\u652f", "\u67ef", "\u661d", "\u7ba1", "\u5362", "\u83ab", "\u7ecf", "\u623f", "\u88d8", "\u7f2a", "\u5e72", "\u89e3", "\u5e94", "\u5b97", "\u4e01", "\u5ba3", "\u8d32", "\u9093", "\u90c1", "\u5355", 832 "\u676d", "\u6d2a", "\u5305", "\u8bf8", "\u5de6", "\u77f3", "\u5d14", "\u5409", "\u94ae", "\u9f9a", "\u7a0b", "\u5d47", "\u90a2", "\u6ed1", "\u88f4", "\u9646", "\u8363", "\u7fc1", "\u8340", "\u7f8a", "\u65bc", 833 "\u60e0", "\u7504", "\u9eb9", "\u5bb6", "\u5c01", "\u82ae", "\u7fbf", "\u50a8", "\u9773", "\u6c72", "\u90b4", "\u7cdc", "\u677e", "\u4e95", "\u6bb5", "\u5bcc", "\u5deb", "\u4e4c", "\u7126", "\u5df4", "\u5f13", 834 "\u7267", "\u9697", "\u5c71", "\u8c37", "\u8f66", "\u4faf", "\u5b93", "\u84ec", "\u5168", "\u90d7", "\u73ed", "\u4ef0", "\u79cb", "\u4ef2", "\u4f0a", "\u5bab", "\u5b81", "\u4ec7", "\u683e", "\u66b4", "\u7518", 835 "\u659c", "\u5389", "\u620e", "\u7956", "\u6b66", "\u7b26", "\u5218", "\u666f", "\u8a79", "\u675f", "\u9f99", "\u53f6", "\u5e78", "\u53f8", "\u97f6", "\u90dc", "\u9ece", "\u84df", "\u8584", "\u5370", "\u5bbf", 836 "\u767d", "\u6000", "\u84b2", "\u90b0", "\u4ece", "\u9102", "\u7d22", "\u54b8", "\u7c4d", "\u8d56", "\u5353", "\u853a", "\u5c60", "\u8499", "\u6c60", "\u4e54", "\u9634", "\u90c1", "\u80e5", "\u80fd", "\u82cd", 837 "\u53cc", "\u95fb", "\u8398", "\u515a", "\u7fdf", "\u8c2d", "\u8d21", "\u52b3", "\u9004", "\u59ec", "\u7533", "\u6276", "\u5835", "\u5189", "\u5bb0", "\u90e6", "\u96cd", "\u90e4", "\u74a9", "\u6851", "\u6842", 838 "\u6fee", "\u725b", "\u5bff", "\u901a", "\u8fb9", "\u6248", "\u71d5", "\u5180", "\u90cf", "\u6d66", "\u5c1a", "\u519c", "\u6e29", "\u522b", "\u5e84", "\u664f", "\u67f4", "\u77bf", "\u960e", "\u5145", "\u6155", 839 "\u8fde", "\u8339", "\u4e60", "\u5ba6", "\u827e", "\u9c7c", "\u5bb9", "\u5411", "\u53e4", "\u6613", "\u614e", "\u6208", "\u5ed6", "\u5ebe", "\u7ec8", "\u66a8", "\u5c45", "\u8861", "\u6b65", "\u90fd", "\u803f", 840 "\u6ee1", "\u5f18", "\u5321", "\u56fd", "\u6587", "\u5bc7", "\u5e7f", "\u7984", "\u9619", "\u4e1c", "\u6b27", "\u6bb3", "\u6c83", "\u5229", "\u851a", "\u8d8a", "\u5914", "\u9686", "\u5e08", "\u5de9", "\u538d", 841 "\u8042", "\u6641", "\u52fe", "\u6556", "\u878d", "\u51b7", "\u8a3e", "\u8f9b", "\u961a", "\u90a3", "\u7b80", "\u9976", "\u7a7a", "\u66fe", "\u6bcb", "\u6c99", "\u4e5c", "\u517b", "\u97a0", "\u987b", "\u4e30", 842 "\u5de2", "\u5173", "\u84af", "\u76f8", "\u67e5", "\u540e", "\u8346", "\u7ea2", "\u6e38", "\u7afa", "\u6743", "\u9011", "\u76d6", "\u76ca", "\u6853", "\u516c", "\u4e07\u4fdf", "\u53f8\u9a6c", "\u4e0a\u5b98", "\u6b27\u9633", 843 "\u590f\u4faf", "\u8bf8\u845b", "\u95fb\u4eba", "\u4e1c\u65b9", "\u8d6b\u8fde", "\u7687\u752b", "\u5c09\u8fdf", "\u516c\u7f8a", "\u6fb9\u53f0", "\u516c\u51b6", "\u5b97\u653f", "\u6fee\u9633", "\u6df3\u4e8e", "\u5355\u4e8e", "\u592a\u53d4", "\u7533\u5c60", "\u516c\u5b59", "\u4ef2\u5b59", 844 "\u8f69\u8f95", "\u4ee4\u72d0", "\u953a\u79bb", "\u5b87\u6587", "\u957f\u5b59", "\u6155\u5bb9", "\u9c9c\u4e8e", "\u95fe\u4e18", "\u53f8\u5f92", "\u53f8\u7a7a", "\u4e0c\u5b98", "\u53f8\u5bc7", "\u4ec9", "\u7763", "\u5b50\u8f66", "\u989b\u5b59", "\u7aef\u6728", "\u5deb\u9a6c", 845 "\u516c\u897f", "\u6f06\u96d5", "\u4e50\u6b63", "\u58e4\u9a77", "\u516c\u826f", "\u62d3\u62d4", "\u5939\u8c37", "\u5bb0\u7236", "\u8c37\u6881", "\u664b", "\u695a", "\u960e", "\u6cd5", "\u6c5d", "\u9122", "\u6d82", "\u94a6", "\u6bb5\u5e72", "\u767e\u91cc", 846 "\u4e1c\u90ed", "\u5357\u95e8", "\u547c\u5ef6", "\u5f52", "\u6d77", "\u7f8a\u820c", "\u5fae\u751f", "\u5cb3", "\u5e05", "\u7f11", "\u4ea2", "\u51b5", "\u540e", "\u6709", "\u7434", "\u6881\u4e18", "\u5de6\u4e18", "\u4e1c\u95e8", "\u897f\u95e8", 847 "\u5546", "\u725f", "\u4f58", "\u4f74", "\u4f2f", "\u8d4f", "\u5357\u5bab", "\u58a8", "\u54c8", "\u8c2f", "\u7b2a", "\u5e74", "\u7231", "\u9633", "\u4f5f" 848 }; 849 850 static final String[] traditionalNames = { "", "Abbot", "Morton", "Zachary", "Williams", "\u8d99", "\u9322", "\u5b6b", 851 "\u674e", "\u5468", "\u5433", "\u912d", "\u738b", "\u99ae", "\u9673", "\u696e", "\u885b", "\u8523", 852 "\u6c88", "\u97d3", "\u694a", "\u6731", "\u79e6", "\u5c24", "\u8a31", "\u4f55", "\u5442", "\u65bd", 853 "\u5f35", "\u5b54", "\u66f9", "\u56b4", "\u83ef", "\u91d1", "\u9b4f", "\u9676", "\u59dc", "\u621a", 854 "\u8b1d", "\u9112", "\u55bb", "\u67cf", "\u6c34", "\u7ac7", "\u7ae0", "\u96f2", "\u8607", "\u6f58", 855 "\u845b", "\u595a", "\u7bc4", "\u5f6d", "\u90ce", "\u9b6f", "\u97cb", "\u660c", "\u99ac", "\u82d7", 856 "\u9cf3", "\u82b1", "\u65b9", "\u4fde", "\u4efb", "\u8881", "\u67f3", "\u9146", "\u9b91", "\u53f2", 857 "\u5510", "\u8cbb", "\u5ec9", "\u5c91", "\u859b", "\u96f7", "\u8cc0", "\u502a", "\u6e6f", "\u6ed5", 858 "\u6bb7", "\u7f85", "\u7562", "\u90dd", "\u9114", "\u5b89", "\u5e38", "\u6a02", "\u65bc", "\u6642", 859 "\u5085", "\u76ae", "\u535e", "\u9f4a", "\u5eb7", "\u4f0d", "\u9918", "\u5143", "\u535c", "\u9867", 860 "\u5b5f", "\u5e73", "\u9ec3", "\u548c", "\u7a46", "\u856d", "\u5c39", "\u59da", "\u90b5", "\u6e5b", 861 "\u6c6a", "\u7941", "\u6bdb", "\u79b9", "\u72c4", "\u7c73", "\u8c9d", "\u660e", "\u81e7", "\u8a08", 862 "\u4f0f", "\u6210", "\u6234", "\u8ac7", "\u5b8b", "\u8305", "\u9f90", "\u718a", "\u7d00", "\u8212", 863 "\u5c48", "\u9805", "\u795d", "\u8463", "\u6881", "\u675c", "\u962e", "\u85cd", "\u95a9", "\u5e2d", 864 "\u5b63", "\u9ebb", "\u5f37", "\u8cc8", "\u8def", "\u5a41", "\u5371", "\u6c5f", "\u7ae5", "\u984f", 865 "\u90ed", "\u6885", "\u76db", "\u6797", "\u5201", "\u937e", "\u5f90", "\u4e18", "\u99f1", "\u9ad8", 866 "\u590f", "\u8521", "\u7530", "\u6a0a", "\u80e1", "\u51cc", "\u970d", "\u865e", "\u842c", "\u652f", 867 "\u67ef", "\u661d", "\u7ba1", "\u76e7", "\u83ab", "\u7d93", "\u623f", "\u88d8", "\u7e46", "\u5e79", 868 "\u89e3", "\u61c9", "\u5b97", "\u4e01", "\u5ba3", "\u8cc1", "\u9127", "\u9b31", "\u55ae", "\u676d", 869 "\u6d2a", "\u5305", "\u8af8", "\u5de6", "\u77f3", "\u5d14", "\u5409", "\u9215", "\u9f94", "\u7a0b", 870 "\u5d47", "\u90a2", "\u6ed1", "\u88f4", "\u9678", "\u69ae", "\u7fc1", "\u8340", "\u7f8a", "\u65bc", 871 "\u60e0", "\u7504", "\u9eb4", "\u5bb6", "\u5c01", "\u82ae", "\u7fbf", "\u5132", "\u9773", "\u6c72", 872 "\u90b4", "\u7cdc", "\u677e", "\u4e95", "\u6bb5", "\u5bcc", "\u5deb", "\u70cf", "\u7126", "\u5df4", 873 "\u5f13", "\u7267", "\u9697", "\u5c71", "\u8c37", "\u8eca", "\u4faf", "\u5b93", "\u84ec", "\u5168", 874 "\u90d7", "\u73ed", "\u4ef0", "\u79cb", "\u4ef2", "\u4f0a", "\u5bae", "\u5be7", "\u4ec7", "\u6b12", 875 "\u66b4", "\u7518", "\u659c", "\u53b2", "\u620e", "\u7956", "\u6b66", "\u7b26", "\u5289", "\u666f", 876 "\u8a79", "\u675f", "\u9f8d", "\u8449", "\u5e78", "\u53f8", "\u97f6", "\u90dc", "\u9ece", "\u858a", 877 "\u8584", "\u5370", "\u5bbf", "\u767d", "\u61f7", "\u84b2", "\u90b0", "\u5f9e", "\u9102", "\u7d22", 878 "\u54b8", "\u7c4d", "\u8cf4", "\u5353", "\u85fa", "\u5c60", "\u8499", "\u6c60", "\u55ac", "\u9670", 879 "\u9b31", "\u80e5", "\u80fd", "\u84bc", "\u96d9", "\u805e", "\u8398", "\u9ee8", "\u7fdf", "\u8b5a", 880 "\u8ca2", "\u52de", "\u9004", "\u59ec", "\u7533", "\u6276", "\u5835", "\u5189", "\u5bb0", "\u9148", 881 "\u96cd", "\u90e4", "\u74a9", "\u6851", "\u6842", "\u6fee", "\u725b", "\u58fd", "\u901a", "\u908a", 882 "\u6248", "\u71d5", "\u5180", "\u90df", "\u6d66", "\u5c1a", "\u8fb2", "\u6eab", "\u5225", "\u838a", 883 "\u664f", "\u67f4", "\u77bf", "\u95bb", "\u5145", "\u6155", "\u9023", "\u8339", "\u7fd2", "\u5ba6", 884 "\u827e", "\u9b5a", "\u5bb9", "\u5411", "\u53e4", "\u6613", "\u614e", "\u6208", "\u5ed6", "\u5ebe", 885 "\u7d42", "\u66a8", "\u5c45", "\u8861", "\u6b65", "\u90fd", "\u803f", "\u6eff", "\u5f18", "\u5321", 886 "\u570b", "\u6587", "\u5bc7", "\u5ee3", "\u797f", "\u95d5", "\u6771", "\u6b50", "\u6bb3", "\u6c83", 887 "\u5229", "\u851a", "\u8d8a", "\u5914", "\u9686", "\u5e2b", "\u978f", "\u5399", "\u8076", "\u6641", 888 "\u52fe", "\u6556", "\u878d", "\u51b7", "\u8a3e", "\u8f9b", "\u95de", "\u90a3", "\u7c21", "\u9952", 889 "\u7a7a", "\u66fe", "\u6bcb", "\u6c99", "\u4e5c", "\u990a", "\u97a0", "\u9808", "\u8c50", "\u5de2", 890 "\u95dc", "\u84af", "\u76f8", "\u67e5", "\u5f8c", "\u834a", "\u7d05", "\u904a", "\u7afa", "\u6b0a", 891 "\u9011", "\u84cb", "\u76ca", "\u6853", "\u516c", "\u4e07\u4fdf", "\u53f8\u99ac", "\u4e0a\u5b98", 892 "\u6b50\u967d", "\u590f\u4faf", "\u8af8\u845b", "\u805e\u4eba", "\u6771\u65b9", "\u8d6b\u9023", 893 "\u7687\u752b", "\u5c09\u9072", "\u516c\u7f8a", "\u6fb9\u53f0", "\u516c\u51b6", "\u5b97\u653f", 894 "\u6fee\u967d", "\u6df3\u4e8e", "\u55ae\u4e8e", "\u592a\u53d4", "\u7533\u5c60", "\u516c\u5b6b", 895 "\u4ef2\u5b6b", "\u8ed2\u8f45", "\u4ee4\u72d0", "\u937e\u96e2", "\u5b87\u6587", "\u9577\u5b6b", 896 "\u6155\u5bb9", "\u9bae\u4e8e", "\u95ad\u4e18", "\u53f8\u5f92", "\u53f8\u7a7a", "\u4e0c\u5b98", 897 "\u53f8\u5bc7", "\u4ec9", "\u7763", "\u5b50\u8eca", "\u9853\u5b6b", "\u7aef\u6728", "\u5deb\u99ac", 898 "\u516c\u897f", "\u6f06\u96d5", "\u6a02\u6b63", "\u58e4\u99df", "\u516c\u826f", "\u62d3\u62d4", 899 "\u593e\u8c37", "\u5bb0\u7236", "\u7a40\u6881", "\u6649", "\u695a", "\u95bb", "\u6cd5", "\u6c5d", "\u9122", 900 "\u5857", "\u6b3d", "\u6bb5\u5e72", "\u767e\u91cc", "\u6771\u90ed", "\u5357\u9580", "\u547c\u5ef6", 901 "\u6b78", "\u6d77", "\u7f8a\u820c", "\u5fae\u751f", "\u5cb3", "\u5e25", "\u7df1", "\u4ea2", "\u6cc1", 902 "\u5f8c", "\u6709", "\u7434", "\u6881\u4e18", "\u5de6\u4e18", "\u6771\u9580", "\u897f\u9580", "\u5546", 903 "\u725f", "\u4f58", "\u4f74", "\u4f2f", "\u8cde", "\u5357\u5bae", "\u58a8", "\u54c8", "\u8b59", "\u7b2a", 904 "\u5e74", "\u611b", "\u967d", "\u4f5f", "\u3401", "\u3422", "\u3426", "\u3493", "\u34A5", "\u34A7", 905 "\u34AA", "\u3536", "\u4A3B", "\u4E00", "\u4E01", "\u4E07", "\u4E0D", "\u4E17", "\u4E23", "\u4E26", 906 "\u4E34", "\u4E82", "\u4EB8", "\u4EB9", "\u511F", "\u512D", "\u513D", "\u513E", "\u53B5", "\u56D4", 907 "\u56D6", "\u7065", "\u7069", "\u706A", "\u7E9E", "\u9750", "\u9F49", "\u9F7E", "\u9F98", "\uD840\uDC35", 908 "\uD840\uDC3D", "\uD840\uDC3E", "\uD840\uDC41", "\uD840\uDC46", "\uD840\uDC4C", "\uD840\uDC4E", 909 "\uD840\uDC53", "\uD840\uDC55", "\uD840\uDC56", "\uD840\uDC5F", "\uD840\uDC60", "\uD840\uDC7A", 910 "\uD840\uDC7B", "\uD840\uDCC8", "\uD840\uDD9E", "\uD840\uDD9F", "\uD840\uDDA0", "\uD840\uDDA1", 911 "\uD841\uDD3B", "\uD842\uDCCA", "\uD842\uDCCB", "\uD842\uDD6C", "\uD842\uDE0B", "\uD842\uDE0C", 912 "\uD842\uDED1", "\uD844\uDD9F", "\uD845\uDD19", "\uD845\uDD1A", "\uD846\uDD3B", "\uD84C\uDF5C", 913 "\uD85A\uDDC4", "\uD85A\uDDC5", "\uD85C\uDD98", "\uD85E\uDCB1", "\uD861\uDC04", "\uD864\uDDD3", 914 "\uD865\uDE63", "\uD869\uDCCA", "\uD86B\uDE9A", }; 915 916 /** 917 * Test AlphabeticIndex vs. root with script reordering. 918 */ 919 @Test 920 public void TestHaniFirst() { 921 RuleBasedCollator coll = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT); 922 coll.setReorderCodes(UScript.HAN); 923 AlphabeticIndex index = new AlphabeticIndex(coll); 924 assertEquals("getBucketCount()", 1, index.getBucketCount()); // ... (underflow only) 925 index.addLabels(Locale.ENGLISH); 926 assertEquals("getBucketCount()", 28, index.getBucketCount()); // ... A-Z ... 927 int bucketIndex = index.getBucketIndex("\u897f"); 928 assertEquals("getBucketIndex(U+897F)", 0, bucketIndex); // underflow bucket 929 bucketIndex = index.getBucketIndex("i"); 930 assertEquals("getBucketIndex(i)", 9, bucketIndex); 931 bucketIndex = index.getBucketIndex("\u03B1"); 932 assertEquals("getBucketIndex(Greek alpha)", 27, bucketIndex); 933 // U+50005 is an unassigned code point which sorts at the end, independent of the Hani group. 934 bucketIndex = index.getBucketIndex(UTF16.valueOf(0x50005)); 935 assertEquals("getBucketIndex(U+50005)", 27, bucketIndex); 936 bucketIndex = index.getBucketIndex("\uFFFF"); 937 assertEquals("getBucketIndex(U+FFFF)", 27, bucketIndex); 938 } 939 940 /** 941 * Test AlphabeticIndex vs. Pinyin with script reordering. 942 */ 943 @Test 944 public void TestPinyinFirst() { 945 RuleBasedCollator coll = (RuleBasedCollator) Collator.getInstance(ULocale.CHINESE); 946 coll.setReorderCodes(UScript.HAN); 947 AlphabeticIndex index = new AlphabeticIndex(coll); 948 assertEquals("getBucketCount()", 28, index.getBucketCount()); // ... A-Z ... 949 index.addLabels(Locale.CHINESE); 950 assertEquals("getBucketCount()", 28, index.getBucketCount()); // ... A-Z ... 951 int bucketIndex = index.getBucketIndex("\u897f"); 952 assertEquals("getBucketIndex(U+897F)", 'X' - 'A' + 1, bucketIndex); 953 bucketIndex = index.getBucketIndex("i"); 954 assertEquals("getBucketIndex(i)", 9, bucketIndex); 955 bucketIndex = index.getBucketIndex("\u03B1"); 956 assertEquals("getBucketIndex(Greek alpha)", 27, bucketIndex); 957 // U+50005 is an unassigned code point which sorts at the end, independent of the Hani group. 958 bucketIndex = index.getBucketIndex(UTF16.valueOf(0x50005)); 959 assertEquals("getBucketIndex(U+50005)", 27, bucketIndex); 960 bucketIndex = index.getBucketIndex("\uFFFF"); 961 assertEquals("getBucketIndex(U+FFFF)", 27, bucketIndex); 962 } 963 964 /** 965 * Test labels with multiple primary weights. 966 */ 967 @Test 968 public void TestSchSt() { 969 AlphabeticIndex index = new AlphabeticIndex(ULocale.GERMAN); 970 index.addLabels(new UnicodeSet("[{Sch*}{St*}]")); 971 // ... A B-R S Sch St T-Z ... 972 ImmutableIndex immIndex = index.buildImmutableIndex(); 973 assertEquals("getBucketCount()", 31, index.getBucketCount()); 974 assertEquals("immutable getBucketCount()", 31, immIndex.getBucketCount()); 975 String[][] testCases = new String[][] { 976 // name, bucket index, bucket label 977 { "Adelbert", "1", "A" }, 978 { "Afrika", "1", "A" }, 979 { "sculap", "2", "" }, 980 { "Aesthet", "2", "" }, 981 { "Berlin", "3", "B" }, 982 { "Rilke", "19", "R" }, 983 { "Sacher", "20", "S" }, 984 { "Seiler", "20", "S" }, 985 { "Sultan", "20", "S" }, 986 { "Schiller", "21", "Sch" }, 987 { "Steiff", "22", "St" }, 988 { "Thomas", "23", "T" } 989 }; 990 List<String> labels = index.getBucketLabels(); 991 for (String[] testCase : testCases) { 992 String name = testCase[0]; 993 int bucketIndex = Integer.valueOf(testCase[1]); 994 String label = testCase[2]; 995 String msg = "getBucketIndex(" + name + ")"; 996 assertEquals(msg, bucketIndex, index.getBucketIndex(name)); 997 msg = "immutable " + msg; 998 assertEquals(msg, bucketIndex, immIndex.getBucketIndex(name)); 999 msg = "bucket label (" + name + ")"; 1000 assertEquals(msg, label, labels.get(index.getBucketIndex(name))); 1001 msg = "immutable " + msg; 1002 assertEquals(msg, label, immIndex.getBucket(bucketIndex).getLabel()); 1003 } 1004 } 1005 1006 /** 1007 * With no real labels, there should be only the underflow label. 1008 */ 1009 @Test 1010 public void TestNoLabels() { 1011 RuleBasedCollator coll = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT); 1012 AlphabeticIndex<Integer> index = new AlphabeticIndex<Integer>(coll); 1013 index.addRecord("\u897f", 0); 1014 index.addRecord("i", 0); 1015 index.addRecord("\u03B1", 0); 1016 assertEquals("getRecordCount()", 3, index.getRecordCount()); // code coverage 1017 assertEquals("getBucketCount()", 1, index.getBucketCount()); // ... 1018 Bucket<Integer> bucket = index.iterator().next(); 1019 assertEquals("underflow label type", LabelType.UNDERFLOW, bucket.getLabelType()); 1020 assertEquals("all records in the underflow bucket", 3, bucket.size()); 1021 } 1022 1023 /** 1024 * Test with the Bopomofo-phonetic tailoring. 1025 */ 1026 @Test 1027 public void TestChineseZhuyin() { 1028 AlphabeticIndex index = new AlphabeticIndex(ULocale.forLanguageTag("zh-u-co-zhuyin")); 1029 ImmutableIndex immIndex = index.buildImmutableIndex(); 1030 assertEquals("getBucketCount()", 38, immIndex.getBucketCount()); // ... -- ... 1031 assertEquals("label 1", "", immIndex.getBucket(1).getLabel()); 1032 assertEquals("label 2", "", immIndex.getBucket(2).getLabel()); 1033 assertEquals("label 3", "", immIndex.getBucket(3).getLabel()); 1034 assertEquals("label 4", "", immIndex.getBucket(4).getLabel()); 1035 assertEquals("label 5", "", immIndex.getBucket(5).getLabel()); 1036 } 1037 1038 @Test 1039 public void TestJapaneseKanji() { 1040 AlphabeticIndex index = new AlphabeticIndex(ULocale.JAPANESE); 1041 AlphabeticIndex.ImmutableIndex immIndex = index.buildImmutableIndex(); 1042 // There are no index characters for Kanji in the Japanese standard collator. 1043 // They should all go into the overflow bucket. 1044 final int[] kanji = { 0x4E9C, 0x95C7, 0x4E00, 0x58F1 }; 1045 int overflowIndex = immIndex.getBucketCount() - 1; 1046 for(int i = 0; i < kanji.length; ++i) { 1047 String msg = String.format("kanji[%d]=U+%04X in overflow bucket", i, kanji[i]); 1048 assertEquals(msg, overflowIndex, immIndex.getBucketIndex(UTF16.valueOf(kanji[i]))); 1049 } 1050 } 1051 1052 @Test 1053 public void TestFrozenCollator() { 1054 // Ticket #9472 1055 RuleBasedCollator coll = (RuleBasedCollator) Collator.getInstance(new ULocale("da")); 1056 coll.setStrength(Collator.IDENTICAL); 1057 coll.freeze(); 1058 // The AlphabeticIndex constructor used to throw an exception 1059 // because it cloned the collator (which preserves frozenness) 1060 // and set the clone's strength to PRIMARY. 1061 AlphabeticIndex index = new AlphabeticIndex(coll); 1062 assertEquals("same strength as input Collator", 1063 Collator.IDENTICAL, index.getCollator().getStrength()); 1064 } 1065 1066 @Test 1067 public void TestChineseUnihan() { 1068 AlphabeticIndex index = new AlphabeticIndex(new ULocale("zh-u-co-unihan")); 1069 index.setMaxLabelCount(500); // ICU 54 default is 99. 1070 assertEquals("getMaxLabelCount()", 500, index.getMaxLabelCount()); // code coverage 1071 AlphabeticIndex.ImmutableIndex immIndex = index.buildImmutableIndex(); 1072 int bucketCount = immIndex.getBucketCount(); 1073 if(bucketCount < 216) { 1074 // There should be at least an underflow and overflow label, 1075 // and one for each of 214 radicals, 1076 // and maybe additional labels for simplified radicals. 1077 // (ICU4C: dataerrln(), prints only a warning if the data is missing) 1078 errln("too few buckets/labels for Chinese/unihan: " + bucketCount + 1079 " (is zh/unihan data available?)"); 1080 return; 1081 } else { 1082 logln("Chinese/unihan has " + bucketCount + " buckets/labels"); 1083 } 1084 // bucketIndex = radical number, adjusted for simplified radicals in lower buckets. 1085 int bucketIndex = index.getBucketIndex("\u4e5d"); 1086 assertEquals("getBucketIndex(U+4E5D)", 5, bucketIndex); 1087 // radical 100, and there is a 90' since Unicode 8 1088 bucketIndex = index.getBucketIndex("\u7527"); 1089 assertEquals("getBucketIndex(U+7527)", 101, bucketIndex); 1090 } 1091 1092 @Test 1093 public void testAddLabels_Locale() { 1094 AlphabeticIndex<?> ulocaleIndex = new AlphabeticIndex<String>(ULocale.CANADA); 1095 AlphabeticIndex<?> localeIndex = new AlphabeticIndex<String>(Locale.CANADA); 1096 ulocaleIndex.addLabels(ULocale.SIMPLIFIED_CHINESE); 1097 localeIndex.addLabels(Locale.SIMPLIFIED_CHINESE); 1098 assertEquals("getBucketLables() results of ulocaleIndex and localeIndex differ", 1099 ulocaleIndex.getBucketLabels(), localeIndex.getBucketLabels()); 1100 } 1101 1102 @Test 1103 public void testGetRecordCount_empty() { 1104 assertEquals("Record count of empty index not 0", 0, 1105 new AlphabeticIndex<String>(ULocale.CANADA).getRecordCount()); 1106 } 1107 1108 @Test 1109 public void testGetRecordCount_withRecords() { 1110 assertEquals("Record count of index with one record not 1", 1, 1111 new AlphabeticIndex<String>(ULocale.CANADA).addRecord("foo", null).getRecordCount()); 1112 } 1113 1114 /** 1115 * Check that setUnderflowLabel/setOverflowLabel/setInflowLabel correctly influence the name of 1116 * generated labels. 1117 */ 1118 @Test 1119 public void testFlowLabels() { 1120 AlphabeticIndex<?> index = new AlphabeticIndex<String>(ULocale.ENGLISH) 1121 .addLabels(ULocale.forLanguageTag("ru")); 1122 index.setUnderflowLabel("underflow"); 1123 index.setOverflowLabel("overflow"); 1124 index.setInflowLabel("inflow"); 1125 index.addRecord("!", null); 1126 index.addRecord("\u03B1", null); // GREEK SMALL LETTER ALPHA 1127 index.addRecord("\uab70", null); // CHEROKEE SMALL LETTER A 1128 AlphabeticIndex.Bucket<?> underflowBucket = null; 1129 AlphabeticIndex.Bucket<?> overflowBucket = null; 1130 AlphabeticIndex.Bucket<?> inflowBucket = null; 1131 for (AlphabeticIndex.Bucket<?> bucket : index) { 1132 switch (bucket.getLabelType()) { 1133 case UNDERFLOW: 1134 assertNull("LabelType not null", underflowBucket); 1135 underflowBucket = bucket; 1136 break; 1137 case OVERFLOW: 1138 assertNull("LabelType not null", overflowBucket); 1139 overflowBucket = bucket; 1140 break; 1141 case INFLOW: 1142 assertNull("LabelType not null", inflowBucket); 1143 inflowBucket = bucket; 1144 break; 1145 } 1146 } 1147 assertNotNull("No bucket 'underflow'", underflowBucket); 1148 assertEquals("Wrong bucket label", "underflow", underflowBucket.getLabel()); 1149 assertEquals("Wrong bucket label", "underflow", index.getUnderflowLabel()); 1150 assertEquals("Bucket size not 1", 1, underflowBucket.size()); 1151 assertNotNull("No bucket 'overflow'", overflowBucket); 1152 assertEquals("Wrong bucket label", "overflow", overflowBucket.getLabel()); 1153 assertEquals("Wrong bucket label", "overflow", index.getOverflowLabel()); 1154 assertEquals("Bucket size not 1", 1, overflowBucket.size()); 1155 assertNotNull("No bucket 'inflow'", inflowBucket); 1156 assertEquals("Wrong bucket label", "inflow", inflowBucket.getLabel()); 1157 assertEquals("Wrong bucket label", "inflow", index.getInflowLabel()); 1158 assertEquals("Bucket size not 1", 1, inflowBucket.size()); 1159 } 1160 } 1161