Home | History | Annotate | Download | only in collator
      1 /**
      2  *******************************************************************************
      3  * Copyright (C) 2001-2015, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  *******************************************************************************
      6  * CollationTest.java, ported from collationtest.cpp
      7  * C++ version created on: 2012apr27
      8  * created by: Markus W. Scherer
      9  */
     10 package com.ibm.icu.dev.test.collator;
     11 
     12 import java.io.BufferedReader;
     13 import java.io.IOException;
     14 import java.text.ParseException;
     15 import java.util.HashSet;
     16 import java.util.Set;
     17 
     18 import com.ibm.icu.dev.test.TestFmwk;
     19 import com.ibm.icu.dev.test.TestUtil;
     20 import com.ibm.icu.impl.Norm2AllModes;
     21 import com.ibm.icu.impl.Utility;
     22 import com.ibm.icu.impl.coll.Collation;
     23 import com.ibm.icu.impl.coll.CollationData;
     24 import com.ibm.icu.impl.coll.CollationFCD;
     25 import com.ibm.icu.impl.coll.CollationIterator;
     26 import com.ibm.icu.impl.coll.CollationRoot;
     27 import com.ibm.icu.impl.coll.CollationRootElements;
     28 import com.ibm.icu.impl.coll.CollationRuleParser;
     29 import com.ibm.icu.impl.coll.CollationWeights;
     30 import com.ibm.icu.impl.coll.FCDIterCollationIterator;
     31 import com.ibm.icu.impl.coll.FCDUTF16CollationIterator;
     32 import com.ibm.icu.impl.coll.UTF16CollationIterator;
     33 import com.ibm.icu.impl.coll.UVector32;
     34 import com.ibm.icu.text.CollationElementIterator;
     35 import com.ibm.icu.text.CollationKey;
     36 import com.ibm.icu.text.Collator;
     37 import com.ibm.icu.text.Collator.ReorderCodes;
     38 import com.ibm.icu.text.Normalizer2;
     39 import com.ibm.icu.text.RawCollationKey;
     40 import com.ibm.icu.text.RuleBasedCollator;
     41 import com.ibm.icu.text.UCharacterIterator;
     42 import com.ibm.icu.text.UTF16;
     43 import com.ibm.icu.text.UnicodeSet;
     44 import com.ibm.icu.text.UnicodeSetIterator;
     45 import com.ibm.icu.util.IllformedLocaleException;
     46 import com.ibm.icu.util.Output;
     47 import com.ibm.icu.util.ULocale;
     48 
     49 public class CollationTest extends TestFmwk {
     50     public static void main(String[] args) throws Exception{
     51         new CollationTest().run(args);
     52     }
     53 
     54     public CollationTest() {
     55     }
     56 
     57     // Fields
     58     Normalizer2 fcd, nfd;
     59     Collator coll;
     60     String fileLine;
     61     int fileLineNumber;
     62     String fileTestName;
     63 
     64     // package private methods ----------------------------------------------
     65 
     66     static void doTest(TestFmwk test, RuleBasedCollator col, String source,
     67                        String target, int result)
     68     {
     69         doTestVariant(test, col, source, target, result);
     70         if (result == -1) {
     71             doTestVariant(test, col, target, source, 1);
     72         }
     73         else if (result == 1) {
     74             doTestVariant(test, col, target, source, -1);
     75         }
     76         else {
     77             doTestVariant(test, col, target, source, 0);
     78         }
     79 
     80         CollationElementIterator iter = col.getCollationElementIterator(source);
     81         backAndForth(test, iter);
     82         iter.setText(target);
     83         backAndForth(test, iter);
     84     }
     85 
     86     /**
     87      * Return an integer array containing all of the collation orders
     88      * returned by calls to next on the specified iterator
     89      */
     90     static int[] getOrders(CollationElementIterator iter)
     91     {
     92         int maxSize = 100;
     93         int size = 0;
     94         int[] orders = new int[maxSize];
     95 
     96         int order;
     97         while ((order = iter.next()) != CollationElementIterator.NULLORDER) {
     98             if (size == maxSize) {
     99                 maxSize *= 2;
    100                 int[] temp = new int[maxSize];
    101                 System.arraycopy(orders, 0, temp,  0, size);
    102                 orders = temp;
    103             }
    104             orders[size++] = order;
    105         }
    106 
    107         if (maxSize > size) {
    108             int[] temp = new int[size];
    109             System.arraycopy(orders, 0, temp,  0, size);
    110             orders = temp;
    111         }
    112         return orders;
    113     }
    114 
    115     static void backAndForth(TestFmwk test, CollationElementIterator iter)
    116     {
    117         // Run through the iterator forwards and stick it into an array
    118         iter.reset();
    119         int[] orders = getOrders(iter);
    120 
    121         // Now go through it backwards and make sure we get the same values
    122         int index = orders.length;
    123         int o;
    124 
    125         // reset the iterator
    126         iter.reset();
    127 
    128         while ((o = iter.previous()) != CollationElementIterator.NULLORDER) {
    129             if (o != orders[--index]) {
    130                 if (o == 0) {
    131                     index ++;
    132                 } else {
    133                     while (index > 0 && orders[index] == 0) {
    134                         index --;
    135                     }
    136                     if (o != orders[index]) {
    137                         test.errln("Mismatch at index " + index + ": 0x"
    138                             + Utility.hex(orders[index]) + " vs 0x" + Utility.hex(o));
    139                         break;
    140                     }
    141                 }
    142             }
    143         }
    144 
    145         while (index != 0 && orders[index - 1] == 0) {
    146           index --;
    147         }
    148 
    149         if (index != 0) {
    150             String msg = "Didn't get back to beginning - index is ";
    151             test.errln(msg + index);
    152 
    153             iter.reset();
    154             test.err("next: ");
    155             while ((o = iter.next()) != CollationElementIterator.NULLORDER) {
    156                 String hexString = "0x" + Utility.hex(o) + " ";
    157                 test.err(hexString);
    158             }
    159             test.errln("");
    160             test.err("prev: ");
    161             while ((o = iter.previous()) != CollationElementIterator.NULLORDER) {
    162                 String hexString = "0x" + Utility.hex(o) + " ";
    163                  test.err(hexString);
    164             }
    165             test.errln("");
    166         }
    167     }
    168 
    169     static final String appendCompareResult(int result, String target){
    170         if (result == -1) {
    171             target += "LESS";
    172         } else if (result == 0) {
    173             target += "EQUAL";
    174         } else if (result == 1) {
    175             target += "GREATER";
    176         } else {
    177             String huh = "?";
    178             target += huh + result;
    179         }
    180         return target;
    181     }
    182 
    183     static final String prettify(CollationKey key) {
    184         byte[] bytes = key.toByteArray();
    185         return prettify(bytes, bytes.length);
    186     }
    187 
    188     static final String prettify(RawCollationKey key) {
    189         return prettify(key.bytes, key.size);
    190     }
    191 
    192     static final String prettify(byte[] skBytes, int length) {
    193         StringBuilder target = new StringBuilder(length * 3 + 2).append('[');
    194 
    195         for (int i = 0; i < length; i++) {
    196             String numStr = Integer.toHexString(skBytes[i] & 0xff);
    197             if (numStr.length() < 2) {
    198                 target.append('0');
    199             }
    200             target.append(numStr).append(' ');
    201         }
    202         target.append(']');
    203         return target.toString();
    204     }
    205 
    206     private static void doTestVariant(TestFmwk test,
    207                                       RuleBasedCollator myCollation,
    208                                       String source, String target, int result)
    209     {
    210         boolean printInfo = false;
    211         int compareResult  = myCollation.compare(source, target);
    212         if (compareResult != result) {
    213 
    214             // !!! if not mod build, error, else nothing.
    215             // warnln if not build, error, else always print warning.
    216             // do we need a 'quiet warning?' (err or log).  Hmmm,
    217             // would it work to have the 'verbose' flag let you
    218             // suppress warnings?  Are there ever some warnings you
    219             // want to suppress, and others you don't?
    220             if(!test.isModularBuild()){
    221                 test.errln("Comparing \"" + Utility.hex(source) + "\" with \""
    222                            + Utility.hex(target) + "\" expected " + result
    223                            + " but got " + compareResult);
    224             }else{
    225                 printInfo = true;
    226             }
    227         }
    228         CollationKey ssk = myCollation.getCollationKey(source);
    229         CollationKey tsk = myCollation.getCollationKey(target);
    230         compareResult = ssk.compareTo(tsk);
    231         if (compareResult != result) {
    232 
    233             if(!test.isModularBuild()){
    234                 test.errln("Comparing CollationKeys of \"" + Utility.hex(source)
    235                            + "\" with \"" + Utility.hex(target)
    236                            + "\" expected " + result + " but got "
    237                            + compareResult);
    238            }else{
    239                printInfo = true;
    240            }
    241         }
    242         RawCollationKey srsk = new RawCollationKey();
    243         myCollation.getRawCollationKey(source, srsk);
    244         RawCollationKey trsk = new RawCollationKey();
    245         myCollation.getRawCollationKey(target, trsk);
    246         compareResult = ssk.compareTo(tsk);
    247         if (compareResult != result) {
    248 
    249             if(!test.isModularBuild()){
    250                 test.errln("Comparing RawCollationKeys of \""
    251                            + Utility.hex(source)
    252                            + "\" with \"" + Utility.hex(target)
    253                            + "\" expected " + result + " but got "
    254                            + compareResult);
    255            }else{
    256                printInfo = true;
    257            }
    258         }
    259         // hmmm, but here we issue a warning
    260         // only difference is, one warning or two, and detailed info or not?
    261         // hmmm, does seem preferable to omit detail if we know it is due to missing resource data.
    262         // well, if we label the errors as warnings, we can let people know the details, but
    263         // also know they may be due to missing resource data.  basically this code is asserting
    264         // that the errors are due to missing resource data, which may or may not be true.
    265         if (printInfo) {
    266             test.warnln("Could not load locale data skipping.");
    267         }
    268     }
    269 
    270     public void TestMinMax() {
    271         setRootCollator();
    272         RuleBasedCollator rbc = (RuleBasedCollator)coll;
    273 
    274         final String s = "\uFFFE\uFFFF";
    275         long[] ces;
    276 
    277         ces = rbc.internalGetCEs(s);
    278         if (ces.length != 2) {
    279             errln("expected 2 CEs for <FFFE, FFFF>, got " + ces.length);
    280             return;
    281         }
    282 
    283         long ce = ces[0];
    284         long expected = Collation.makeCE(Collation.MERGE_SEPARATOR_PRIMARY);
    285         if (ce != expected) {
    286             errln("CE(U+fffe)=0x" + Utility.hex(ce) + " != 02..");
    287         }
    288 
    289         ce = ces[1];
    290         expected = Collation.makeCE(Collation.MAX_PRIMARY);
    291         if (ce != expected) {
    292             errln("CE(U+ffff)=0x" + Utility.hex(ce) + " != max..");
    293         }
    294     }
    295 
    296     public void TestImplicits() {
    297         CollationData cd = CollationRoot.getData();
    298 
    299         // Implicit primary weights should be assigned for the following sets,
    300         // and sort in ascending order by set and then code point.
    301         // See http://www.unicode.org/reports/tr10/#Implicit_Weights
    302         // core Han Unified Ideographs
    303         UnicodeSet coreHan = new UnicodeSet("[\\p{unified_ideograph}&"
    304                                  + "[\\p{Block=CJK_Unified_Ideographs}"
    305                                  + "\\p{Block=CJK_Compatibility_Ideographs}]]");
    306         // all other Unified Han ideographs
    307         UnicodeSet otherHan = new UnicodeSet("[\\p{unified ideograph}-"
    308                                  + "[\\p{Block=CJK_Unified_Ideographs}"
    309                                  + "\\p{Block=CJK_Compatibility_Ideographs}]]");
    310 
    311         UnicodeSet unassigned = new UnicodeSet("[[:Cn:][:Cs:][:Co:]]");
    312         unassigned.remove(0xfffe, 0xffff);  // These have special CLDR root mappings.
    313 
    314         // Starting with CLDR 26/ICU 54, the root Han order may instead be
    315         // the Unihan radical-stroke order.
    316         // The tests should pass either way, so we only test the order of a small set of Han characters
    317         // whose radical-stroke order is the same as their code point order.
    318         UnicodeSet someHanInCPOrder = new UnicodeSet(
    319                 "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48" +
    320                 "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]");
    321         UnicodeSet inOrder = new UnicodeSet(someHanInCPOrder);
    322         inOrder.addAll(unassigned).freeze();
    323 
    324         UnicodeSet[] sets = { coreHan, otherHan, unassigned };
    325         int prev = 0;
    326         long prevPrimary = 0;
    327         UTF16CollationIterator ci = new UTF16CollationIterator(cd, false, "", 0);
    328         for (int i = 0; i < sets.length; ++i) {
    329             UnicodeSetIterator iter = new UnicodeSetIterator(sets[i]);
    330             while (iter.next()) {
    331                 String s = iter.getString();
    332                 int c = s.codePointAt(0);
    333                 ci.setText(false, s, 0);
    334                 long ce = ci.nextCE();
    335                 long ce2 = ci.nextCE();
    336                 if (ce == Collation.NO_CE || ce2 != Collation.NO_CE) {
    337                     errln("CollationIterator.nextCE(0x" + Utility.hex(c)
    338                             + ") did not yield exactly one CE");
    339                     continue;
    340 
    341                 }
    342                 if ((ce & 0xffffffffL) != Collation.COMMON_SEC_AND_TER_CE) {
    343                     errln("CollationIterator.nextCE(U+" + Utility.hex(c, 4)
    344                             + ") has non-common sec/ter weights: 0x" + Utility.hex(ce & 0xffffffffL, 8));
    345                     continue;
    346                 }
    347                 long primary = ce >>> 32;
    348                 if (!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) {
    349                     errln("CE(U+" + Utility.hex(c) + ")=0x" + Utility.hex(primary)
    350                             + ".. not greater than CE(U+" + Utility.hex(prev)
    351                             + ")=0x" + Utility.hex(prevPrimary) + "..");
    352 
    353                 }
    354                 prev = c;
    355                 prevPrimary = primary;
    356             }
    357         }
    358     }
    359 
    360     // ICU4C: TestNulTerminated / renamed for ICU4J
    361     public void TestSubSequence() {
    362         CollationData data = CollationRoot.getData();
    363         final String s = "abab"; // { 0x61, 0x62, 0x61, 0x62 }
    364 
    365         UTF16CollationIterator ci1 = new UTF16CollationIterator(data, false, s, 0);
    366         UTF16CollationIterator ci2 = new UTF16CollationIterator(data, false, s, 2);
    367 
    368         for (int i = 0; i < 2; ++i) {
    369             long ce1 = ci1.nextCE();
    370             long ce2 = ci2.nextCE();
    371 
    372             if (ce1 != ce2) {
    373                 errln("CollationIterator.nextCE(with start position at 0) != "
    374                       + "nextCE(with start position at 2) at CE " + i);
    375             }
    376         }
    377     }
    378 
    379 
    380     // ICU4C: TestIllegalUTF8 / not applicable to ICU4J
    381 
    382 
    383     private static void addLeadSurrogatesForSupplementary(UnicodeSet src, UnicodeSet dest) {
    384         for(int c = 0x10000; c < 0x110000;) {
    385             int next = c + 0x400;
    386             if(src.containsSome(c, next - 1)) {
    387                 dest.add(UTF16.getLeadSurrogate(c));
    388             }
    389             c = next;
    390         }
    391     }
    392 
    393     public void TestShortFCDData() {
    394         UnicodeSet expectedLccc = new UnicodeSet("[:^lccc=0:]");
    395         expectedLccc.add(0xdc00, 0xdfff);   // add all trail surrogates
    396         addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
    397 
    398         UnicodeSet lccc = new UnicodeSet(); // actual
    399         for (int c = 0; c <= 0xffff; ++c) {
    400             if (CollationFCD.hasLccc(c)) {
    401                 lccc.add(c);
    402             }
    403         }
    404 
    405         UnicodeSet diff = new UnicodeSet(expectedLccc);
    406         diff.removeAll(lccc);
    407         diff.remove(0x10000, 0x10ffff);  // hasLccc() only works for the BMP
    408 
    409         String empty = "[]";
    410         String diffString;
    411 
    412         diffString = diff.toPattern(true);
    413         assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
    414 
    415         diff = lccc;
    416         diff.removeAll(expectedLccc);
    417         diffString = diff.toPattern(true);
    418         assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString);
    419 
    420         UnicodeSet expectedTccc = new UnicodeSet("[:^tccc=0:]");
    421         addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
    422         addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
    423 
    424         UnicodeSet tccc = new UnicodeSet(); // actual
    425         for(int c = 0; c <= 0xffff; ++c) {
    426             if (CollationFCD.hasTccc(c)) {
    427                 tccc.add(c);
    428             }
    429         }
    430 
    431         diff = new UnicodeSet(expectedTccc);
    432         diff.removeAll(tccc);
    433         diff.remove(0x10000, 0x10ffff); // hasTccc() only works for the BMP
    434         assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
    435 
    436         diff = tccc;
    437         diff.removeAll(expectedTccc);
    438         diffString = diff.toPattern(true);
    439         assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
    440     }
    441 
    442     private static class CodePointIterator {
    443         int[] cp;
    444         int length;
    445         int pos;
    446 
    447         CodePointIterator(int[] cp) {
    448             this.cp = cp;
    449             this.length = cp.length;
    450             this.pos = 0;
    451         }
    452 
    453         void resetToStart() {
    454             pos = 0;
    455         }
    456 
    457         int next() {
    458             return (pos < length) ? cp[pos++] : Collation.SENTINEL_CP;
    459         }
    460 
    461         int previous() {
    462             return (pos > 0) ? cp[--pos] : Collation.SENTINEL_CP;
    463         }
    464 
    465         int getLength() {
    466             return length;
    467         }
    468 
    469         int getIndex() {
    470             return pos;
    471         }
    472     }
    473 
    474     private void checkFCD(String name, CollationIterator ci, CodePointIterator cpi) {
    475         // Iterate forward to the limit.
    476         for (;;) {
    477             int c1 = ci.nextCodePoint();
    478             int c2 = cpi.next();
    479             if (c1 != c2) {
    480                 errln(name + ".nextCodePoint(to limit, 1st pass) = U+" + Utility.hex(c1)
    481                         + " != U+" + Utility.hex(c1) + " at " + cpi.getIndex());
    482                 return;
    483             }
    484             if (c1 < 0) {
    485                 break;
    486             }
    487         }
    488 
    489         // Iterate backward most of the way.
    490         for (int n = (cpi.getLength() * 2) / 3; n > 0; --n) {
    491             int c1 = ci.previousCodePoint();
    492             int c2 = cpi.previous();
    493             if (c1 != c2) {
    494                 errln(name + ".previousCodePoint() = U+" + Utility.hex(c1) +
    495                         " != U+" + Utility.hex(c2) + " at " + cpi.getIndex());
    496                 return;
    497             }
    498         }
    499 
    500         // Forward again.
    501         for (;;) {
    502             int c1 = ci.nextCodePoint();
    503             int c2 = cpi.next();
    504             if (c1 != c2) {
    505                 errln(name + ".nextCodePoint(to limit again) = U+" + Utility.hex(c1)
    506                         + " != U+" + Utility.hex(c2) + " at " + cpi.getIndex());
    507                 return;
    508             }
    509             if (c1 < 0) {
    510                 break;
    511             }
    512         }
    513 
    514         // Iterate backward to the start.
    515         for (;;) {
    516             int c1 = ci.previousCodePoint();
    517             int c2 = cpi.previous();
    518             if (c1 != c2) {
    519                 errln(name + ".nextCodePoint(to start) = U+" + Utility.hex(c1)
    520                         + " != U+" + Utility.hex(c2) + " at " + cpi.getIndex());
    521                 return;
    522             }
    523             if (c1 < 0) {
    524                 break;
    525             }
    526         }
    527     }
    528 
    529     public void TestFCD() {
    530         CollationData data = CollationRoot.getData();
    531 
    532         // Input string, not FCD.
    533         StringBuilder buf = new StringBuilder();
    534         buf.append("\u0308\u00e1\u0062\u0301\u0327\u0430\u0062")
    535             .appendCodePoint(0x1D15F)   // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
    536             .append("\u0327\u0308")     // ccc=202, 230
    537             .appendCodePoint(0x1D16D)   // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
    538             .appendCodePoint(0x1D15F)
    539             .appendCodePoint(0x1D16D)
    540             .append("\uac01")
    541             .append("\u00e7")           // Character with tccc!=0 decomposed together with mis-ordered sequence.
    542             .appendCodePoint(0x1D16D).appendCodePoint(0x1D165)
    543             .append("\u00e1")           // Character with tccc!=0 decomposed together with decomposed sequence.
    544             .append("\u0f73\u0f75")     // Tibetan composite vowels must be decomposed.
    545             .append("\u4e00\u0f81");
    546         String s = buf.toString();
    547 
    548         // Expected code points.
    549         int[] cp = {
    550             0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
    551             0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
    552             0x1D15F, 0x1D16D,
    553             0xac01,
    554             0x63, 0x327, 0x1D165, 0x1D16D,
    555             0x61,
    556             0xf71, 0xf71, 0xf72, 0xf74, 0x301,
    557             0x4e00, 0xf71, 0xf80
    558         };
    559 
    560         FCDUTF16CollationIterator u16ci = new FCDUTF16CollationIterator(data, false, s, 0);
    561         CodePointIterator cpi = new CodePointIterator(cp);
    562         checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
    563 
    564         cpi.resetToStart();
    565         UCharacterIterator iter = UCharacterIterator.getInstance(s);
    566         FCDIterCollationIterator uici = new FCDIterCollationIterator(data, false, iter, 0);
    567         checkFCD("FCDIterCollationIterator", uici, cpi);
    568     }
    569 
    570     private void checkAllocWeights(CollationWeights cw, long lowerLimit, long upperLimit,
    571             int n, int someLength, int minCount) {
    572 
    573         if (!cw.allocWeights(lowerLimit, upperLimit, n)) {
    574             errln("CollationWeights::allocWeights(0x"
    575                     + Utility.hex(lowerLimit) + ",0x"
    576                     + Utility.hex(upperLimit) + ","
    577                     + n + ") = false");
    578             return;
    579         }
    580         long previous = lowerLimit;
    581         int count = 0; // number of weights that have someLength
    582         for (int i = 0; i < n; ++i) {
    583             long w = cw.nextWeight();
    584             if (w == 0xffffffffL) {
    585                 errln("CollationWeights::allocWeights(0x"
    586                         + Utility.hex(lowerLimit) + ",0x"
    587                         + Utility.hex(upperLimit) + ",0x"
    588                         + n + ").nextWeight() returns only "
    589                         + i + " weights");
    590                 return;
    591             }
    592             if (!(previous < w && w < upperLimit)) {
    593                 errln("CollationWeights::allocWeights(0x"
    594                         + Utility.hex(lowerLimit) + ",0x"
    595                         + Utility.hex(upperLimit) + ","
    596                         + n + ").nextWeight() number "
    597                         + (i + 1) + " -> 0x" + Utility.hex(w)
    598                         + " not between "
    599                         + Utility.hex(previous) + " and "
    600                         + Utility.hex(upperLimit));
    601                 return;
    602             }
    603             if (CollationWeights.lengthOfWeight(w) == someLength) {
    604                 ++count;
    605             }
    606         }
    607         if (count < minCount) {
    608             errln("CollationWeights::allocWeights(0x"
    609                     + Utility.hex(lowerLimit) + ",0x"
    610                     + Utility.hex(upperLimit) + ","
    611                     + n + ").nextWeight() returns only "
    612                     + count + " < " + minCount + " weights of length "
    613                     + someLength);
    614 
    615         }
    616     }
    617 
    618     public void TestCollationWeights() {
    619         CollationWeights cw = new CollationWeights();
    620 
    621         // Non-compressible primaries use 254 second bytes 02..FF.
    622         logln("CollationWeights.initForPrimary(non-compressible)");
    623         cw.initForPrimary(false);
    624         // Expect 1 weight 11 and 254 weights 12xx.
    625         checkAllocWeights(cw, 0x10000000L, 0x13000000L, 255, 1, 1);
    626         checkAllocWeights(cw, 0x10000000L, 0x13000000L, 255, 2, 254);
    627         // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
    628         checkAllocWeights(cw, 0x10fefe40L, 0x12030300L, 260, 2, 255);
    629         // Expect 254 two-byte weights from the ranges 10ff and 11xx.
    630         checkAllocWeights(cw, 0x10fefe40L, 0x12030300L, 600, 2, 254);
    631         // Expect 254^2=64516 three-byte weights.
    632         // During computation, there should be 3 three-byte ranges
    633         // 10ffff, 11xxxx, 120202.
    634         // The middle one should be split 64515:1,
    635         // and the newly-split-off range and the last ranged lengthened.
    636         checkAllocWeights(cw, 0x10fffe00L, 0x12020300L, 1 + 64516 + 254 + 1, 3, 64516);
    637         // Expect weights 1102 & 1103.
    638         checkAllocWeights(cw, 0x10ff0000L, 0x11040000L, 2, 2, 2);
    639         // Expect weights 102102 & 102103.
    640         checkAllocWeights(cw, 0x1020ff00L, 0x10210400L, 2, 3, 2);
    641 
    642         // Compressible primaries use 251 second bytes 04..FE.
    643         logln("CollationWeights.initForPrimary(compressible)");
    644         cw.initForPrimary(true);
    645         // Expect 1 weight 11 and 251 weights 12xx.
    646         checkAllocWeights(cw, 0x10000000L, 0x13000000L, 252, 1, 1);
    647         checkAllocWeights(cw, 0x10000000L, 0x13000000L, 252, 2, 251);
    648         // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
    649         checkAllocWeights(cw, 0x10fdfe40L, 0x12050300L, 260, 2, 252);
    650         // Expect weights 1104 & 1105.
    651         checkAllocWeights(cw, 0x10fe0000L, 0x11060000L, 2, 2, 2);
    652         // Expect weights 102102 & 102103.
    653         checkAllocWeights(cw, 0x1020ff00L, 0x10210400L, 2, 3, 2);
    654 
    655         // Secondary and tertiary weights use only bytes 3 & 4.
    656         logln("CollationWeights.initForSecondary()");
    657         cw.initForSecondary();
    658         // Expect weights fbxx and all four fc..ff.
    659         checkAllocWeights(cw, 0xfb20L, 0x10000L, 20, 3, 4);
    660 
    661         logln("CollationWeights.initForTertiary()");
    662         cw.initForTertiary();
    663         // Expect weights 3dxx and both 3e & 3f.
    664         checkAllocWeights(cw, 0x3d02L, 0x4000L, 10, 3, 2);
    665     }
    666 
    667     private static boolean isValidCE(CollationRootElements re, CollationData data, long p, long s, long ctq) {
    668         long p1 = p >>> 24;
    669         long p2 = (p >>> 16) & 0xff;
    670         long p3 = (p >>> 8) & 0xff;
    671         long p4 = p & 0xff;
    672         long s1 = s >>> 8;
    673         long s2 = s & 0xff;
    674         // ctq = Case, Tertiary, Quaternary
    675         long c = (ctq & Collation.CASE_MASK) >>> 14;
    676         long t = ctq & Collation.ONLY_TERTIARY_MASK;
    677         long t1 = t >>> 8;
    678         long t2 = t & 0xff;
    679         long q = ctq & Collation.QUATERNARY_MASK;
    680         // No leading zero bytes.
    681         if ((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
    682             return false;
    683         }
    684         // No intermediate zero bytes.
    685         if (p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
    686             return false;
    687         }
    688         if (p2 != 0 && p3 == 0 && p4 != 0) {
    689             return false;
    690         }
    691         // Minimum & maximum lead bytes.
    692         if ((p1 != 0 && p1 <= Collation.MERGE_SEPARATOR_BYTE)
    693                 || s1 == Collation.LEVEL_SEPARATOR_BYTE
    694                 || t1 == Collation.LEVEL_SEPARATOR_BYTE || t1 > 0x3f) {
    695             return false;
    696         }
    697         if (c > 2) {
    698             return false;
    699         }
    700         // The valid byte range for the second primary byte depends on compressibility.
    701         if (p2 != 0) {
    702             if (data.isCompressibleLeadByte((int)p1)) {
    703                 if (p2 <= Collation.PRIMARY_COMPRESSION_LOW_BYTE
    704                         || Collation.PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
    705                     return false;
    706                 }
    707             } else {
    708                 if (p2 <= Collation.LEVEL_SEPARATOR_BYTE) {
    709                     return false;
    710                 }
    711             }
    712         }
    713         // Other bytes just need to avoid the level separator.
    714         // Trailing zeros are ok.
    715         // assert (Collation.LEVEL_SEPARATOR_BYTE == 1);
    716         if (p3 == Collation.LEVEL_SEPARATOR_BYTE || p4 == Collation.LEVEL_SEPARATOR_BYTE
    717                 || s2 == Collation.LEVEL_SEPARATOR_BYTE || t2 == Collation.LEVEL_SEPARATOR_BYTE) {
    718             return false;
    719         }
    720         // Well-formed CEs.
    721         if (p == 0) {
    722             if (s == 0) {
    723                 if (t == 0) {
    724                     // Completely ignorable CE.
    725                     // Quaternary CEs are not supported.
    726                     if (c != 0 || q != 0) {
    727                         return false;
    728                     }
    729                 } else {
    730                     // Tertiary CE.
    731                     if (t < re.getTertiaryBoundary() || c != 2) {
    732                         return false;
    733                     }
    734                 }
    735             } else {
    736                 // Secondary CE.
    737                 if (s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
    738                     return false;
    739                 }
    740             }
    741         } else {
    742             // Primary CE.
    743             if (s == 0 || (Collation.COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary())
    744                     || s >= re.getSecondaryBoundary()) {
    745                 return false;
    746             }
    747             if (t == 0 || t >= re.getTertiaryBoundary()) {
    748                 return false;
    749             }
    750         }
    751         return true;
    752     }
    753 
    754     private static boolean isValidCE(CollationRootElements re, CollationData data, long ce) {
    755         long p = ce >>> 32;
    756         long secTer = ce & 0xffffffffL;
    757         return isValidCE(re, data, p, secTer >>> 16, secTer & 0xffff);
    758     }
    759 
    760     private static class RootElementsIterator {
    761         CollationData data;
    762         long[] elements;
    763         int length;
    764 
    765         long pri;
    766         long secTer;
    767         int index;
    768 
    769         RootElementsIterator(CollationData root) {
    770             data = root;
    771             elements = root.rootElements;
    772             length = elements.length;
    773             pri = 0;
    774             secTer = 0;
    775             index = (int)elements[CollationRootElements.IX_FIRST_TERTIARY_INDEX];
    776         }
    777 
    778         boolean next() {
    779             if (index >= length) {
    780                 return false;
    781             }
    782             long p = elements[index];
    783             if (p == CollationRootElements.PRIMARY_SENTINEL) {
    784                 return false;
    785             }
    786             if ((p & CollationRootElements.SEC_TER_DELTA_FLAG) != 0) {
    787                 ++index;
    788                 secTer = p & ~CollationRootElements.SEC_TER_DELTA_FLAG;
    789                 return true;
    790             }
    791             if ((p & CollationRootElements.PRIMARY_STEP_MASK) != 0) {
    792                 // End of a range, enumerate the primaries in the range.
    793                 int step = (int)p & CollationRootElements.PRIMARY_STEP_MASK;
    794                 p &= 0xffffff00;
    795                 if (pri == p) {
    796                     // Finished the range, return the next CE after it.
    797                     ++index;
    798                     return next();
    799                 }
    800                 assert (pri < p);
    801                 // Return the next primary in this range.
    802                 boolean isCompressible = data.isCompressiblePrimary(pri);
    803                 if ((pri & 0xffff) == 0) {
    804                     pri = Collation.incTwoBytePrimaryByOffset(pri, isCompressible, step);
    805                 } else {
    806                     pri = Collation.incThreeBytePrimaryByOffset(pri, isCompressible, step);
    807                 }
    808                 return true;
    809             }
    810             // Simple primary CE.
    811             ++index;
    812             pri = p;
    813             // Does this have an explicit below-common sec/ter unit,
    814             // or does it imply a common one?
    815             if(index == length) {
    816                 secTer = Collation.COMMON_SEC_AND_TER_CE;
    817             } else {
    818                 secTer = elements[index];
    819                 if((secTer & CollationRootElements.SEC_TER_DELTA_FLAG) == 0) {
    820                     // No sec/ter delta.
    821                     secTer = Collation.COMMON_SEC_AND_TER_CE;
    822                 } else {
    823                     secTer &= ~CollationRootElements.SEC_TER_DELTA_FLAG;
    824                     if(secTer > Collation.COMMON_SEC_AND_TER_CE) {
    825                         // Implied sec/ter.
    826                         secTer = Collation.COMMON_SEC_AND_TER_CE;
    827                     } else {
    828                         // Explicit sec/ter below common/common.
    829                         ++index;
    830                     }
    831                 }
    832             }
    833             return true;
    834         }
    835 
    836         long getPrimary() {
    837             return pri;
    838         }
    839 
    840         long getSecTer() {
    841             return secTer;
    842         }
    843     }
    844 
    845     public void TestRootElements() {
    846         CollationData root = CollationRoot.getData();
    847 
    848         CollationRootElements rootElements = new CollationRootElements(root.rootElements);
    849         RootElementsIterator iter = new RootElementsIterator(root);
    850 
    851         // We check each root CE for validity,
    852         // and we also verify that there is a tailoring gap between each two CEs.
    853         CollationWeights cw1c = new CollationWeights(); // compressible primary weights
    854         CollationWeights cw1u = new CollationWeights(); // uncompressible primary weights
    855         CollationWeights cw2 = new CollationWeights();
    856         CollationWeights cw3 = new CollationWeights();
    857 
    858         cw1c.initForPrimary(true);
    859         cw1u.initForPrimary(false);
    860         cw2.initForSecondary();
    861         cw3.initForTertiary();
    862 
    863         // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
    864         // nor the special merge-separator CE for U+FFFE.
    865         long prevPri = 0;
    866         long prevSec = 0;
    867         long prevTer = 0;
    868 
    869         while (iter.next()) {
    870             long pri = iter.getPrimary();
    871             long secTer = iter.getSecTer();
    872             // CollationRootElements CEs must have 0 case and quaternary bits.
    873             if ((secTer & Collation.CASE_AND_QUATERNARY_MASK) != 0) {
    874                 errln("CollationRootElements CE has non-zero case and/or quaternary bits: "
    875                         + "0x" + Utility.hex(pri, 8) + " 0x" + Utility.hex(secTer, 8));
    876             }
    877             long sec = secTer >>> 16;
    878             long ter = secTer & Collation.ONLY_TERTIARY_MASK;
    879             long ctq = ter;
    880             if (pri == 0 && sec == 0 && ter != 0) {
    881                 // Tertiary CEs must have uppercase bits,
    882                 // but they are not stored in the CollationRootElements.
    883                 ctq |= 0x8000;
    884             }
    885             if (!isValidCE(rootElements, root, pri, sec, ctq)) {
    886                 errln("invalid root CE 0x"
    887                         + Utility.hex(pri, 8) + " 0x" + Utility.hex(secTer, 8));
    888             } else {
    889                 if (pri != prevPri) {
    890                     long newWeight = 0;
    891                     if (prevPri == 0 || prevPri >= Collation.FFFD_PRIMARY) {
    892                         // There is currently no tailoring gap after primary ignorables,
    893                         // and we forbid tailoring after U+FFFD and U+FFFF.
    894                     } else if (root.isCompressiblePrimary(prevPri)) {
    895                         if (!cw1c.allocWeights(prevPri, pri, 1)) {
    896                             errln("no primary/compressible tailoring gap between "
    897                                     + "0x" + Utility.hex(prevPri, 8)
    898                                     + " and 0x" + Utility.hex(pri, 8));
    899                         } else {
    900                             newWeight = cw1c.nextWeight();
    901                         }
    902                     } else {
    903                         if (!cw1u.allocWeights(prevPri, pri, 1)) {
    904                             errln("no primary/uncompressible tailoring gap between "
    905                                     + "0x" + Utility.hex(prevPri, 8)
    906                                     + " and 0x" + Utility.hex(pri, 8));
    907                         } else {
    908                             newWeight = cw1u.nextWeight();
    909                         }
    910                     }
    911                     if (newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
    912                         errln("mis-allocated primary weight, should get "
    913                                 + "0x" + Utility.hex(prevPri, 8)
    914                                 + " < 0x" + Utility.hex(newWeight, 8)
    915                                 + " < 0x" + Utility.hex(pri, 8));
    916                     }
    917                 } else if (sec != prevSec) {
    918                     long lowerLimit = prevSec == 0 ?
    919                             rootElements.getSecondaryBoundary() - 0x100 : prevSec;
    920                     if (!cw2.allocWeights(lowerLimit, sec, 1)) {
    921                         errln("no secondary tailoring gap between "
    922                                 + "0x" + Utility.hex(lowerLimit)
    923                                 + " and 0x" + Utility.hex(sec));
    924                     } else {
    925                         long newWeight = cw2.nextWeight();
    926                         if (!(prevSec < newWeight && newWeight < sec)) {
    927                             errln("mis-allocated secondary weight, should get "
    928                                     + "0x" + Utility.hex(lowerLimit)
    929                                     + " < 0x" + Utility.hex(newWeight)
    930                                     + " < 0x" + Utility.hex(sec));
    931                         }
    932                     }
    933                 } else if (ter != prevTer) {
    934                     long lowerLimit = prevTer == 0 ?
    935                             rootElements.getTertiaryBoundary() - 0x100 : prevTer;
    936                     if (!cw3.allocWeights(lowerLimit, ter, 1)) {
    937                         errln("no tertiary tailoring gap between "
    938                                 + "0x" + Utility.hex(lowerLimit)
    939                                 + " and 0x" + Utility.hex(ter));
    940                     } else {
    941                         long newWeight = cw3.nextWeight();
    942                         if (!(prevTer < newWeight && newWeight < ter)) {
    943                             errln("mis-allocated tertiary weight, should get "
    944                                     + "0x" + Utility.hex(lowerLimit)
    945                                     + " < 0x" + Utility.hex(newWeight)
    946                                     + " < 0x" + Utility.hex(ter));
    947                         }
    948                     }
    949                 } else {
    950                     errln("duplicate root CE 0x"
    951                             + Utility.hex(pri, 8) + " 0x" + Utility.hex(secTer, 8));
    952                 }
    953             }
    954             prevPri = pri;
    955             prevSec = sec;
    956             prevTer = ter;
    957         }
    958     }
    959 
    960     public void TestTailoredElements() {
    961         CollationData root = CollationRoot.getData();
    962         CollationRootElements rootElements = new CollationRootElements(root.rootElements);
    963 
    964         Set<String> prevLocales = new HashSet<String>();
    965         prevLocales.add("");
    966         prevLocales.add("root");
    967         prevLocales.add("root@collation=standard");
    968 
    969         long[] ces;
    970         ULocale[] locales = Collator.getAvailableULocales();
    971         String localeID = "root";
    972         int locIdx = 0;
    973 
    974         for (; locIdx < locales.length; localeID = locales[locIdx++].getName()) {
    975             ULocale locale = new ULocale(localeID);
    976             String[] types = Collator.getKeywordValuesForLocale("collation", locale, false);
    977             for (int typeIdx = 0; typeIdx < types.length; ++typeIdx) {
    978                 String type = types[typeIdx];  // first: default type
    979                 if (type.startsWith("private-")) {
    980                     errln("Collator.getKeywordValuesForLocale(" + localeID +
    981                             ") returns private collation keyword: " + type);
    982                 }
    983                 ULocale localeWithType = locale.setKeywordValue("collation", type);
    984                 Collator coll = Collator.getInstance(localeWithType);
    985                 ULocale actual = coll.getLocale(ULocale.ACTUAL_LOCALE);
    986                 if (prevLocales.contains(actual.getName())) {
    987                     continue;
    988                 }
    989                 prevLocales.add(actual.getName());
    990                 logln("TestTailoredElements(): requested " + localeWithType.getName()
    991                         + " -> actual " + actual.getName());
    992                 if (!(coll instanceof RuleBasedCollator)) {
    993                     continue;
    994                 }
    995                 RuleBasedCollator rbc = (RuleBasedCollator) coll;
    996 
    997                 // Note: It would be better to get tailored strings such that we can
    998                 // identify the prefix, and only get the CEs for the prefix+string,
    999                 // not also for the prefix.
   1000                 // There is currently no API for that.
   1001                 // It would help in an unusual case where a contraction starting in the prefix
   1002                 // extends past its end, and we do not see the intended mapping.
   1003                 // For example, for a mapping p|st, if there is also a contraction ps,
   1004                 // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
   1005                 UnicodeSet tailored = coll.getTailoredSet();
   1006                 UnicodeSetIterator iter = new UnicodeSetIterator(tailored);
   1007                 while (iter.next()) {
   1008                     String s = iter.getString();
   1009                     ces = rbc.internalGetCEs(s);
   1010                     for (int i = 0; i < ces.length; ++i) {
   1011                         long ce = ces[i];
   1012                         if (!isValidCE(rootElements, root, ce)) {
   1013                             logln(prettify(s));
   1014                             errln("invalid tailored CE 0x" + Utility.hex(ce, 16)
   1015                                     + " at CE index " + i + " from string:");
   1016                         }
   1017                     }
   1018                 }
   1019             }
   1020         }
   1021     }
   1022 
   1023     private static boolean isSpace(char c) {
   1024         return (c == 0x09 || c == 0x20 || c == 0x3000);
   1025     }
   1026 
   1027     private static boolean isSectionStarter(char c) {
   1028         return (c == '%' || c == '*' || c == '@');
   1029     }
   1030 
   1031     private int skipSpaces(int i) {
   1032         while (isSpace(fileLine.charAt(i))) {
   1033             ++i;
   1034         }
   1035         return i;
   1036     }
   1037 
   1038     private String printSortKey(byte[] p) {
   1039         StringBuilder s = new StringBuilder();
   1040         for (int i = 0; i < p.length; ++i) {
   1041             if (i > 0) {
   1042                 s.append(' ');
   1043             }
   1044             byte b = p[i];
   1045             if (b == 0) {
   1046                 s.append('.');
   1047             } else if (b == 1) {
   1048                 s.append('|');
   1049             } else {
   1050                 s.append(String.format("%02x", b & 0xff));
   1051             }
   1052         }
   1053         return s.toString();
   1054     }
   1055 
   1056     private String printCollationKey(CollationKey key) {
   1057         byte[] p = key.toByteArray();
   1058         return printSortKey(p);
   1059     }
   1060 
   1061     private boolean readNonEmptyLine(BufferedReader in) throws IOException {
   1062         for (;;) {
   1063             String line = in.readLine();
   1064             if (line == null) {
   1065                 fileLine = null;
   1066                 return false;
   1067             }
   1068             if (fileLineNumber == 0 && line.length() != 0 && line.charAt(0) == '\uFEFF') {
   1069                 line = line.substring(1);  // Remove the BOM.
   1070             }
   1071             ++fileLineNumber;
   1072             // Strip trailing comments and spaces
   1073             int idx = line.indexOf('#');
   1074             if (idx < 0) {
   1075                 idx = line.length();
   1076             }
   1077             while (idx > 0 && isSpace(line.charAt(idx - 1))) {
   1078                 --idx;
   1079             }
   1080             if (idx != 0) {
   1081                 fileLine = idx < line.length() ? line.substring(0, idx) : line;
   1082                 return true;
   1083             }
   1084             // Empty line, continue.
   1085         }
   1086     }
   1087 
   1088     private int parseString(int start, Output<String> prefix, Output<String> s) throws ParseException {
   1089         int length = fileLine.length();
   1090         int i;
   1091         for (i = start; i < length && !isSpace(fileLine.charAt(i)); ++i) {
   1092         }
   1093         int pipeIndex = fileLine.indexOf('|', start);
   1094         if (pipeIndex >= 0 && pipeIndex < i) {
   1095             String tmpPrefix  = Utility.unescape(fileLine.substring(start, pipeIndex));
   1096             if (tmpPrefix.length() == 0) {
   1097                 prefix.value = null;
   1098                 logln(fileLine);
   1099                 throw new ParseException("empty prefix on line " + fileLineNumber, fileLineNumber);
   1100             }
   1101             prefix.value = tmpPrefix;
   1102             start = pipeIndex + 1;
   1103         } else {
   1104             prefix.value = null;
   1105         }
   1106 
   1107         String tmp = Utility.unescape(fileLine.substring(start, i));
   1108         if (tmp.length() == 0) {
   1109             s.value = null;
   1110             logln(fileLine);
   1111             throw new ParseException("empty string on line " + fileLineNumber, fileLineNumber);
   1112         }
   1113         s.value = tmp;
   1114         return i;
   1115     }
   1116 
   1117     private int parseRelationAndString(Output<String> s) throws ParseException {
   1118         int relation = Collation.NO_LEVEL;
   1119         int start;
   1120         if (fileLine.charAt(0) == '<') {
   1121             char second = fileLine.charAt(1);
   1122             start = 2;
   1123             switch(second) {
   1124             case 0x31:  // <1
   1125                 relation = Collation.PRIMARY_LEVEL;
   1126                 break;
   1127             case 0x32:  // <2
   1128                 relation = Collation.SECONDARY_LEVEL;
   1129                 break;
   1130             case 0x33:  // <3
   1131                 relation = Collation.TERTIARY_LEVEL;
   1132                 break;
   1133             case 0x34:  // <4
   1134                 relation = Collation.QUATERNARY_LEVEL;
   1135                 break;
   1136             case 0x63:  // <c
   1137                 relation = Collation.CASE_LEVEL;
   1138                 break;
   1139             case 0x69:  // <i
   1140                 relation = Collation.IDENTICAL_LEVEL;
   1141                 break;
   1142             default:  // just <
   1143                 relation = Collation.NO_LEVEL;
   1144                 start = 1;
   1145                 break;
   1146             }
   1147         } else if (fileLine.charAt(0) == '=') {
   1148             relation = Collation.ZERO_LEVEL;
   1149             start = 1;
   1150         } else {
   1151             start = 0;
   1152         }
   1153 
   1154         if (start == 0 || !isSpace(fileLine.charAt(start))) {
   1155             logln(fileLine);
   1156             throw new ParseException("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line "
   1157                                         + fileLineNumber, fileLineNumber);
   1158         }
   1159 
   1160         start = skipSpaces(start);
   1161         Output<String> prefixOut = new Output<String>();
   1162         start = parseString(start, prefixOut, s);
   1163         if (prefixOut.value != null) {
   1164             logln(fileLine);
   1165             throw new ParseException("prefix string not allowed for test string: on line "
   1166                                         + fileLineNumber, fileLineNumber);
   1167         }
   1168         if (start < fileLine.length()) {
   1169             logln(fileLine);
   1170             throw new ParseException("unexpected line contents after test string on line "
   1171                                         + fileLineNumber, fileLineNumber);
   1172         }
   1173 
   1174         return relation;
   1175     }
   1176 
   1177     private void parseAndSetAttribute() throws ParseException {
   1178         // Parse attributes even if the Collator could not be created,
   1179         // in order to report syntax errors.
   1180         int start = skipSpaces(1);
   1181         int equalPos = fileLine.indexOf('=');
   1182         if (equalPos < 0) {
   1183             if (fileLine.regionMatches(start, "reorder", 0, 7)) {
   1184                 parseAndSetReorderCodes(start + 7);
   1185                 return;
   1186             }
   1187             logln(fileLine);
   1188             throw new ParseException("missing '=' on line " + fileLineNumber, fileLineNumber);
   1189         }
   1190 
   1191         String attrString = fileLine.substring(start,  equalPos);
   1192         String valueString = fileLine.substring(equalPos + 1);
   1193         if (attrString.equals("maxVariable")) {
   1194             int max;
   1195             if (valueString.equals("space")) {
   1196                 max = ReorderCodes.SPACE;
   1197             } else if(valueString.equals("punct")) {
   1198                 max = ReorderCodes.PUNCTUATION;
   1199             } else if(valueString.equals("symbol")) {
   1200                 max = ReorderCodes.SYMBOL;
   1201             } else if(valueString.equals("currency")) {
   1202                 max = ReorderCodes.CURRENCY;
   1203             } else {
   1204                 logln(fileLine);
   1205                 throw new ParseException("invalid attribute value name on line "
   1206                                             + fileLineNumber, fileLineNumber);
   1207             }
   1208             if (coll != null) {
   1209                 coll.setMaxVariable(max);
   1210             }
   1211             fileLine = null;
   1212             return;
   1213         }
   1214 
   1215         boolean parsed = true;
   1216         RuleBasedCollator rbc = (RuleBasedCollator)coll;
   1217         if (attrString.equals("backwards")) {
   1218             if (valueString.equals("on")) {
   1219                 if (rbc != null) rbc.setFrenchCollation(true);
   1220             } else if (valueString.equals("off")) {
   1221                 if (rbc != null) rbc.setFrenchCollation(false);
   1222             } else if (valueString.equals("default")) {
   1223                 if (rbc != null) rbc.setFrenchCollationDefault();
   1224             } else {
   1225                 parsed = false;
   1226             }
   1227         } else if (attrString.equals("alternate")) {
   1228             if (valueString.equals("non-ignorable")) {
   1229                 if (rbc != null) rbc.setAlternateHandlingShifted(false);
   1230             } else if (valueString.equals("shifted")) {
   1231                 if (rbc != null) rbc.setAlternateHandlingShifted(true);
   1232             } else if (valueString.equals("default")) {
   1233                 if (rbc != null) rbc.setAlternateHandlingDefault();
   1234             } else {
   1235                 parsed = false;
   1236             }
   1237         } else if (attrString.equals("caseFirst")) {
   1238             if (valueString.equals("upper")) {
   1239                 if (rbc != null) rbc.setUpperCaseFirst(true);
   1240             } else if (valueString.equals("lower")) {
   1241                 if (rbc != null) rbc.setLowerCaseFirst(true);
   1242             } else if (valueString.equals("default")) {
   1243                 if (rbc != null) rbc.setCaseFirstDefault();
   1244             } else {
   1245                 parsed = false;
   1246             }
   1247         } else if (attrString.equals("caseLevel")) {
   1248             if (valueString.equals("on")) {
   1249                 if (rbc != null) rbc.setCaseLevel(true);
   1250             } else if (valueString.equals("off")) {
   1251                 if (rbc != null) rbc.setCaseLevel(false);
   1252             } else if (valueString.equals("default")) {
   1253                 if (rbc != null) rbc.setCaseLevelDefault();
   1254             } else {
   1255                 parsed = false;
   1256             }
   1257         } else if (attrString.equals("strength")) {
   1258             if (valueString.equals("primary")) {
   1259                 if (rbc != null) rbc.setStrength(Collator.PRIMARY);
   1260             } else if (valueString.equals("secondary")) {
   1261                 if (rbc != null) rbc.setStrength(Collator.SECONDARY);
   1262             } else if (valueString.equals("tertiary")) {
   1263                 if (rbc != null) rbc.setStrength(Collator.TERTIARY);
   1264             } else if (valueString.equals("quaternary")) {
   1265                 if (rbc != null) rbc.setStrength(Collator.QUATERNARY);
   1266             } else if (valueString.equals("identical")) {
   1267                 if (rbc != null) rbc.setStrength(Collator.IDENTICAL);
   1268             } else if (valueString.equals("default")) {
   1269                 if (rbc != null) rbc.setStrengthDefault();
   1270             } else {
   1271                 parsed = false;
   1272             }
   1273         } else if (attrString.equals("numeric")) {
   1274             if (valueString.equals("on")) {
   1275                 if (rbc != null) rbc.setNumericCollation(true);
   1276             } else if (valueString.equals("off")) {
   1277                 if (rbc != null) rbc.setNumericCollation(false);
   1278             } else if (valueString.equals("default")) {
   1279                 if (rbc != null) rbc.setNumericCollationDefault();
   1280             } else {
   1281                 parsed = false;
   1282             }
   1283         } else {
   1284             logln(fileLine);
   1285             throw new ParseException("invalid attribute name on line "
   1286                                         + fileLineNumber, fileLineNumber);
   1287         }
   1288         if (!parsed) {
   1289             logln(fileLine);
   1290             throw new ParseException(
   1291                     "invalid attribute value name or attribute=value combination on line "
   1292                     + fileLineNumber, fileLineNumber);
   1293         }
   1294 
   1295         fileLine = null;
   1296     }
   1297 
   1298     private void parseAndSetReorderCodes(int start) throws ParseException {
   1299         UVector32 reorderCodes = new UVector32();
   1300         while (start < fileLine.length()) {
   1301             start = skipSpaces(start);
   1302             int limit = start;
   1303             while (limit < fileLine.length() && !isSpace(fileLine.charAt(limit))) {
   1304                 ++limit;
   1305             }
   1306             String name = fileLine.substring(start, limit);
   1307             int code = CollationRuleParser.getReorderCode(name);
   1308             if (code < -1) {
   1309                 if (name.equalsIgnoreCase("default")) {
   1310                     code = ReorderCodes.DEFAULT;  // -1
   1311                 } else {
   1312                     logln(fileLine);
   1313                     throw new ParseException("invalid reorder code '" + name + "' on line "
   1314                                                 + fileLineNumber, fileLineNumber);
   1315                 }
   1316             }
   1317             reorderCodes.addElement(code);
   1318             start = limit;
   1319         }
   1320         if (coll != null) {
   1321             int[] reorderCodesArray = new int[reorderCodes.size()];
   1322             System.arraycopy(reorderCodes.getBuffer(), 0,
   1323                     reorderCodesArray, 0, reorderCodes.size());
   1324             coll.setReorderCodes(reorderCodesArray);
   1325         }
   1326 
   1327         fileLine = null;
   1328     }
   1329 
   1330     private void buildTailoring(BufferedReader in) throws IOException {
   1331         StringBuilder rules = new StringBuilder();
   1332         while (readNonEmptyLine(in) && !isSectionStarter(fileLine.charAt(0))) {
   1333             rules.append(Utility.unescape(fileLine));
   1334         }
   1335 
   1336         try {
   1337             coll = new RuleBasedCollator(rules.toString());
   1338         } catch (Exception e) {
   1339             logln(rules.toString());
   1340             // Android patch: Add --omitCollationRules to genrb.
   1341             warnln("RuleBasedCollator(rules) failed - " + e.getMessage());
   1342             // Android patch end.
   1343             coll = null;
   1344         }
   1345     }
   1346 
   1347     private void setRootCollator() {
   1348         coll = Collator.getInstance(ULocale.ROOT);
   1349     }
   1350 
   1351     private void setLocaleCollator() {
   1352         coll = null;
   1353         ULocale locale = null;
   1354         if (fileLine.length() > 9) {
   1355             String localeID = fileLine.substring(9); // "@ locale <langTag>"
   1356             try {
   1357                 locale = new ULocale(localeID);  // either locale ID or language tag
   1358             } catch (IllformedLocaleException e) {
   1359                 locale = null;
   1360             }
   1361         }
   1362         if (locale == null) {
   1363             logln(fileLine);
   1364             errln("invalid language tag on line " + fileLineNumber);
   1365             return;
   1366         }
   1367 
   1368         logln("creating a collator for locale ID " + locale.getName());
   1369         try {
   1370             coll = Collator.getInstance(locale);
   1371         } catch (Exception e) {
   1372             errln("unable to create a collator for locale " + locale +
   1373                     " on line " + fileLineNumber + " - " + e);
   1374         }
   1375     }
   1376 
   1377     private boolean needsNormalization(String s) {
   1378         if (!fcd.isNormalized(s)) {
   1379             return true;
   1380         }
   1381         // In some sequences with Tibetan composite vowel signs,
   1382         // even if the string passes the FCD check,
   1383         // those composites must be decomposed.
   1384         // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
   1385         int index = 0;
   1386         while((index = s.indexOf(0xf71, index)) >= 0) {
   1387             if (++index < s.length()) {
   1388                 char c = s.charAt(index);
   1389                 if (c == 0xf73 || c == 0xf75 || c == 0xf81) {
   1390                     return true;
   1391                 }
   1392             }
   1393         }
   1394         return false;
   1395     }
   1396 
   1397     private boolean getCollationKey(String norm, String line, String s, Output<CollationKey> keyOut) {
   1398         CollationKey key = coll.getCollationKey(s);
   1399         keyOut.value = key;
   1400 
   1401         byte[] keyBytes = key.toByteArray();
   1402         if (keyBytes.length == 0 || keyBytes[keyBytes.length - 1] != 0) {
   1403             logln(fileTestName);
   1404             logln(line);
   1405             logln(printCollationKey(key));
   1406             errln("Collator(" + norm + ").getCollationKey() wrote an empty or unterminated key");
   1407             return false;
   1408         }
   1409 
   1410         int numLevels = coll.getStrength();
   1411         if (numLevels < Collator.IDENTICAL) {
   1412             ++numLevels;
   1413         } else {
   1414             numLevels = 5;
   1415         }
   1416         if (((RuleBasedCollator)coll).isCaseLevel()) {
   1417             ++numLevels;
   1418         }
   1419         int numLevelSeparators = 0;
   1420         for (int i = 0; i < (keyBytes.length - 1); ++i) {
   1421             byte b = keyBytes[i];
   1422             if (b == 0) {
   1423                 logln(fileTestName);
   1424                 logln(line);
   1425                 logln(printCollationKey(key));
   1426                 errln("Collator(" + norm + ").getCollationKey() contains a 00 byte");
   1427                 return false;
   1428             }
   1429             if (b == 1) {
   1430                 ++numLevelSeparators;
   1431             }
   1432         }
   1433         if (numLevelSeparators != (numLevels - 1)) {
   1434             logln(fileTestName);
   1435             logln(line);
   1436             logln(printCollationKey(key));
   1437             errln("Collator(" + norm + ").getCollationKey() has "
   1438                     + numLevelSeparators + " level separators for "
   1439                     + numLevels + " levels");
   1440             return false;
   1441         }
   1442 
   1443         // No nextSortKeyPart support in ICU4J
   1444 
   1445         return true;
   1446     }
   1447 
   1448     /**
   1449      * Changes the key to the merged segments of the U+FFFE-separated substrings of s.
   1450      * Leaves key unchanged if s does not contain U+FFFE.
   1451      * @return true if the key was successfully changed
   1452      */
   1453     private boolean getMergedCollationKey(String s, Output<CollationKey> key) {
   1454         CollationKey mergedKey = null;
   1455         int sLength = s.length();
   1456         int segmentStart = 0;
   1457         for (int i = 0;;) {
   1458             if (i == sLength) {
   1459                 if (segmentStart == 0) {
   1460                     // s does not contain any U+FFFE.
   1461                     return false;
   1462                 }
   1463             } else if (s.charAt(i) != '\uFFFE') {
   1464                 ++i;
   1465                 continue;
   1466             }
   1467             // Get the sort key for another segment and merge it into mergedKey.
   1468             CollationKey tmpKey = coll.getCollationKey(s.substring(segmentStart, i));
   1469             if (mergedKey == null) {
   1470                 mergedKey = tmpKey;
   1471             } else {
   1472                 mergedKey = mergedKey.merge(tmpKey);
   1473             }
   1474             if (i == sLength) {
   1475                 break;
   1476             }
   1477             segmentStart = ++i;
   1478         }
   1479         key.value = mergedKey;
   1480         return true;
   1481     }
   1482 
   1483     private static int getDifferenceLevel(CollationKey prevKey, CollationKey key,
   1484             int order, boolean collHasCaseLevel) {
   1485         if (order == Collation.EQUAL) {
   1486             return Collation.NO_LEVEL;
   1487         }
   1488         byte[] prevBytes = prevKey.toByteArray();
   1489         byte[] bytes = key.toByteArray();
   1490         int level = Collation.PRIMARY_LEVEL;
   1491         for (int i = 0;; ++i) {
   1492             byte b = prevBytes[i];
   1493             if (b != bytes[i]) {
   1494                 break;
   1495             }
   1496             if ((int)b == Collation.LEVEL_SEPARATOR_BYTE) {
   1497                 ++level;
   1498                 if (level == Collation.CASE_LEVEL && !collHasCaseLevel) {
   1499                     ++level;
   1500                 }
   1501             }
   1502         }
   1503         return level;
   1504     }
   1505 
   1506     private boolean checkCompareTwo(String norm, String prevFileLine, String prevString, String s,
   1507                                     int expectedOrder, int expectedLevel) {
   1508         // Get the sort keys first, for error debug output.
   1509         Output<CollationKey> prevKeyOut = new Output<CollationKey>();
   1510         CollationKey prevKey;
   1511         if (!getCollationKey(norm, fileLine, prevString, prevKeyOut)) {
   1512             return false;
   1513         }
   1514         prevKey = prevKeyOut.value;
   1515 
   1516         Output<CollationKey> keyOut = new Output<CollationKey>();
   1517         CollationKey key;
   1518         if (!getCollationKey(norm, fileLine, s, keyOut)) {
   1519             return false;
   1520         }
   1521         key = keyOut.value;
   1522 
   1523         int order = coll.compare(prevString, s);
   1524         if (order != expectedOrder) {
   1525             logln(fileTestName);
   1526             logln(prevFileLine);
   1527             logln(fileLine);
   1528             logln(printCollationKey(prevKey));
   1529             logln(printCollationKey(key));
   1530             errln("line " + fileLineNumber
   1531                     + " Collator(" + norm + ").compare(previous, current) wrong order: "
   1532                     + order + " != " + expectedOrder);
   1533             return false;
   1534         }
   1535         order = coll.compare(s, prevString);
   1536         if (order != -expectedOrder) {
   1537             logln(fileTestName);
   1538             logln(prevFileLine);
   1539             logln(fileLine);
   1540             logln(printCollationKey(prevKey));
   1541             logln(printCollationKey(key));
   1542             errln("line " + fileLineNumber
   1543                     + " Collator(" + norm + ").compare(current, previous) wrong order: "
   1544                     + order + " != " + -expectedOrder);
   1545             return false;
   1546         }
   1547 
   1548         order = prevKey.compareTo(key);
   1549         if (order != expectedOrder) {
   1550             logln(fileTestName);
   1551             logln(prevFileLine);
   1552             logln(fileLine);
   1553             logln(printCollationKey(prevKey));
   1554             logln(printCollationKey(key));
   1555             errln("line " + fileLineNumber
   1556                     + " Collator(" + norm + ").getCollationKey(previous, current).compareTo() wrong order: "
   1557                     + order + " != " + expectedOrder);
   1558             return false;
   1559         }
   1560         boolean collHasCaseLevel = ((RuleBasedCollator)coll).isCaseLevel();
   1561         int level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
   1562         if (order != Collation.EQUAL && expectedLevel != Collation.NO_LEVEL) {
   1563             if (level != expectedLevel) {
   1564                 logln(fileTestName);
   1565                 logln(prevFileLine);
   1566                 logln(fileLine);
   1567                 logln(printCollationKey(prevKey));
   1568                 logln(printCollationKey(key));
   1569                 errln("line " + fileLineNumber
   1570                         + " Collator(" + norm + ").getCollationKey(previous, current).compareTo()="
   1571                         + order + " wrong level: " + level + " != " + expectedLevel);
   1572                 return false;
   1573             }
   1574         }
   1575 
   1576         // If either string contains U+FFFE, then their sort keys must compare the same as
   1577         // the merged sort keys of each string's between-FFFE segments.
   1578         //
   1579         // It is not required that
   1580         //   sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2))
   1581         // only that those two methods yield the same order.
   1582         //
   1583         // Use bit-wise OR so that getMergedCollationKey() is always called for both strings.
   1584         Output<CollationKey> outPrevKey = new Output<CollationKey>(prevKey);
   1585         Output<CollationKey> outKey = new Output<CollationKey>(key);
   1586         if (getMergedCollationKey(prevString, outPrevKey) | getMergedCollationKey(s, outKey)) {
   1587             prevKey = outPrevKey.value;
   1588             key = outKey.value;
   1589             order = prevKey.compareTo(key);
   1590             if (order != expectedOrder) {
   1591                 logln(fileTestName);
   1592                 errln("line " + fileLineNumber
   1593                         + " Collator(" + norm + ").getCollationKey"
   1594                         + "(previous, current segments between U+FFFE)).merge().compareTo() wrong order: "
   1595                         + order + " != " + expectedOrder);
   1596                 logln(prevFileLine);
   1597                 logln(fileLine);
   1598                 logln(printCollationKey(prevKey));
   1599                 logln(printCollationKey(key));
   1600                 return false;
   1601             }
   1602             int mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
   1603             if (order != Collation.EQUAL && expectedLevel != Collation.NO_LEVEL) {
   1604                 if(mergedLevel != level) {
   1605                     logln(fileTestName);
   1606                     errln("line " + fileLineNumber
   1607                         + " Collator(" + norm + ").getCollationKey"
   1608                         + "(previous, current segments between U+FFFE)).merge().compareTo()="
   1609                         + order + " wrong level: " + mergedLevel + " != " + level);
   1610                     logln(prevFileLine);
   1611                     logln(fileLine);
   1612                     logln(printCollationKey(prevKey));
   1613                     logln(printCollationKey(key));
   1614                     return false;
   1615                 }
   1616             }
   1617         }
   1618         return true;
   1619     }
   1620 
   1621     private void checkCompareStrings(BufferedReader in) throws IOException {
   1622         String prevFileLine = "(none)";
   1623         String prevString = "";
   1624         Output<String> sOut = new Output<String>();
   1625         while (readNonEmptyLine(in) && !isSectionStarter(fileLine.charAt(0))) {
   1626             // Parse the line even if it will be ignored (when we do not have a Collator)
   1627             // in order to report syntax issues.
   1628             int relation;
   1629             try {
   1630                 relation = parseRelationAndString(sOut);
   1631             } catch (ParseException pe) {
   1632                 errln(pe.toString());
   1633                 break;
   1634             }
   1635             if(coll == null) {
   1636                 // We were unable to create the Collator but continue with tests.
   1637                 // Ignore test data for this Collator.
   1638                 // The next Collator creation might work.
   1639                 continue;
   1640             }
   1641             String s = sOut.value;
   1642             int expectedOrder = (relation == Collation.ZERO_LEVEL) ? Collation.EQUAL : Collation.LESS;
   1643             int expectedLevel = relation;
   1644             boolean isOk = true;
   1645             if (!needsNormalization(prevString) && !needsNormalization(s)) {
   1646                 coll.setDecomposition(Collator.NO_DECOMPOSITION);
   1647                 isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
   1648                                         expectedOrder, expectedLevel);
   1649             }
   1650             if (isOk) {
   1651                 coll.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
   1652                 isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
   1653                                         expectedOrder, expectedLevel);
   1654             }
   1655             if (isOk && (!nfd.isNormalized(prevString) || !nfd.isNormalized(s))) {
   1656                 String pn = nfd.normalize(prevString);
   1657                 String n = nfd.normalize(s);
   1658                 isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
   1659                                         expectedOrder, expectedLevel);
   1660             }
   1661             prevFileLine = fileLine;
   1662             prevString = s;
   1663         }
   1664     }
   1665 
   1666     public void TestDataDriven() {
   1667         nfd = Normalizer2.getNFDInstance();
   1668         fcd = Norm2AllModes.getFCDNormalizer2();
   1669 
   1670         BufferedReader in = null;
   1671 
   1672         try {
   1673             in = TestUtil.getDataReader("collationtest.txt", "UTF-8");
   1674 
   1675             // Read a new line if necessary.
   1676             // Sub-parsers leave the first line set that they do not handle.
   1677             while (fileLine != null || readNonEmptyLine(in)) {
   1678                 if (!isSectionStarter(fileLine.charAt(0))) {
   1679                     logln(fileLine);
   1680                     errln("syntax error on line " + fileLineNumber);
   1681                     return;
   1682                 }
   1683                 if (fileLine.startsWith("** test: ")) {
   1684                     fileTestName = fileLine;
   1685                     logln(fileLine);
   1686                     fileLine = null;
   1687                 } else if (fileLine.equals("@ root")) {
   1688                     setRootCollator();
   1689                     fileLine = null;
   1690                 } else if (fileLine.startsWith("@ locale ")) {
   1691                     setLocaleCollator();
   1692                     fileLine = null;
   1693                 } else if (fileLine.equals("@ rules")) {
   1694                     buildTailoring(in);
   1695                 } else if (fileLine.charAt(0) == '%'
   1696                         && fileLine.length() > 1 && isSpace(fileLine.charAt(1))) {
   1697                     parseAndSetAttribute();
   1698                 } else if (fileLine.equals("* compare")) {
   1699                     checkCompareStrings(in);
   1700                 } else {
   1701                     logln(fileLine);
   1702                     errln("syntax error on line " + fileLineNumber);
   1703                     return;
   1704                 }
   1705             }
   1706         } catch (ParseException pe) {
   1707             errln(pe.toString());
   1708         } catch (IOException e) {
   1709             errln(e.getMessage());
   1710         } finally {
   1711             try {
   1712                 if (in != null) {
   1713                     in.close();
   1714                 }
   1715             } catch (IOException e) {
   1716                 e.printStackTrace();
   1717             }
   1718         }
   1719     }
   1720 }
   1721