1 /** 2 ******************************************************************************* 3 * Copyright (C) 2001-2015, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 * CollationTest.java, ported from collationtest.cpp 7 * C++ version created on: 2012apr27 8 * created by: Markus W. Scherer 9 */ 10 package com.ibm.icu.dev.test.collator; 11 12 import java.io.BufferedReader; 13 import java.io.IOException; 14 import java.text.ParseException; 15 import java.util.HashSet; 16 import java.util.Set; 17 18 import com.ibm.icu.dev.test.TestFmwk; 19 import com.ibm.icu.dev.test.TestUtil; 20 import com.ibm.icu.impl.Norm2AllModes; 21 import com.ibm.icu.impl.Utility; 22 import com.ibm.icu.impl.coll.Collation; 23 import com.ibm.icu.impl.coll.CollationData; 24 import com.ibm.icu.impl.coll.CollationFCD; 25 import com.ibm.icu.impl.coll.CollationIterator; 26 import com.ibm.icu.impl.coll.CollationRoot; 27 import com.ibm.icu.impl.coll.CollationRootElements; 28 import com.ibm.icu.impl.coll.CollationRuleParser; 29 import com.ibm.icu.impl.coll.CollationWeights; 30 import com.ibm.icu.impl.coll.FCDIterCollationIterator; 31 import com.ibm.icu.impl.coll.FCDUTF16CollationIterator; 32 import com.ibm.icu.impl.coll.UTF16CollationIterator; 33 import com.ibm.icu.impl.coll.UVector32; 34 import com.ibm.icu.text.CollationElementIterator; 35 import com.ibm.icu.text.CollationKey; 36 import com.ibm.icu.text.Collator; 37 import com.ibm.icu.text.Collator.ReorderCodes; 38 import com.ibm.icu.text.Normalizer2; 39 import com.ibm.icu.text.RawCollationKey; 40 import com.ibm.icu.text.RuleBasedCollator; 41 import com.ibm.icu.text.UCharacterIterator; 42 import com.ibm.icu.text.UTF16; 43 import com.ibm.icu.text.UnicodeSet; 44 import com.ibm.icu.text.UnicodeSetIterator; 45 import com.ibm.icu.util.IllformedLocaleException; 46 import com.ibm.icu.util.Output; 47 import com.ibm.icu.util.ULocale; 48 49 public class CollationTest extends TestFmwk { 50 public static void main(String[] args) throws Exception{ 51 new CollationTest().run(args); 52 } 53 54 public CollationTest() { 55 } 56 57 // Fields 58 Normalizer2 fcd, nfd; 59 Collator coll; 60 String fileLine; 61 int fileLineNumber; 62 String fileTestName; 63 64 // package private methods ---------------------------------------------- 65 66 static void doTest(TestFmwk test, RuleBasedCollator col, String source, 67 String target, int result) 68 { 69 doTestVariant(test, col, source, target, result); 70 if (result == -1) { 71 doTestVariant(test, col, target, source, 1); 72 } 73 else if (result == 1) { 74 doTestVariant(test, col, target, source, -1); 75 } 76 else { 77 doTestVariant(test, col, target, source, 0); 78 } 79 80 CollationElementIterator iter = col.getCollationElementIterator(source); 81 backAndForth(test, iter); 82 iter.setText(target); 83 backAndForth(test, iter); 84 } 85 86 /** 87 * Return an integer array containing all of the collation orders 88 * returned by calls to next on the specified iterator 89 */ 90 static int[] getOrders(CollationElementIterator iter) 91 { 92 int maxSize = 100; 93 int size = 0; 94 int[] orders = new int[maxSize]; 95 96 int order; 97 while ((order = iter.next()) != CollationElementIterator.NULLORDER) { 98 if (size == maxSize) { 99 maxSize *= 2; 100 int[] temp = new int[maxSize]; 101 System.arraycopy(orders, 0, temp, 0, size); 102 orders = temp; 103 } 104 orders[size++] = order; 105 } 106 107 if (maxSize > size) { 108 int[] temp = new int[size]; 109 System.arraycopy(orders, 0, temp, 0, size); 110 orders = temp; 111 } 112 return orders; 113 } 114 115 static void backAndForth(TestFmwk test, CollationElementIterator iter) 116 { 117 // Run through the iterator forwards and stick it into an array 118 iter.reset(); 119 int[] orders = getOrders(iter); 120 121 // Now go through it backwards and make sure we get the same values 122 int index = orders.length; 123 int o; 124 125 // reset the iterator 126 iter.reset(); 127 128 while ((o = iter.previous()) != CollationElementIterator.NULLORDER) { 129 if (o != orders[--index]) { 130 if (o == 0) { 131 index ++; 132 } else { 133 while (index > 0 && orders[index] == 0) { 134 index --; 135 } 136 if (o != orders[index]) { 137 test.errln("Mismatch at index " + index + ": 0x" 138 + Utility.hex(orders[index]) + " vs 0x" + Utility.hex(o)); 139 break; 140 } 141 } 142 } 143 } 144 145 while (index != 0 && orders[index - 1] == 0) { 146 index --; 147 } 148 149 if (index != 0) { 150 String msg = "Didn't get back to beginning - index is "; 151 test.errln(msg + index); 152 153 iter.reset(); 154 test.err("next: "); 155 while ((o = iter.next()) != CollationElementIterator.NULLORDER) { 156 String hexString = "0x" + Utility.hex(o) + " "; 157 test.err(hexString); 158 } 159 test.errln(""); 160 test.err("prev: "); 161 while ((o = iter.previous()) != CollationElementIterator.NULLORDER) { 162 String hexString = "0x" + Utility.hex(o) + " "; 163 test.err(hexString); 164 } 165 test.errln(""); 166 } 167 } 168 169 static final String appendCompareResult(int result, String target){ 170 if (result == -1) { 171 target += "LESS"; 172 } else if (result == 0) { 173 target += "EQUAL"; 174 } else if (result == 1) { 175 target += "GREATER"; 176 } else { 177 String huh = "?"; 178 target += huh + result; 179 } 180 return target; 181 } 182 183 static final String prettify(CollationKey key) { 184 byte[] bytes = key.toByteArray(); 185 return prettify(bytes, bytes.length); 186 } 187 188 static final String prettify(RawCollationKey key) { 189 return prettify(key.bytes, key.size); 190 } 191 192 static final String prettify(byte[] skBytes, int length) { 193 StringBuilder target = new StringBuilder(length * 3 + 2).append('['); 194 195 for (int i = 0; i < length; i++) { 196 String numStr = Integer.toHexString(skBytes[i] & 0xff); 197 if (numStr.length() < 2) { 198 target.append('0'); 199 } 200 target.append(numStr).append(' '); 201 } 202 target.append(']'); 203 return target.toString(); 204 } 205 206 private static void doTestVariant(TestFmwk test, 207 RuleBasedCollator myCollation, 208 String source, String target, int result) 209 { 210 boolean printInfo = false; 211 int compareResult = myCollation.compare(source, target); 212 if (compareResult != result) { 213 214 // !!! if not mod build, error, else nothing. 215 // warnln if not build, error, else always print warning. 216 // do we need a 'quiet warning?' (err or log). Hmmm, 217 // would it work to have the 'verbose' flag let you 218 // suppress warnings? Are there ever some warnings you 219 // want to suppress, and others you don't? 220 if(!test.isModularBuild()){ 221 test.errln("Comparing \"" + Utility.hex(source) + "\" with \"" 222 + Utility.hex(target) + "\" expected " + result 223 + " but got " + compareResult); 224 }else{ 225 printInfo = true; 226 } 227 } 228 CollationKey ssk = myCollation.getCollationKey(source); 229 CollationKey tsk = myCollation.getCollationKey(target); 230 compareResult = ssk.compareTo(tsk); 231 if (compareResult != result) { 232 233 if(!test.isModularBuild()){ 234 test.errln("Comparing CollationKeys of \"" + Utility.hex(source) 235 + "\" with \"" + Utility.hex(target) 236 + "\" expected " + result + " but got " 237 + compareResult); 238 }else{ 239 printInfo = true; 240 } 241 } 242 RawCollationKey srsk = new RawCollationKey(); 243 myCollation.getRawCollationKey(source, srsk); 244 RawCollationKey trsk = new RawCollationKey(); 245 myCollation.getRawCollationKey(target, trsk); 246 compareResult = ssk.compareTo(tsk); 247 if (compareResult != result) { 248 249 if(!test.isModularBuild()){ 250 test.errln("Comparing RawCollationKeys of \"" 251 + Utility.hex(source) 252 + "\" with \"" + Utility.hex(target) 253 + "\" expected " + result + " but got " 254 + compareResult); 255 }else{ 256 printInfo = true; 257 } 258 } 259 // hmmm, but here we issue a warning 260 // only difference is, one warning or two, and detailed info or not? 261 // hmmm, does seem preferable to omit detail if we know it is due to missing resource data. 262 // well, if we label the errors as warnings, we can let people know the details, but 263 // also know they may be due to missing resource data. basically this code is asserting 264 // that the errors are due to missing resource data, which may or may not be true. 265 if (printInfo) { 266 test.warnln("Could not load locale data skipping."); 267 } 268 } 269 270 public void TestMinMax() { 271 setRootCollator(); 272 RuleBasedCollator rbc = (RuleBasedCollator)coll; 273 274 final String s = "\uFFFE\uFFFF"; 275 long[] ces; 276 277 ces = rbc.internalGetCEs(s); 278 if (ces.length != 2) { 279 errln("expected 2 CEs for <FFFE, FFFF>, got " + ces.length); 280 return; 281 } 282 283 long ce = ces[0]; 284 long expected = Collation.makeCE(Collation.MERGE_SEPARATOR_PRIMARY); 285 if (ce != expected) { 286 errln("CE(U+fffe)=0x" + Utility.hex(ce) + " != 02.."); 287 } 288 289 ce = ces[1]; 290 expected = Collation.makeCE(Collation.MAX_PRIMARY); 291 if (ce != expected) { 292 errln("CE(U+ffff)=0x" + Utility.hex(ce) + " != max.."); 293 } 294 } 295 296 public void TestImplicits() { 297 CollationData cd = CollationRoot.getData(); 298 299 // Implicit primary weights should be assigned for the following sets, 300 // and sort in ascending order by set and then code point. 301 // See http://www.unicode.org/reports/tr10/#Implicit_Weights 302 // core Han Unified Ideographs 303 UnicodeSet coreHan = new UnicodeSet("[\\p{unified_ideograph}&" 304 + "[\\p{Block=CJK_Unified_Ideographs}" 305 + "\\p{Block=CJK_Compatibility_Ideographs}]]"); 306 // all other Unified Han ideographs 307 UnicodeSet otherHan = new UnicodeSet("[\\p{unified ideograph}-" 308 + "[\\p{Block=CJK_Unified_Ideographs}" 309 + "\\p{Block=CJK_Compatibility_Ideographs}]]"); 310 311 UnicodeSet unassigned = new UnicodeSet("[[:Cn:][:Cs:][:Co:]]"); 312 unassigned.remove(0xfffe, 0xffff); // These have special CLDR root mappings. 313 314 // Starting with CLDR 26/ICU 54, the root Han order may instead be 315 // the Unihan radical-stroke order. 316 // The tests should pass either way, so we only test the order of a small set of Han characters 317 // whose radical-stroke order is the same as their code point order. 318 UnicodeSet someHanInCPOrder = new UnicodeSet( 319 "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48" + 320 "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]"); 321 UnicodeSet inOrder = new UnicodeSet(someHanInCPOrder); 322 inOrder.addAll(unassigned).freeze(); 323 324 UnicodeSet[] sets = { coreHan, otherHan, unassigned }; 325 int prev = 0; 326 long prevPrimary = 0; 327 UTF16CollationIterator ci = new UTF16CollationIterator(cd, false, "", 0); 328 for (int i = 0; i < sets.length; ++i) { 329 UnicodeSetIterator iter = new UnicodeSetIterator(sets[i]); 330 while (iter.next()) { 331 String s = iter.getString(); 332 int c = s.codePointAt(0); 333 ci.setText(false, s, 0); 334 long ce = ci.nextCE(); 335 long ce2 = ci.nextCE(); 336 if (ce == Collation.NO_CE || ce2 != Collation.NO_CE) { 337 errln("CollationIterator.nextCE(0x" + Utility.hex(c) 338 + ") did not yield exactly one CE"); 339 continue; 340 341 } 342 if ((ce & 0xffffffffL) != Collation.COMMON_SEC_AND_TER_CE) { 343 errln("CollationIterator.nextCE(U+" + Utility.hex(c, 4) 344 + ") has non-common sec/ter weights: 0x" + Utility.hex(ce & 0xffffffffL, 8)); 345 continue; 346 } 347 long primary = ce >>> 32; 348 if (!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) { 349 errln("CE(U+" + Utility.hex(c) + ")=0x" + Utility.hex(primary) 350 + ".. not greater than CE(U+" + Utility.hex(prev) 351 + ")=0x" + Utility.hex(prevPrimary) + ".."); 352 353 } 354 prev = c; 355 prevPrimary = primary; 356 } 357 } 358 } 359 360 // ICU4C: TestNulTerminated / renamed for ICU4J 361 public void TestSubSequence() { 362 CollationData data = CollationRoot.getData(); 363 final String s = "abab"; // { 0x61, 0x62, 0x61, 0x62 } 364 365 UTF16CollationIterator ci1 = new UTF16CollationIterator(data, false, s, 0); 366 UTF16CollationIterator ci2 = new UTF16CollationIterator(data, false, s, 2); 367 368 for (int i = 0; i < 2; ++i) { 369 long ce1 = ci1.nextCE(); 370 long ce2 = ci2.nextCE(); 371 372 if (ce1 != ce2) { 373 errln("CollationIterator.nextCE(with start position at 0) != " 374 + "nextCE(with start position at 2) at CE " + i); 375 } 376 } 377 } 378 379 380 // ICU4C: TestIllegalUTF8 / not applicable to ICU4J 381 382 383 private static void addLeadSurrogatesForSupplementary(UnicodeSet src, UnicodeSet dest) { 384 for(int c = 0x10000; c < 0x110000;) { 385 int next = c + 0x400; 386 if(src.containsSome(c, next - 1)) { 387 dest.add(UTF16.getLeadSurrogate(c)); 388 } 389 c = next; 390 } 391 } 392 393 public void TestShortFCDData() { 394 UnicodeSet expectedLccc = new UnicodeSet("[:^lccc=0:]"); 395 expectedLccc.add(0xdc00, 0xdfff); // add all trail surrogates 396 addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc); 397 398 UnicodeSet lccc = new UnicodeSet(); // actual 399 for (int c = 0; c <= 0xffff; ++c) { 400 if (CollationFCD.hasLccc(c)) { 401 lccc.add(c); 402 } 403 } 404 405 UnicodeSet diff = new UnicodeSet(expectedLccc); 406 diff.removeAll(lccc); 407 diff.remove(0x10000, 0x10ffff); // hasLccc() only works for the BMP 408 409 String empty = "[]"; 410 String diffString; 411 412 diffString = diff.toPattern(true); 413 assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString); 414 415 diff = lccc; 416 diff.removeAll(expectedLccc); 417 diffString = diff.toPattern(true); 418 assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString); 419 420 UnicodeSet expectedTccc = new UnicodeSet("[:^tccc=0:]"); 421 addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc); 422 addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc); 423 424 UnicodeSet tccc = new UnicodeSet(); // actual 425 for(int c = 0; c <= 0xffff; ++c) { 426 if (CollationFCD.hasTccc(c)) { 427 tccc.add(c); 428 } 429 } 430 431 diff = new UnicodeSet(expectedTccc); 432 diff.removeAll(tccc); 433 diff.remove(0x10000, 0x10ffff); // hasTccc() only works for the BMP 434 assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString); 435 436 diff = tccc; 437 diff.removeAll(expectedTccc); 438 diffString = diff.toPattern(true); 439 assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString); 440 } 441 442 private static class CodePointIterator { 443 int[] cp; 444 int length; 445 int pos; 446 447 CodePointIterator(int[] cp) { 448 this.cp = cp; 449 this.length = cp.length; 450 this.pos = 0; 451 } 452 453 void resetToStart() { 454 pos = 0; 455 } 456 457 int next() { 458 return (pos < length) ? cp[pos++] : Collation.SENTINEL_CP; 459 } 460 461 int previous() { 462 return (pos > 0) ? cp[--pos] : Collation.SENTINEL_CP; 463 } 464 465 int getLength() { 466 return length; 467 } 468 469 int getIndex() { 470 return pos; 471 } 472 } 473 474 private void checkFCD(String name, CollationIterator ci, CodePointIterator cpi) { 475 // Iterate forward to the limit. 476 for (;;) { 477 int c1 = ci.nextCodePoint(); 478 int c2 = cpi.next(); 479 if (c1 != c2) { 480 errln(name + ".nextCodePoint(to limit, 1st pass) = U+" + Utility.hex(c1) 481 + " != U+" + Utility.hex(c1) + " at " + cpi.getIndex()); 482 return; 483 } 484 if (c1 < 0) { 485 break; 486 } 487 } 488 489 // Iterate backward most of the way. 490 for (int n = (cpi.getLength() * 2) / 3; n > 0; --n) { 491 int c1 = ci.previousCodePoint(); 492 int c2 = cpi.previous(); 493 if (c1 != c2) { 494 errln(name + ".previousCodePoint() = U+" + Utility.hex(c1) + 495 " != U+" + Utility.hex(c2) + " at " + cpi.getIndex()); 496 return; 497 } 498 } 499 500 // Forward again. 501 for (;;) { 502 int c1 = ci.nextCodePoint(); 503 int c2 = cpi.next(); 504 if (c1 != c2) { 505 errln(name + ".nextCodePoint(to limit again) = U+" + Utility.hex(c1) 506 + " != U+" + Utility.hex(c2) + " at " + cpi.getIndex()); 507 return; 508 } 509 if (c1 < 0) { 510 break; 511 } 512 } 513 514 // Iterate backward to the start. 515 for (;;) { 516 int c1 = ci.previousCodePoint(); 517 int c2 = cpi.previous(); 518 if (c1 != c2) { 519 errln(name + ".nextCodePoint(to start) = U+" + Utility.hex(c1) 520 + " != U+" + Utility.hex(c2) + " at " + cpi.getIndex()); 521 return; 522 } 523 if (c1 < 0) { 524 break; 525 } 526 } 527 } 528 529 public void TestFCD() { 530 CollationData data = CollationRoot.getData(); 531 532 // Input string, not FCD. 533 StringBuilder buf = new StringBuilder(); 534 buf.append("\u0308\u00e1\u0062\u0301\u0327\u0430\u0062") 535 .appendCodePoint(0x1D15F) // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216 536 .append("\u0327\u0308") // ccc=202, 230 537 .appendCodePoint(0x1D16D) // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226 538 .appendCodePoint(0x1D15F) 539 .appendCodePoint(0x1D16D) 540 .append("\uac01") 541 .append("\u00e7") // Character with tccc!=0 decomposed together with mis-ordered sequence. 542 .appendCodePoint(0x1D16D).appendCodePoint(0x1D165) 543 .append("\u00e1") // Character with tccc!=0 decomposed together with decomposed sequence. 544 .append("\u0f73\u0f75") // Tibetan composite vowels must be decomposed. 545 .append("\u4e00\u0f81"); 546 String s = buf.toString(); 547 548 // Expected code points. 549 int[] cp = { 550 0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62, 551 0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308, 552 0x1D15F, 0x1D16D, 553 0xac01, 554 0x63, 0x327, 0x1D165, 0x1D16D, 555 0x61, 556 0xf71, 0xf71, 0xf72, 0xf74, 0x301, 557 0x4e00, 0xf71, 0xf80 558 }; 559 560 FCDUTF16CollationIterator u16ci = new FCDUTF16CollationIterator(data, false, s, 0); 561 CodePointIterator cpi = new CodePointIterator(cp); 562 checkFCD("FCDUTF16CollationIterator", u16ci, cpi); 563 564 cpi.resetToStart(); 565 UCharacterIterator iter = UCharacterIterator.getInstance(s); 566 FCDIterCollationIterator uici = new FCDIterCollationIterator(data, false, iter, 0); 567 checkFCD("FCDIterCollationIterator", uici, cpi); 568 } 569 570 private void checkAllocWeights(CollationWeights cw, long lowerLimit, long upperLimit, 571 int n, int someLength, int minCount) { 572 573 if (!cw.allocWeights(lowerLimit, upperLimit, n)) { 574 errln("CollationWeights::allocWeights(0x" 575 + Utility.hex(lowerLimit) + ",0x" 576 + Utility.hex(upperLimit) + "," 577 + n + ") = false"); 578 return; 579 } 580 long previous = lowerLimit; 581 int count = 0; // number of weights that have someLength 582 for (int i = 0; i < n; ++i) { 583 long w = cw.nextWeight(); 584 if (w == 0xffffffffL) { 585 errln("CollationWeights::allocWeights(0x" 586 + Utility.hex(lowerLimit) + ",0x" 587 + Utility.hex(upperLimit) + ",0x" 588 + n + ").nextWeight() returns only " 589 + i + " weights"); 590 return; 591 } 592 if (!(previous < w && w < upperLimit)) { 593 errln("CollationWeights::allocWeights(0x" 594 + Utility.hex(lowerLimit) + ",0x" 595 + Utility.hex(upperLimit) + "," 596 + n + ").nextWeight() number " 597 + (i + 1) + " -> 0x" + Utility.hex(w) 598 + " not between " 599 + Utility.hex(previous) + " and " 600 + Utility.hex(upperLimit)); 601 return; 602 } 603 if (CollationWeights.lengthOfWeight(w) == someLength) { 604 ++count; 605 } 606 } 607 if (count < minCount) { 608 errln("CollationWeights::allocWeights(0x" 609 + Utility.hex(lowerLimit) + ",0x" 610 + Utility.hex(upperLimit) + "," 611 + n + ").nextWeight() returns only " 612 + count + " < " + minCount + " weights of length " 613 + someLength); 614 615 } 616 } 617 618 public void TestCollationWeights() { 619 CollationWeights cw = new CollationWeights(); 620 621 // Non-compressible primaries use 254 second bytes 02..FF. 622 logln("CollationWeights.initForPrimary(non-compressible)"); 623 cw.initForPrimary(false); 624 // Expect 1 weight 11 and 254 weights 12xx. 625 checkAllocWeights(cw, 0x10000000L, 0x13000000L, 255, 1, 1); 626 checkAllocWeights(cw, 0x10000000L, 0x13000000L, 255, 2, 254); 627 // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202. 628 checkAllocWeights(cw, 0x10fefe40L, 0x12030300L, 260, 2, 255); 629 // Expect 254 two-byte weights from the ranges 10ff and 11xx. 630 checkAllocWeights(cw, 0x10fefe40L, 0x12030300L, 600, 2, 254); 631 // Expect 254^2=64516 three-byte weights. 632 // During computation, there should be 3 three-byte ranges 633 // 10ffff, 11xxxx, 120202. 634 // The middle one should be split 64515:1, 635 // and the newly-split-off range and the last ranged lengthened. 636 checkAllocWeights(cw, 0x10fffe00L, 0x12020300L, 1 + 64516 + 254 + 1, 3, 64516); 637 // Expect weights 1102 & 1103. 638 checkAllocWeights(cw, 0x10ff0000L, 0x11040000L, 2, 2, 2); 639 // Expect weights 102102 & 102103. 640 checkAllocWeights(cw, 0x1020ff00L, 0x10210400L, 2, 3, 2); 641 642 // Compressible primaries use 251 second bytes 04..FE. 643 logln("CollationWeights.initForPrimary(compressible)"); 644 cw.initForPrimary(true); 645 // Expect 1 weight 11 and 251 weights 12xx. 646 checkAllocWeights(cw, 0x10000000L, 0x13000000L, 252, 1, 1); 647 checkAllocWeights(cw, 0x10000000L, 0x13000000L, 252, 2, 251); 648 // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204. 649 checkAllocWeights(cw, 0x10fdfe40L, 0x12050300L, 260, 2, 252); 650 // Expect weights 1104 & 1105. 651 checkAllocWeights(cw, 0x10fe0000L, 0x11060000L, 2, 2, 2); 652 // Expect weights 102102 & 102103. 653 checkAllocWeights(cw, 0x1020ff00L, 0x10210400L, 2, 3, 2); 654 655 // Secondary and tertiary weights use only bytes 3 & 4. 656 logln("CollationWeights.initForSecondary()"); 657 cw.initForSecondary(); 658 // Expect weights fbxx and all four fc..ff. 659 checkAllocWeights(cw, 0xfb20L, 0x10000L, 20, 3, 4); 660 661 logln("CollationWeights.initForTertiary()"); 662 cw.initForTertiary(); 663 // Expect weights 3dxx and both 3e & 3f. 664 checkAllocWeights(cw, 0x3d02L, 0x4000L, 10, 3, 2); 665 } 666 667 private static boolean isValidCE(CollationRootElements re, CollationData data, long p, long s, long ctq) { 668 long p1 = p >>> 24; 669 long p2 = (p >>> 16) & 0xff; 670 long p3 = (p >>> 8) & 0xff; 671 long p4 = p & 0xff; 672 long s1 = s >>> 8; 673 long s2 = s & 0xff; 674 // ctq = Case, Tertiary, Quaternary 675 long c = (ctq & Collation.CASE_MASK) >>> 14; 676 long t = ctq & Collation.ONLY_TERTIARY_MASK; 677 long t1 = t >>> 8; 678 long t2 = t & 0xff; 679 long q = ctq & Collation.QUATERNARY_MASK; 680 // No leading zero bytes. 681 if ((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) { 682 return false; 683 } 684 // No intermediate zero bytes. 685 if (p1 != 0 && p2 == 0 && (p & 0xffff) != 0) { 686 return false; 687 } 688 if (p2 != 0 && p3 == 0 && p4 != 0) { 689 return false; 690 } 691 // Minimum & maximum lead bytes. 692 if ((p1 != 0 && p1 <= Collation.MERGE_SEPARATOR_BYTE) 693 || s1 == Collation.LEVEL_SEPARATOR_BYTE 694 || t1 == Collation.LEVEL_SEPARATOR_BYTE || t1 > 0x3f) { 695 return false; 696 } 697 if (c > 2) { 698 return false; 699 } 700 // The valid byte range for the second primary byte depends on compressibility. 701 if (p2 != 0) { 702 if (data.isCompressibleLeadByte((int)p1)) { 703 if (p2 <= Collation.PRIMARY_COMPRESSION_LOW_BYTE 704 || Collation.PRIMARY_COMPRESSION_HIGH_BYTE <= p2) { 705 return false; 706 } 707 } else { 708 if (p2 <= Collation.LEVEL_SEPARATOR_BYTE) { 709 return false; 710 } 711 } 712 } 713 // Other bytes just need to avoid the level separator. 714 // Trailing zeros are ok. 715 // assert (Collation.LEVEL_SEPARATOR_BYTE == 1); 716 if (p3 == Collation.LEVEL_SEPARATOR_BYTE || p4 == Collation.LEVEL_SEPARATOR_BYTE 717 || s2 == Collation.LEVEL_SEPARATOR_BYTE || t2 == Collation.LEVEL_SEPARATOR_BYTE) { 718 return false; 719 } 720 // Well-formed CEs. 721 if (p == 0) { 722 if (s == 0) { 723 if (t == 0) { 724 // Completely ignorable CE. 725 // Quaternary CEs are not supported. 726 if (c != 0 || q != 0) { 727 return false; 728 } 729 } else { 730 // Tertiary CE. 731 if (t < re.getTertiaryBoundary() || c != 2) { 732 return false; 733 } 734 } 735 } else { 736 // Secondary CE. 737 if (s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) { 738 return false; 739 } 740 } 741 } else { 742 // Primary CE. 743 if (s == 0 || (Collation.COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) 744 || s >= re.getSecondaryBoundary()) { 745 return false; 746 } 747 if (t == 0 || t >= re.getTertiaryBoundary()) { 748 return false; 749 } 750 } 751 return true; 752 } 753 754 private static boolean isValidCE(CollationRootElements re, CollationData data, long ce) { 755 long p = ce >>> 32; 756 long secTer = ce & 0xffffffffL; 757 return isValidCE(re, data, p, secTer >>> 16, secTer & 0xffff); 758 } 759 760 private static class RootElementsIterator { 761 CollationData data; 762 long[] elements; 763 int length; 764 765 long pri; 766 long secTer; 767 int index; 768 769 RootElementsIterator(CollationData root) { 770 data = root; 771 elements = root.rootElements; 772 length = elements.length; 773 pri = 0; 774 secTer = 0; 775 index = (int)elements[CollationRootElements.IX_FIRST_TERTIARY_INDEX]; 776 } 777 778 boolean next() { 779 if (index >= length) { 780 return false; 781 } 782 long p = elements[index]; 783 if (p == CollationRootElements.PRIMARY_SENTINEL) { 784 return false; 785 } 786 if ((p & CollationRootElements.SEC_TER_DELTA_FLAG) != 0) { 787 ++index; 788 secTer = p & ~CollationRootElements.SEC_TER_DELTA_FLAG; 789 return true; 790 } 791 if ((p & CollationRootElements.PRIMARY_STEP_MASK) != 0) { 792 // End of a range, enumerate the primaries in the range. 793 int step = (int)p & CollationRootElements.PRIMARY_STEP_MASK; 794 p &= 0xffffff00; 795 if (pri == p) { 796 // Finished the range, return the next CE after it. 797 ++index; 798 return next(); 799 } 800 assert (pri < p); 801 // Return the next primary in this range. 802 boolean isCompressible = data.isCompressiblePrimary(pri); 803 if ((pri & 0xffff) == 0) { 804 pri = Collation.incTwoBytePrimaryByOffset(pri, isCompressible, step); 805 } else { 806 pri = Collation.incThreeBytePrimaryByOffset(pri, isCompressible, step); 807 } 808 return true; 809 } 810 // Simple primary CE. 811 ++index; 812 pri = p; 813 // Does this have an explicit below-common sec/ter unit, 814 // or does it imply a common one? 815 if(index == length) { 816 secTer = Collation.COMMON_SEC_AND_TER_CE; 817 } else { 818 secTer = elements[index]; 819 if((secTer & CollationRootElements.SEC_TER_DELTA_FLAG) == 0) { 820 // No sec/ter delta. 821 secTer = Collation.COMMON_SEC_AND_TER_CE; 822 } else { 823 secTer &= ~CollationRootElements.SEC_TER_DELTA_FLAG; 824 if(secTer > Collation.COMMON_SEC_AND_TER_CE) { 825 // Implied sec/ter. 826 secTer = Collation.COMMON_SEC_AND_TER_CE; 827 } else { 828 // Explicit sec/ter below common/common. 829 ++index; 830 } 831 } 832 } 833 return true; 834 } 835 836 long getPrimary() { 837 return pri; 838 } 839 840 long getSecTer() { 841 return secTer; 842 } 843 } 844 845 public void TestRootElements() { 846 CollationData root = CollationRoot.getData(); 847 848 CollationRootElements rootElements = new CollationRootElements(root.rootElements); 849 RootElementsIterator iter = new RootElementsIterator(root); 850 851 // We check each root CE for validity, 852 // and we also verify that there is a tailoring gap between each two CEs. 853 CollationWeights cw1c = new CollationWeights(); // compressible primary weights 854 CollationWeights cw1u = new CollationWeights(); // uncompressible primary weights 855 CollationWeights cw2 = new CollationWeights(); 856 CollationWeights cw3 = new CollationWeights(); 857 858 cw1c.initForPrimary(true); 859 cw1u.initForPrimary(false); 860 cw2.initForSecondary(); 861 cw3.initForTertiary(); 862 863 // Note: The root elements do not include Han-implicit or unassigned-implicit CEs, 864 // nor the special merge-separator CE for U+FFFE. 865 long prevPri = 0; 866 long prevSec = 0; 867 long prevTer = 0; 868 869 while (iter.next()) { 870 long pri = iter.getPrimary(); 871 long secTer = iter.getSecTer(); 872 // CollationRootElements CEs must have 0 case and quaternary bits. 873 if ((secTer & Collation.CASE_AND_QUATERNARY_MASK) != 0) { 874 errln("CollationRootElements CE has non-zero case and/or quaternary bits: " 875 + "0x" + Utility.hex(pri, 8) + " 0x" + Utility.hex(secTer, 8)); 876 } 877 long sec = secTer >>> 16; 878 long ter = secTer & Collation.ONLY_TERTIARY_MASK; 879 long ctq = ter; 880 if (pri == 0 && sec == 0 && ter != 0) { 881 // Tertiary CEs must have uppercase bits, 882 // but they are not stored in the CollationRootElements. 883 ctq |= 0x8000; 884 } 885 if (!isValidCE(rootElements, root, pri, sec, ctq)) { 886 errln("invalid root CE 0x" 887 + Utility.hex(pri, 8) + " 0x" + Utility.hex(secTer, 8)); 888 } else { 889 if (pri != prevPri) { 890 long newWeight = 0; 891 if (prevPri == 0 || prevPri >= Collation.FFFD_PRIMARY) { 892 // There is currently no tailoring gap after primary ignorables, 893 // and we forbid tailoring after U+FFFD and U+FFFF. 894 } else if (root.isCompressiblePrimary(prevPri)) { 895 if (!cw1c.allocWeights(prevPri, pri, 1)) { 896 errln("no primary/compressible tailoring gap between " 897 + "0x" + Utility.hex(prevPri, 8) 898 + " and 0x" + Utility.hex(pri, 8)); 899 } else { 900 newWeight = cw1c.nextWeight(); 901 } 902 } else { 903 if (!cw1u.allocWeights(prevPri, pri, 1)) { 904 errln("no primary/uncompressible tailoring gap between " 905 + "0x" + Utility.hex(prevPri, 8) 906 + " and 0x" + Utility.hex(pri, 8)); 907 } else { 908 newWeight = cw1u.nextWeight(); 909 } 910 } 911 if (newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) { 912 errln("mis-allocated primary weight, should get " 913 + "0x" + Utility.hex(prevPri, 8) 914 + " < 0x" + Utility.hex(newWeight, 8) 915 + " < 0x" + Utility.hex(pri, 8)); 916 } 917 } else if (sec != prevSec) { 918 long lowerLimit = prevSec == 0 ? 919 rootElements.getSecondaryBoundary() - 0x100 : prevSec; 920 if (!cw2.allocWeights(lowerLimit, sec, 1)) { 921 errln("no secondary tailoring gap between " 922 + "0x" + Utility.hex(lowerLimit) 923 + " and 0x" + Utility.hex(sec)); 924 } else { 925 long newWeight = cw2.nextWeight(); 926 if (!(prevSec < newWeight && newWeight < sec)) { 927 errln("mis-allocated secondary weight, should get " 928 + "0x" + Utility.hex(lowerLimit) 929 + " < 0x" + Utility.hex(newWeight) 930 + " < 0x" + Utility.hex(sec)); 931 } 932 } 933 } else if (ter != prevTer) { 934 long lowerLimit = prevTer == 0 ? 935 rootElements.getTertiaryBoundary() - 0x100 : prevTer; 936 if (!cw3.allocWeights(lowerLimit, ter, 1)) { 937 errln("no tertiary tailoring gap between " 938 + "0x" + Utility.hex(lowerLimit) 939 + " and 0x" + Utility.hex(ter)); 940 } else { 941 long newWeight = cw3.nextWeight(); 942 if (!(prevTer < newWeight && newWeight < ter)) { 943 errln("mis-allocated tertiary weight, should get " 944 + "0x" + Utility.hex(lowerLimit) 945 + " < 0x" + Utility.hex(newWeight) 946 + " < 0x" + Utility.hex(ter)); 947 } 948 } 949 } else { 950 errln("duplicate root CE 0x" 951 + Utility.hex(pri, 8) + " 0x" + Utility.hex(secTer, 8)); 952 } 953 } 954 prevPri = pri; 955 prevSec = sec; 956 prevTer = ter; 957 } 958 } 959 960 public void TestTailoredElements() { 961 CollationData root = CollationRoot.getData(); 962 CollationRootElements rootElements = new CollationRootElements(root.rootElements); 963 964 Set<String> prevLocales = new HashSet<String>(); 965 prevLocales.add(""); 966 prevLocales.add("root"); 967 prevLocales.add("root@collation=standard"); 968 969 long[] ces; 970 ULocale[] locales = Collator.getAvailableULocales(); 971 String localeID = "root"; 972 int locIdx = 0; 973 974 for (; locIdx < locales.length; localeID = locales[locIdx++].getName()) { 975 ULocale locale = new ULocale(localeID); 976 String[] types = Collator.getKeywordValuesForLocale("collation", locale, false); 977 for (int typeIdx = 0; typeIdx < types.length; ++typeIdx) { 978 String type = types[typeIdx]; // first: default type 979 if (type.startsWith("private-")) { 980 errln("Collator.getKeywordValuesForLocale(" + localeID + 981 ") returns private collation keyword: " + type); 982 } 983 ULocale localeWithType = locale.setKeywordValue("collation", type); 984 Collator coll = Collator.getInstance(localeWithType); 985 ULocale actual = coll.getLocale(ULocale.ACTUAL_LOCALE); 986 if (prevLocales.contains(actual.getName())) { 987 continue; 988 } 989 prevLocales.add(actual.getName()); 990 logln("TestTailoredElements(): requested " + localeWithType.getName() 991 + " -> actual " + actual.getName()); 992 if (!(coll instanceof RuleBasedCollator)) { 993 continue; 994 } 995 RuleBasedCollator rbc = (RuleBasedCollator) coll; 996 997 // Note: It would be better to get tailored strings such that we can 998 // identify the prefix, and only get the CEs for the prefix+string, 999 // not also for the prefix. 1000 // There is currently no API for that. 1001 // It would help in an unusual case where a contraction starting in the prefix 1002 // extends past its end, and we do not see the intended mapping. 1003 // For example, for a mapping p|st, if there is also a contraction ps, 1004 // then we get CEs(ps)+CEs(t), rather than CEs(p|st). 1005 UnicodeSet tailored = coll.getTailoredSet(); 1006 UnicodeSetIterator iter = new UnicodeSetIterator(tailored); 1007 while (iter.next()) { 1008 String s = iter.getString(); 1009 ces = rbc.internalGetCEs(s); 1010 for (int i = 0; i < ces.length; ++i) { 1011 long ce = ces[i]; 1012 if (!isValidCE(rootElements, root, ce)) { 1013 logln(prettify(s)); 1014 errln("invalid tailored CE 0x" + Utility.hex(ce, 16) 1015 + " at CE index " + i + " from string:"); 1016 } 1017 } 1018 } 1019 } 1020 } 1021 } 1022 1023 private static boolean isSpace(char c) { 1024 return (c == 0x09 || c == 0x20 || c == 0x3000); 1025 } 1026 1027 private static boolean isSectionStarter(char c) { 1028 return (c == '%' || c == '*' || c == '@'); 1029 } 1030 1031 private int skipSpaces(int i) { 1032 while (isSpace(fileLine.charAt(i))) { 1033 ++i; 1034 } 1035 return i; 1036 } 1037 1038 private String printSortKey(byte[] p) { 1039 StringBuilder s = new StringBuilder(); 1040 for (int i = 0; i < p.length; ++i) { 1041 if (i > 0) { 1042 s.append(' '); 1043 } 1044 byte b = p[i]; 1045 if (b == 0) { 1046 s.append('.'); 1047 } else if (b == 1) { 1048 s.append('|'); 1049 } else { 1050 s.append(String.format("%02x", b & 0xff)); 1051 } 1052 } 1053 return s.toString(); 1054 } 1055 1056 private String printCollationKey(CollationKey key) { 1057 byte[] p = key.toByteArray(); 1058 return printSortKey(p); 1059 } 1060 1061 private boolean readNonEmptyLine(BufferedReader in) throws IOException { 1062 for (;;) { 1063 String line = in.readLine(); 1064 if (line == null) { 1065 fileLine = null; 1066 return false; 1067 } 1068 if (fileLineNumber == 0 && line.length() != 0 && line.charAt(0) == '\uFEFF') { 1069 line = line.substring(1); // Remove the BOM. 1070 } 1071 ++fileLineNumber; 1072 // Strip trailing comments and spaces 1073 int idx = line.indexOf('#'); 1074 if (idx < 0) { 1075 idx = line.length(); 1076 } 1077 while (idx > 0 && isSpace(line.charAt(idx - 1))) { 1078 --idx; 1079 } 1080 if (idx != 0) { 1081 fileLine = idx < line.length() ? line.substring(0, idx) : line; 1082 return true; 1083 } 1084 // Empty line, continue. 1085 } 1086 } 1087 1088 private int parseString(int start, Output<String> prefix, Output<String> s) throws ParseException { 1089 int length = fileLine.length(); 1090 int i; 1091 for (i = start; i < length && !isSpace(fileLine.charAt(i)); ++i) { 1092 } 1093 int pipeIndex = fileLine.indexOf('|', start); 1094 if (pipeIndex >= 0 && pipeIndex < i) { 1095 String tmpPrefix = Utility.unescape(fileLine.substring(start, pipeIndex)); 1096 if (tmpPrefix.length() == 0) { 1097 prefix.value = null; 1098 logln(fileLine); 1099 throw new ParseException("empty prefix on line " + fileLineNumber, fileLineNumber); 1100 } 1101 prefix.value = tmpPrefix; 1102 start = pipeIndex + 1; 1103 } else { 1104 prefix.value = null; 1105 } 1106 1107 String tmp = Utility.unescape(fileLine.substring(start, i)); 1108 if (tmp.length() == 0) { 1109 s.value = null; 1110 logln(fileLine); 1111 throw new ParseException("empty string on line " + fileLineNumber, fileLineNumber); 1112 } 1113 s.value = tmp; 1114 return i; 1115 } 1116 1117 private int parseRelationAndString(Output<String> s) throws ParseException { 1118 int relation = Collation.NO_LEVEL; 1119 int start; 1120 if (fileLine.charAt(0) == '<') { 1121 char second = fileLine.charAt(1); 1122 start = 2; 1123 switch(second) { 1124 case 0x31: // <1 1125 relation = Collation.PRIMARY_LEVEL; 1126 break; 1127 case 0x32: // <2 1128 relation = Collation.SECONDARY_LEVEL; 1129 break; 1130 case 0x33: // <3 1131 relation = Collation.TERTIARY_LEVEL; 1132 break; 1133 case 0x34: // <4 1134 relation = Collation.QUATERNARY_LEVEL; 1135 break; 1136 case 0x63: // <c 1137 relation = Collation.CASE_LEVEL; 1138 break; 1139 case 0x69: // <i 1140 relation = Collation.IDENTICAL_LEVEL; 1141 break; 1142 default: // just < 1143 relation = Collation.NO_LEVEL; 1144 start = 1; 1145 break; 1146 } 1147 } else if (fileLine.charAt(0) == '=') { 1148 relation = Collation.ZERO_LEVEL; 1149 start = 1; 1150 } else { 1151 start = 0; 1152 } 1153 1154 if (start == 0 || !isSpace(fileLine.charAt(start))) { 1155 logln(fileLine); 1156 throw new ParseException("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line " 1157 + fileLineNumber, fileLineNumber); 1158 } 1159 1160 start = skipSpaces(start); 1161 Output<String> prefixOut = new Output<String>(); 1162 start = parseString(start, prefixOut, s); 1163 if (prefixOut.value != null) { 1164 logln(fileLine); 1165 throw new ParseException("prefix string not allowed for test string: on line " 1166 + fileLineNumber, fileLineNumber); 1167 } 1168 if (start < fileLine.length()) { 1169 logln(fileLine); 1170 throw new ParseException("unexpected line contents after test string on line " 1171 + fileLineNumber, fileLineNumber); 1172 } 1173 1174 return relation; 1175 } 1176 1177 private void parseAndSetAttribute() throws ParseException { 1178 // Parse attributes even if the Collator could not be created, 1179 // in order to report syntax errors. 1180 int start = skipSpaces(1); 1181 int equalPos = fileLine.indexOf('='); 1182 if (equalPos < 0) { 1183 if (fileLine.regionMatches(start, "reorder", 0, 7)) { 1184 parseAndSetReorderCodes(start + 7); 1185 return; 1186 } 1187 logln(fileLine); 1188 throw new ParseException("missing '=' on line " + fileLineNumber, fileLineNumber); 1189 } 1190 1191 String attrString = fileLine.substring(start, equalPos); 1192 String valueString = fileLine.substring(equalPos + 1); 1193 if (attrString.equals("maxVariable")) { 1194 int max; 1195 if (valueString.equals("space")) { 1196 max = ReorderCodes.SPACE; 1197 } else if(valueString.equals("punct")) { 1198 max = ReorderCodes.PUNCTUATION; 1199 } else if(valueString.equals("symbol")) { 1200 max = ReorderCodes.SYMBOL; 1201 } else if(valueString.equals("currency")) { 1202 max = ReorderCodes.CURRENCY; 1203 } else { 1204 logln(fileLine); 1205 throw new ParseException("invalid attribute value name on line " 1206 + fileLineNumber, fileLineNumber); 1207 } 1208 if (coll != null) { 1209 coll.setMaxVariable(max); 1210 } 1211 fileLine = null; 1212 return; 1213 } 1214 1215 boolean parsed = true; 1216 RuleBasedCollator rbc = (RuleBasedCollator)coll; 1217 if (attrString.equals("backwards")) { 1218 if (valueString.equals("on")) { 1219 if (rbc != null) rbc.setFrenchCollation(true); 1220 } else if (valueString.equals("off")) { 1221 if (rbc != null) rbc.setFrenchCollation(false); 1222 } else if (valueString.equals("default")) { 1223 if (rbc != null) rbc.setFrenchCollationDefault(); 1224 } else { 1225 parsed = false; 1226 } 1227 } else if (attrString.equals("alternate")) { 1228 if (valueString.equals("non-ignorable")) { 1229 if (rbc != null) rbc.setAlternateHandlingShifted(false); 1230 } else if (valueString.equals("shifted")) { 1231 if (rbc != null) rbc.setAlternateHandlingShifted(true); 1232 } else if (valueString.equals("default")) { 1233 if (rbc != null) rbc.setAlternateHandlingDefault(); 1234 } else { 1235 parsed = false; 1236 } 1237 } else if (attrString.equals("caseFirst")) { 1238 if (valueString.equals("upper")) { 1239 if (rbc != null) rbc.setUpperCaseFirst(true); 1240 } else if (valueString.equals("lower")) { 1241 if (rbc != null) rbc.setLowerCaseFirst(true); 1242 } else if (valueString.equals("default")) { 1243 if (rbc != null) rbc.setCaseFirstDefault(); 1244 } else { 1245 parsed = false; 1246 } 1247 } else if (attrString.equals("caseLevel")) { 1248 if (valueString.equals("on")) { 1249 if (rbc != null) rbc.setCaseLevel(true); 1250 } else if (valueString.equals("off")) { 1251 if (rbc != null) rbc.setCaseLevel(false); 1252 } else if (valueString.equals("default")) { 1253 if (rbc != null) rbc.setCaseLevelDefault(); 1254 } else { 1255 parsed = false; 1256 } 1257 } else if (attrString.equals("strength")) { 1258 if (valueString.equals("primary")) { 1259 if (rbc != null) rbc.setStrength(Collator.PRIMARY); 1260 } else if (valueString.equals("secondary")) { 1261 if (rbc != null) rbc.setStrength(Collator.SECONDARY); 1262 } else if (valueString.equals("tertiary")) { 1263 if (rbc != null) rbc.setStrength(Collator.TERTIARY); 1264 } else if (valueString.equals("quaternary")) { 1265 if (rbc != null) rbc.setStrength(Collator.QUATERNARY); 1266 } else if (valueString.equals("identical")) { 1267 if (rbc != null) rbc.setStrength(Collator.IDENTICAL); 1268 } else if (valueString.equals("default")) { 1269 if (rbc != null) rbc.setStrengthDefault(); 1270 } else { 1271 parsed = false; 1272 } 1273 } else if (attrString.equals("numeric")) { 1274 if (valueString.equals("on")) { 1275 if (rbc != null) rbc.setNumericCollation(true); 1276 } else if (valueString.equals("off")) { 1277 if (rbc != null) rbc.setNumericCollation(false); 1278 } else if (valueString.equals("default")) { 1279 if (rbc != null) rbc.setNumericCollationDefault(); 1280 } else { 1281 parsed = false; 1282 } 1283 } else { 1284 logln(fileLine); 1285 throw new ParseException("invalid attribute name on line " 1286 + fileLineNumber, fileLineNumber); 1287 } 1288 if (!parsed) { 1289 logln(fileLine); 1290 throw new ParseException( 1291 "invalid attribute value name or attribute=value combination on line " 1292 + fileLineNumber, fileLineNumber); 1293 } 1294 1295 fileLine = null; 1296 } 1297 1298 private void parseAndSetReorderCodes(int start) throws ParseException { 1299 UVector32 reorderCodes = new UVector32(); 1300 while (start < fileLine.length()) { 1301 start = skipSpaces(start); 1302 int limit = start; 1303 while (limit < fileLine.length() && !isSpace(fileLine.charAt(limit))) { 1304 ++limit; 1305 } 1306 String name = fileLine.substring(start, limit); 1307 int code = CollationRuleParser.getReorderCode(name); 1308 if (code < -1) { 1309 if (name.equalsIgnoreCase("default")) { 1310 code = ReorderCodes.DEFAULT; // -1 1311 } else { 1312 logln(fileLine); 1313 throw new ParseException("invalid reorder code '" + name + "' on line " 1314 + fileLineNumber, fileLineNumber); 1315 } 1316 } 1317 reorderCodes.addElement(code); 1318 start = limit; 1319 } 1320 if (coll != null) { 1321 int[] reorderCodesArray = new int[reorderCodes.size()]; 1322 System.arraycopy(reorderCodes.getBuffer(), 0, 1323 reorderCodesArray, 0, reorderCodes.size()); 1324 coll.setReorderCodes(reorderCodesArray); 1325 } 1326 1327 fileLine = null; 1328 } 1329 1330 private void buildTailoring(BufferedReader in) throws IOException { 1331 StringBuilder rules = new StringBuilder(); 1332 while (readNonEmptyLine(in) && !isSectionStarter(fileLine.charAt(0))) { 1333 rules.append(Utility.unescape(fileLine)); 1334 } 1335 1336 try { 1337 coll = new RuleBasedCollator(rules.toString()); 1338 } catch (Exception e) { 1339 logln(rules.toString()); 1340 // Android patch: Add --omitCollationRules to genrb. 1341 warnln("RuleBasedCollator(rules) failed - " + e.getMessage()); 1342 // Android patch end. 1343 coll = null; 1344 } 1345 } 1346 1347 private void setRootCollator() { 1348 coll = Collator.getInstance(ULocale.ROOT); 1349 } 1350 1351 private void setLocaleCollator() { 1352 coll = null; 1353 ULocale locale = null; 1354 if (fileLine.length() > 9) { 1355 String localeID = fileLine.substring(9); // "@ locale <langTag>" 1356 try { 1357 locale = new ULocale(localeID); // either locale ID or language tag 1358 } catch (IllformedLocaleException e) { 1359 locale = null; 1360 } 1361 } 1362 if (locale == null) { 1363 logln(fileLine); 1364 errln("invalid language tag on line " + fileLineNumber); 1365 return; 1366 } 1367 1368 logln("creating a collator for locale ID " + locale.getName()); 1369 try { 1370 coll = Collator.getInstance(locale); 1371 } catch (Exception e) { 1372 errln("unable to create a collator for locale " + locale + 1373 " on line " + fileLineNumber + " - " + e); 1374 } 1375 } 1376 1377 private boolean needsNormalization(String s) { 1378 if (!fcd.isNormalized(s)) { 1379 return true; 1380 } 1381 // In some sequences with Tibetan composite vowel signs, 1382 // even if the string passes the FCD check, 1383 // those composites must be decomposed. 1384 // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81. 1385 int index = 0; 1386 while((index = s.indexOf(0xf71, index)) >= 0) { 1387 if (++index < s.length()) { 1388 char c = s.charAt(index); 1389 if (c == 0xf73 || c == 0xf75 || c == 0xf81) { 1390 return true; 1391 } 1392 } 1393 } 1394 return false; 1395 } 1396 1397 private boolean getCollationKey(String norm, String line, String s, Output<CollationKey> keyOut) { 1398 CollationKey key = coll.getCollationKey(s); 1399 keyOut.value = key; 1400 1401 byte[] keyBytes = key.toByteArray(); 1402 if (keyBytes.length == 0 || keyBytes[keyBytes.length - 1] != 0) { 1403 logln(fileTestName); 1404 logln(line); 1405 logln(printCollationKey(key)); 1406 errln("Collator(" + norm + ").getCollationKey() wrote an empty or unterminated key"); 1407 return false; 1408 } 1409 1410 int numLevels = coll.getStrength(); 1411 if (numLevels < Collator.IDENTICAL) { 1412 ++numLevels; 1413 } else { 1414 numLevels = 5; 1415 } 1416 if (((RuleBasedCollator)coll).isCaseLevel()) { 1417 ++numLevels; 1418 } 1419 int numLevelSeparators = 0; 1420 for (int i = 0; i < (keyBytes.length - 1); ++i) { 1421 byte b = keyBytes[i]; 1422 if (b == 0) { 1423 logln(fileTestName); 1424 logln(line); 1425 logln(printCollationKey(key)); 1426 errln("Collator(" + norm + ").getCollationKey() contains a 00 byte"); 1427 return false; 1428 } 1429 if (b == 1) { 1430 ++numLevelSeparators; 1431 } 1432 } 1433 if (numLevelSeparators != (numLevels - 1)) { 1434 logln(fileTestName); 1435 logln(line); 1436 logln(printCollationKey(key)); 1437 errln("Collator(" + norm + ").getCollationKey() has " 1438 + numLevelSeparators + " level separators for " 1439 + numLevels + " levels"); 1440 return false; 1441 } 1442 1443 // No nextSortKeyPart support in ICU4J 1444 1445 return true; 1446 } 1447 1448 /** 1449 * Changes the key to the merged segments of the U+FFFE-separated substrings of s. 1450 * Leaves key unchanged if s does not contain U+FFFE. 1451 * @return true if the key was successfully changed 1452 */ 1453 private boolean getMergedCollationKey(String s, Output<CollationKey> key) { 1454 CollationKey mergedKey = null; 1455 int sLength = s.length(); 1456 int segmentStart = 0; 1457 for (int i = 0;;) { 1458 if (i == sLength) { 1459 if (segmentStart == 0) { 1460 // s does not contain any U+FFFE. 1461 return false; 1462 } 1463 } else if (s.charAt(i) != '\uFFFE') { 1464 ++i; 1465 continue; 1466 } 1467 // Get the sort key for another segment and merge it into mergedKey. 1468 CollationKey tmpKey = coll.getCollationKey(s.substring(segmentStart, i)); 1469 if (mergedKey == null) { 1470 mergedKey = tmpKey; 1471 } else { 1472 mergedKey = mergedKey.merge(tmpKey); 1473 } 1474 if (i == sLength) { 1475 break; 1476 } 1477 segmentStart = ++i; 1478 } 1479 key.value = mergedKey; 1480 return true; 1481 } 1482 1483 private static int getDifferenceLevel(CollationKey prevKey, CollationKey key, 1484 int order, boolean collHasCaseLevel) { 1485 if (order == Collation.EQUAL) { 1486 return Collation.NO_LEVEL; 1487 } 1488 byte[] prevBytes = prevKey.toByteArray(); 1489 byte[] bytes = key.toByteArray(); 1490 int level = Collation.PRIMARY_LEVEL; 1491 for (int i = 0;; ++i) { 1492 byte b = prevBytes[i]; 1493 if (b != bytes[i]) { 1494 break; 1495 } 1496 if ((int)b == Collation.LEVEL_SEPARATOR_BYTE) { 1497 ++level; 1498 if (level == Collation.CASE_LEVEL && !collHasCaseLevel) { 1499 ++level; 1500 } 1501 } 1502 } 1503 return level; 1504 } 1505 1506 private boolean checkCompareTwo(String norm, String prevFileLine, String prevString, String s, 1507 int expectedOrder, int expectedLevel) { 1508 // Get the sort keys first, for error debug output. 1509 Output<CollationKey> prevKeyOut = new Output<CollationKey>(); 1510 CollationKey prevKey; 1511 if (!getCollationKey(norm, fileLine, prevString, prevKeyOut)) { 1512 return false; 1513 } 1514 prevKey = prevKeyOut.value; 1515 1516 Output<CollationKey> keyOut = new Output<CollationKey>(); 1517 CollationKey key; 1518 if (!getCollationKey(norm, fileLine, s, keyOut)) { 1519 return false; 1520 } 1521 key = keyOut.value; 1522 1523 int order = coll.compare(prevString, s); 1524 if (order != expectedOrder) { 1525 logln(fileTestName); 1526 logln(prevFileLine); 1527 logln(fileLine); 1528 logln(printCollationKey(prevKey)); 1529 logln(printCollationKey(key)); 1530 errln("line " + fileLineNumber 1531 + " Collator(" + norm + ").compare(previous, current) wrong order: " 1532 + order + " != " + expectedOrder); 1533 return false; 1534 } 1535 order = coll.compare(s, prevString); 1536 if (order != -expectedOrder) { 1537 logln(fileTestName); 1538 logln(prevFileLine); 1539 logln(fileLine); 1540 logln(printCollationKey(prevKey)); 1541 logln(printCollationKey(key)); 1542 errln("line " + fileLineNumber 1543 + " Collator(" + norm + ").compare(current, previous) wrong order: " 1544 + order + " != " + -expectedOrder); 1545 return false; 1546 } 1547 1548 order = prevKey.compareTo(key); 1549 if (order != expectedOrder) { 1550 logln(fileTestName); 1551 logln(prevFileLine); 1552 logln(fileLine); 1553 logln(printCollationKey(prevKey)); 1554 logln(printCollationKey(key)); 1555 errln("line " + fileLineNumber 1556 + " Collator(" + norm + ").getCollationKey(previous, current).compareTo() wrong order: " 1557 + order + " != " + expectedOrder); 1558 return false; 1559 } 1560 boolean collHasCaseLevel = ((RuleBasedCollator)coll).isCaseLevel(); 1561 int level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel); 1562 if (order != Collation.EQUAL && expectedLevel != Collation.NO_LEVEL) { 1563 if (level != expectedLevel) { 1564 logln(fileTestName); 1565 logln(prevFileLine); 1566 logln(fileLine); 1567 logln(printCollationKey(prevKey)); 1568 logln(printCollationKey(key)); 1569 errln("line " + fileLineNumber 1570 + " Collator(" + norm + ").getCollationKey(previous, current).compareTo()=" 1571 + order + " wrong level: " + level + " != " + expectedLevel); 1572 return false; 1573 } 1574 } 1575 1576 // If either string contains U+FFFE, then their sort keys must compare the same as 1577 // the merged sort keys of each string's between-FFFE segments. 1578 // 1579 // It is not required that 1580 // sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2)) 1581 // only that those two methods yield the same order. 1582 // 1583 // Use bit-wise OR so that getMergedCollationKey() is always called for both strings. 1584 Output<CollationKey> outPrevKey = new Output<CollationKey>(prevKey); 1585 Output<CollationKey> outKey = new Output<CollationKey>(key); 1586 if (getMergedCollationKey(prevString, outPrevKey) | getMergedCollationKey(s, outKey)) { 1587 prevKey = outPrevKey.value; 1588 key = outKey.value; 1589 order = prevKey.compareTo(key); 1590 if (order != expectedOrder) { 1591 logln(fileTestName); 1592 errln("line " + fileLineNumber 1593 + " Collator(" + norm + ").getCollationKey" 1594 + "(previous, current segments between U+FFFE)).merge().compareTo() wrong order: " 1595 + order + " != " + expectedOrder); 1596 logln(prevFileLine); 1597 logln(fileLine); 1598 logln(printCollationKey(prevKey)); 1599 logln(printCollationKey(key)); 1600 return false; 1601 } 1602 int mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel); 1603 if (order != Collation.EQUAL && expectedLevel != Collation.NO_LEVEL) { 1604 if(mergedLevel != level) { 1605 logln(fileTestName); 1606 errln("line " + fileLineNumber 1607 + " Collator(" + norm + ").getCollationKey" 1608 + "(previous, current segments between U+FFFE)).merge().compareTo()=" 1609 + order + " wrong level: " + mergedLevel + " != " + level); 1610 logln(prevFileLine); 1611 logln(fileLine); 1612 logln(printCollationKey(prevKey)); 1613 logln(printCollationKey(key)); 1614 return false; 1615 } 1616 } 1617 } 1618 return true; 1619 } 1620 1621 private void checkCompareStrings(BufferedReader in) throws IOException { 1622 String prevFileLine = "(none)"; 1623 String prevString = ""; 1624 Output<String> sOut = new Output<String>(); 1625 while (readNonEmptyLine(in) && !isSectionStarter(fileLine.charAt(0))) { 1626 // Parse the line even if it will be ignored (when we do not have a Collator) 1627 // in order to report syntax issues. 1628 int relation; 1629 try { 1630 relation = parseRelationAndString(sOut); 1631 } catch (ParseException pe) { 1632 errln(pe.toString()); 1633 break; 1634 } 1635 if(coll == null) { 1636 // We were unable to create the Collator but continue with tests. 1637 // Ignore test data for this Collator. 1638 // The next Collator creation might work. 1639 continue; 1640 } 1641 String s = sOut.value; 1642 int expectedOrder = (relation == Collation.ZERO_LEVEL) ? Collation.EQUAL : Collation.LESS; 1643 int expectedLevel = relation; 1644 boolean isOk = true; 1645 if (!needsNormalization(prevString) && !needsNormalization(s)) { 1646 coll.setDecomposition(Collator.NO_DECOMPOSITION); 1647 isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s, 1648 expectedOrder, expectedLevel); 1649 } 1650 if (isOk) { 1651 coll.setDecomposition(Collator.CANONICAL_DECOMPOSITION); 1652 isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s, 1653 expectedOrder, expectedLevel); 1654 } 1655 if (isOk && (!nfd.isNormalized(prevString) || !nfd.isNormalized(s))) { 1656 String pn = nfd.normalize(prevString); 1657 String n = nfd.normalize(s); 1658 isOk = checkCompareTwo("NFD input", prevFileLine, pn, n, 1659 expectedOrder, expectedLevel); 1660 } 1661 prevFileLine = fileLine; 1662 prevString = s; 1663 } 1664 } 1665 1666 public void TestDataDriven() { 1667 nfd = Normalizer2.getNFDInstance(); 1668 fcd = Norm2AllModes.getFCDNormalizer2(); 1669 1670 BufferedReader in = null; 1671 1672 try { 1673 in = TestUtil.getDataReader("collationtest.txt", "UTF-8"); 1674 1675 // Read a new line if necessary. 1676 // Sub-parsers leave the first line set that they do not handle. 1677 while (fileLine != null || readNonEmptyLine(in)) { 1678 if (!isSectionStarter(fileLine.charAt(0))) { 1679 logln(fileLine); 1680 errln("syntax error on line " + fileLineNumber); 1681 return; 1682 } 1683 if (fileLine.startsWith("** test: ")) { 1684 fileTestName = fileLine; 1685 logln(fileLine); 1686 fileLine = null; 1687 } else if (fileLine.equals("@ root")) { 1688 setRootCollator(); 1689 fileLine = null; 1690 } else if (fileLine.startsWith("@ locale ")) { 1691 setLocaleCollator(); 1692 fileLine = null; 1693 } else if (fileLine.equals("@ rules")) { 1694 buildTailoring(in); 1695 } else if (fileLine.charAt(0) == '%' 1696 && fileLine.length() > 1 && isSpace(fileLine.charAt(1))) { 1697 parseAndSetAttribute(); 1698 } else if (fileLine.equals("* compare")) { 1699 checkCompareStrings(in); 1700 } else { 1701 logln(fileLine); 1702 errln("syntax error on line " + fileLineNumber); 1703 return; 1704 } 1705 } 1706 } catch (ParseException pe) { 1707 errln(pe.toString()); 1708 } catch (IOException e) { 1709 errln(e.getMessage()); 1710 } finally { 1711 try { 1712 if (in != null) { 1713 in.close(); 1714 } 1715 } catch (IOException e) { 1716 e.printStackTrace(); 1717 } 1718 } 1719 } 1720 } 1721