1 /* 2 ******************************************************************************* 3 * Copyright (C) 1996-2014, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 package com.ibm.icu.dev.test.lang; 8 9 import java.text.NumberFormat; 10 import java.text.ParsePosition; 11 import java.util.ArrayList; 12 import java.util.Arrays; 13 import java.util.Collection; 14 import java.util.Comparator; 15 import java.util.HashMap; 16 import java.util.HashSet; 17 import java.util.Iterator; 18 import java.util.LinkedHashSet; 19 import java.util.List; 20 import java.util.Set; 21 import java.util.SortedSet; 22 import java.util.TreeSet; 23 24 import com.ibm.icu.dev.test.TestFmwk; 25 import com.ibm.icu.dev.util.CollectionUtilities; 26 import com.ibm.icu.impl.SortedSetRelation; 27 import com.ibm.icu.impl.Utility; 28 import com.ibm.icu.lang.UCharacter; 29 import com.ibm.icu.lang.UCharacterEnums.ECharacterCategory; 30 import com.ibm.icu.lang.UProperty; 31 import com.ibm.icu.lang.UScript; 32 import com.ibm.icu.text.SymbolTable; 33 import com.ibm.icu.text.UTF16; 34 import com.ibm.icu.text.UnicodeMatcher; 35 import com.ibm.icu.text.UnicodeSet; 36 import com.ibm.icu.text.UnicodeSet.ComparisonStyle; 37 import com.ibm.icu.text.UnicodeSet.EntryRange; 38 import com.ibm.icu.text.UnicodeSet.SpanCondition; 39 import com.ibm.icu.text.UnicodeSetIterator; 40 import com.ibm.icu.text.UnicodeSetSpanner; 41 import com.ibm.icu.text.UnicodeSetSpanner.CountMethod; 42 import com.ibm.icu.text.UnicodeSetSpanner.TrimOption; 43 import com.ibm.icu.util.OutputInt; 44 45 /** 46 * @test 47 * @summary General test of UnicodeSet 48 */ 49 public class UnicodeSetTest extends TestFmwk { 50 51 static final String NOT = "%%%%"; 52 53 public static void main(String[] args) throws Exception { 54 new UnicodeSetTest().run(args); 55 } 56 57 private static final boolean isCccValue(int ccc) { 58 switch (ccc) { 59 case 0: 60 case 1: 61 case 7: 62 case 8: 63 case 9: 64 case 200: 65 case 202: 66 case 216: 67 case 218: 68 case 220: 69 case 222: 70 case 224: 71 case 226: 72 case 228: 73 case 230: 74 case 232: 75 case 233: 76 case 234: 77 case 240: 78 return true; 79 default: 80 return false; 81 } 82 } 83 84 public void TestPropertyAccess() { 85 int count = 0; 86 // test to see that all of the names work 87 for (int propNum = UProperty.BINARY_START; propNum < UProperty.INT_LIMIT; ++propNum) { 88 count++; 89 //Skipping tests in the non-exhaustive mode to shorten the test time ticket#6475 90 if(getInclusion()<=5 && count%5!=0){ 91 continue; 92 } 93 if (propNum >= UProperty.BINARY_LIMIT && propNum < UProperty.INT_START) { // skip the gap 94 propNum = UProperty.INT_START; 95 } 96 for (int nameChoice = UProperty.NameChoice.SHORT; nameChoice <= UProperty.NameChoice.LONG; ++nameChoice) { 97 String propName; 98 try { 99 propName = UCharacter.getPropertyName(propNum, nameChoice); 100 if (propName == null) { 101 if (nameChoice == UProperty.NameChoice.SHORT) continue; // allow non-existent short names 102 throw new NullPointerException(); 103 } 104 } catch (RuntimeException e1) { 105 errln("Can't get property name for: " 106 + "Property (" + propNum + ")" 107 + ", NameChoice: " + nameChoice + ", " 108 + e1.getClass().getName()); 109 continue; 110 } 111 logln("Property (" + propNum + "): " + propName); 112 for (int valueNum = UCharacter.getIntPropertyMinValue(propNum); valueNum <= UCharacter.getIntPropertyMaxValue(propNum); ++valueNum) { 113 String valueName; 114 try { 115 valueName = UCharacter.getPropertyValueName(propNum, valueNum, nameChoice); 116 if (valueName == null) { 117 if (nameChoice == UProperty.NameChoice.SHORT) continue; // allow non-existent short names 118 if ((propNum == UProperty.CANONICAL_COMBINING_CLASS || 119 propNum == UProperty.LEAD_CANONICAL_COMBINING_CLASS || 120 propNum == UProperty.TRAIL_CANONICAL_COMBINING_CLASS) && 121 !isCccValue(valueNum)) { 122 // Only a few of the canonical combining classes have names. 123 // Otherwise they are just integer values. 124 continue; 125 } else { 126 throw new NullPointerException(); 127 } 128 } 129 } catch (RuntimeException e1) { 130 errln("Can't get property value name for: " 131 + "Property (" + propNum + "): " + propName + ", " 132 + "Value (" + valueNum + ") " 133 + ", NameChoice: " + nameChoice + ", " 134 + e1.getClass().getName()); 135 continue; 136 } 137 logln("Value (" + valueNum + "): " + valueName); 138 UnicodeSet testSet; 139 try { 140 testSet = new UnicodeSet("[:" + propName + "=" + valueName + ":]"); 141 } catch (RuntimeException e) { 142 errln("Can't create UnicodeSet for: " 143 + "Property (" + propNum + "): " + propName + ", " 144 + "Value (" + valueNum + "): " + valueName + ", " 145 + e.getClass().getName()); 146 continue; 147 } 148 UnicodeSet collectedErrors = new UnicodeSet(); 149 for (UnicodeSetIterator it = new UnicodeSetIterator(testSet); it.next();) { 150 int value = UCharacter.getIntPropertyValue(it.codepoint, propNum); 151 if (value != valueNum) { 152 collectedErrors.add(it.codepoint); 153 } 154 } 155 if (collectedErrors.size() != 0) { 156 errln("Property Value Differs: " 157 + "Property (" + propNum + "): " + propName + ", " 158 + "Value (" + valueNum + "): " + valueName + ", " 159 + "Differing values: " + collectedErrors.toPattern(true)); 160 } 161 } 162 } 163 } 164 } 165 166 167 /** 168 * Test toPattern(). 169 */ 170 public void TestToPattern() throws Exception { 171 // Test that toPattern() round trips with syntax characters 172 // and whitespace. 173 for (int i = 0; i < OTHER_TOPATTERN_TESTS.length; ++i) { 174 checkPat(OTHER_TOPATTERN_TESTS[i], new UnicodeSet(OTHER_TOPATTERN_TESTS[i])); 175 } 176 for (int i = 0; i <= 0x10FFFF; ++i) { 177 if ((i <= 0xFF && !UCharacter.isLetter(i)) || UCharacter.isWhitespace(i)) { 178 // check various combinations to make sure they all work. 179 if (i != 0 && !toPatternAux(i, i)) continue; 180 if (!toPatternAux(0, i)) continue; 181 if (!toPatternAux(i, 0xFFFF)) continue; 182 } 183 } 184 185 // Test pattern behavior of multicharacter strings. 186 UnicodeSet s = new UnicodeSet("[a-z {aa} {ab}]"); 187 expectToPattern(s, "[a-z{aa}{ab}]", 188 new String[] {"aa", "ab", NOT, "ac"}); 189 s.add("ac"); 190 expectToPattern(s, "[a-z{aa}{ab}{ac}]", 191 new String[] {"aa", "ab", "ac", NOT, "xy"}); 192 193 s.applyPattern("[a-z {\\{l} {r\\}}]"); 194 expectToPattern(s, "[a-z{r\\}}{\\{l}]", 195 new String[] {"{l", "r}", NOT, "xy"}); 196 s.add("[]"); 197 expectToPattern(s, "[a-z{\\[\\]}{r\\}}{\\{l}]", 198 new String[] {"{l", "r}", "[]", NOT, "xy"}); 199 200 s.applyPattern("[a-z {\u4E01\u4E02}{\\n\\r}]"); 201 expectToPattern(s, "[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]", 202 new String[] {"\u4E01\u4E02", "\n\r"}); 203 204 s.clear(); 205 s.add("abc"); 206 s.add("abc"); 207 expectToPattern(s, "[{abc}]", 208 new String[] {"abc", NOT, "ab"}); 209 210 // JB#3400: For 2 character ranges prefer [ab] to [a-b] 211 s.clear(); 212 s.add('a', 'b'); 213 expectToPattern(s, "[ab]", null); 214 215 // Cover applyPattern, applyPropertyAlias 216 s.clear(); 217 s.applyPattern("[ab ]", true); 218 expectToPattern(s, "[ab]", new String[] {"a", NOT, "ab", " "}); 219 s.clear(); 220 s.applyPattern("[ab ]", false); 221 expectToPattern(s, "[\\ ab]", new String[] {"a", "\u0020", NOT, "ab"}); 222 223 s.clear(); 224 s.applyPropertyAlias("nv", "0.5"); 225 s.retainAll(new UnicodeSet("[:age=6.0:]")); // stabilize this test 226 expectToPattern(s, "[\\u00BD\\u0B73\\u0D74\\u0F2A\\u2CFD\\uA831\\U00010141\\U00010175\\U00010176\\U00010E7B]", null); 227 // Unicode 5.1 adds Malayalam 1/2 (\u0D74) 228 // Unicode 5.2 adds U+A831 NORTH INDIC FRACTION ONE HALF and U+10E7B RUMI FRACTION ONE HALF 229 // Unicode 6.0 adds U+0B73 ORIYA FRACTION ONE HALF 230 231 s.clear(); 232 s.applyPropertyAlias("gc", "Lu"); 233 // TODO expectToPattern(s, what?) 234 235 // RemoveAllStrings() 236 s.clear(); 237 s.applyPattern("[a-z{abc}{def}]"); 238 expectToPattern(s, "[a-z{abc}{def}]", null); 239 s.removeAllStrings(); 240 expectToPattern(s, "[a-z]", null); 241 } 242 243 static String[] OTHER_TOPATTERN_TESTS = { 244 "[[:latin:]&[:greek:]]", 245 "[[:latin:]-[:greek:]]", 246 "[:nonspacing mark:]" 247 }; 248 249 250 public boolean toPatternAux(int start, int end) { 251 // use Integer.toString because Utility.hex doesn't handle ints 252 String source = "0x" + Integer.toString(start,16).toUpperCase(); 253 if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase(); 254 UnicodeSet testSet = new UnicodeSet(); 255 testSet.add(start, end); 256 return checkPat(source, testSet); 257 } 258 259 boolean checkPat (String source, UnicodeSet testSet) { 260 String pat = ""; 261 try { 262 // What we want to make sure of is that a pattern generated 263 // by toPattern(), with or without escaped unprintables, can 264 // be passed back into the UnicodeSet constructor. 265 String pat0 = testSet.toPattern(true); 266 if (!checkPat(source + " (escaped)", testSet, pat0)) return false; 267 268 //String pat1 = unescapeLeniently(pat0); 269 //if (!checkPat(source + " (in code)", testSet, pat1)) return false; 270 271 String pat2 = testSet.toPattern(false); 272 if (!checkPat(source, testSet, pat2)) return false; 273 274 //String pat3 = unescapeLeniently(pat2); 275 //if (!checkPat(source + " (in code)", testSet, pat3)) return false; 276 277 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3); 278 logln(source + " => " + pat0 + ", " + pat2); 279 } catch (Exception e) { 280 errln("EXCEPTION in toPattern: " + source + " => " + pat); 281 return false; 282 } 283 return true; 284 } 285 286 boolean checkPat (String source, UnicodeSet testSet, String pat) { 287 UnicodeSet testSet2 = new UnicodeSet(pat); 288 if (!testSet2.equals(testSet)) { 289 errln("Fail toPattern: " + source + "; " + pat + " => " + 290 testSet2.toPattern(false) + ", expected " + 291 testSet.toPattern(false)); 292 return false; 293 } 294 return true; 295 } 296 297 // NOTE: copied the following from Utility. There ought to be a version in there with a flag 298 // that does the Java stuff 299 300 public static int unescapeAt(String s, int[] offset16) { 301 int c; 302 int result = 0; 303 int n = 0; 304 int minDig = 0; 305 int maxDig = 0; 306 int bitsPerDigit = 4; 307 int dig; 308 int i; 309 310 /* Check that offset is in range */ 311 int offset = offset16[0]; 312 int length = s.length(); 313 if (offset < 0 || offset >= length) { 314 return -1; 315 } 316 317 /* Fetch first UChar after '\\' */ 318 c = UTF16.charAt(s, offset); 319 offset += UTF16.getCharCount(c); 320 321 /* Convert hexadecimal and octal escapes */ 322 switch (c) { 323 case 'u': 324 minDig = maxDig = 4; 325 break; 326 /* 327 case 'U': 328 minDig = maxDig = 8; 329 break; 330 case 'x': 331 minDig = 1; 332 maxDig = 2; 333 break; 334 */ 335 default: 336 dig = UCharacter.digit(c, 8); 337 if (dig >= 0) { 338 minDig = 1; 339 maxDig = 3; 340 n = 1; /* Already have first octal digit */ 341 bitsPerDigit = 3; 342 result = dig; 343 } 344 break; 345 } 346 if (minDig != 0) { 347 while (offset < length && n < maxDig) { 348 // TEMPORARY 349 // TODO: Restore the char32-based code when UCharacter.digit 350 // is working (Bug 66). 351 352 //c = UTF16.charAt(s, offset); 353 //dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16); 354 c = s.charAt(offset); 355 dig = Character.digit((char)c, (bitsPerDigit == 3) ? 8 : 16); 356 if (dig < 0) { 357 break; 358 } 359 result = (result << bitsPerDigit) | dig; 360 //offset += UTF16.getCharCount(c); 361 ++offset; 362 ++n; 363 } 364 if (n < minDig) { 365 return -1; 366 } 367 offset16[0] = offset; 368 return result; 369 } 370 371 /* Convert C-style escapes in table */ 372 for (i=0; i<UNESCAPE_MAP.length; i+=2) { 373 if (c == UNESCAPE_MAP[i]) { 374 offset16[0] = offset; 375 return UNESCAPE_MAP[i+1]; 376 } else if (c < UNESCAPE_MAP[i]) { 377 break; 378 } 379 } 380 381 /* If no special forms are recognized, then consider 382 * the backslash to generically escape the next character. */ 383 offset16[0] = offset; 384 return c; 385 } 386 387 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ 388 static private final char[] UNESCAPE_MAP = { 389 /*" 0x22, 0x22 */ 390 /*' 0x27, 0x27 */ 391 /*? 0x3F, 0x3F */ 392 /*\ 0x5C, 0x5C */ 393 /*a*/ 0x61, 0x07, 394 /*b*/ 0x62, 0x08, 395 /*f*/ 0x66, 0x0c, 396 /*n*/ 0x6E, 0x0a, 397 /*r*/ 0x72, 0x0d, 398 /*t*/ 0x74, 0x09, 399 /*v*/ 0x76, 0x0b 400 }; 401 402 /** 403 * Convert all escapes in a given string using unescapeAt(). 404 * Leave invalid escape sequences unchanged. 405 */ 406 public static String unescapeLeniently(String s) { 407 StringBuffer buf = new StringBuffer(); 408 int[] pos = new int[1]; 409 for (int i=0; i<s.length(); ) { 410 char c = s.charAt(i++); 411 if (c == '\\') { 412 pos[0] = i; 413 int e = unescapeAt(s, pos); 414 if (e < 0) { 415 buf.append(c); 416 } else { 417 UTF16.append(buf, e); 418 i = pos[0]; 419 } 420 } else { 421 buf.append(c); 422 } 423 } 424 return buf.toString(); 425 } 426 427 public void TestPatterns() { 428 UnicodeSet set = new UnicodeSet(); 429 expectPattern(set, "[[a-m]&[d-z]&[k-y]]", "km"); 430 expectPattern(set, "[[a-z]-[m-y]-[d-r]]", "aczz"); 431 expectPattern(set, "[a\\-z]", "--aazz"); 432 expectPattern(set, "[-az]", "--aazz"); 433 expectPattern(set, "[az-]", "--aazz"); 434 expectPattern(set, "[[[a-z]-[aeiou]i]]", "bdfnptvz"); 435 436 // Throw in a test of complement 437 set.complement(); 438 String exp = '\u0000' + "aeeoouu" + (char)('z'+1) + '\uFFFF'; 439 expectPairs(set, exp); 440 } 441 442 public void TestCategories() { 443 int failures = 0; 444 UnicodeSet set = new UnicodeSet("[:Lu:]"); 445 expectContainment(set, "ABC", "abc"); 446 447 // Make sure generation of L doesn't pollute cached Lu set 448 // First generate L, then Lu 449 // not used int TOP = 0x200; // Don't need to go over the whole range: 450 set = new UnicodeSet("[:L:]"); 451 for (int i=0; i<0x200; ++i) { 452 boolean l = UCharacter.isLetter(i); 453 if (l != set.contains((char)i)) { 454 errln("FAIL: L contains " + (char)i + " = " + 455 set.contains((char)i)); 456 if (++failures == 10) break; 457 } 458 } 459 460 set = new UnicodeSet("[:Lu:]"); 461 for (int i=0; i<0x200; ++i) { 462 boolean lu = (UCharacter.getType(i) == ECharacterCategory.UPPERCASE_LETTER); 463 if (lu != set.contains((char)i)) { 464 errln("FAIL: Lu contains " + (char)i + " = " + 465 set.contains((char)i)); 466 if (++failures == 20) break; 467 } 468 } 469 } 470 471 public void TestAddRemove() { 472 UnicodeSet set = new UnicodeSet(); 473 set.add('a', 'z'); 474 expectPairs(set, "az"); 475 set.remove('m', 'p'); 476 expectPairs(set, "alqz"); 477 set.remove('e', 'g'); 478 expectPairs(set, "adhlqz"); 479 set.remove('d', 'i'); 480 expectPairs(set, "acjlqz"); 481 set.remove('c', 'r'); 482 expectPairs(set, "absz"); 483 set.add('f', 'q'); 484 expectPairs(set, "abfqsz"); 485 set.remove('a', 'g'); 486 expectPairs(set, "hqsz"); 487 set.remove('a', 'z'); 488 expectPairs(set, ""); 489 490 // Try removing an entire set from another set 491 expectPattern(set, "[c-x]", "cx"); 492 UnicodeSet set2 = new UnicodeSet(); 493 expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz"); 494 set.removeAll(set2); 495 expectPairs(set, "deluxx"); 496 497 // Try adding an entire set to another set 498 expectPattern(set, "[jackiemclean]", "aacceein"); 499 expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort"); 500 set.addAll(set2); 501 expectPairs(set, "aacehort"); 502 503 // Test commutativity 504 expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort"); 505 expectPattern(set2, "[jackiemclean]", "aacceein"); 506 set.addAll(set2); 507 expectPairs(set, "aacehort"); 508 } 509 510 /** 511 * Make sure minimal representation is maintained. 512 */ 513 public void TestMinimalRep() { 514 // This is pretty thoroughly tested by checkCanonicalRep() 515 // run against the exhaustive operation results. Use the code 516 // here for debugging specific spot problems. 517 518 // 1 overlap against 2 519 UnicodeSet set = new UnicodeSet("[h-km-q]"); 520 UnicodeSet set2 = new UnicodeSet("[i-o]"); 521 set.addAll(set2); 522 expectPairs(set, "hq"); 523 // right 524 set.applyPattern("[a-m]"); 525 set2.applyPattern("[e-o]"); 526 set.addAll(set2); 527 expectPairs(set, "ao"); 528 // left 529 set.applyPattern("[e-o]"); 530 set2.applyPattern("[a-m]"); 531 set.addAll(set2); 532 expectPairs(set, "ao"); 533 // 1 overlap against 3 534 set.applyPattern("[a-eg-mo-w]"); 535 set2.applyPattern("[d-q]"); 536 set.addAll(set2); 537 expectPairs(set, "aw"); 538 } 539 540 public void TestAPI() { 541 // default ct 542 UnicodeSet set = new UnicodeSet(); 543 if (!set.isEmpty() || set.getRangeCount() != 0) { 544 errln("FAIL, set should be empty but isn't: " + 545 set); 546 } 547 548 // clear(), isEmpty() 549 set.add('a'); 550 if (set.isEmpty()) { 551 errln("FAIL, set shouldn't be empty but is: " + 552 set); 553 } 554 set.clear(); 555 if (!set.isEmpty()) { 556 errln("FAIL, set should be empty but isn't: " + 557 set); 558 } 559 560 // size() 561 set.clear(); 562 if (set.size() != 0) { 563 errln("FAIL, size should be 0, but is " + set.size() + 564 ": " + set); 565 } 566 set.add('a'); 567 if (set.size() != 1) { 568 errln("FAIL, size should be 1, but is " + set.size() + 569 ": " + set); 570 } 571 set.add('1', '9'); 572 if (set.size() != 10) { 573 errln("FAIL, size should be 10, but is " + set.size() + 574 ": " + set); 575 } 576 set.clear(); 577 set.complement(); 578 if (set.size() != 0x110000) { 579 errln("FAIL, size should be 0x110000, but is" + set.size()); 580 } 581 582 // contains(first, last) 583 set.clear(); 584 set.applyPattern("[A-Y 1-8 b-d l-y]"); 585 for (int i = 0; i<set.getRangeCount(); ++i) { 586 int a = set.getRangeStart(i); 587 int b = set.getRangeEnd(i); 588 if (!set.contains(a, b)) { 589 errln("FAIL, should contain " + (char)a + '-' + (char)b + 590 " but doesn't: " + set); 591 } 592 if (set.contains((char)(a-1), b)) { 593 errln("FAIL, shouldn't contain " + 594 (char)(a-1) + '-' + (char)b + 595 " but does: " + set); 596 } 597 if (set.contains(a, (char)(b+1))) { 598 errln("FAIL, shouldn't contain " + 599 (char)a + '-' + (char)(b+1) + 600 " but does: " + set); 601 } 602 } 603 604 // Ported InversionList test. 605 UnicodeSet a = new UnicodeSet((char)3,(char)10); 606 UnicodeSet b = new UnicodeSet((char)7,(char)15); 607 UnicodeSet c = new UnicodeSet(); 608 609 logln("a [3-10]: " + a); 610 logln("b [7-15]: " + b); 611 c.set(a); c.addAll(b); 612 UnicodeSet exp = new UnicodeSet((char)3,(char)15); 613 if (c.equals(exp)) { 614 logln("c.set(a).add(b): " + c); 615 } else { 616 errln("FAIL: c.set(a).add(b) = " + c + ", expect " + exp); 617 } 618 c.complement(); 619 exp.set((char)0, (char)2); 620 exp.add((char)16, UnicodeSet.MAX_VALUE); 621 if (c.equals(exp)) { 622 logln("c.complement(): " + c); 623 } else { 624 errln(Utility.escape("FAIL: c.complement() = " + c + ", expect " + exp)); 625 } 626 c.complement(); 627 exp.set((char)3, (char)15); 628 if (c.equals(exp)) { 629 logln("c.complement(): " + c); 630 } else { 631 errln("FAIL: c.complement() = " + c + ", expect " + exp); 632 } 633 c.set(a); c.complementAll(b); 634 exp.set((char)3,(char)6); 635 exp.add((char)11,(char) 15); 636 if (c.equals(exp)) { 637 logln("c.set(a).complement(b): " + c); 638 } else { 639 errln("FAIL: c.set(a).complement(b) = " + c + ", expect " + exp); 640 } 641 642 exp.set(c); 643 c = bitsToSet(setToBits(c)); 644 if (c.equals(exp)) { 645 logln("bitsToSet(setToBits(c)): " + c); 646 } else { 647 errln("FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp); 648 } 649 650 // Additional tests for coverage JB#2118 651 //UnicodeSet::complement(class UnicodeString const &) 652 //UnicodeSet::complementAll(class UnicodeString const &) 653 //UnicodeSet::containsNone(class UnicodeSet const &) 654 //UnicodeSet::containsNone(long,long) 655 //UnicodeSet::containsSome(class UnicodeSet const &) 656 //UnicodeSet::containsSome(long,long) 657 //UnicodeSet::removeAll(class UnicodeString const &) 658 //UnicodeSet::retain(long) 659 //UnicodeSet::retainAll(class UnicodeString const &) 660 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &) 661 //UnicodeSetIterator::getString(void) 662 set.clear(); 663 set.complement("ab"); 664 exp.applyPattern("[{ab}]"); 665 if (!set.equals(exp)) { errln("FAIL: complement(\"ab\")"); return; } 666 667 UnicodeSetIterator iset = new UnicodeSetIterator(set); 668 if (!iset.next() || iset.codepoint != UnicodeSetIterator.IS_STRING) { 669 errln("FAIL: UnicodeSetIterator.next/IS_STRING"); 670 } else if (!iset.string.equals("ab")) { 671 errln("FAIL: UnicodeSetIterator.string"); 672 } 673 674 set.add((char)0x61, (char)0x7A); 675 set.complementAll("alan"); 676 exp.applyPattern("[{ab}b-kmo-z]"); 677 if (!set.equals(exp)) { errln("FAIL: complementAll(\"alan\")"); return; } 678 679 exp.applyPattern("[a-z]"); 680 if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); } 681 if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); } 682 exp.applyPattern("[aln]"); 683 if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); } 684 if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); } 685 686 if (set.containsNone((char)0x61, (char)0x7A)) { 687 errln("FAIL: containsNone(char, char)"); 688 } 689 if (!set.containsSome((char)0x61, (char)0x7A)) { 690 errln("FAIL: containsSome(char, char)"); 691 } 692 if (!set.containsNone((char)0x41, (char)0x5A)) { 693 errln("FAIL: containsNone(char, char)"); 694 } 695 if (set.containsSome((char)0x41, (char)0x5A)) { 696 errln("FAIL: containsSome(char, char)"); 697 } 698 699 set.removeAll("liu"); 700 exp.applyPattern("[{ab}b-hj-kmo-tv-z]"); 701 if (!set.equals(exp)) { errln("FAIL: removeAll(\"liu\")"); return; } 702 703 set.retainAll("star"); 704 exp.applyPattern("[rst]"); 705 if (!set.equals(exp)) { errln("FAIL: retainAll(\"star\")"); return; } 706 707 set.retain((char)0x73); 708 exp.applyPattern("[s]"); 709 if (!set.equals(exp)) { errln("FAIL: retain('s')"); return; } 710 711 // ICU 2.6 coverage tests 712 // public final UnicodeSet retain(String s); 713 // public final UnicodeSet remove(int c); 714 // public final UnicodeSet remove(String s); 715 // public int hashCode(); 716 set.applyPattern("[a-z{ab}{cd}]"); 717 set.retain("cd"); 718 exp.applyPattern("[{cd}]"); 719 if (!set.equals(exp)) { errln("FAIL: retain(\"cd\")"); return; } 720 721 set.applyPattern("[a-z{ab}{cd}]"); 722 set.remove((char)0x63); 723 exp.applyPattern("[abd-z{ab}{cd}]"); 724 if (!set.equals(exp)) { errln("FAIL: remove('c')"); return; } 725 726 set.remove("cd"); 727 exp.applyPattern("[abd-z{ab}]"); 728 if (!set.equals(exp)) { errln("FAIL: remove(\"cd\")"); return; } 729 730 if (set.hashCode() != exp.hashCode()) { 731 errln("FAIL: hashCode() unequal"); 732 } 733 exp.clear(); 734 if (set.hashCode() == exp.hashCode()) { 735 errln("FAIL: hashCode() equal"); 736 } 737 738 { 739 //Cover addAll(Collection) and addAllTo(Collection) 740 // Seems that there is a bug in addAll(Collection) operation 741 // Ram also add a similar test to UtilityTest.java 742 logln("Testing addAll(Collection) ... "); 743 String[] array = {"a", "b", "c", "de"}; 744 List list = Arrays.asList(array); 745 Set aset = new HashSet(list); 746 logln(" *** The source set's size is: " + aset.size()); 747 748 set.clear(); 749 set.addAll(aset); 750 if (set.size() != aset.size()) { 751 errln("FAIL: After addAll, the UnicodeSet size expected " + aset.size() + 752 ", " + set.size() + " seen instead!"); 753 } else { 754 logln("OK: After addAll, the UnicodeSet size got " + set.size()); 755 } 756 757 List list2 = new ArrayList(); 758 set.addAllTo(list2); 759 760 //verify the result 761 log(" *** The elements are: "); 762 String s = set.toPattern(true); 763 logln(s); 764 Iterator myiter = list2.iterator(); 765 while(myiter.hasNext()) { 766 log(myiter.next().toString() + " "); 767 } 768 logln(""); // a new line 769 } 770 771 } 772 773 public void TestStrings() { 774 // Object[][] testList = { 775 // {I_EQUALS, UnicodeSet.fromAll("abc"), 776 // new UnicodeSet("[a-c]")}, 777 // 778 // {I_EQUALS, UnicodeSet.from("ch").add('a','z').add("ll"), 779 // new UnicodeSet("[{ll}{ch}a-z]")}, 780 // 781 // {I_EQUALS, UnicodeSet.from("ab}c"), 782 // new UnicodeSet("[{ab\\}c}]")}, 783 // 784 // {I_EQUALS, new UnicodeSet('a','z').add('A', 'Z').retain('M','m').complement('X'), 785 // new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]")}, 786 // }; 787 // 788 // for (int i = 0; i < testList.length; ++i) { 789 // expectRelation(testList[i][0], testList[i][1], testList[i][2], "(" + i + ")"); 790 // } 791 792 UnicodeSet[][] testList = { 793 {UnicodeSet.fromAll("abc"), 794 new UnicodeSet("[a-c]")}, 795 796 {UnicodeSet.from("ch").add('a','z').add("ll"), 797 new UnicodeSet("[{ll}{ch}a-z]")}, 798 799 {UnicodeSet.from("ab}c"), 800 new UnicodeSet("[{ab\\}c}]")}, 801 802 {new UnicodeSet('a','z').add('A', 'Z').retain('M','m').complement('X'), 803 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]")}, 804 }; 805 806 for (int i = 0; i < testList.length; ++i) { 807 if (!testList[i][0].equals(testList[i][1])) { 808 errln("FAIL: sets unequal; see source code (" + i + ")"); 809 } 810 } 811 } 812 813 static final Integer 814 I_ANY = new Integer(SortedSetRelation.ANY), 815 I_CONTAINS = new Integer(SortedSetRelation.CONTAINS), 816 I_DISJOINT = new Integer(SortedSetRelation.DISJOINT), 817 I_NO_B = new Integer(SortedSetRelation.NO_B), 818 I_ISCONTAINED = new Integer(SortedSetRelation.ISCONTAINED), 819 I_EQUALS = new Integer(SortedSetRelation.EQUALS), 820 I_NO_A = new Integer(SortedSetRelation.NO_A), 821 I_NONE = new Integer(SortedSetRelation.NONE); 822 823 public void TestSetRelation() { 824 825 String[] choices = {"a", "b", "cd", "ef"}; 826 int limit = 1 << choices.length; 827 828 SortedSet iset = new TreeSet(); 829 SortedSet jset = new TreeSet(); 830 831 for (int i = 0; i < limit; ++i) { 832 pick(i, choices, iset); 833 for (int j = 0; j < limit; ++j) { 834 pick(j, choices, jset); 835 checkSetRelation(iset, jset, "(" + i + ")"); 836 } 837 } 838 } 839 840 public void TestSetSpeed() { 841 // skip unless verbose 842 if (!isVerbose()) return; 843 844 SetSpeed2(100); 845 SetSpeed2(1000); 846 } 847 848 public void SetSpeed2(int size) { 849 850 SortedSet iset = new TreeSet(); 851 SortedSet jset = new TreeSet(); 852 853 for (int i = 0; i < size*2; i += 2) { // only even values 854 iset.add(new Integer(i)); 855 jset.add(new Integer(i)); 856 } 857 858 int iterations = 1000000 / size; 859 860 logln("Timing comparison of Java vs Utility"); 861 logln("For about " + size + " objects that are almost all the same."); 862 863 CheckSpeed(iset, jset, "when a = b", iterations); 864 865 iset.add(new Integer(size + 1)); // add odd value in middle 866 867 CheckSpeed(iset, jset, "when a contains b", iterations); 868 CheckSpeed(jset, iset, "when b contains a", iterations); 869 870 jset.add(new Integer(size - 1)); // add different odd value in middle 871 872 CheckSpeed(jset, iset, "when a, b are disjoint", iterations); 873 } 874 875 void CheckSpeed(SortedSet iset, SortedSet jset, String message, int iterations) { 876 CheckSpeed2(iset, jset, message, iterations); 877 CheckSpeed3(iset, jset, message, iterations); 878 } 879 880 void CheckSpeed2(SortedSet iset, SortedSet jset, String message, int iterations) { 881 boolean x; 882 boolean y; 883 884 // make sure code is loaded: 885 x = iset.containsAll(jset); 886 y = SortedSetRelation.hasRelation(iset, SortedSetRelation.CONTAINS, jset); 887 if (x != y) errln("FAIL contains comparison"); 888 889 double start = System.currentTimeMillis(); 890 for (int i = 0; i < iterations; ++i) { 891 x |= iset.containsAll(jset); 892 } 893 double middle = System.currentTimeMillis(); 894 for (int i = 0; i < iterations; ++i) { 895 y |= SortedSetRelation.hasRelation(iset, SortedSetRelation.CONTAINS, jset); 896 } 897 double end = System.currentTimeMillis(); 898 899 double jtime = (middle - start)/iterations; 900 double utime = (end - middle)/iterations; 901 902 NumberFormat nf = NumberFormat.getPercentInstance(); 903 logln("Test contains: " + message + ": Java: " + jtime 904 + ", Utility: " + utime + ", u:j: " + nf.format(utime/jtime)); 905 } 906 907 void CheckSpeed3(SortedSet iset, SortedSet jset, String message, int iterations) { 908 boolean x; 909 boolean y; 910 911 // make sure code is loaded: 912 x = iset.equals(jset); 913 y = SortedSetRelation.hasRelation(iset, SortedSetRelation.EQUALS, jset); 914 if (x != y) errln("FAIL equality comparison"); 915 916 917 double start = System.currentTimeMillis(); 918 for (int i = 0; i < iterations; ++i) { 919 x |= iset.equals(jset); 920 } 921 double middle = System.currentTimeMillis(); 922 for (int i = 0; i < iterations; ++i) { 923 y |= SortedSetRelation.hasRelation(iset, SortedSetRelation.EQUALS, jset); 924 } 925 double end = System.currentTimeMillis(); 926 927 double jtime = (middle - start)/iterations; 928 double utime = (end - middle)/iterations; 929 930 NumberFormat nf = NumberFormat.getPercentInstance(); 931 logln("Test equals: " + message + ": Java: " + jtime 932 + ", Utility: " + utime + ", u:j: " + nf.format(utime/jtime)); 933 } 934 935 void pick(int bits, Object[] examples, SortedSet output) { 936 output.clear(); 937 for (int k = 0; k < 32; ++k) { 938 if (((1<<k) & bits) != 0) output.add(examples[k]); 939 } 940 } 941 942 public static final String[] RELATION_NAME = { 943 "both-are-null", 944 "a-is-null", 945 "equals", 946 "is-contained-in", 947 "b-is-null", 948 "is-disjoint_with", 949 "contains", 950 "any", }; 951 952 boolean dumbHasRelation(Collection A, int filter, Collection B) { 953 Collection ab = new TreeSet(A); 954 ab.retainAll(B); 955 if (ab.size() > 0 && (filter & SortedSetRelation.A_AND_B) == 0) return false; 956 957 // A - B size == A.size - A&B.size 958 if (A.size() > ab.size() && (filter & SortedSetRelation.A_NOT_B) == 0) return false; 959 960 // B - A size == B.size - A&B.size 961 if (B.size() > ab.size() && (filter & SortedSetRelation.B_NOT_A) == 0) return false; 962 963 964 return true; 965 } 966 967 void checkSetRelation(SortedSet a, SortedSet b, String message) { 968 for (int i = 0; i < 8; ++i) { 969 970 boolean hasRelation = SortedSetRelation.hasRelation(a, i, b); 971 boolean dumbHasRelation = dumbHasRelation(a, i, b); 972 973 logln(message + " " + hasRelation + ":\t" + a + "\t" + RELATION_NAME[i] + "\t" + b); 974 975 if (hasRelation != dumbHasRelation) { 976 errln("FAIL: " + 977 message + " " + dumbHasRelation + ":\t" + a + "\t" + RELATION_NAME[i] + "\t" + b); 978 } 979 } 980 logln(""); 981 } 982 983 /** 984 * Test the [:Latin:] syntax. 985 */ 986 public void TestScriptSet() { 987 988 expectContainment("[:Latin:]", "aA", CharsToUnicodeString("\\u0391\\u03B1")); 989 990 expectContainment("[:Greek:]", CharsToUnicodeString("\\u0391\\u03B1"), "aA"); 991 992 /* Jitterbug 1423 */ 993 expectContainment("[[:Common:][:Inherited:]]", CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA"); 994 995 } 996 997 /** 998 * Test the [:Latin:] syntax. 999 */ 1000 public void TestPropertySet() { 1001 String[] DATA = { 1002 // Pattern, Chars IN, Chars NOT in 1003 1004 "[:Latin:]", 1005 "aA", 1006 "\u0391\u03B1", 1007 1008 "[\\p{Greek}]", 1009 "\u0391\u03B1", 1010 "aA", 1011 1012 "\\P{ GENERAL Category = upper case letter }", 1013 "abc", 1014 "ABC", 1015 1016 // Combining class: @since ICU 2.2 1017 // Check both symbolic and numeric 1018 "\\p{ccc=Nukta}", 1019 "\u0ABC", 1020 "abc", 1021 1022 "\\p{Canonical Combining Class = 11}", 1023 "\u05B1", 1024 "\u05B2", 1025 1026 "[:c c c = iota subscript :]", 1027 "\u0345", 1028 "xyz", 1029 1030 // Bidi class: @since ICU 2.2 1031 "\\p{bidiclass=lefttoright}", 1032 "abc", 1033 "\u0671\u0672", 1034 1035 // Binary properties: @since ICU 2.2 1036 "\\p{ideographic}", 1037 "\u4E0A", 1038 "x", 1039 1040 "[:math=false:]", 1041 "q)*(", // )(and * were removed from math in Unicode 4.0.1 1042 "+<>^", 1043 1044 // JB#1767 \N{}, \p{ASCII} 1045 "[:Ascii:]", 1046 "abc\u0000\u007F", 1047 "\u0080\u4E00", 1048 1049 "[\\N{ latin small letter a }[:name= latin small letter z:]]", 1050 "az", 1051 "qrs", 1052 1053 // JB#2015 1054 "[:any:]", 1055 "a\\U0010FFFF", 1056 "", 1057 1058 "[:nv=0.5:]", 1059 "\u00BD\u0F2A", 1060 "\u00BC", 1061 1062 // JB#2653: Age 1063 "[:Age=1.1:]", 1064 "\u03D6", // 1.1 1065 "\u03D8\u03D9", // 3.2 1066 1067 "[:Age=3.1:]", 1068 "\\u1800\\u3400\\U0002f800", 1069 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000", 1070 1071 // JB#2350: Case_Sensitive 1072 "[:Case Sensitive:]", 1073 "A\u1FFC\\U00010410", 1074 ";\u00B4\\U00010500", 1075 1076 1077 // Regex compatibility test 1078 "[-b]", // leading '-' is literal 1079 "-b", 1080 "ac", 1081 1082 "[^-b]", // leading '-' is literal 1083 "ac", 1084 "-b", 1085 1086 "[b-]", // trailing '-' is literal 1087 "-b", 1088 "ac", 1089 1090 "[^b-]", // trailing '-' is literal 1091 "ac", 1092 "-b", 1093 1094 "[a-b-]", // trailing '-' is literal 1095 "ab-", 1096 "c=", 1097 1098 "[[a-q]&[p-z]-]", // trailing '-' is literal 1099 "pq-", 1100 "or=", 1101 1102 "[\\s|\\)|:|$|\\>]", // from regex tests 1103 "s|):$>", 1104 "\\abc", 1105 1106 "[\uDC00cd]", // JB#2906: isolated trail at start 1107 "cd\uDC00", 1108 "ab\uD800\\U00010000", 1109 1110 "[ab\uD800]", // JB#2906: isolated trail at start 1111 "ab\uD800", 1112 "cd\uDC00\\U00010000", 1113 1114 "[ab\uD800cd]", // JB#2906: isolated lead in middle 1115 "abcd\uD800", 1116 "ef\uDC00\\U00010000", 1117 1118 "[ab\uDC00cd]", // JB#2906: isolated trail in middle 1119 "abcd\uDC00", 1120 "ef\uD800\\U00010000", 1121 1122 "[:^lccc=0:]", // Lead canonical class 1123 "\u0300\u0301", 1124 "abcd\u00c0\u00c5", 1125 1126 "[:^tccc=0:]", // Trail canonical class 1127 "\u0300\u0301\u00c0\u00c5", 1128 "abcd", 1129 1130 "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class 1131 "\u0300\u0301\u00c0\u00c5", 1132 "abcd", 1133 1134 "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now) 1135 "", 1136 "abcd\u0300\u0301\u00c0\u00c5", 1137 1138 "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not 1139 "\u0F73\u0F75\u0F81", 1140 "abcd\u0300\u0301\u00c0\u00c5", 1141 1142 "[:Assigned:]", 1143 "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD", 1144 "\\u0888\\uFDD3\\uFFFE\\U00050005", 1145 1146 // Script_Extensions, new in Unicode 6.0 1147 "[:scx=Arab:]", 1148 "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3", 1149 "\\u061D\\uFDEF\\uFDFE", 1150 1151 // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions, 1152 // so scx-sc is missing U+FDF2. 1153 "[[:Script_Extensions=Arabic:]-[:Arab:]]", 1154 "\\u0640\\u064B\\u0650\\u0655", 1155 "\\uFDF2" 1156 }; 1157 1158 for (int i=0; i<DATA.length; i+=3) { 1159 expectContainment(DATA[i], DATA[i+1], DATA[i+2]); 1160 } 1161 } 1162 1163 public void TestUnicodeSetStrings() { 1164 UnicodeSet uset = new UnicodeSet("[a{bc}{cd}pqr\u0000]"); 1165 logln(uset + " ~ " + uset.getRegexEquivalent()); 1166 String[][] testStrings = {{"x", "none"}, 1167 {"bc", "all"}, 1168 {"cdbca", "all"}, 1169 {"a", "all"}, 1170 {"bcx", "some"}, 1171 {"ab", "some"}, 1172 {"acb", "some"}, 1173 {"bcda", "some"}, 1174 {"dccbx", "none"}, 1175 }; 1176 for (int i = 0; i < testStrings.length; ++i) { 1177 check(uset, testStrings[i][0], testStrings[i][1]); 1178 } 1179 } 1180 1181 1182 private void check(UnicodeSet uset, String string, String desiredStatus) { 1183 boolean shouldContainAll = desiredStatus.equals("all"); 1184 boolean shouldContainNone = desiredStatus.equals("none"); 1185 if (uset.containsAll(string) != shouldContainAll) { 1186 errln("containsAll " + string + " should be " + shouldContainAll); 1187 } else { 1188 logln("containsAll " + string + " = " + shouldContainAll); 1189 } 1190 if (uset.containsNone(string) != shouldContainNone) { 1191 errln("containsNone " + string + " should be " + shouldContainNone); 1192 } else { 1193 logln("containsNone " + string + " = " + shouldContainNone); 1194 } 1195 } 1196 1197 /** 1198 * Test cloning of UnicodeSet 1199 */ 1200 public void TestClone() { 1201 UnicodeSet s = new UnicodeSet("[abcxyz]"); 1202 UnicodeSet t = (UnicodeSet) s.clone(); 1203 expectContainment(t, "abc", "def"); 1204 } 1205 1206 /** 1207 * Test the indexOf() and charAt() methods. 1208 */ 1209 public void TestIndexOf() { 1210 UnicodeSet set = new UnicodeSet("[a-cx-y3578]"); 1211 for (int i=0; i<set.size(); ++i) { 1212 int c = set.charAt(i); 1213 if (set.indexOf(c) != i) { 1214 errln("FAIL: charAt(" + i + ") = " + c + 1215 " => indexOf() => " + set.indexOf(c)); 1216 } 1217 } 1218 int c = set.charAt(set.size()); 1219 if (c != -1) { 1220 errln("FAIL: charAt(<out of range>) = " + 1221 Utility.escape(String.valueOf(c))); 1222 } 1223 int j = set.indexOf('q'); 1224 if (j != -1) { 1225 errln("FAIL: indexOf('q') = " + j); 1226 } 1227 } 1228 1229 public void TestContainsString() { 1230 UnicodeSet x = new UnicodeSet("[a{bc}]"); 1231 if (x.contains("abc")) errln("FAIL"); 1232 } 1233 1234 public void TestExhaustive() { 1235 // exhaustive tests. Simulate UnicodeSets with integers. 1236 // That gives us very solid tests (except for large memory tests). 1237 1238 char limit = (char)128; 1239 1240 for (char i = 0; i < limit; ++i) { 1241 logln("Testing " + i + ", " + bitsToSet(i)); 1242 _testComplement(i); 1243 1244 // AS LONG AS WE ARE HERE, check roundtrip 1245 checkRoundTrip(bitsToSet(i)); 1246 1247 for (char j = 0; j < limit; ++j) { 1248 _testAdd(i,j); 1249 _testXor(i,j); 1250 _testRetain(i,j); 1251 _testRemove(i,j); 1252 } 1253 } 1254 } 1255 1256 /** 1257 * Make sure each script name and abbreviated name can be used 1258 * to construct a UnicodeSet. 1259 */ 1260 public void TestScriptNames() { 1261 for (int i=0; i<UScript.CODE_LIMIT; ++i) { 1262 for (int j=0; j<2; ++j) { 1263 String pat = ""; 1264 try { 1265 String name = 1266 (j==0) ? UScript.getName(i) : UScript.getShortName(i); 1267 pat = "[:" + name + ":]"; 1268 UnicodeSet set = new UnicodeSet(pat); 1269 logln("Ok: " + pat + " -> " + set.toPattern(false)); 1270 } catch (IllegalArgumentException e) { 1271 if (pat.length() == 0) { 1272 errln("FAIL (in UScript): No name for script " + i); 1273 } else { 1274 errln("FAIL: Couldn't create " + pat); 1275 } 1276 } 1277 } 1278 } 1279 } 1280 1281 /** 1282 * Test closure API. 1283 */ 1284 public void TestCloseOver() { 1285 String CASE = String.valueOf(UnicodeSet.CASE); 1286 String[] DATA = { 1287 // selector, input, output 1288 CASE, 1289 "[aq\u00DF{Bc}{bC}{Fi}]", 1290 "[aAqQ\u00DF\u1E9E\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1 1291 1292 CASE, 1293 "[\u01F1]", // 'DZ' 1294 "[\u01F1\u01F2\u01F3]", 1295 1296 CASE, 1297 "[\u1FB4]", 1298 "[\u1FB4{\u03AC\u03B9}]", 1299 1300 CASE, 1301 "[{F\uFB01}]", 1302 "[\uFB03{ffi}]", 1303 1304 CASE, 1305 "[a-z]","[A-Za-z\u017F\u212A]", 1306 CASE, 1307 "[abc]","[A-Ca-c]", 1308 CASE, 1309 "[ABC]","[A-Ca-c]", 1310 }; 1311 1312 UnicodeSet s = new UnicodeSet(); 1313 UnicodeSet t = new UnicodeSet(); 1314 for (int i=0; i<DATA.length; i+=3) { 1315 int selector = Integer.parseInt(DATA[i]); 1316 String pat = DATA[i+1]; 1317 String exp = DATA[i+2]; 1318 s.applyPattern(pat); 1319 s.closeOver(selector); 1320 t.applyPattern(exp); 1321 if (s.equals(t)) { 1322 logln("Ok: " + pat + ".closeOver(" + selector + ") => " + exp); 1323 } else { 1324 errln("FAIL: " + pat + ".closeOver(" + selector + ") => " + 1325 s.toPattern(true) + ", expected " + exp); 1326 } 1327 } 1328 1329 // Test the pattern API 1330 s.applyPattern("[abc]", UnicodeSet.CASE); 1331 expectContainment(s, "abcABC", "defDEF"); 1332 s = new UnicodeSet("[^abc]", UnicodeSet.CASE); 1333 expectContainment(s, "defDEF", "abcABC"); 1334 } 1335 1336 public void TestEscapePattern() { 1337 // The following pattern must contain at least one range "c-d" 1338 // where c or d is a Pattern_White_Space. 1339 String pattern = 1340 "[\\uFEFF \\u200E-\\u20FF \\uFFF9-\\uFFFC \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]"; 1341 String exp = 1342 "[\\u200E-\\u20FF\\uFEFF\\uFFF9-\\uFFFC\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]"; 1343 // We test this with two passes; in the second pass we 1344 // pre-unescape the pattern. Since U+200E is Pattern_White_Space, 1345 // this fails -- which is what we expect. 1346 for (int pass=1; pass<=2; ++pass) { 1347 String pat = pattern; 1348 if (pass==2) { 1349 pat = Utility.unescape(pat); 1350 } 1351 // Pattern is only good for pass 1 1352 boolean isPatternValid = (pass==1); 1353 1354 UnicodeSet set = null; 1355 try { 1356 set = new UnicodeSet(pat); 1357 } catch (IllegalArgumentException e) { 1358 set = null; 1359 } 1360 if ((set != null) != isPatternValid){ 1361 errln("FAIL: applyPattern(" + 1362 Utility.escape(pat) + ") => " + set); 1363 continue; 1364 } 1365 if (set == null) { 1366 continue; 1367 } 1368 if (set.contains((char)0x0644)){ 1369 errln("FAIL: " + Utility.escape(pat) + " contains(U+0664)"); 1370 } 1371 1372 String newpat = set.toPattern(true); 1373 if (newpat.equals(exp)) { 1374 logln(Utility.escape(pat) + " => " + newpat); 1375 } else { 1376 errln("FAIL: " + Utility.escape(pat) + " => " + newpat); 1377 } 1378 1379 for (int i=0; i<set.getRangeCount(); ++i) { 1380 StringBuffer str = new StringBuffer("Range "); 1381 str.append((char)(0x30 + i)) 1382 .append(": "); 1383 UTF16.append(str, set.getRangeStart(i)); 1384 str.append(" - "); 1385 UTF16.append(str, set.getRangeEnd(i)); 1386 String s = Utility.escape(str.toString() + " (" + set.getRangeStart(i) + " - " + 1387 set.getRangeEnd(i) + ")"); 1388 if (set.getRangeStart(i) < 0) { 1389 errln("FAIL: " + s); 1390 } else { 1391 logln(s); 1392 } 1393 } 1394 } 1395 } 1396 1397 public void TestSymbolTable() { 1398 // Multiple test cases can be set up here. Each test case 1399 // is terminated by null: 1400 // var, value, var, value,..., input pat., exp. output pat., null 1401 String DATA[] = { 1402 "us", "a-z", "[0-1$us]", "[0-1a-z]", null, 1403 "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", null, 1404 "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", null 1405 }; 1406 1407 for (int i=0; i<DATA.length; ++i) { 1408 TokenSymbolTable sym = new TokenSymbolTable(); 1409 1410 // Set up variables 1411 while (DATA[i+2] != null) { 1412 sym.add(DATA[i], DATA[i+1]); 1413 i += 2; 1414 } 1415 1416 // Input pattern and expected output pattern 1417 String inpat = DATA[i], exppat = DATA[i+1]; 1418 i += 2; 1419 1420 ParsePosition pos = new ParsePosition(0); 1421 UnicodeSet us = new UnicodeSet(inpat, pos, sym); 1422 1423 // results 1424 if (pos.getIndex() != inpat.length()) { 1425 errln("Failed to read to end of string \"" 1426 + inpat + "\": read to " 1427 + pos.getIndex() + ", length is " 1428 + inpat.length()); 1429 } 1430 1431 UnicodeSet us2 = new UnicodeSet(exppat); 1432 if (!us.equals(us2)) { 1433 errln("Failed, got " + us + ", expected " + us2); 1434 } else { 1435 logln("Ok, got " + us); 1436 } 1437 1438 //cover Unicode(String,ParsePosition,SymbolTable,int) 1439 ParsePosition inpos = new ParsePosition(0); 1440 UnicodeSet inSet = new UnicodeSet(inpat, inpos, sym, UnicodeSet.IGNORE_SPACE); 1441 UnicodeSet expSet = new UnicodeSet(exppat); 1442 if (!inSet.equals(expSet)) { 1443 errln("FAIL: Failed, got " + inSet + ", expected " + expSet); 1444 } else { 1445 logln("OK: got " + inSet); 1446 } 1447 } 1448 } 1449 1450 /** 1451 * Test that Posix style character classes [:digit:], etc. 1452 * have the Unicode definitions from TR 18. 1453 */ 1454 public void TestPosixClasses() { 1455 expectEqual("POSIX alpha", "[:alpha:]", "\\p{Alphabetic}"); 1456 expectEqual("POSIX lower", "[:lower:]", "\\p{lowercase}"); 1457 expectEqual("POSIX upper", "[:upper:]", "\\p{Uppercase}"); 1458 expectEqual("POSIX punct", "[:punct:]", "\\p{gc=Punctuation}"); 1459 expectEqual("POSIX digit", "[:digit:]", "\\p{gc=DecimalNumber}"); 1460 expectEqual("POSIX xdigit", "[:xdigit:]", "[\\p{DecimalNumber}\\p{HexDigit}]"); 1461 expectEqual("POSIX alnum", "[:alnum:]", "[\\p{Alphabetic}\\p{DecimalNumber}]"); 1462 expectEqual("POSIX space", "[:space:]", "\\p{Whitespace}"); 1463 expectEqual("POSIX blank", "[:blank:]", "[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"); 1464 expectEqual("POSIX cntrl", "[:cntrl:]", "\\p{Control}"); 1465 expectEqual("POSIX graph", "[:graph:]", "[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"); 1466 expectEqual("POSIX print", "[:print:]", "[[:graph:][:blank:]-[\\p{Control}]]"); 1467 } 1468 1469 public void TestHangulSyllable() { 1470 final UnicodeSet lvt = new UnicodeSet("[:Hangul_Syllable_Type=LVT_Syllable:]"); 1471 assertNotEquals("LVT count", new UnicodeSet(), lvt); 1472 logln(lvt + ": " + lvt.size()); 1473 final UnicodeSet lv = new UnicodeSet("[:Hangul_Syllable_Type=LV_Syllable:]"); 1474 assertNotEquals("LV count", new UnicodeSet(), lv); 1475 logln(lv + ": " + lv.size()); 1476 } 1477 1478 /** 1479 * Test that frozen classes disallow changes. For 4217 1480 */ 1481 public void TestFrozen() { 1482 UnicodeSet test = new UnicodeSet("[[:whitespace:]A]"); 1483 test.freeze(); 1484 checkModification(test, true); 1485 checkModification(test, false); 1486 } 1487 1488 /** 1489 * Test Generic support 1490 */ 1491 public void TestGenerics() { 1492 UnicodeSet set1 = new UnicodeSet("[a-b d-g {ch} {zh}]").freeze(); 1493 UnicodeSet set2 = new UnicodeSet("[e-f {ch}]").freeze(); 1494 UnicodeSet set3 = new UnicodeSet("[d m-n {dh}]").freeze(); 1495 // A useful range of sets for testing, including both characters and strings 1496 // set 1 contains set2 1497 // set 1 is overlaps with set 3 1498 // set 2 is disjoint with set 3 1499 1500 //public Iterator<String> iterator() { 1501 1502 ArrayList<String> oldList = new ArrayList<String>(); 1503 for (UnicodeSetIterator it = new UnicodeSetIterator(set1); it.next();) { 1504 oldList.add(it.getString()); 1505 } 1506 1507 ArrayList<String> list1 = new ArrayList<String>(); 1508 for (String s : set1) { 1509 list1.add(s); 1510 } 1511 assertEquals("iteration test", oldList, list1); 1512 1513 //addAllTo(Iterable<T>, U) 1514 list1.clear(); 1515 set1.addAllTo(list1); 1516 assertEquals("iteration test", oldList, list1); 1517 1518 list1 = set1.addAllTo(new ArrayList<String>()); 1519 assertEquals("addAllTo", oldList, list1); 1520 1521 ArrayList<String> list2 = set2.addAllTo(new ArrayList<String>()); 1522 ArrayList<String> list3 = set3.addAllTo(new ArrayList<String>()); 1523 1524 // put them into different order, to check that order doesn't matter 1525 TreeSet sorted1 = set1.addAllTo(new TreeSet<String>()); 1526 TreeSet sorted2 = set2.addAllTo(new TreeSet<String>()); 1527 TreeSet sorted3 = set3.addAllTo(new TreeSet<String>()); 1528 1529 //containsAll(Collection<String> collection) 1530 assertTrue("containsAll", set1.containsAll(list1)); 1531 assertTrue("containsAll", set1.containsAll(sorted1)); 1532 assertTrue("containsAll", set1.containsAll(list2)); 1533 assertTrue("containsAll", set1.containsAll(sorted2)); 1534 assertFalse("containsAll", set1.containsAll(list3)); 1535 assertFalse("containsAll", set1.containsAll(sorted3)); 1536 assertFalse("containsAll", set2.containsAll(list3)); 1537 assertFalse("containsAll", set2.containsAll(sorted3)); 1538 1539 //containsSome(Collection<String>) 1540 assertTrue("containsSome", set1.containsSome(list1)); 1541 assertTrue("containsSome", set1.containsSome(sorted1)); 1542 assertTrue("containsSome", set1.containsSome(list2)); 1543 assertTrue("containsSome", set1.containsSome(sorted2)); 1544 assertTrue("containsSome", set1.containsSome(list3)); 1545 assertTrue("containsSome", set1.containsSome(sorted3)); 1546 assertFalse("containsSome", set2.containsSome(list3)); 1547 assertFalse("containsSome", set2.containsSome(sorted3)); 1548 1549 //containsNone(Collection<String>) 1550 assertFalse("containsNone", set1.containsNone(list1)); 1551 assertFalse("containsNone", set1.containsNone(sorted1)); 1552 assertFalse("containsNone", set1.containsNone(list2)); 1553 assertFalse("containsNone", set1.containsNone(sorted2)); 1554 assertFalse("containsNone", set1.containsNone(list3)); 1555 assertFalse("containsNone", set1.containsNone(sorted3)); 1556 assertTrue("containsNone", set2.containsNone(list3)); 1557 assertTrue("containsNone", set2.containsNone(sorted3)); 1558 1559 //addAll(String...) 1560 UnicodeSet other3 = new UnicodeSet().addAll("d", "m", "n", "dh"); 1561 assertEquals("addAll", set3, other3); 1562 1563 //removeAll(Collection<String>) 1564 UnicodeSet mod1 = new UnicodeSet(set1).removeAll(set2); 1565 UnicodeSet mod2 = new UnicodeSet(set1).removeAll(list2); 1566 assertEquals("remove all", mod1, mod2); 1567 1568 //retainAll(Collection<String>) 1569 mod1 = new UnicodeSet(set1).retainAll(set2); 1570 mod2 = new UnicodeSet(set1).retainAll(set2.addAllTo(new LinkedHashSet<String>())); 1571 assertEquals("remove all", mod1, mod2); 1572 } 1573 1574 public void TestComparison() { 1575 UnicodeSet set1 = new UnicodeSet("[a-b d-g {ch} {zh}]").freeze(); 1576 UnicodeSet set2 = new UnicodeSet("[c-e {ch}]").freeze(); 1577 UnicodeSet set3 = new UnicodeSet("[d m-n z {dh}]").freeze(); 1578 1579 //compareTo(UnicodeSet) 1580 // do indirectly, by sorting 1581 List<UnicodeSet> unsorted = Arrays.asList(set3, set2, set1); 1582 List<UnicodeSet> goalShortest = Arrays.asList(set2, set3, set1); 1583 List<UnicodeSet> goalLongest = Arrays.asList(set1, set3, set2); 1584 List<UnicodeSet> goalLex = Arrays.asList(set1, set2, set3); 1585 1586 List<UnicodeSet> sorted = new ArrayList(new TreeSet<UnicodeSet>(unsorted)); 1587 assertNotEquals("compareTo-shorter-first", unsorted, sorted); 1588 assertEquals("compareTo-shorter-first", goalShortest, sorted); 1589 1590 TreeSet<UnicodeSet> sorted1 = new TreeSet<UnicodeSet>(new Comparator<UnicodeSet>(){ 1591 public int compare(UnicodeSet o1, UnicodeSet o2) { 1592 // TODO Auto-generated method stub 1593 return o1.compareTo(o2, ComparisonStyle.LONGER_FIRST); 1594 }}); 1595 sorted1.addAll(unsorted); 1596 sorted = new ArrayList(sorted1); 1597 assertNotEquals("compareTo-longer-first", unsorted, sorted); 1598 assertEquals("compareTo-longer-first", goalLongest, sorted); 1599 1600 sorted1 = new TreeSet<UnicodeSet>(new Comparator<UnicodeSet>(){ 1601 public int compare(UnicodeSet o1, UnicodeSet o2) { 1602 // TODO Auto-generated method stub 1603 return o1.compareTo(o2, ComparisonStyle.LEXICOGRAPHIC); 1604 }}); 1605 sorted1.addAll(unsorted); 1606 sorted = new ArrayList(sorted1); 1607 assertNotEquals("compareTo-lex", unsorted, sorted); 1608 assertEquals("compareTo-lex", goalLex, sorted); 1609 1610 //compare(String, int) 1611 // make a list of interesting combinations 1612 List<String> sources = Arrays.asList("\u0000", "a", "b", "\uD7FF", "\uD800", "\uDBFF", "\uDC00", "\uDFFF", "\uE000", "\uFFFD", "\uFFFF"); 1613 TreeSet<String> target = new TreeSet<String>(); 1614 for (String s : sources) { 1615 target.add(s); 1616 for (String t : sources) { 1617 target.add(s + t); 1618 for (String u : sources) { 1619 target.add(s + t + u); 1620 } 1621 } 1622 } 1623 // now compare all the combinations. If any of them is a code point, use it. 1624 int maxErrorCount = 0; 1625 compare: 1626 for (String last : target) { 1627 for (String curr : target) { 1628 int lastCount = Character.codePointCount(last, 0, last.length()); 1629 int currCount = Character.codePointCount(curr, 0, curr.length()); 1630 int comparison; 1631 if (lastCount == 1) { 1632 comparison = UnicodeSet.compare(last.codePointAt(0), curr); 1633 } else if (currCount == 1) { 1634 comparison = UnicodeSet.compare(last, curr.codePointAt(0)); 1635 } else { 1636 continue; 1637 } 1638 if (comparison != last.compareTo(curr)) { 1639 // repeat for debugging 1640 if (lastCount == 1) { 1641 comparison = UnicodeSet.compare(last.codePointAt(0), curr); 1642 } else if (currCount == 1) { 1643 comparison = UnicodeSet.compare(last, curr.codePointAt(0)); 1644 } 1645 if (maxErrorCount++ > 10) { 1646 errln(maxErrorCount + " Failure in comparing " + last + " & " + curr + "\tOmitting others..."); 1647 break compare; 1648 } 1649 errln(maxErrorCount + " Failure in comparing " + last + " & " + curr); 1650 } 1651 } 1652 } 1653 1654 //compare(Iterable<T>, Iterable<T>) 1655 int max = 10; 1656 List<String> test1 = new ArrayList<String>(max); 1657 List<String> test2 = new ArrayList<String>(max); 1658 for (int i = 0; i <= max; ++i) { 1659 test1.add("a" + i); 1660 test2.add("a" + (max - i)); // add in reverse order 1661 } 1662 assertNotEquals("compare iterable test", test1, test2); 1663 TreeSet<CharSequence> sortedTest1 = new TreeSet<CharSequence>(test1); 1664 TreeSet<CharSequence> sortedTest2 = new TreeSet<CharSequence>(test2); 1665 assertEquals("compare iterable test", sortedTest1, sortedTest2); 1666 } 1667 1668 public void TestRangeConstructor() { 1669 UnicodeSet w = new UnicodeSet().addAll(3,5); 1670 UnicodeSet s = new UnicodeSet(3,5); 1671 assertEquals("new constructor", w, s); 1672 1673 w = new UnicodeSet().addAll(3,5).addAll(7,7); 1674 UnicodeSet t = new UnicodeSet(3,5, 7,7); 1675 assertEquals("new constructor", w, t); 1676 // check to make sure right exceptions are thrown 1677 Class expected = IllegalArgumentException.class; 1678 Class actual; 1679 1680 try { 1681 actual = null; 1682 @SuppressWarnings("unused") 1683 UnicodeSet u = new UnicodeSet(5); 1684 } catch (IllegalArgumentException e) { 1685 actual = e.getClass(); 1686 } 1687 assertEquals("exception if odd", expected, actual); 1688 1689 try { 1690 actual = null; 1691 @SuppressWarnings("unused") 1692 UnicodeSet u = new UnicodeSet(3, 2, 7, 9); 1693 } catch (IllegalArgumentException e) { 1694 actual = e.getClass(); 1695 } 1696 assertEquals("exception for start/end problem", expected, actual); 1697 1698 try { 1699 actual = null; 1700 @SuppressWarnings("unused") 1701 UnicodeSet u = new UnicodeSet(3, 5, 6, 9); 1702 } catch (IllegalArgumentException e) { 1703 actual = e.getClass(); 1704 } 1705 assertEquals("exception for end/start problem", expected, actual); 1706 1707 CheckRangeSpeed(10000, new UnicodeSet("[:whitespace:]")); 1708 CheckRangeSpeed(1000, new UnicodeSet("[:letter:]")); 1709 } 1710 1711 /** 1712 * @param iterations 1713 * @param testSet 1714 */ 1715 private void CheckRangeSpeed(int iterations, UnicodeSet testSet) { 1716 testSet.complement().complement(); 1717 String testPattern = testSet.toString(); 1718 // fill a set of pairs from the pattern 1719 int[] pairs = new int[testSet.getRangeCount()*2]; 1720 int j = 0; 1721 for (UnicodeSetIterator it = new UnicodeSetIterator(testSet); it.nextRange();) { 1722 pairs[j++] = it.codepoint; 1723 pairs[j++] = it.codepointEnd; 1724 } 1725 UnicodeSet fromRange = new UnicodeSet(testSet); 1726 assertEquals("from range vs pattern", testSet, fromRange); 1727 1728 double start = System.currentTimeMillis(); 1729 for (int i = 0; i < iterations; ++i) { 1730 fromRange = new UnicodeSet(testSet); 1731 } 1732 double middle = System.currentTimeMillis(); 1733 for (int i = 0; i < iterations; ++i) { 1734 new UnicodeSet(testPattern); 1735 } 1736 double end = System.currentTimeMillis(); 1737 1738 double rangeConstructorTime = (middle - start)/iterations; 1739 double patternConstructorTime = (end - middle)/iterations; 1740 String message = "Range constructor:\t" + rangeConstructorTime + ";\tPattern constructor:\t" + patternConstructorTime + "\t\t" 1741 + percent.format(rangeConstructorTime/patternConstructorTime-1); 1742 if (rangeConstructorTime < 2*patternConstructorTime) { 1743 logln(message); 1744 } else { 1745 errln(message); 1746 } 1747 } 1748 1749 NumberFormat percent = NumberFormat.getPercentInstance(); 1750 { 1751 percent.setMaximumFractionDigits(2); 1752 } 1753 // **************************************** 1754 // UTILITIES 1755 // **************************************** 1756 1757 public void checkModification(UnicodeSet original, boolean isFrozen) { 1758 main: 1759 for (int i = 0; ;++i) { 1760 UnicodeSet test = (UnicodeSet) (isFrozen ? original.clone() : original.cloneAsThawed()); 1761 boolean gotException = true; 1762 boolean checkEquals = true; 1763 try { 1764 switch(i) { 1765 case 0: test.add(0); break; 1766 case 1: test.add(0,1); break; 1767 case 2: test.add("a"); break; 1768 case 3: List a = new ArrayList(); a.add("a"); test.addAll(a); break; 1769 case 4: test.addAll("ab"); break; 1770 case 5: test.addAll(new UnicodeSet("[ab]")); break; 1771 case 6: test.applyIntPropertyValue(0,0); break; 1772 case 7: test.applyPattern("[ab]"); break; 1773 case 8: test.applyPattern("[ab]", true); break; 1774 case 9: test.applyPattern("[ab]", 0); break; 1775 case 10: test.applyPropertyAlias("hex","true"); break; 1776 case 11: test.applyPropertyAlias("hex", "true", null); break; 1777 case 12: test.closeOver(UnicodeSet.CASE); break; 1778 case 13: test.compact(); checkEquals = false; break; 1779 case 14: test.complement(0); break; 1780 case 15: test.complement(0,0); break; 1781 case 16: test.complement("ab"); break; 1782 case 17: test.complementAll("ab"); break; 1783 case 18: test.complementAll(new UnicodeSet("[ab]")); break; 1784 case 19: test.remove(' '); break; 1785 case 20: test.remove(' ','a'); break; 1786 case 21: test.remove(" "); break; 1787 case 22: test.removeAll(" a"); break; 1788 case 23: test.removeAll(new UnicodeSet("[\\ a]")); break; 1789 case 24: test.retain(' '); break; 1790 case 25: test.retain(' ','a'); break; 1791 case 26: test.retain(" "); break; 1792 case 27: test.retainAll(" a"); break; 1793 case 28: test.retainAll(new UnicodeSet("[\\ a]")); break; 1794 case 29: test.set(0,1); break; 1795 case 30: test.set(new UnicodeSet("[ab]")); break; 1796 1797 default: continue main; // so we don't keep having to change the endpoint, and gaps are not skipped. 1798 case 35: return; 1799 } 1800 gotException = false; 1801 } catch (UnsupportedOperationException e) { 1802 // do nothing 1803 } 1804 if (isFrozen && !gotException) errln(i + ") attempt to modify frozen object didn't result in an exception"); 1805 if (!isFrozen && gotException) errln(i + ") attempt to modify thawed object did result in an exception"); 1806 if (checkEquals) { 1807 if (test.equals(original)) { 1808 if (!isFrozen) errln(i + ") attempt to modify thawed object didn't change the object"); 1809 } else { // unequal 1810 if (isFrozen) errln(i + ") attempt to modify frozen object changed the object"); 1811 } 1812 } 1813 } 1814 } 1815 1816 // Following cod block is commented out to eliminate PrettyPrinter depenencies 1817 1818 // String[] prettyData = { 1819 // "[\\uD7DE-\\uD90C \\uDCB5-\\uDD9F]", // special case 1820 // "[:any:]", 1821 // "[:whitespace:]", 1822 // "[:linebreak=AL:]", 1823 // }; 1824 // 1825 // public void TestPrettyPrinting() { 1826 // try{ 1827 // PrettyPrinter pp = new PrettyPrinter(); 1828 // 1829 // int i = 0; 1830 // for (; i < prettyData.length; ++i) { 1831 // UnicodeSet test = new UnicodeSet(prettyData[i]); 1832 // checkPrettySet(pp, i, test); 1833 // } 1834 // Random random = new Random(0); 1835 // UnicodeSet test = new UnicodeSet(); 1836 // 1837 // // To keep runtimes under control, make the number of random test cases 1838 // // to try depends on the test framework exhaustive setting. 1839 // // params.inclusions = 5: default exhaustive value 1840 // // params.inclusions = 10: max exhaustive value. 1841 // int iterations = 50; 1842 // if (params.inclusion > 5) { 1843 // iterations = (params.inclusion-5) * 200; 1844 // } 1845 // for (; i < iterations; ++i) { 1846 // double start = random.nextGaussian() * 0x10000; 1847 // if (start < 0) start = - start; 1848 // if (start > 0x10FFFF) { 1849 // start = 0x10FFFF; 1850 // } 1851 // double end = random.nextGaussian() * 0x100; 1852 // if (end < 0) end = -end; 1853 // end = start + end; 1854 // if (end > 0x10FFFF) { 1855 // end = 0x10FFFF; 1856 // } 1857 // test.complement((int)start, (int)end); 1858 // checkPrettySet(pp, i, test); 1859 // } 1860 // }catch(RuntimeException ex){ 1861 // warnln("Could not load Collator"); 1862 // } 1863 // } 1864 // 1865 // private void checkPrettySet(PrettyPrinter pp, int i, UnicodeSet test) { 1866 // String pretty = pp.toPattern(test); 1867 // UnicodeSet retry = new UnicodeSet(pretty); 1868 // if (!test.equals(retry)) { 1869 // errln(i + ". Failed test: " + test + " != " + pretty); 1870 // } else { 1871 // logln(i + ". Worked for " + truncate(test.toString()) + " => " + truncate(pretty)); 1872 // } 1873 // } 1874 // 1875 // private String truncate(String string) { 1876 // if (string.length() <= 100) return string; 1877 // return string.substring(0,97) + "..."; 1878 // } 1879 1880 public class TokenSymbolTable implements SymbolTable { 1881 HashMap contents = new HashMap(); 1882 1883 /** 1884 * (Non-SymbolTable API) Add the given variable and value to 1885 * the table. Variable should NOT contain leading '$'. 1886 */ 1887 public void add(String var, String value) { 1888 char[] buffer = new char[value.length()]; 1889 value.getChars(0, value.length(), buffer, 0); 1890 add(var, buffer); 1891 } 1892 1893 /** 1894 * (Non-SymbolTable API) Add the given variable and value to 1895 * the table. Variable should NOT contain leading '$'. 1896 */ 1897 public void add(String var, char[] body) { 1898 logln("TokenSymbolTable: add \"" + var + "\" => \"" + 1899 new String(body) + "\""); 1900 contents.put(var, body); 1901 } 1902 1903 /* (non-Javadoc) 1904 * @see com.ibm.icu.text.SymbolTable#lookup(java.lang.String) 1905 */ 1906 public char[] lookup(String s) { 1907 logln("TokenSymbolTable: lookup \"" + s + "\" => \"" + 1908 new String((char[]) contents.get(s)) + "\""); 1909 return (char[])contents.get(s); 1910 } 1911 1912 /* (non-Javadoc) 1913 * @see com.ibm.icu.text.SymbolTable#lookupMatcher(int) 1914 */ 1915 public UnicodeMatcher lookupMatcher(int ch) { 1916 return null; 1917 } 1918 1919 /* (non-Javadoc) 1920 * @see com.ibm.icu.text.SymbolTable#parseReference(java.lang.String, 1921 java.text.ParsePosition, int) 1922 */ 1923 public String parseReference(String text, ParsePosition pos, int 1924 limit) { 1925 int cp; 1926 int start = pos.getIndex(); 1927 int i; 1928 for (i = start; i < limit; i += UTF16.getCharCount(cp)) { 1929 cp = UTF16.charAt(text, i); 1930 if (!com.ibm.icu.lang.UCharacter.isUnicodeIdentifierPart(cp)) { 1931 break; 1932 } 1933 } 1934 logln("TokenSymbolTable: parse \"" + text + "\" from " + 1935 start + " to " + i + 1936 " => \"" + text.substring(start,i) + "\""); 1937 pos.setIndex(i); 1938 return text.substring(start,i); 1939 } 1940 } 1941 1942 public void TestSurrogate() { 1943 String DATA[] = { 1944 // These should all behave identically 1945 "[abc\\uD800\\uDC00]", 1946 "[abc\uD800\uDC00]", 1947 "[abc\\U00010000]", 1948 }; 1949 for (int i=0; i<DATA.length; ++i) { 1950 logln("Test pattern " + i + " :" + Utility.escape(DATA[i])); 1951 UnicodeSet set = new UnicodeSet(DATA[i]); 1952 expectContainment(set, 1953 CharsToUnicodeString("abc\\U00010000"), 1954 "\uD800;\uDC00"); // split apart surrogate-pair 1955 if (set.size() != 4) { 1956 errln(Utility.escape("FAIL: " + DATA[i] + ".size() == " + 1957 set.size() + ", expected 4")); 1958 } 1959 } 1960 } 1961 1962 public void TestContains() { 1963 int limit = 256; // combinations to test 1964 for (int i = 0; i < limit; ++i) { 1965 logln("Trying: " + i); 1966 UnicodeSet x = bitsToSet(i); 1967 for (int j = 0; j < limit; ++j) { 1968 UnicodeSet y = bitsToSet(j); 1969 boolean containsNone = (i & j) == 0; 1970 boolean containsAll = (i & j) == j; 1971 boolean equals = i == j; 1972 if (containsNone != x.containsNone(y)) { 1973 x.containsNone(y); // repeat for debugging 1974 errln("FAILED: " + x + " containsSome " + y); 1975 } 1976 if (containsAll != x.containsAll(y)) { 1977 x.containsAll(y); // repeat for debugging 1978 errln("FAILED: " + x + " containsAll " + y); 1979 } 1980 if (equals != x.equals(y)) { 1981 x.equals(y); // repeat for debugging 1982 errln("FAILED: " + x + " equals " + y); 1983 } 1984 } 1985 } 1986 } 1987 1988 void _testComplement(int a) { 1989 UnicodeSet x = bitsToSet(a); 1990 UnicodeSet z = bitsToSet(a); 1991 z.complement(); 1992 int c = setToBits(z); 1993 if (c != (~a)) { 1994 errln("FAILED: add: ~" + x + " != " + z); 1995 errln("FAILED: add: ~" + a + " != " + c); 1996 } 1997 checkCanonicalRep(z, "complement " + a); 1998 } 1999 2000 void _testAdd(int a, int b) { 2001 UnicodeSet x = bitsToSet(a); 2002 UnicodeSet y = bitsToSet(b); 2003 UnicodeSet z = bitsToSet(a); 2004 z.addAll(y); 2005 int c = setToBits(z); 2006 if (c != (a | b)) { 2007 errln(Utility.escape("FAILED: add: " + x + " | " + y + " != " + z)); 2008 errln("FAILED: add: " + a + " | " + b + " != " + c); 2009 } 2010 checkCanonicalRep(z, "add " + a + "," + b); 2011 } 2012 2013 void _testRetain(int a, int b) { 2014 UnicodeSet x = bitsToSet(a); 2015 UnicodeSet y = bitsToSet(b); 2016 UnicodeSet z = bitsToSet(a); 2017 z.retainAll(y); 2018 int c = setToBits(z); 2019 if (c != (a & b)) { 2020 errln("FAILED: retain: " + x + " & " + y + " != " + z); 2021 errln("FAILED: retain: " + a + " & " + b + " != " + c); 2022 } 2023 checkCanonicalRep(z, "retain " + a + "," + b); 2024 } 2025 2026 void _testRemove(int a, int b) { 2027 UnicodeSet x = bitsToSet(a); 2028 UnicodeSet y = bitsToSet(b); 2029 UnicodeSet z = bitsToSet(a); 2030 z.removeAll(y); 2031 int c = setToBits(z); 2032 if (c != (a &~ b)) { 2033 errln("FAILED: remove: " + x + " &~ " + y + " != " + z); 2034 errln("FAILED: remove: " + a + " &~ " + b + " != " + c); 2035 } 2036 checkCanonicalRep(z, "remove " + a + "," + b); 2037 } 2038 2039 void _testXor(int a, int b) { 2040 UnicodeSet x = bitsToSet(a); 2041 UnicodeSet y = bitsToSet(b); 2042 UnicodeSet z = bitsToSet(a); 2043 z.complementAll(y); 2044 int c = setToBits(z); 2045 if (c != (a ^ b)) { 2046 errln("FAILED: complement: " + x + " ^ " + y + " != " + z); 2047 errln("FAILED: complement: " + a + " ^ " + b + " != " + c); 2048 } 2049 checkCanonicalRep(z, "complement " + a + "," + b); 2050 } 2051 2052 /** 2053 * Check that ranges are monotonically increasing and non- 2054 * overlapping. 2055 */ 2056 void checkCanonicalRep(UnicodeSet set, String msg) { 2057 int n = set.getRangeCount(); 2058 if (n < 0) { 2059 errln("FAIL result of " + msg + 2060 ": range count should be >= 0 but is " + 2061 n + " for " + Utility.escape(set.toString())); 2062 return; 2063 } 2064 int last = 0; 2065 for (int i=0; i<n; ++i) { 2066 int start = set.getRangeStart(i); 2067 int end = set.getRangeEnd(i); 2068 if (start > end) { 2069 errln("FAIL result of " + msg + 2070 ": range " + (i+1) + 2071 " start > end: " + start + ", " + end + 2072 " for " + Utility.escape(set.toString())); 2073 } 2074 if (i > 0 && start <= last) { 2075 errln("FAIL result of " + msg + 2076 ": range " + (i+1) + 2077 " overlaps previous range: " + start + ", " + end + 2078 " for " + Utility.escape(set.toString())); 2079 } 2080 last = end; 2081 } 2082 } 2083 2084 /** 2085 * Convert a bitmask to a UnicodeSet. 2086 */ 2087 UnicodeSet bitsToSet(int a) { 2088 UnicodeSet result = new UnicodeSet(); 2089 for (int i = 0; i < 32; ++i) { 2090 if ((a & (1<<i)) != 0) { 2091 result.add((char)i,(char)i); 2092 } 2093 } 2094 2095 return result; 2096 } 2097 2098 /** 2099 * Convert a UnicodeSet to a bitmask. Only the characters 2100 * U+0000 to U+0020 are represented in the bitmask. 2101 */ 2102 static int setToBits(UnicodeSet x) { 2103 int result = 0; 2104 for (int i = 0; i < 32; ++i) { 2105 if (x.contains((char)i)) { 2106 result |= (1<<i); 2107 } 2108 } 2109 return result; 2110 } 2111 2112 /** 2113 * Return the representation of an inversion list based UnicodeSet 2114 * as a pairs list. Ranges are listed in ascending Unicode order. 2115 * For example, the set [a-zA-M3] is represented as "33AMaz". 2116 */ 2117 static String getPairs(UnicodeSet set) { 2118 StringBuffer pairs = new StringBuffer(); 2119 for (int i=0; i<set.getRangeCount(); ++i) { 2120 int start = set.getRangeStart(i); 2121 int end = set.getRangeEnd(i); 2122 if (end > 0xFFFF) { 2123 end = 0xFFFF; 2124 i = set.getRangeCount(); // Should be unnecessary 2125 } 2126 pairs.append((char)start).append((char)end); 2127 } 2128 return pairs.toString(); 2129 } 2130 2131 /** 2132 * Test function. Make sure that the sets have the right relation 2133 */ 2134 2135 void expectRelation(Object relationObj, Object set1Obj, Object set2Obj, String message) { 2136 int relation = ((Integer) relationObj).intValue(); 2137 UnicodeSet set1 = (UnicodeSet) set1Obj; 2138 UnicodeSet set2 = (UnicodeSet) set2Obj; 2139 2140 // by-the-by, check the iterator 2141 checkRoundTrip(set1); 2142 checkRoundTrip(set2); 2143 2144 boolean contains = set1.containsAll(set2); 2145 boolean isContained = set2.containsAll(set1); 2146 boolean disjoint = set1.containsNone(set2); 2147 boolean equals = set1.equals(set2); 2148 2149 UnicodeSet intersection = new UnicodeSet(set1).retainAll(set2); 2150 UnicodeSet minus12 = new UnicodeSet(set1).removeAll(set2); 2151 UnicodeSet minus21 = new UnicodeSet(set2).removeAll(set1); 2152 2153 // test basic properties 2154 2155 if (contains != (intersection.size() == set2.size())) { 2156 errln("FAIL contains1" + set1.toPattern(true) + ", " + set2.toPattern(true)); 2157 } 2158 2159 if (contains != (intersection.equals(set2))) { 2160 errln("FAIL contains2" + set1.toPattern(true) + ", " + set2.toPattern(true)); 2161 } 2162 2163 if (isContained != (intersection.size() == set1.size())) { 2164 errln("FAIL isContained1" + set1.toPattern(true) + ", " + set2.toPattern(true)); 2165 } 2166 2167 if (isContained != (intersection.equals(set1))) { 2168 errln("FAIL isContained2" + set1.toPattern(true) + ", " + set2.toPattern(true)); 2169 } 2170 2171 if ((contains && isContained) != equals) { 2172 errln("FAIL equals" + set1.toPattern(true) + ", " + set2.toPattern(true)); 2173 } 2174 2175 if (disjoint != (intersection.size() == 0)) { 2176 errln("FAIL disjoint" + set1.toPattern(true) + ", " + set2.toPattern(true)); 2177 } 2178 2179 // Now see if the expected relation is true 2180 int status = (minus12.size() != 0 ? 4 : 0) 2181 | (intersection.size() != 0 ? 2 : 0) 2182 | (minus21.size() != 0 ? 1 : 0); 2183 2184 if (status != relation) { 2185 errln("FAIL relation incorrect" + message 2186 + "; desired = " + RELATION_NAME[relation] 2187 + "; found = " + RELATION_NAME[status] 2188 + "; set1 = " + set1.toPattern(true) 2189 + "; set2 = " + set2.toPattern(true) 2190 ); 2191 } 2192 } 2193 2194 /** 2195 * Basic consistency check for a few items. 2196 * That the iterator works, and that we can create a pattern and 2197 * get the same thing back 2198 */ 2199 2200 void checkRoundTrip(UnicodeSet s) { 2201 String pat = s.toPattern(false); 2202 UnicodeSet t = copyWithIterator(s, false); 2203 checkEqual(s, t, "iterator roundtrip"); 2204 2205 t = copyWithIterator(s, true); // try range 2206 checkEqual(s, t, "iterator roundtrip"); 2207 2208 t = new UnicodeSet(pat); 2209 checkEqual(s, t, "toPattern(false)"); 2210 2211 pat = s.toPattern(true); 2212 t = new UnicodeSet(pat); 2213 checkEqual(s, t, "toPattern(true)"); 2214 } 2215 2216 UnicodeSet copyWithIterator(UnicodeSet s, boolean withRange) { 2217 UnicodeSet t = new UnicodeSet(); 2218 UnicodeSetIterator it = new UnicodeSetIterator(s); 2219 if (withRange) { 2220 while (it.nextRange()) { 2221 if (it.codepoint == UnicodeSetIterator.IS_STRING) { 2222 t.add(it.string); 2223 } else { 2224 t.add(it.codepoint, it.codepointEnd); 2225 } 2226 } 2227 } else { 2228 while (it.next()) { 2229 if (it.codepoint == UnicodeSetIterator.IS_STRING) { 2230 t.add(it.string); 2231 } else { 2232 t.add(it.codepoint); 2233 } 2234 } 2235 } 2236 return t; 2237 } 2238 2239 boolean checkEqual(UnicodeSet s, UnicodeSet t, String message) { 2240 if (!s.equals(t)) { 2241 errln("FAIL " + message 2242 + "; source = " + s.toPattern(true) 2243 + "; result = " + t.toPattern(true) 2244 ); 2245 return false; 2246 } 2247 return true; 2248 } 2249 2250 void expectEqual(String name, String pat1, String pat2) { 2251 UnicodeSet set1, set2; 2252 try { 2253 set1 = new UnicodeSet(pat1); 2254 set2 = new UnicodeSet(pat2); 2255 } catch (IllegalArgumentException e) { 2256 errln("FAIL: Couldn't create UnicodeSet from pattern for \"" + name + "\": " + e.getMessage()); 2257 return; 2258 } 2259 if(!set1.equals(set2)) { 2260 errln("FAIL: Sets built from patterns differ for \"" + name + "\""); 2261 } 2262 } 2263 2264 /** 2265 * Expect the given set to contain the characters in charsIn and 2266 * to not contain those in charsOut. 2267 */ 2268 void expectContainment(String pat, String charsIn, String charsOut) { 2269 UnicodeSet set; 2270 try { 2271 set = new UnicodeSet(pat); 2272 } catch (IllegalArgumentException e) { 2273 errln("FAIL: Couldn't create UnicodeSet from pattern \"" + 2274 pat + "\": " + e.getMessage()); 2275 return; 2276 } 2277 expectContainment(set, charsIn, charsOut); 2278 } 2279 2280 /** 2281 * Expect the given set to contain the characters in charsIn and 2282 * to not contain those in charsOut. 2283 */ 2284 void expectContainment(UnicodeSet set, String charsIn, String charsOut) { 2285 StringBuffer bad = new StringBuffer(); 2286 if (charsIn != null) { 2287 charsIn = Utility.unescape(charsIn); 2288 for (int i=0; i<charsIn.length(); ) { 2289 int c = UTF16.charAt(charsIn,i); 2290 i += UTF16.getCharCount(c); 2291 if (!set.contains(c)) { 2292 UTF16.append(bad,c); 2293 } 2294 } 2295 if (bad.length() > 0) { 2296 errln(Utility.escape("FAIL: set " + set + " does not contain " + bad + 2297 ", expected containment of " + charsIn)); 2298 } else { 2299 logln(Utility.escape("Ok: set " + set + " contains " + charsIn)); 2300 } 2301 } 2302 if (charsOut != null) { 2303 charsOut = Utility.unescape(charsOut); 2304 bad.setLength(0); 2305 for (int i=0; i<charsOut.length(); ) { 2306 int c = UTF16.charAt(charsOut,i); 2307 i += UTF16.getCharCount(c); 2308 if (set.contains(c)) { 2309 UTF16.append(bad, c); 2310 } 2311 } 2312 if (bad.length() > 0) { 2313 errln(Utility.escape("FAIL: set " + set + " contains " + bad + 2314 ", expected non-containment of " + charsOut)); 2315 } else { 2316 logln(Utility.escape("Ok: set " + set + " does not contain " + charsOut)); 2317 } 2318 } 2319 } 2320 2321 void expectPattern(UnicodeSet set, 2322 String pattern, 2323 String expectedPairs) { 2324 set.applyPattern(pattern); 2325 if (!getPairs(set).equals(expectedPairs)) { 2326 errln("FAIL: applyPattern(\"" + pattern + 2327 "\") => pairs \"" + 2328 Utility.escape(getPairs(set)) + "\", expected \"" + 2329 Utility.escape(expectedPairs) + "\""); 2330 } else { 2331 logln("Ok: applyPattern(\"" + pattern + 2332 "\") => pairs \"" + 2333 Utility.escape(getPairs(set)) + "\""); 2334 } 2335 } 2336 2337 void expectToPattern(UnicodeSet set, 2338 String expPat, 2339 String[] expStrings) { 2340 String pat = set.toPattern(true); 2341 if (pat.equals(expPat)) { 2342 logln("Ok: toPattern() => \"" + pat + "\""); 2343 } else { 2344 errln("FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\""); 2345 return; 2346 } 2347 if (expStrings == null) { 2348 return; 2349 } 2350 boolean in = true; 2351 for (int i=0; i<expStrings.length; ++i) { 2352 if (expStrings[i] == NOT) { // sic; pointer comparison 2353 in = false; 2354 continue; 2355 } 2356 boolean contained = set.contains(expStrings[i]); 2357 if (contained == in) { 2358 logln("Ok: " + expPat + 2359 (contained ? " contains {" : " does not contain {") + 2360 Utility.escape(expStrings[i]) + "}"); 2361 } else { 2362 errln("FAIL: " + expPat + 2363 (contained ? " contains {" : " does not contain {") + 2364 Utility.escape(expStrings[i]) + "}"); 2365 } 2366 } 2367 } 2368 2369 void expectPairs(UnicodeSet set, String expectedPairs) { 2370 if (!getPairs(set).equals(expectedPairs)) { 2371 errln("FAIL: Expected pair list \"" + 2372 Utility.escape(expectedPairs) + "\", got \"" + 2373 Utility.escape(getPairs(set)) + "\""); 2374 } 2375 } 2376 static final String CharsToUnicodeString(String s) { 2377 return Utility.unescape(s); 2378 } 2379 2380 /* Test the method public UnicodeSet getSet() */ 2381 public void TestGetSet() { 2382 UnicodeSetIterator us = new UnicodeSetIterator(); 2383 try { 2384 us.getSet(); 2385 } catch (Exception e) { 2386 errln("UnicodeSetIterator.getSet() was not suppose to given an " + "an exception."); 2387 } 2388 } 2389 2390 /* Tests the method public UnicodeSet add(Collection<?> source) */ 2391 public void TestAddCollection() { 2392 UnicodeSet us = new UnicodeSet(); 2393 Collection<?> s = null; 2394 try { 2395 us.add(s); 2396 errln("UnicodeSet.add(Collection<?>) was suppose to return an exception for a null parameter."); 2397 } catch (Exception e) { 2398 } 2399 } 2400 2401 public void TestConstants() { 2402 assertEquals("Empty", new UnicodeSet(), UnicodeSet.EMPTY); 2403 assertEquals("All", new UnicodeSet(0,0x10FFFF), UnicodeSet.ALL_CODE_POINTS); 2404 } 2405 2406 public void TestIteration() { 2407 UnicodeSet us1 = new UnicodeSet("[abcM{xy}]"); 2408 assertEquals("", "M, a-c", CollectionUtilities.join(us1.ranges(), ", ")); 2409 2410 // Sample code 2411 for (@SuppressWarnings("unused") EntryRange range : us1.ranges()) { 2412 // do something with code points between range.codepointEnd and range.codepointEnd; 2413 } 2414 for (@SuppressWarnings("unused") String s : us1.strings()) { 2415 // do something with each string; 2416 } 2417 2418 String[] tests = { 2419 "[M-Qzab{XY}{ZW}]", 2420 "[]", 2421 "[a]", 2422 "[a-c]", 2423 "[{XY}]", 2424 }; 2425 for (String test : tests) { 2426 UnicodeSet us = new UnicodeSet(test); 2427 UnicodeSetIterator it = new UnicodeSetIterator(us); 2428 for (EntryRange range : us.ranges()) { 2429 final String title = range.toString(); 2430 logln(title); 2431 it.nextRange(); 2432 assertEquals(title, it.codepoint, range.codepoint); 2433 assertEquals(title, it.codepointEnd, range.codepointEnd); 2434 } 2435 for (String s : us.strings()) { 2436 it.nextRange(); 2437 assertEquals("strings", it.string, s); 2438 } 2439 assertFalse("", it.next()); 2440 } 2441 } 2442 2443 public void TestReplaceAndDelete() { 2444 UnicodeSetSpanner m; 2445 2446 m = new UnicodeSetSpanner(new UnicodeSet("[._]")); 2447 assertEquals("", "abc", m.deleteFrom("_._a_._b_._c_._")); 2448 assertEquals("", "_.__.__.__._", m.deleteFrom("_._a_._b_._c_._", SpanCondition.NOT_CONTAINED)); 2449 2450 assertEquals("", "a_._b_._c", m.trim("_._a_._b_._c_._")); 2451 assertEquals("", "a_._b_._c_._", m.trim("_._a_._b_._c_._", TrimOption.LEADING)); 2452 assertEquals("", "_._a_._b_._c", m.trim("_._a_._b_._c_._", TrimOption.TRAILING)); 2453 2454 assertEquals("", "a??b??c", m.replaceFrom("a_._b_._c", "??", CountMethod.WHOLE_SPAN)); 2455 assertEquals("", "a??b??c", m.replaceFrom(m.trim("_._a_._b_._c_._"), "??", CountMethod.WHOLE_SPAN)); 2456 assertEquals("", "XYXYXYaXYXYXYbXYXYXYcXYXYXY", m.replaceFrom("_._a_._b_._c_._", "XY")); 2457 assertEquals("", "XYaXYbXYcXY", m.replaceFrom("_._a_._b_._c_._", "XY", CountMethod.WHOLE_SPAN)); 2458 2459 m = new UnicodeSetSpanner(new UnicodeSet("\\p{uppercase}")); 2460 assertEquals("", "TQBF", m.deleteFrom("The Quick Brown Fox.", SpanCondition.NOT_CONTAINED)); 2461 2462 m = new UnicodeSetSpanner(m.getUnicodeSet().addAll(new UnicodeSet("\\p{lowercase}"))); 2463 assertEquals("", "TheQuickBrownFox", m.deleteFrom("The Quick Brown Fox.", SpanCondition.NOT_CONTAINED)); 2464 2465 m = new UnicodeSetSpanner(new UnicodeSet("[{ab}]")); 2466 assertEquals("", "XXc acb", m.replaceFrom("ababc acb", "X")); 2467 assertEquals("", "Xc acb", m.replaceFrom("ababc acb", "X", CountMethod.WHOLE_SPAN)); 2468 assertEquals("", "ababX", m.replaceFrom("ababc acb", "X", CountMethod.WHOLE_SPAN, SpanCondition.NOT_CONTAINED)); 2469 } 2470 2471 public void TestCodePoints() { 2472 // test supplemental code points and strings clusters 2473 checkCodePoints("x\u0308", "z\u0308", CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE, null, 1); 2474 checkCodePoints("", "", CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE, null, 1); 2475 checkCodePoints("", "", CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE, null, 1); 2476 } 2477 2478 private void checkCodePoints(String a, String b, CountMethod quantifier, SpanCondition spanCondition, 2479 String expectedReplaced, int expectedCount) { 2480 final String ab = a+b; 2481 UnicodeSetSpanner m = new UnicodeSetSpanner(new UnicodeSet("[{" + a + "}]")); 2482 assertEquals("new UnicodeSetSpanner(\"[{" + a + "}]\").countIn(\"" + ab + "\")", 2483 expectedCount, 2484 callCountIn(m, ab, quantifier, spanCondition) 2485 ); 2486 2487 if (expectedReplaced == null) { 2488 expectedReplaced = "-" + b; 2489 } 2490 assertEquals("new UnicodeSetSpanner(\"[{" + a + "}]\").replaceFrom(\"" + ab + "\", \"-\")", 2491 expectedReplaced, m.replaceFrom(ab, "-", quantifier)); 2492 } 2493 2494 public void TestCountIn() { 2495 UnicodeSetSpanner m = new UnicodeSetSpanner(new UnicodeSet("[ab]")); 2496 checkCountIn(m, CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE, "abc", 2); 2497 checkCountIn(m, CountMethod.WHOLE_SPAN, SpanCondition.SIMPLE, "abc", 1); 2498 checkCountIn(m, CountMethod.MIN_ELEMENTS, SpanCondition.NOT_CONTAINED, "acccb", 3); 2499 } 2500 2501 public void checkCountIn(UnicodeSetSpanner m, CountMethod countMethod, SpanCondition spanCondition, String target, int expected) { 2502 final String message = "countIn " + countMethod + ", " + spanCondition; 2503 assertEquals(message, callCountIn(m, target, countMethod, spanCondition), expected); 2504 } 2505 2506 public int callCountIn(UnicodeSetSpanner m, final String ab, CountMethod countMethod, SpanCondition spanCondition) { 2507 return spanCondition != SpanCondition.SIMPLE ? m.countIn(ab, countMethod, spanCondition) 2508 : countMethod != CountMethod.MIN_ELEMENTS ? m.countIn(ab, countMethod) 2509 : m.countIn(ab); 2510 } 2511 2512 public void testForSpanGaps() { 2513 String[] items = {"a", "b", "c", "{ab}", "{bc}", "{cd}", "{abc}", "{bcd}"}; 2514 final int limit = 1<<items.length; 2515 // build long string for testing 2516 StringBuilder longBuffer = new StringBuilder(); 2517 for (int i = 1; i < limit; ++i) { 2518 longBuffer.append("x"); 2519 longBuffer.append(getCombinations(items, i)); 2520 } 2521 String longString = longBuffer.toString(); 2522 longString = longString.replace("{","").replace("}",""); 2523 2524 long start = System.nanoTime(); 2525 for (int i = 1; i < limit; ++i) { 2526 UnicodeSet us = new UnicodeSet("[" + getCombinations(items, i) + "]"); 2527 int problemFound = checkSpan(longString, us, SpanCondition.SIMPLE); 2528 if (problemFound >= 0) { 2529 assertEquals("Testing " + longString + ", found gap at", -1, problemFound); 2530 break; 2531 } 2532 } 2533 long end = System.nanoTime(); 2534 logln("Time for SIMPLE :\t" + (end-start)); 2535 start = System.nanoTime(); 2536 for (int i = 1; i < limit; ++i) { 2537 UnicodeSet us = new UnicodeSet("[" + getCombinations(items, i) + "]"); 2538 int problemFound = checkSpan(longString, us, SpanCondition.CONTAINED); 2539 if (problemFound >= 0) { 2540 assertEquals("Testing " + longString + ", found gap at", -1, problemFound); 2541 break; 2542 } 2543 } 2544 end = System.nanoTime(); 2545 logln("Time for CONTAINED:\t" + (end-start)); 2546 } 2547 2548 /** 2549 * Check that there are no gaps, when we alternate spanning. That is, there 2550 * should only be a zero length span at the very start. 2551 * @param longString 2552 * @param us 2553 * @param simple 2554 */ 2555 private int checkSpan(String longString, UnicodeSet us, SpanCondition spanCondition) { 2556 int start = 0; 2557 while (start < longString.length()) { 2558 int limit = us.span(longString, start, spanCondition); 2559 if (limit == longString.length()) { 2560 break; 2561 } else if (limit == start && start != 0) { 2562 return start; 2563 } 2564 start = limit; 2565 limit = us.span(longString, start, SpanCondition.NOT_CONTAINED); 2566 if (limit == start) { 2567 return start; 2568 } 2569 start = limit; 2570 } 2571 return -1; // all ok 2572 } 2573 2574 private String getCombinations(String[] items, int bitset) { 2575 StringBuilder result = new StringBuilder(); 2576 for (int i = 0; bitset != 0; ++i) { 2577 int other = bitset & (1 << i); 2578 if (other != 0) { 2579 bitset ^= other; 2580 result.append(items[i]); 2581 } 2582 } 2583 return result.toString(); 2584 } 2585 2586 public void TestCharSequenceArgs() { 2587 // statics 2588 assertEquals("CharSequence from", new UnicodeSet("[{abc}]"), UnicodeSet.from(new StringBuilder("abc"))); 2589 assertEquals("CharSequence fromAll", new UnicodeSet("[a-c]"), UnicodeSet.fromAll(new StringBuilder("abc"))); 2590 assertEquals("CharSequence compare", 1.0f, Math.signum(UnicodeSet.compare(new StringBuilder("abc"), 0x61))); 2591 assertEquals("CharSequence compare", -1.0f, Math.signum(UnicodeSet.compare(0x61, new StringBuilder("abc")))); 2592 assertEquals("CharSequence compare", 0.0f, Math.signum(UnicodeSet.compare(new StringBuilder("a"), 0x61))); 2593 assertEquals("CharSequence compare", 0.0f, Math.signum(UnicodeSet.compare(0x61, new StringBuilder("a")))); 2594 assertEquals("CharSequence getSingleCodePoint", 0x1F466, UnicodeSet.getSingleCodePoint(new StringBuilder(""))); 2595 2596 // iterables/arrays 2597 Iterable<StringBuilder> iterable = Arrays.asList(new StringBuilder("A"), new StringBuilder("B")); 2598 assertEquals("CharSequence containsAll", true, new UnicodeSet("[AB]").containsAll(iterable)); 2599 assertEquals("CharSequence containsAll", false, new UnicodeSet("[a-cA]").containsAll(iterable)); 2600 assertEquals("CharSequence containsNone", true, new UnicodeSet("[a-c]").containsNone(iterable) ); 2601 assertEquals("CharSequence containsNone", false, new UnicodeSet("[a-cA]").containsNone(iterable) ); 2602 assertEquals("CharSequence containsSome", true, new UnicodeSet("[a-cA]").containsSome(iterable) ); 2603 assertEquals("CharSequence containsSome", false, new UnicodeSet("[a-c]").containsSome(iterable) ); 2604 assertEquals("CharSequence addAll", new UnicodeSet("[a-cAB]"), new UnicodeSet("[a-cA]").addAll(new StringBuilder("A"), new StringBuilder("B")) ); 2605 assertEquals("CharSequence removeAll", new UnicodeSet("[a-c]"), new UnicodeSet("[a-cA]").removeAll( iterable) ); 2606 assertEquals("CharSequence retainAll", new UnicodeSet("[A]"), new UnicodeSet("[a-cA]").retainAll( iterable) ); 2607 2608 // UnicodeSet results 2609 assertEquals("CharSequence add", new UnicodeSet("[Aa-c{abc}{qr}]"), new UnicodeSet("[a-cA{qr}]").add(new StringBuilder("abc")) ); 2610 assertEquals("CharSequence retain", new UnicodeSet("[{abc}]"), new UnicodeSet("[a-cA{abc}{qr}]").retain(new StringBuilder("abc")) ); 2611 assertEquals("CharSequence remove", new UnicodeSet("[Aa-c{qr}]"), new UnicodeSet("[a-cA{abc}{qr}]").remove(new StringBuilder("abc")) ); 2612 assertEquals("CharSequence complement", new UnicodeSet("[Aa-c{qr}]"), new UnicodeSet("[a-cA{abc}{qr}]").complement(new StringBuilder("abc")) ); 2613 assertEquals("CharSequence complement", new UnicodeSet("[Aa-c{abc}{qr}]"), new UnicodeSet("[a-cA{qr}]").complement(new StringBuilder("abc")) ); 2614 2615 assertEquals("CharSequence addAll", new UnicodeSet("[a-cABC]"), new UnicodeSet("[a-cA]").addAll(new StringBuilder("ABC")) ); 2616 assertEquals("CharSequence retainAll", new UnicodeSet("[a-c]"), new UnicodeSet("[a-cA]").retainAll(new StringBuilder("abcB")) ); 2617 assertEquals("CharSequence removeAll", new UnicodeSet("[Aab]"), new UnicodeSet("[a-cA]").removeAll(new StringBuilder("cC")) ); 2618 assertEquals("CharSequence complementAll", new UnicodeSet("[ABbc]"), new UnicodeSet("[a-cA]").complementAll(new StringBuilder("aB")) ); 2619 2620 // containment 2621 assertEquals("CharSequence contains", true, new UnicodeSet("[a-cA{ab}]"). contains(new StringBuilder("ab")) ); 2622 assertEquals("CharSequence containsNone", false, new UnicodeSet("[a-cA]"). containsNone(new StringBuilder("ab")) ); 2623 assertEquals("CharSequence containsSome", true, new UnicodeSet("[a-cA{ab}]"). containsSome(new StringBuilder("ab")) ); 2624 2625 // spanning 2626 assertEquals("CharSequence span", 3, new UnicodeSet("[a-cA]"). span(new StringBuilder("abc"), SpanCondition.SIMPLE) ); 2627 assertEquals("CharSequence span", 3, new UnicodeSet("[a-cA]"). span(new StringBuilder("abc"), 1, SpanCondition.SIMPLE) ); 2628 assertEquals("CharSequence spanBack", 0, new UnicodeSet("[a-cA]"). spanBack(new StringBuilder("abc"), SpanCondition.SIMPLE) ); 2629 assertEquals("CharSequence spanBack", 0, new UnicodeSet("[a-cA]"). spanBack(new StringBuilder("abc"), 1, SpanCondition.SIMPLE) ); 2630 2631 // internal 2632 OutputInt outCount = new OutputInt(); 2633 assertEquals("CharSequence matchesAt", 2, new UnicodeSet("[a-cA]"). matchesAt(new StringBuilder("abc"), 1) ); 2634 assertEquals("CharSequence spanAndCount", 3, new UnicodeSet("[a-cA]"). spanAndCount(new StringBuilder("abc"), 1, SpanCondition.SIMPLE, outCount ) ); 2635 assertEquals("CharSequence findIn", 3, new UnicodeSet("[a-cA]"). findIn(new StringBuilder("abc"), 1, true) ); 2636 assertEquals("CharSequence findLastIn", -1, new UnicodeSet("[a-cA]"). findLastIn(new StringBuilder("abc"), 1, true) ); 2637 assertEquals("CharSequence add", "c", new UnicodeSet("[abA]"). stripFrom(new StringBuilder("abc"), true)); 2638 } 2639 } 2640