1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2009-2014, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.dev.test.lang; 10 11 import java.util.Collection; 12 13 import org.junit.Test; 14 15 import com.ibm.icu.dev.test.TestFmwk; 16 import com.ibm.icu.impl.Utility; 17 import com.ibm.icu.text.UTF16; 18 import com.ibm.icu.text.UnicodeSet; 19 import com.ibm.icu.text.UnicodeSet.SpanCondition; 20 import com.ibm.icu.util.OutputInt; 21 22 /** 23 * @test 24 * @summary General test of UnicodeSet string span. 25 */ 26 public class UnicodeSetStringSpanTest extends TestFmwk { 27 // Simple test first, easier to debug. 28 @Test 29 public void TestSimpleStringSpan() { 30 String pattern = "[a{ab}{bc}]"; 31 String string = "abc"; 32 UnicodeSet set = new UnicodeSet(pattern); 33 set.complement(); 34 int pos = set.spanBack(string, 3, SpanCondition.SIMPLE); 35 if (pos != 1) { 36 errln(String.format("FAIL: UnicodeSet(%s).spanBack(%s) returns the wrong value pos %d (!= 1)", 37 set.toString(), string, pos)); 38 } 39 pos = set.span(string, SpanCondition.SIMPLE); 40 if (pos != 3) { 41 errln(String.format("FAIL: UnicodeSet(%s).span(%s) returns the wrong value pos %d (!= 3)", 42 set.toString(), string, pos)); 43 } 44 pos = set.span(string, 1, SpanCondition.SIMPLE); 45 if (pos != 3) { 46 errln(String.format("FAIL: UnicodeSet(%s).span(%s, 1) returns the wrong value pos %d (!= 3)", 47 set.toString(), string, pos)); 48 } 49 } 50 51 // test our slow implementation 52 @Test 53 public void TestSimpleStringSpanSlow() { 54 String pattern = "[a{ab}{bc}]"; 55 String string = "abc"; 56 UnicodeSet uset = new UnicodeSet(pattern); 57 uset.complement(); 58 UnicodeSetWithStrings set = new UnicodeSetWithStrings(uset); 59 60 int length = containsSpanBackUTF16(set, string, 3, SpanCondition.SIMPLE); 61 if (length != 1) { 62 errln(String.format("FAIL: UnicodeSet(%s) containsSpanBackUTF16(%s) returns the wrong value length %d (!= 1)", 63 set.toString(), string, length)); 64 } 65 length = containsSpanUTF16(set, string, SpanCondition.SIMPLE); 66 if (length != 3) { 67 errln(String.format("FAIL: UnicodeSet(%s) containsSpanUTF16(%s) returns the wrong value length %d (!= 3)", 68 set.toString(), string, length)); 69 } 70 length = containsSpanUTF16(set, string.substring(1), SpanCondition.SIMPLE); 71 if (length != 2) { 72 errln(String.format("FAIL: UnicodeSet(%s) containsSpanUTF16(%s) returns the wrong value length %d (!= 2)", 73 set.toString(), string, length)); 74 } 75 } 76 77 // Test select patterns and strings, and test SIMPLE. 78 @Test 79 public void TestSimpleStringSpanAndFreeze() { 80 String pattern = "[x{xy}{xya}{axy}{ax}]"; 81 final String string = "xx" 82 + "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya" + "xx" 83 + "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya" + "xx" 84 + "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy" + "aaaa"; 85 86 UnicodeSet set = new UnicodeSet(pattern); 87 if (set.containsAll(string)) { 88 errln("FAIL: UnicodeSet(" + pattern + ").containsAll(" + string + ") should be FALSE"); 89 } 90 91 // Remove trailing "aaaa". 92 String string16 = string.substring(0, string.length() - 4); 93 if (!set.containsAll(string16)) { 94 errln("FAIL: UnicodeSet(" + pattern + ").containsAll(" + string + "[:-4]) should be TRUE"); 95 } 96 97 String s16 = "byayaxya"; 98 if ( set.span(s16.substring(0, 8), SpanCondition.NOT_CONTAINED) != 4 99 || set.span(s16.substring(0, 7), SpanCondition.NOT_CONTAINED) != 4 100 || set.span(s16.substring(0, 6), SpanCondition.NOT_CONTAINED) != 4 101 || set.span(s16.substring(0, 5), SpanCondition.NOT_CONTAINED) != 5 102 || set.span(s16.substring(0, 4), SpanCondition.NOT_CONTAINED) != 4 103 || set.span(s16.substring(0, 3), SpanCondition.NOT_CONTAINED) != 3) { 104 errln("FAIL: UnicodeSet(" + pattern + ").span(while not) returns the wrong value"); 105 } 106 107 pattern = "[a{ab}{abc}{cd}]"; 108 set.applyPattern(pattern); 109 s16 = "acdabcdabccd"; 110 if ( set.span(s16.substring(0, 12), SpanCondition.CONTAINED) != 12 111 || set.span(s16.substring(0, 12), SpanCondition.SIMPLE) != 6 112 || set.span(s16.substring(7), SpanCondition.SIMPLE) != 5) { 113 errln("FAIL: UnicodeSet(" + pattern + ").span(while longest match) returns the wrong value"); 114 } 115 set.freeze(); 116 if ( set.span(s16.substring(0, 12), SpanCondition.CONTAINED) != 12 117 || set.span(s16.substring(0, 12), SpanCondition.SIMPLE) != 6 118 || set.span(s16.substring(7), SpanCondition.SIMPLE) != 5) { 119 errln("FAIL: UnicodeSet(" + pattern + ").span(while longest match) returns the wrong value"); 120 } 121 122 pattern = "[d{cd}{bcd}{ab}]"; 123 set = (UnicodeSet)set.cloneAsThawed(); 124 set.applyPattern(pattern).freeze(); 125 s16 = "abbcdabcdabd"; 126 if ( set.spanBack(s16, 12, SpanCondition.CONTAINED) != 0 127 || set.spanBack(s16, 12, SpanCondition.SIMPLE) != 6 128 || set.spanBack(s16, 5, SpanCondition.SIMPLE) != 0) { 129 errln("FAIL: UnicodeSet(" + pattern + ").spanBack(while longest match) returns the wrong value"); 130 } 131 } 132 133 // more complex test. -------------------------------------------------------- 134 135 // Make the strings in a UnicodeSet easily accessible. 136 private static class UnicodeSetWithStrings { 137 private UnicodeSet set; 138 private Collection<String> setStrings; 139 private int stringsLength; 140 141 public UnicodeSetWithStrings(final UnicodeSet normalSet) { 142 set = normalSet; 143 setStrings = normalSet.strings(); 144 stringsLength = setStrings.size(); 145 } 146 147 public final UnicodeSet getSet() { 148 return set; 149 } 150 151 public boolean hasStrings() { 152 return (stringsLength > 0); 153 } 154 155 public Iterable<String> strings() { 156 return setStrings; 157 } 158 } 159 160 // Compare 16-bit Unicode strings (which may be malformed UTF-16) 161 // at code point boundaries. 162 // That is, each edge of a match must not be in the middle of a surrogate pair. 163 static boolean matches16CPB(final String s, int start, int limit, final String t) { 164 limit -= start; 165 int length = t.length(); 166 return t.equals(s.substring(start, start + length)) 167 && !(0 < start && UTF16.isLeadSurrogate (s.charAt(start - 1)) && 168 UTF16.isTrailSurrogate(s.charAt(start))) 169 && !(length < limit && UTF16.isLeadSurrogate (s.charAt(start + length - 1)) && 170 UTF16.isTrailSurrogate(s.charAt(start + length))); 171 } 172 173 // Implement span() with contains() for comparison. 174 static int containsSpanUTF16(final UnicodeSetWithStrings set, final String s, 175 SpanCondition spanCondition) { 176 final UnicodeSet realSet = set.getSet(); 177 int length = s.length(); 178 if (!set.hasStrings()) { 179 boolean spanContained = false; 180 if (spanCondition != SpanCondition.NOT_CONTAINED) { 181 spanContained = true; // Pin to 0/1 values. 182 } 183 184 int c; 185 int start = 0, prev; 186 while ((prev = start) < length) { 187 c = s.codePointAt(start); 188 start = s.offsetByCodePoints(start, 1); 189 if (realSet.contains(c) != spanContained) { 190 break; 191 } 192 } 193 return prev; 194 } else if (spanCondition == SpanCondition.NOT_CONTAINED) { 195 int c; 196 int start, next; 197 for (start = next = 0; start < length;) { 198 c = s.codePointAt(next); 199 next = s.offsetByCodePoints(next, 1); 200 if (realSet.contains(c)) { 201 break; 202 } 203 for (String str : set.strings()) { 204 if (str.length() <= (length - start) && matches16CPB(s, start, length, str)) { 205 // spanNeedsStrings=true; 206 return start; 207 } 208 } 209 start = next; 210 } 211 return start; 212 } else /* CONTAINED or SIMPLE */{ 213 int c; 214 int start, next, maxSpanLimit = 0; 215 for (start = next = 0; start < length;) { 216 c = s.codePointAt(next); 217 next = s.offsetByCodePoints(next, 1); 218 if (!realSet.contains(c)) { 219 next = start; // Do not span this single, not-contained code point. 220 } 221 for (String str : set.strings()) { 222 if (str.length() <= (length - start) && matches16CPB(s, start, length, str)) { 223 // spanNeedsStrings=true; 224 int matchLimit = start + str.length(); 225 if (matchLimit == length) { 226 return length; 227 } 228 if (spanCondition == SpanCondition.CONTAINED) { 229 // Iterate for the shortest match at each position. 230 // Recurse for each but the shortest match. 231 if (next == start) { 232 next = matchLimit; // First match from start. 233 } else { 234 if (matchLimit < next) { 235 // Remember shortest match from start for iteration. 236 int temp = next; 237 next = matchLimit; 238 matchLimit = temp; 239 } 240 // Recurse for non-shortest match from start. 241 int spanLength = containsSpanUTF16(set, s.substring(matchLimit), 242 SpanCondition.CONTAINED); 243 if ((matchLimit + spanLength) > maxSpanLimit) { 244 maxSpanLimit = matchLimit + spanLength; 245 if (maxSpanLimit == length) { 246 return length; 247 } 248 } 249 } 250 } else /* spanCondition==SIMPLE */{ 251 if (matchLimit > next) { 252 // Remember longest match from start. 253 next = matchLimit; 254 } 255 } 256 } 257 } 258 if (next == start) { 259 break; // No match from start. 260 } 261 start = next; 262 } 263 if (start > maxSpanLimit) { 264 return start; 265 } else { 266 return maxSpanLimit; 267 } 268 } 269 } 270 271 static int containsSpanBackUTF16(final UnicodeSetWithStrings set, final String s, int length, 272 SpanCondition spanCondition) { 273 if (length == 0) { 274 return 0; 275 } 276 final UnicodeSet realSet = set.getSet(); 277 if (!set.hasStrings()) { 278 boolean spanContained = false; 279 if (spanCondition != SpanCondition.NOT_CONTAINED) { 280 spanContained = true; // Pin to 0/1 values. 281 } 282 283 int c; 284 int prev = length; 285 do { 286 c = s.codePointBefore(prev); 287 if (realSet.contains(c) != spanContained) { 288 break; 289 } 290 prev = s.offsetByCodePoints(prev, -1); 291 } while (prev > 0); 292 return prev; 293 } else if (spanCondition == SpanCondition.NOT_CONTAINED) { 294 int c; 295 int prev = length, length0 = length; 296 do { 297 c = s.codePointBefore(prev); 298 if (realSet.contains(c)) { 299 break; 300 } 301 for (String str : set.strings()) { 302 if (str.length() <= prev && matches16CPB(s, prev - str.length(), length0, str)) { 303 // spanNeedsStrings=true; 304 return prev; 305 } 306 } 307 prev = s.offsetByCodePoints(prev, -1); 308 } while (prev > 0); 309 return prev; 310 } else /* SpanCondition.CONTAINED or SIMPLE */{ 311 int c; 312 int prev = length, minSpanStart = length, length0 = length; 313 do { 314 c = s.codePointBefore(length); 315 length = s.offsetByCodePoints(length, -1); 316 if (!realSet.contains(c)) { 317 length = prev; // Do not span this single, not-contained code point. 318 } 319 for (String str : set.strings()) { 320 if (str.length() <= prev && matches16CPB(s, prev - str.length(), length0, str)) { 321 // spanNeedsStrings=true; 322 int matchStart = prev - str.length(); 323 if (matchStart == 0) { 324 return 0; 325 } 326 if (spanCondition == SpanCondition.CONTAINED) { 327 // Iterate for the shortest match at each position. 328 // Recurse for each but the shortest match. 329 if (length == prev) { 330 length = matchStart; // First match from prev. 331 } else { 332 if (matchStart > length) { 333 // Remember shortest match from prev for iteration. 334 int temp = length; 335 length = matchStart; 336 matchStart = temp; 337 } 338 // Recurse for non-shortest match from prev. 339 int spanStart = containsSpanBackUTF16(set, s, matchStart, 340 SpanCondition.CONTAINED); 341 if (spanStart < minSpanStart) { 342 minSpanStart = spanStart; 343 if (minSpanStart == 0) { 344 return 0; 345 } 346 } 347 } 348 } else /* spanCondition==SIMPLE */{ 349 if (matchStart < length) { 350 // Remember longest match from prev. 351 length = matchStart; 352 } 353 } 354 } 355 } 356 if (length == prev) { 357 break; // No match from prev. 358 } 359 } while ((prev = length) > 0); 360 if (prev < minSpanStart) { 361 return prev; 362 } else { 363 return minSpanStart; 364 } 365 } 366 } 367 368 // spans to be performed and compared 369 static final int SPAN_UTF16 = 1; 370 static final int SPAN_UTF8 = 2; 371 static final int SPAN_UTFS = 3; 372 373 static final int SPAN_SET = 4; 374 static final int SPAN_COMPLEMENT = 8; 375 static final int SPAN_POLARITY = 0xc; 376 377 static final int SPAN_FWD = 0x10; 378 static final int SPAN_BACK = 0x20; 379 static final int SPAN_DIRS = 0x30; 380 381 static final int SPAN_CONTAINED = 0x100; 382 static final int SPAN_SIMPLE = 0x200; 383 static final int SPAN_CONDITION = 0x300; 384 385 static final int SPAN_ALL = 0x33f; 386 387 static SpanCondition invertSpanCondition(SpanCondition spanCondition, SpanCondition contained) { 388 return spanCondition == SpanCondition.NOT_CONTAINED ? contained 389 : SpanCondition.NOT_CONTAINED; 390 } 391 392 /* 393 * Count spans on a string with the method according to type and set the span limits. The set may be the complement 394 * of the original. When using spanBack() and comparing with span(), use a span condition for the first spanBack() 395 * according to the expected number of spans. Sets typeName to an empty string if there is no such type. Returns -1 396 * if the span option is filtered out. 397 */ 398 static int getSpans(final UnicodeSetWithStrings set, boolean isComplement, final String s, 399 int whichSpans, int type, String[] typeName, int limits[], int limitsCapacity, 400 int expectCount) { 401 final UnicodeSet realSet = set.getSet(); 402 int start, count, i; 403 SpanCondition spanCondition, firstSpanCondition, contained; 404 boolean isForward; 405 406 int length = s.length(); 407 if (type < 0 || 7 < type) { 408 typeName[0] = null; 409 return 0; 410 } 411 412 final String typeNames16[] = { 413 "contains", 414 "contains(LM)", 415 "span", 416 "span(LM)", 417 "containsBack", 418 "containsBack(LM)", 419 "spanBack", 420 "spanBack(LM)" }; 421 422 typeName[0] = typeNames16[type]; 423 424 // filter span options 425 if (type <= 3) { 426 // span forward 427 if ((whichSpans & SPAN_FWD) == 0) { 428 return -1; 429 } 430 isForward = true; 431 } else { 432 // span backward 433 if ((whichSpans & SPAN_BACK) == 0) { 434 return -1; 435 } 436 isForward = false; 437 } 438 if ((type & 1) == 0) { 439 // use SpanCondition.CONTAINED 440 if ((whichSpans & SPAN_CONTAINED) == 0) { 441 return -1; 442 } 443 contained = SpanCondition.CONTAINED; 444 } else { 445 // use SIMPLE 446 if ((whichSpans & SPAN_SIMPLE) == 0) { 447 return -1; 448 } 449 contained = SpanCondition.SIMPLE; 450 } 451 452 // Default first span condition for going forward with an uncomplemented set. 453 spanCondition = SpanCondition.NOT_CONTAINED; 454 if (isComplement) { 455 spanCondition = invertSpanCondition(spanCondition, contained); 456 } 457 458 // First span condition for span(), used to terminate the spanBack() iteration. 459 firstSpanCondition = spanCondition; 460 461 // spanBack(): Its initial span condition is span()'s last span condition, 462 // which is the opposite of span()'s first span condition 463 // if we expect an even number of spans. 464 // (The loop inverts spanCondition (expectCount-1) times 465 // before the expectCount'th span() call.) 466 // If we do not compare forward and backward directions, then we do not have an 467 // expectCount and just start with firstSpanCondition. 468 if (!isForward && (whichSpans & SPAN_FWD) != 0 && (expectCount & 1) == 0) { 469 spanCondition = invertSpanCondition(spanCondition, contained); 470 } 471 472 count = 0; 473 switch (type) { 474 case 0: 475 case 1: 476 start = 0; 477 for (;;) { 478 start += containsSpanUTF16(set, s.substring(start), spanCondition); 479 if (count < limitsCapacity) { 480 limits[count] = start; 481 } 482 ++count; 483 if (start >= length) { 484 break; 485 } 486 spanCondition = invertSpanCondition(spanCondition, contained); 487 } 488 break; 489 case 2: 490 case 3: 491 start = 0; 492 for (;;) { 493 start = realSet.span(s, start, spanCondition); 494 if (count < limitsCapacity) { 495 limits[count] = start; 496 } 497 ++count; 498 if (start >= length) { 499 break; 500 } 501 spanCondition = invertSpanCondition(spanCondition, contained); 502 } 503 break; 504 case 4: 505 case 5: 506 for (;;) { 507 ++count; 508 if (count <= limitsCapacity) { 509 limits[limitsCapacity - count] = length; 510 } 511 length = containsSpanBackUTF16(set, s, length, spanCondition); 512 if (length == 0 && spanCondition == firstSpanCondition) { 513 break; 514 } 515 spanCondition = invertSpanCondition(spanCondition, contained); 516 } 517 if (count < limitsCapacity) { 518 for (i = count; i-- > 0;) { 519 limits[i] = limits[limitsCapacity - count + i]; 520 } 521 } 522 break; 523 case 6: 524 case 7: 525 for (;;) { 526 ++count; 527 if (count <= limitsCapacity) { 528 limits[limitsCapacity - count] = length >= 0 ? length : s.length(); 529 } 530 length = realSet.spanBack(s, length, spanCondition); 531 if (length == 0 && spanCondition == firstSpanCondition) { 532 break; 533 } 534 spanCondition = invertSpanCondition(spanCondition, contained); 535 } 536 if (count < limitsCapacity) { 537 for (i = count; i-- > 0;) { 538 limits[i] = limits[limitsCapacity - count + i]; 539 } 540 } 541 break; 542 default: 543 typeName = null; 544 return -1; 545 } 546 547 return count; 548 } 549 550 // sets to be tested; odd index=isComplement 551 static final int SLOW = 0; 552 static final int SLOW_NOT = 1; 553 static final int FAST = 2; 554 static final int FAST_NOT = 3; 555 static final int SET_COUNT = 4; 556 557 static final String setNames[] = { "slow", "slow.not", "fast", "fast.not" }; 558 559 /* 560 * Verify that we get the same results whether we look at text with contains(), span() or spanBack(), using unfrozen 561 * or frozen versions of the set, and using the set or its complement (switching the spanConditions accordingly). 562 * The latter verifies that set.span(spanCondition) == set.complement().span(!spanCondition). 563 * 564 * The expectLimits[] are either provided by the caller (with expectCount>=0) or returned to the caller (with an 565 * input expectCount<0). 566 */ 567 void verifySpan(final UnicodeSetWithStrings sets[], final String s, int whichSpans, 568 int expectLimits[], int expectCount, 569 final String testName, int index) { 570 int[] limits = new int[500]; 571 int limitsCount; 572 int i, j; 573 String[] typeName = new String[1]; 574 int type; 575 576 for (i = 0; i < SET_COUNT; ++i) { 577 if ((i & 1) == 0) { 578 // Even-numbered sets are original, uncomplemented sets. 579 if ((whichSpans & SPAN_SET) == 0) { 580 continue; 581 } 582 } else { 583 // Odd-numbered sets are complemented. 584 if ((whichSpans & SPAN_COMPLEMENT) == 0) { 585 continue; 586 } 587 } 588 for (type = 0;; ++type) { 589 limitsCount = getSpans(sets[i], (0 != (i & 1)), s, whichSpans, type, typeName, limits, 590 limits.length, expectCount); 591 if (typeName[0] == null) { 592 break; // All types tried. 593 } 594 if (limitsCount < 0) { 595 continue; // Span option filtered out. 596 } 597 if (expectCount < 0) { 598 expectCount = limitsCount; 599 if (limitsCount > limits.length) { 600 errln(String.format("FAIL: %s[0x%x].%s.%s span count=%d > %d capacity - too many spans", 601 testName, index, setNames[i], typeName[0], limitsCount, limits.length)); 602 return; 603 } 604 for (j = limitsCount; j-- > 0;) { 605 expectLimits[j] = limits[j]; 606 } 607 } else if (limitsCount != expectCount) { 608 errln(String.format("FAIL: %s[0x%x].%s.%s span count=%d != %d", testName, index, setNames[i], 609 typeName[0], limitsCount, expectCount)); 610 } else { 611 for (j = 0; j < limitsCount; ++j) { 612 if (limits[j] != expectLimits[j]) { 613 errln(String.format("FAIL: %s[0x%x].%s.%s span count=%d limits[%d]=%d != %d", testName, 614 index, setNames[i], typeName[0], limitsCount, j, limits[j], expectLimits[j])); 615 break; 616 } 617 } 618 } 619 } 620 } 621 622 // Compare span() with containsAll()/containsNone(), 623 // but only if we have expectLimits[] from the uncomplemented set. 624 if ((whichSpans & SPAN_SET) != 0) { 625 final String s16 = s; 626 String string; 627 int prev = 0, limit, len; 628 for (i = 0; i < expectCount; ++i) { 629 limit = expectLimits[i]; 630 len = limit - prev; 631 if (len > 0) { 632 string = s16.substring(prev, prev + len); // read-only alias 633 if (0 != (i & 1)) { 634 if (!sets[SLOW].getSet().containsAll(string)) { 635 errln(String.format("FAIL: %s[0x%x].%s.containsAll(%d..%d)==false contradicts span()", 636 testName, index, setNames[SLOW], prev, limit)); 637 return; 638 } 639 if (!sets[FAST].getSet().containsAll(string)) { 640 errln(String.format("FAIL: %s[0x%x].%s.containsAll(%d..%d)==false contradicts span()", 641 testName, index, setNames[FAST], prev, limit)); 642 return; 643 } 644 } else { 645 if (!sets[SLOW].getSet().containsNone(string)) { 646 errln(String.format("FAIL: %s[0x%x].%s.containsNone(%d..%d)==false contradicts span()", 647 testName, index, setNames[SLOW], prev, limit)); 648 return; 649 } 650 if (!sets[FAST].getSet().containsNone(string)) { 651 errln(String.format("FAIL: %s[0x%x].%s.containsNone(%d..%d)==false contradicts span()", 652 testName, index, setNames[FAST], prev, limit)); 653 return; 654 } 655 } 656 } 657 prev = limit; 658 } 659 } 660 } 661 662 // Specifically test either UTF-16 or UTF-8. 663 void verifySpan(final UnicodeSetWithStrings sets[], final String s, int whichSpans, 664 final String testName, int index) { 665 int[] expectLimits = new int[500]; 666 int expectCount = -1; 667 verifySpan(sets, s, whichSpans, expectLimits, expectCount, testName, index); 668 } 669 670 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text, 671 // unless either UTF is turned off in whichSpans. 672 // Testing UTF-16 and UTF-8 together requires that surrogate code points 673 // have the same contains(c) value as U+FFFD. 674 void verifySpanBothUTFs(final UnicodeSetWithStrings sets[], final String s16, int whichSpans, 675 final String testName, int index) { 676 int[] expectLimits = new int[500]; 677 int expectCount; 678 679 expectCount = -1; // Get expectLimits[] from verifySpan(). 680 681 if ((whichSpans & SPAN_UTF16) != 0) { 682 verifySpan(sets, s16, whichSpans, expectLimits, expectCount, testName, index); 683 } 684 } 685 686 static int nextCodePoint(int c) { 687 // Skip some large and boring ranges. 688 switch (c) { 689 case 0x3441: 690 return 0x4d7f; 691 case 0x5100: 692 return 0x9f00; 693 case 0xb040: 694 return 0xd780; 695 case 0xe041: 696 return 0xf8fe; 697 case 0x10100: 698 return 0x20000; 699 case 0x20041: 700 return 0xe0000; 701 case 0xe0101: 702 return 0x10fffd; 703 default: 704 return c + 1; 705 } 706 } 707 708 // Verify that all implementations represent the same set. 709 void verifySpanContents(final UnicodeSetWithStrings sets[], int whichSpans, final String testName) { 710 StringBuffer s = new StringBuffer(); 711 int localWhichSpans; 712 int c, first; 713 for (first = c = 0;; c = nextCodePoint(c)) { 714 if (c > 0x10ffff || s.length() > 1024) { 715 localWhichSpans = whichSpans; 716 verifySpanBothUTFs(sets, s.toString(), localWhichSpans, testName, first); 717 if (c > 0x10ffff) { 718 break; 719 } 720 s.delete(0, s.length()); 721 first = c; 722 } 723 UTF16.append(s, c); 724 } 725 } 726 727 // Test with a particular, interesting string. 728 // Specify length and try NUL-termination. 729 static final char interestingStringChars[] = { 0x61, 0x62, 0x20, // Latin, space 730 0x3b1, 0x3b2, 0x3b3, // Greek 731 0xd900, // lead surrogate 732 0x3000, 0x30ab, 0x30ad, // wide space, Katakana 733 0xdc05, // trail surrogate 734 0xa0, 0xac00, 0xd7a3, // nbsp, Hangul 735 0xd900, 0xdc05, // unassigned supplementary 736 0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary 737 0xd7a4, 0xdc05, 0xd900, 0x2028 // unassigned, surrogates in wrong order, LS 738 }; 739 static String interestingString = new String(interestingStringChars); 740 static final String unicodeSet1 = "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]"; 741 742 @Test 743 public void TestInterestingStringSpan() { 744 UnicodeSet uset = new UnicodeSet(Utility.unescape(unicodeSet1)); 745 SpanCondition spanCondition = SpanCondition.NOT_CONTAINED; 746 int expect = 2; 747 int start = 14; 748 749 int c = 0xd840; 750 boolean contains = uset.contains(c); 751 if (false != contains) { 752 errln(String.format("FAIL: UnicodeSet(unicodeSet1).contains(%d) = true (expect false)", 753 c)); 754 } 755 756 UnicodeSetWithStrings set = new UnicodeSetWithStrings(uset); 757 int len = containsSpanUTF16(set, interestingString.substring(start), spanCondition); 758 if (expect != len) { 759 errln(String.format("FAIL: containsSpanUTF16(unicodeSet1, \"%s(%d)\") = %d (expect %d)", 760 interestingString, start, len, expect)); 761 } 762 763 len = uset.span(interestingString, start, spanCondition) - start; 764 if (expect != len) { 765 errln(String.format("FAIL: UnicodeSet(unicodeSet1).span(\"%s\", %d) = %d (expect %d)", 766 interestingString, start, len, expect)); 767 } 768 } 769 770 void verifySpanUTF16String(final UnicodeSetWithStrings sets[], int whichSpans, final String testName) { 771 if ((whichSpans & SPAN_UTF16) == 0) { 772 return; 773 } 774 verifySpan(sets, interestingString, (whichSpans & ~SPAN_UTF8), testName, 1); 775 } 776 777 // Take a set of span options and multiply them so that 778 // each portion only has one of the options a, b and c. 779 // If b==0, then the set of options is just modified with mask and a. 780 // If b!=0 and c==0, then the set of options is just modified with mask, a and b. 781 static int addAlternative(int whichSpans[], int whichSpansCount, int mask, int a, int b, int c) { 782 int s; 783 int i; 784 785 for (i = 0; i < whichSpansCount; ++i) { 786 s = whichSpans[i] & mask; 787 whichSpans[i] = s | a; 788 if (b != 0) { 789 whichSpans[whichSpansCount + i] = s | b; 790 if (c != 0) { 791 whichSpans[2 * whichSpansCount + i] = s | c; 792 } 793 } 794 } 795 return b == 0 ? whichSpansCount : c == 0 ? 2 * whichSpansCount : 3 * whichSpansCount; 796 } 797 798 // They are not representable in UTF-8, and a leading trail surrogate 799 // and a trailing lead surrogate must not match in the middle of a proper surrogate pair. 800 // U+20001 == \\uD840\\uDC01 801 // U+20400 == \\uD841\\uDC00 802 static final String patternWithUnpairedSurrogate = 803 "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]"; 804 static final String stringWithUnpairedSurrogate = 805 "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"; 806 807 static final String _63_a = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; 808 static final String _64_a = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; 809 static final String _63_b = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"; 810 static final String _64_b = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"; 811 static final String longPattern = 812 "[a{" + _64_a + _64_a + _64_a + _64_a + "b}" + "{a" + _64_b + _64_b + _64_b + _64_b + "}]"; 813 814 @Test 815 public void TestStringWithUnpairedSurrogateSpan() { 816 String string = Utility.unescape(stringWithUnpairedSurrogate); 817 UnicodeSet uset = new UnicodeSet(Utility.unescape(patternWithUnpairedSurrogate)); 818 SpanCondition spanCondition = SpanCondition.NOT_CONTAINED; 819 int start = 17; 820 int expect = 5; 821 822 UnicodeSetWithStrings set = new UnicodeSetWithStrings(uset); 823 int len = containsSpanUTF16(set, string.substring(start), spanCondition); 824 if (expect != len) { 825 errln(String.format("FAIL: containsSpanUTF16(patternWithUnpairedSurrogate, \"%s(%d)\") = %d (expect %d)", 826 string, start, len, expect)); 827 } 828 829 len = uset.span(string, start, spanCondition) - start; 830 if (expect != len) { 831 errln(String.format("FAIL: UnicodeSet(patternWithUnpairedSurrogate).span(\"%s\", %d) = %d (expect %d)", 832 string, start, len, expect)); 833 } 834 } 835 836 @Test 837 public void TestSpan() { 838 // "[...]" is a UnicodeSet pattern. 839 // "*" performs tests on all Unicode code points and on a selection of 840 // malformed UTF-8/16 strings. 841 // "-options" limits the scope of testing for the current set. 842 // By default, the test verifies that equivalent boundaries are found 843 // for UTF-16 and UTF-8, going forward and backward, 844 // alternating NOT_CONTAINED with 845 // either CONTAINED or SIMPLE. 846 // Single-character options: 847 // 8 -- UTF-16 and UTF-8 boundaries may differ. 848 // Cause: contains(U+FFFD) is inconsistent with contains(some surrogates), 849 // or the set contains strings with unpaired surrogates 850 // which do not translate to valid UTF-8. 851 // c -- set.span() and set.complement().span() boundaries may differ. 852 // Cause: Set strings are not complemented. 853 // b -- span() and spanBack() boundaries may differ. 854 // Cause: Strings in the set overlap, and spanBack(CONTAINED) 855 // and spanBack(SIMPLE) are defined to 856 // match with non-overlapping substrings. 857 // For example, with a set containing "ab" and "ba", 858 // span() of "aba" yields boundaries { 0, 2, 3 } 859 // because the initial "ab" matches from 0 to 2, 860 // while spanBack() yields boundaries { 0, 1, 3 } 861 // because the final "ba" matches from 1 to 3. 862 // l -- CONTAINED and SIMPLE boundaries may differ. 863 // Cause: Strings in the set overlap, and a longer match may 864 // require a sequence including non-longest substrings. 865 // For example, with a set containing "ab", "abc" and "cd", 866 // span(contained) of "abcd" spans the entire string 867 // but span(longest match) only spans the first 3 characters. 868 // Each "-options" first resets all options and then applies the specified options. 869 // A "-" without options resets the options. 870 // The options are also reset for each new set. 871 // Other strings will be spanned. 872 final String testdata[] = { 873 "[:ID_Continue:]", 874 "*", 875 "[:White_Space:]", 876 "*", 877 "[]", 878 "*", 879 "[\\u0000-\\U0010FFFF]", 880 "*", 881 "[\\u0000\\u0080\\u0800\\U00010000]", 882 "*", 883 "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]", 884 "*", 885 unicodeSet1, 886 "-c", 887 "*", 888 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]", 889 "-c", 890 "*", 891 892 // Overlapping strings cause overlapping attempts to match. 893 "[x{xy}{xya}{axy}{ax}]", 894 "-cl", 895 896 // More repetitions of "xya" would take too long with the recursive 897 // reference implementation. 898 // containsAll()=false 899 // test_string 0x14 900 "xx" + "xyaxyaxyaxya" + // set.complement().span(longest match) will stop here. 901 "xx" + // set.complement().span(contained) will stop between the two 'x'es. 902 "xyaxyaxyaxya" + "xx" + "xyaxyaxyaxya" + // span() ends here. 903 "aaa", 904 905 // containsAll()=true 906 // test_string 0x15 907 "xx" + "xyaxyaxyaxya" + "xx" + "xyaxyaxyaxya" + "xx" + "xyaxyaxyaxy", 908 909 "-bc", 910 // test_string 0x17 911 "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 } 912 "-c", 913 "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 } 914 "byayax", // span() -> { 4, 6 } complement.span() -> { 6 } 915 "-", 916 "byaya", // span() -> { 5 } 917 "byay", // span() -> { 4 } 918 "bya", // span() -> { 3 } 919 920 // span(longest match) will not span the whole string. 921 "[a{ab}{bc}]", 922 "-cl", 923 // test_string 0x21 924 "abc", 925 926 "[a{ab}{abc}{cd}]", 927 "-cl", 928 "acdabcdabccd", 929 930 // spanBack(longest match) will not span the whole string. 931 "[c{ab}{bc}]", 932 "-cl", 933 "abc", 934 935 "[d{cd}{bcd}{ab}]", 936 "-cl", 937 "abbcdabcdabd", 938 939 // Test with non-ASCII set strings - test proper handling of surrogate pairs 940 // and UTF-8 trail bytes. 941 // Copies of above test sets and strings, but transliterated to have 942 // different code points with similar trail units. 943 // Previous: a b c d 944 // Unicode: 042B 30AB 200AB 204AB 945 // UTF-16: 042B 30AB D840 DCAB D841 DCAB 946 // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB 947 "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]", 948 "-cl", 949 "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB", 950 951 "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]", 952 "-cl", 953 "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB", 954 955 // Stress bookkeeping and recursion. 956 // The following strings are barely doable with the recursive 957 // reference implementation. 958 // The not-contained character at the end prevents an early exit from the span(). 959 "[b{bb}]", 960 "-c", 961 // test_string 0x33 962 "bbbbbbbbbbbbbbbbbbbbbbbb-", 963 // On complement sets, span() and spanBack() get different results 964 // because b is not in the complement set and there is an odd number of b's 965 // in the test string. 966 "-bc", 967 "bbbbbbbbbbbbbbbbbbbbbbbbb-", 968 969 // Test with set strings with an initial or final code point span 970 // longer than 254. 971 longPattern, 972 "-c", 973 _64_a + _64_a + _64_a + _63_a + "b", 974 _64_a + _64_a + _64_a + _64_a + "b", 975 _64_a + _64_a + _64_a + _64_a + "aaaabbbb", 976 "a" + _64_b + _64_b + _64_b + _63_b, 977 "a" + _64_b + _64_b + _64_b + _64_b, 978 "aaaabbbb" + _64_b + _64_b + _64_b + _64_b, 979 980 // Test with strings containing unpaired surrogates. 981 patternWithUnpairedSurrogate, "-8cl", 982 stringWithUnpairedSurrogate }; 983 int i, j; 984 int whichSpansCount = 1; 985 int[] whichSpans = new int[96]; 986 for (i = whichSpans.length; i-- > 0;) { 987 whichSpans[i] = SPAN_ALL; 988 } 989 990 UnicodeSet[] sets = new UnicodeSet[SET_COUNT]; 991 UnicodeSetWithStrings[] sets_with_str = new UnicodeSetWithStrings[SET_COUNT]; 992 993 String testName = null; 994 @SuppressWarnings("unused") 995 String testNameLimit; 996 997 for (i = 0; i < testdata.length; ++i) { 998 final String s = testdata[i]; 999 if (s.charAt(0) == '[') { 1000 // Create new test sets from this pattern. 1001 for (j = 0; j < SET_COUNT; ++j) { 1002 sets_with_str[j] = null; 1003 sets[j] = null; 1004 } 1005 sets[SLOW] = new UnicodeSet(Utility.unescape(s)); 1006 sets[SLOW_NOT] = new UnicodeSet(sets[SLOW]); 1007 sets[SLOW_NOT].complement(); 1008 // Intermediate set: Test cloning of a frozen set. 1009 UnicodeSet fast = new UnicodeSet(sets[SLOW]); 1010 fast.freeze(); 1011 sets[FAST] = (UnicodeSet) fast.clone(); 1012 fast = null; 1013 UnicodeSet fastNot = new UnicodeSet(sets[SLOW_NOT]); 1014 fastNot.freeze(); 1015 sets[FAST_NOT] = (UnicodeSet) fastNot.clone(); 1016 fastNot = null; 1017 1018 for (j = 0; j < SET_COUNT; ++j) { 1019 sets_with_str[j] = new UnicodeSetWithStrings(sets[j]); 1020 } 1021 1022 testName = s + ':'; 1023 whichSpans[0] = SPAN_ALL; 1024 whichSpansCount = 1; 1025 } else if (s.charAt(0) == '-') { 1026 whichSpans[0] = SPAN_ALL; 1027 whichSpansCount = 1; 1028 1029 for (j = 1; j < s.length(); j++) { 1030 switch (s.charAt(j)) { 1031 case 'c': 1032 whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~SPAN_POLARITY, SPAN_SET, 1033 SPAN_COMPLEMENT, 0); 1034 break; 1035 case 'b': 1036 whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~SPAN_DIRS, SPAN_FWD, SPAN_BACK, 1037 0); 1038 break; 1039 case 'l': 1040 // test CONTAINED FWD & BACK, and separately 1041 // SIMPLE only FWD, and separately 1042 // SIMPLE only BACK 1043 whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~(SPAN_DIRS | SPAN_CONDITION), 1044 SPAN_DIRS | SPAN_CONTAINED, SPAN_FWD | SPAN_SIMPLE, SPAN_BACK | SPAN_SIMPLE); 1045 break; 1046 case '8': 1047 whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~SPAN_UTFS, SPAN_UTF16, 1048 SPAN_UTF8, 0); 1049 break; 1050 default: 1051 errln(String.format("FAIL: unrecognized span set option in \"%s\"", testdata[i])); 1052 break; 1053 } 1054 } 1055 } else if (s.equals("*")) { 1056 testNameLimit = "bad_string"; 1057 for (j = 0; j < whichSpansCount; ++j) { 1058 if (whichSpansCount > 1) { 1059 testNameLimit += String.format("%%0x%3x", whichSpans[j]); 1060 } 1061 verifySpanUTF16String(sets_with_str, whichSpans[j], testName); 1062 } 1063 1064 testNameLimit = "contents"; 1065 for (j = 0; j < whichSpansCount; ++j) { 1066 if (whichSpansCount > 1) { 1067 testNameLimit += String.format("%%0x%3x", whichSpans[j]); 1068 } 1069 verifySpanContents(sets_with_str, whichSpans[j], testName); 1070 } 1071 } else { 1072 String string = Utility.unescape(s); 1073 testNameLimit = "test_string"; 1074 for (j = 0; j < whichSpansCount; ++j) { 1075 if (whichSpansCount > 1) { 1076 testNameLimit += String.format("%%0x%3x", whichSpans[j]); 1077 } 1078 verifySpanBothUTFs(sets_with_str, string, whichSpans[j], testName, i); 1079 } 1080 } 1081 } 1082 } 1083 1084 @Test 1085 public void TestSpanAndCount() { 1086 // a set with no strings 1087 UnicodeSet abc = new UnicodeSet('a', 'c'); 1088 // a set with an "irrelevant" string (fully contained in the code point set) 1089 UnicodeSet crlf = new UnicodeSet().add('\n').add('\r').add("\r\n"); 1090 // a set with no "irrelevant" string but some interesting overlaps 1091 UnicodeSet ab_cd = new UnicodeSet().add('a').add("ab").add("abc").add("cd"); 1092 String s = "ab\n\r\r\n" + UTF16.valueOf(0x50000) + "abcde"; 1093 OutputInt count = new OutputInt(); 1094 assertEquals("abc span[8, 11[", 11, 1095 abc.spanAndCount(s, 8, SpanCondition.SIMPLE, count)); 1096 assertEquals("abc count=3", 3, count.value); 1097 assertEquals("no abc span[2, 8[", 8, 1098 abc.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count)); 1099 assertEquals("no abc count=5", 5, count.value); 1100 assertEquals("line endings span[2, 6[", 6, 1101 crlf.spanAndCount(s, 2, SpanCondition.CONTAINED, count)); 1102 assertEquals("line endings count=3", 3, count.value); 1103 assertEquals("no ab+cd span[2, 8[", 8, 1104 ab_cd.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count)); 1105 assertEquals("no ab+cd count=5", 5, count.value); 1106 assertEquals("ab+cd span[8, 12[", 12, 1107 ab_cd.spanAndCount(s, 8, SpanCondition.CONTAINED, count)); 1108 assertEquals("ab+cd count=2", 2, count.value); 1109 assertEquals("1x abc span[8, 11[", 11, 1110 ab_cd.spanAndCount(s, 8, SpanCondition.SIMPLE, count)); 1111 assertEquals("1x abc count=1", 1, count.value); 1112 1113 abc.freeze(); 1114 crlf.freeze(); 1115 ab_cd.freeze(); 1116 assertEquals("abc span[8, 11[ (frozen)", 11, 1117 abc.spanAndCount(s, 8, SpanCondition.SIMPLE, count)); 1118 assertEquals("abc count=3 (frozen)", 3, count.value); 1119 assertEquals("no abc span[2, 8[ (frozen)", 8, 1120 abc.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count)); 1121 assertEquals("no abc count=5 (frozen)", 5, count.value); 1122 assertEquals("line endings span[2, 6[ (frozen)", 6, 1123 crlf.spanAndCount(s, 2, SpanCondition.CONTAINED, count)); 1124 assertEquals("line endings count=3 (frozen)", 3, count.value); 1125 assertEquals("no ab+cd span[2, 8[ (frozen)", 8, 1126 ab_cd.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count)); 1127 assertEquals("no ab+cd count=5 (frozen)", 5, count.value); 1128 assertEquals("ab+cd span[8, 12[ (frozen)", 12, 1129 ab_cd.spanAndCount(s, 8, SpanCondition.CONTAINED, count)); 1130 assertEquals("ab+cd count=2 (frozen)", 2, count.value); 1131 assertEquals("1x abc span[8, 11[ (frozen)", 11, 1132 ab_cd.spanAndCount(s, 8, SpanCondition.SIMPLE, count)); 1133 assertEquals("1x abc count=1 (frozen)", 1, count.value); 1134 } 1135 } 1136