1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2013-2015, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 ******************************************************************************* 9 * CollationRuleParser.java, ported from collationruleparser.h/.cpp 10 * 11 * C++ version created on: 2013apr10 12 * created by: Markus W. Scherer 13 */ 14 15 package android.icu.impl.coll; 16 17 import java.text.ParseException; 18 import java.util.ArrayList; 19 20 import android.icu.impl.IllegalIcuArgumentException; 21 import android.icu.impl.PatternProps; 22 import android.icu.lang.UCharacter; 23 import android.icu.lang.UProperty; 24 import android.icu.text.Collator; 25 import android.icu.text.Normalizer2; 26 import android.icu.text.UTF16; 27 import android.icu.text.UnicodeSet; 28 import android.icu.util.ULocale; 29 30 /** 31 * @hide Only a subset of ICU is exposed in Android 32 */ 33 public final class CollationRuleParser { 34 /** Special reset positions. */ 35 enum Position { 36 FIRST_TERTIARY_IGNORABLE, 37 LAST_TERTIARY_IGNORABLE, 38 FIRST_SECONDARY_IGNORABLE, 39 LAST_SECONDARY_IGNORABLE, 40 FIRST_PRIMARY_IGNORABLE, 41 LAST_PRIMARY_IGNORABLE, 42 FIRST_VARIABLE, 43 LAST_VARIABLE, 44 FIRST_REGULAR, 45 LAST_REGULAR, 46 FIRST_IMPLICIT, 47 LAST_IMPLICIT, 48 FIRST_TRAILING, 49 LAST_TRAILING 50 } 51 static final Position[] POSITION_VALUES = Position.values(); 52 53 /** 54 * First character of contractions that encode special reset positions. 55 * U+FFFE cannot be tailored via rule syntax. 56 * 57 * The second contraction character is POS_BASE + Position. 58 */ 59 static final char POS_LEAD = 0xfffe; 60 /** 61 * Base for the second character of contractions that encode special reset positions. 62 * Braille characters U+28xx are printable and normalization-inert. 63 * @see POS_LEAD 64 */ 65 static final char POS_BASE = 0x2800; 66 67 static abstract class Sink { 68 /** 69 * Adds a reset. 70 * strength=UCOL_IDENTICAL for &str. 71 * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3. 72 */ 73 abstract void addReset(int strength, CharSequence str); 74 /** 75 * Adds a relation with strength and prefix | str / extension. 76 */ 77 abstract void addRelation(int strength, CharSequence prefix, 78 CharSequence str, CharSequence extension); 79 80 void suppressContractions(UnicodeSet set) {} 81 82 void optimize(UnicodeSet set) {} 83 } 84 85 interface Importer { 86 String getRules(String localeID, String collationType); 87 } 88 89 /** 90 * Constructor. 91 * The Sink must be set before parsing. 92 * The Importer can be set, otherwise [import locale] syntax is not supported. 93 */ 94 CollationRuleParser(CollationData base) { 95 baseData = base; 96 } 97 98 /** 99 * Sets the pointer to a Sink object. 100 * The pointer is aliased: Pointer copy without cloning or taking ownership. 101 */ 102 void setSink(Sink sinkAlias) { 103 sink = sinkAlias; 104 } 105 106 /** 107 * Sets the pointer to an Importer object. 108 * The pointer is aliased: Pointer copy without cloning or taking ownership. 109 */ 110 void setImporter(Importer importerAlias) { 111 importer = importerAlias; 112 } 113 114 void parse(String ruleString, CollationSettings outSettings) throws ParseException { 115 settings = outSettings; 116 parse(ruleString); 117 } 118 119 private static final int UCOL_DEFAULT = -1; 120 private static final int UCOL_OFF = 0; 121 private static final int UCOL_ON = 1; 122 123 /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */ 124 private static final int STRENGTH_MASK = 0xf; 125 private static final int STARRED_FLAG = 0x10; 126 private static final int OFFSET_SHIFT = 8; 127 128 private static final String BEFORE = "[before"; 129 130 // In C++, we parse into temporary UnicodeString objects named "raw" or "str". 131 // In Java, we reuse this StringBuilder. 132 private final StringBuilder rawBuilder = new StringBuilder(); 133 134 private void parse(String ruleString) throws ParseException { 135 rules = ruleString; 136 ruleIndex = 0; 137 138 while(ruleIndex < rules.length()) { 139 char c = rules.charAt(ruleIndex); 140 if(PatternProps.isWhiteSpace(c)) { 141 ++ruleIndex; 142 continue; 143 } 144 switch(c) { 145 case 0x26: // '&' 146 parseRuleChain(); 147 break; 148 case 0x5b: // '[' 149 parseSetting(); 150 break; 151 case 0x23: // '#' starts a comment, until the end of the line 152 ruleIndex = skipComment(ruleIndex + 1); 153 break; 154 case 0x40: // '@' is equivalent to [backwards 2] 155 settings.setFlag(CollationSettings.BACKWARD_SECONDARY, true); 156 ++ruleIndex; 157 break; 158 case 0x21: // '!' used to turn on Thai/Lao character reversal 159 // Accept but ignore. The root collator has contractions 160 // that are equivalent to the character reversal, where appropriate. 161 ++ruleIndex; 162 break; 163 default: 164 setParseError("expected a reset or setting or comment"); 165 break; 166 } 167 } 168 } 169 170 private void parseRuleChain() throws ParseException { 171 int resetStrength = parseResetAndPosition(); 172 boolean isFirstRelation = true; 173 for(;;) { 174 int result = parseRelationOperator(); 175 if(result < 0) { 176 if(ruleIndex < rules.length() && rules.charAt(ruleIndex) == 0x23) { 177 // '#' starts a comment, until the end of the line 178 ruleIndex = skipComment(ruleIndex + 1); 179 continue; 180 } 181 if(isFirstRelation) { 182 setParseError("reset not followed by a relation"); 183 } 184 return; 185 } 186 int strength = result & STRENGTH_MASK; 187 if(resetStrength < Collator.IDENTICAL) { 188 // reset-before rule chain 189 if(isFirstRelation) { 190 if(strength != resetStrength) { 191 setParseError("reset-before strength differs from its first relation"); 192 return; 193 } 194 } else { 195 if(strength < resetStrength) { 196 setParseError("reset-before strength followed by a stronger relation"); 197 return; 198 } 199 } 200 } 201 int i = ruleIndex + (result >> OFFSET_SHIFT); // skip over the relation operator 202 if((result & STARRED_FLAG) == 0) { 203 parseRelationStrings(strength, i); 204 } else { 205 parseStarredCharacters(strength, i); 206 } 207 isFirstRelation = false; 208 } 209 } 210 211 private int parseResetAndPosition() throws ParseException { 212 int i = skipWhiteSpace(ruleIndex + 1); 213 int j; 214 char c; 215 int resetStrength; 216 if(rules.regionMatches(i, BEFORE, 0, BEFORE.length()) && 217 (j = i + BEFORE.length()) < rules.length() && 218 PatternProps.isWhiteSpace(rules.charAt(j)) && 219 ((j = skipWhiteSpace(j + 1)) + 1) < rules.length() && 220 0x31 <= (c = rules.charAt(j)) && c <= 0x33 && 221 rules.charAt(j + 1) == 0x5d) { 222 // &[before n] with n=1 or 2 or 3 223 resetStrength = Collator.PRIMARY + (c - 0x31); 224 i = skipWhiteSpace(j + 2); 225 } else { 226 resetStrength = Collator.IDENTICAL; 227 } 228 if(i >= rules.length()) { 229 setParseError("reset without position"); 230 return UCOL_DEFAULT; 231 } 232 if(rules.charAt(i) == 0x5b) { // '[' 233 i = parseSpecialPosition(i, rawBuilder); 234 } else { 235 i = parseTailoringString(i, rawBuilder); 236 } 237 try { 238 sink.addReset(resetStrength, rawBuilder); 239 } catch(Exception e) { 240 setParseError("adding reset failed", e); 241 return UCOL_DEFAULT; 242 } 243 ruleIndex = i; 244 return resetStrength; 245 } 246 247 private int parseRelationOperator() { 248 ruleIndex = skipWhiteSpace(ruleIndex); 249 if(ruleIndex >= rules.length()) { return UCOL_DEFAULT; } 250 int strength; 251 int i = ruleIndex; 252 char c = rules.charAt(i++); 253 switch(c) { 254 case 0x3c: // '<' 255 if(i < rules.length() && rules.charAt(i) == 0x3c) { // << 256 ++i; 257 if(i < rules.length() && rules.charAt(i) == 0x3c) { // <<< 258 ++i; 259 if(i < rules.length() && rules.charAt(i) == 0x3c) { // <<<< 260 ++i; 261 strength = Collator.QUATERNARY; 262 } else { 263 strength = Collator.TERTIARY; 264 } 265 } else { 266 strength = Collator.SECONDARY; 267 } 268 } else { 269 strength = Collator.PRIMARY; 270 } 271 if(i < rules.length() && rules.charAt(i) == 0x2a) { // '*' 272 ++i; 273 strength |= STARRED_FLAG; 274 } 275 break; 276 case 0x3b: // ';' same as << 277 strength = Collator.SECONDARY; 278 break; 279 case 0x2c: // ',' same as <<< 280 strength = Collator.TERTIARY; 281 break; 282 case 0x3d: // '=' 283 strength = Collator.IDENTICAL; 284 if(i < rules.length() && rules.charAt(i) == 0x2a) { // '*' 285 ++i; 286 strength |= STARRED_FLAG; 287 } 288 break; 289 default: 290 return UCOL_DEFAULT; 291 } 292 return ((i - ruleIndex) << OFFSET_SHIFT) | strength; 293 } 294 295 private void parseRelationStrings(int strength, int i) throws ParseException { 296 // Parse 297 // prefix | str / extension 298 // where prefix and extension are optional. 299 String prefix = ""; 300 CharSequence extension = ""; 301 i = parseTailoringString(i, rawBuilder); 302 char next = (i < rules.length()) ? rules.charAt(i) : 0; 303 if(next == 0x7c) { // '|' separates the context prefix from the string. 304 prefix = rawBuilder.toString(); 305 i = parseTailoringString(i + 1, rawBuilder); 306 next = (i < rules.length()) ? rules.charAt(i) : 0; 307 } 308 // str = rawBuilder (do not modify rawBuilder any more in this function) 309 if(next == 0x2f) { // '/' separates the string from the extension. 310 StringBuilder extBuilder = new StringBuilder(); 311 i = parseTailoringString(i + 1, extBuilder); 312 extension = extBuilder; 313 } 314 if(prefix.length() != 0) { 315 int prefix0 = prefix.codePointAt(0); 316 int c = rawBuilder.codePointAt(0); 317 if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) { 318 setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary"); 319 return; 320 } 321 } 322 try { 323 sink.addRelation(strength, prefix, rawBuilder, extension); 324 } catch(Exception e) { 325 setParseError("adding relation failed", e); 326 return; 327 } 328 ruleIndex = i; 329 } 330 331 private void parseStarredCharacters(int strength, int i) throws ParseException { 332 String empty = ""; 333 i = parseString(skipWhiteSpace(i), rawBuilder); 334 if(rawBuilder.length() == 0) { 335 setParseError("missing starred-relation string"); 336 return; 337 } 338 int prev = -1; 339 int j = 0; 340 for(;;) { 341 while(j < rawBuilder.length()) { 342 int c = rawBuilder.codePointAt(j); 343 if(!nfd.isInert(c)) { 344 setParseError("starred-relation string is not all NFD-inert"); 345 return; 346 } 347 try { 348 sink.addRelation(strength, empty, UTF16.valueOf(c), empty); 349 } catch(Exception e) { 350 setParseError("adding relation failed", e); 351 return; 352 } 353 j += Character.charCount(c); 354 prev = c; 355 } 356 if(i >= rules.length() || rules.charAt(i) != 0x2d) { // '-' 357 break; 358 } 359 if(prev < 0) { 360 setParseError("range without start in starred-relation string"); 361 return; 362 } 363 i = parseString(i + 1, rawBuilder); 364 if(rawBuilder.length() == 0) { 365 setParseError("range without end in starred-relation string"); 366 return; 367 } 368 int c = rawBuilder.codePointAt(0); 369 if(c < prev) { 370 setParseError("range start greater than end in starred-relation string"); 371 return; 372 } 373 // range prev-c 374 while(++prev <= c) { 375 if(!nfd.isInert(prev)) { 376 setParseError("starred-relation string range is not all NFD-inert"); 377 return; 378 } 379 if(isSurrogate(prev)) { 380 setParseError("starred-relation string range contains a surrogate"); 381 return; 382 } 383 if(0xfffd <= prev && prev <= 0xffff) { 384 setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF"); 385 return; 386 } 387 try { 388 sink.addRelation(strength, empty, UTF16.valueOf(prev), empty); 389 } catch(Exception e) { 390 setParseError("adding relation failed", e); 391 return; 392 } 393 } 394 prev = -1; 395 j = Character.charCount(c); 396 } 397 ruleIndex = skipWhiteSpace(i); 398 } 399 400 private int parseTailoringString(int i, StringBuilder raw) throws ParseException { 401 i = parseString(skipWhiteSpace(i), raw); 402 if(raw.length() == 0) { 403 setParseError("missing relation string"); 404 } 405 return skipWhiteSpace(i); 406 } 407 408 private int parseString(int i, StringBuilder raw) throws ParseException { 409 raw.setLength(0); 410 while(i < rules.length()) { 411 char c = rules.charAt(i++); 412 if(isSyntaxChar(c)) { 413 if(c == 0x27) { // apostrophe 414 if(i < rules.length() && rules.charAt(i) == 0x27) { 415 // Double apostrophe, encodes a single one. 416 raw.append((char)0x27); 417 ++i; 418 continue; 419 } 420 // Quote literal text until the next single apostrophe. 421 for(;;) { 422 if(i == rules.length()) { 423 setParseError("quoted literal text missing terminating apostrophe"); 424 return i; 425 } 426 c = rules.charAt(i++); 427 if(c == 0x27) { 428 if(i < rules.length() && rules.charAt(i) == 0x27) { 429 // Double apostrophe inside quoted literal text, 430 // still encodes a single apostrophe. 431 ++i; 432 } else { 433 break; 434 } 435 } 436 raw.append(c); 437 } 438 } else if(c == 0x5c) { // backslash 439 if(i == rules.length()) { 440 setParseError("backslash escape at the end of the rule string"); 441 return i; 442 } 443 int cp = rules.codePointAt(i); 444 raw.appendCodePoint(cp); 445 i += Character.charCount(cp); 446 } else { 447 // Any other syntax character terminates a string. 448 --i; 449 break; 450 } 451 } else if(PatternProps.isWhiteSpace(c)) { 452 // Unquoted white space terminates a string. 453 --i; 454 break; 455 } else { 456 raw.append(c); 457 } 458 } 459 for(int j = 0; j < raw.length();) { 460 int c = raw.codePointAt(j); 461 if(isSurrogate(c)) { 462 setParseError("string contains an unpaired surrogate"); 463 return i; 464 } 465 if(0xfffd <= c && c <= 0xffff) { 466 setParseError("string contains U+FFFD, U+FFFE or U+FFFF"); 467 return i; 468 } 469 j += Character.charCount(c); 470 } 471 return i; 472 } 473 474 // TODO: Widen UTF16.isSurrogate(char16) to take an int. 475 private static final boolean isSurrogate(int c) { 476 return (c & 0xfffff800) == 0xd800; 477 } 478 479 private static final String[] positions = { 480 "first tertiary ignorable", 481 "last tertiary ignorable", 482 "first secondary ignorable", 483 "last secondary ignorable", 484 "first primary ignorable", 485 "last primary ignorable", 486 "first variable", 487 "last variable", 488 "first regular", 489 "last regular", 490 "first implicit", 491 "last implicit", 492 "first trailing", 493 "last trailing" 494 }; 495 496 /** 497 * Sets str to a contraction of U+FFFE and (U+2800 + Position). 498 * @return rule index after the special reset position 499 * @throws ParseException 500 */ 501 private int parseSpecialPosition(int i, StringBuilder str) throws ParseException { 502 int j = readWords(i + 1, rawBuilder); 503 if(j > i && rules.charAt(j) == 0x5d && rawBuilder.length() != 0) { // words end with ] 504 ++j; 505 String raw = rawBuilder.toString(); 506 str.setLength(0); 507 for(int pos = 0; pos < positions.length; ++pos) { 508 if(raw.equals(positions[pos])) { 509 str.append(POS_LEAD).append((char)(POS_BASE + pos)); 510 return j; 511 } 512 } 513 if(raw.equals("top")) { 514 str.append(POS_LEAD).append((char)(POS_BASE + Position.LAST_REGULAR.ordinal())); 515 return j; 516 } 517 if(raw.equals("variable top")) { 518 str.append(POS_LEAD).append((char)(POS_BASE + Position.LAST_VARIABLE.ordinal())); 519 return j; 520 } 521 } 522 setParseError("not a valid special reset position"); 523 return i; 524 } 525 526 private void parseSetting() throws ParseException { 527 int i = ruleIndex + 1; 528 int j = readWords(i, rawBuilder); 529 if(j <= i || rawBuilder.length() == 0) { 530 setParseError("expected a setting/option at '['"); 531 } 532 // startsWith() etc. are available for String but not CharSequence/StringBuilder. 533 String raw = rawBuilder.toString(); 534 if(rules.charAt(j) == 0x5d) { // words end with ] 535 ++j; 536 if(raw.startsWith("reorder") && 537 (raw.length() == 7 || raw.charAt(7) == 0x20)) { 538 parseReordering(raw); 539 ruleIndex = j; 540 return; 541 } 542 if(raw.equals("backwards 2")) { 543 settings.setFlag(CollationSettings.BACKWARD_SECONDARY, true); 544 ruleIndex = j; 545 return; 546 } 547 String v; 548 int valueIndex = raw.lastIndexOf(0x20); 549 if(valueIndex >= 0) { 550 v = raw.substring(valueIndex + 1); 551 raw = raw.substring(0, valueIndex); 552 } else { 553 v = ""; 554 } 555 if(raw.equals("strength") && v.length() == 1) { 556 int value = UCOL_DEFAULT; 557 char c = v.charAt(0); 558 if(0x31 <= c && c <= 0x34) { // 1..4 559 value = Collator.PRIMARY + (c - 0x31); 560 } else if(c == 0x49) { // 'I' 561 value = Collator.IDENTICAL; 562 } 563 if(value != UCOL_DEFAULT) { 564 settings.setStrength(value); 565 ruleIndex = j; 566 return; 567 } 568 } else if(raw.equals("alternate")) { 569 int value = UCOL_DEFAULT; 570 if(v.equals("non-ignorable")) { 571 value = 0; // UCOL_NON_IGNORABLE 572 } else if(v.equals("shifted")) { 573 value = 1; // UCOL_SHIFTED 574 } 575 if(value != UCOL_DEFAULT) { 576 settings.setAlternateHandlingShifted(value > 0); 577 ruleIndex = j; 578 return; 579 } 580 } else if(raw.equals("maxVariable")) { 581 int value = UCOL_DEFAULT; 582 if(v.equals("space")) { 583 value = CollationSettings.MAX_VAR_SPACE; 584 } else if(v.equals("punct")) { 585 value = CollationSettings.MAX_VAR_PUNCT; 586 } else if(v.equals("symbol")) { 587 value = CollationSettings.MAX_VAR_SYMBOL; 588 } else if(v.equals("currency")) { 589 value = CollationSettings.MAX_VAR_CURRENCY; 590 } 591 if(value != UCOL_DEFAULT) { 592 settings.setMaxVariable(value, 0); 593 settings.variableTop = baseData.getLastPrimaryForGroup( 594 Collator.ReorderCodes.FIRST + value); 595 assert(settings.variableTop != 0); 596 ruleIndex = j; 597 return; 598 } 599 } else if(raw.equals("caseFirst")) { 600 int value = UCOL_DEFAULT; 601 if(v.equals("off")) { 602 value = UCOL_OFF; 603 } else if(v.equals("lower")) { 604 value = CollationSettings.CASE_FIRST; // UCOL_LOWER_FIRST 605 } else if(v.equals("upper")) { 606 value = CollationSettings.CASE_FIRST_AND_UPPER_MASK; // UCOL_UPPER_FIRST 607 } 608 if(value != UCOL_DEFAULT) { 609 settings.setCaseFirst(value); 610 ruleIndex = j; 611 return; 612 } 613 } else if(raw.equals("caseLevel")) { 614 int value = getOnOffValue(v); 615 if(value != UCOL_DEFAULT) { 616 settings.setFlag(CollationSettings.CASE_LEVEL, value > 0); 617 ruleIndex = j; 618 return; 619 } 620 } else if(raw.equals("normalization")) { 621 int value = getOnOffValue(v); 622 if(value != UCOL_DEFAULT) { 623 settings.setFlag(CollationSettings.CHECK_FCD, value > 0); 624 ruleIndex = j; 625 return; 626 } 627 } else if(raw.equals("numericOrdering")) { 628 int value = getOnOffValue(v); 629 if(value != UCOL_DEFAULT) { 630 settings.setFlag(CollationSettings.NUMERIC, value > 0); 631 ruleIndex = j; 632 return; 633 } 634 } else if(raw.equals("hiraganaQ")) { 635 int value = getOnOffValue(v); 636 if(value != UCOL_DEFAULT) { 637 if(value == UCOL_ON) { 638 setParseError("[hiraganaQ on] is not supported"); 639 } 640 ruleIndex = j; 641 return; 642 } 643 } else if(raw.equals("import")) { 644 // BCP 47 language tag -> ICU locale ID 645 ULocale localeID; 646 try { 647 localeID = new ULocale.Builder().setLanguageTag(v).build(); 648 } catch(Exception e) { 649 setParseError("expected language tag in [import langTag]", e); 650 return; 651 } 652 // localeID minus all keywords 653 String baseID = localeID.getBaseName(); 654 // @collation=type, or length=0 if not specified 655 String collationType = localeID.getKeywordValue("collation"); 656 if(importer == null) { 657 setParseError("[import langTag] is not supported"); 658 } else { 659 String importedRules; 660 try { 661 importedRules = 662 importer.getRules(baseID, 663 collationType != null ? collationType : "standard"); 664 } catch(Exception e) { 665 setParseError("[import langTag] failed", e); 666 return; 667 } 668 String outerRules = rules; 669 int outerRuleIndex = ruleIndex; 670 try { 671 parse(importedRules); 672 } catch(Exception e) { 673 ruleIndex = outerRuleIndex; // Restore the original index for error reporting. 674 setParseError("parsing imported rules failed", e); 675 } 676 rules = outerRules; 677 ruleIndex = j; 678 } 679 return; 680 } 681 } else if(rules.charAt(j) == 0x5b) { // words end with [ 682 UnicodeSet set = new UnicodeSet(); 683 j = parseUnicodeSet(j, set); 684 if(raw.equals("optimize")) { 685 try { 686 sink.optimize(set); 687 } catch(Exception e) { 688 setParseError("[optimize set] failed", e); 689 } 690 ruleIndex = j; 691 return; 692 } else if(raw.equals("suppressContractions")) { 693 try { 694 sink.suppressContractions(set); 695 } catch(Exception e) { 696 setParseError("[suppressContractions set] failed", e); 697 } 698 ruleIndex = j; 699 return; 700 } 701 } 702 setParseError("not a valid setting/option"); 703 } 704 705 private void parseReordering(CharSequence raw) throws ParseException { 706 int i = 7; // after "reorder" 707 if(i == raw.length()) { 708 // empty [reorder] with no codes 709 settings.resetReordering(); 710 return; 711 } 712 // Parse the codes in [reorder aa bb cc]. 713 ArrayList<Integer> reorderCodes = new ArrayList<Integer>(); 714 while(i < raw.length()) { 715 ++i; // skip the word-separating space 716 int limit = i; 717 while(limit < raw.length() && raw.charAt(limit) != ' ') { ++limit; } 718 String word = raw.subSequence(i, limit).toString(); 719 int code = getReorderCode(word); 720 if(code < 0) { 721 setParseError("unknown script or reorder code"); 722 return; 723 } 724 reorderCodes.add(code); 725 i = limit; 726 } 727 if(reorderCodes.isEmpty()) { 728 settings.resetReordering(); 729 } else { 730 int[] codes = new int[reorderCodes.size()]; 731 int j = 0; 732 for(Integer code : reorderCodes) { codes[j++] = code; } 733 settings.setReordering(baseData, codes); 734 } 735 } 736 737 private static final String[] gSpecialReorderCodes = { 738 "space", "punct", "symbol", "currency", "digit" 739 }; 740 741 /** 742 * Gets a script or reorder code from its string representation. 743 * @return the script/reorder code, or 744 * -1 if not recognized 745 */ 746 public static int getReorderCode(String word) { 747 for(int i = 0; i < gSpecialReorderCodes.length; ++i) { 748 if(word.equalsIgnoreCase(gSpecialReorderCodes[i])) { 749 return Collator.ReorderCodes.FIRST + i; 750 } 751 } 752 try { 753 int script = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, word); 754 if(script >= 0) { 755 return script; 756 } 757 } catch (IllegalIcuArgumentException e) { 758 // fall through 759 } 760 if(word.equalsIgnoreCase("others")) { 761 return Collator.ReorderCodes.OTHERS; // same as Zzzz = USCRIPT_UNKNOWN 762 } 763 return -1; 764 } 765 766 private static int getOnOffValue(String s) { 767 if(s.equals("on")) { 768 return UCOL_ON; 769 } else if(s.equals("off")) { 770 return UCOL_OFF; 771 } else { 772 return UCOL_DEFAULT; 773 } 774 } 775 776 private int parseUnicodeSet(int i, UnicodeSet set) throws ParseException { 777 // Collect a UnicodeSet pattern between a balanced pair of [brackets]. 778 int level = 0; 779 int j = i; 780 for(;;) { 781 if(j == rules.length()) { 782 setParseError("unbalanced UnicodeSet pattern brackets"); 783 return j; 784 } 785 char c = rules.charAt(j++); 786 if(c == 0x5b) { // '[' 787 ++level; 788 } else if(c == 0x5d) { // ']' 789 if(--level == 0) { break; } 790 } 791 } 792 try { 793 set.applyPattern(rules.substring(i, j)); 794 } catch(Exception e) { 795 setParseError("not a valid UnicodeSet pattern: " + e.getMessage()); 796 } 797 j = skipWhiteSpace(j); 798 if(j == rules.length() || rules.charAt(j) != 0x5d) { 799 setParseError("missing option-terminating ']' after UnicodeSet pattern"); 800 return j; 801 } 802 return ++j; 803 } 804 805 private int readWords(int i, StringBuilder raw) { 806 raw.setLength(0); 807 i = skipWhiteSpace(i); 808 for(;;) { 809 if(i >= rules.length()) { return 0; } 810 char c = rules.charAt(i); 811 if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) { // syntax except -_ 812 if(raw.length() == 0) { return i; } 813 int lastIndex = raw.length() - 1; 814 if(raw.charAt(lastIndex) == ' ') { // remove trailing space 815 raw.setLength(lastIndex); 816 } 817 return i; 818 } 819 if(PatternProps.isWhiteSpace(c)) { 820 raw.append(' '); 821 i = skipWhiteSpace(i + 1); 822 } else { 823 raw.append(c); 824 ++i; 825 } 826 } 827 } 828 829 private int skipComment(int i) { 830 // skip to past the newline 831 while(i < rules.length()) { 832 char c = rules.charAt(i++); 833 // LF or FF or CR or NEL or LS or PS 834 if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) { 835 // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS." 836 // NLF (new line function) = CR or LF or CR+LF or NEL. 837 // No need to collect all of CR+LF because a following LF will be ignored anyway. 838 break; 839 } 840 } 841 return i; 842 } 843 844 private void setParseError(String reason) throws ParseException { 845 throw makeParseException(reason); 846 } 847 848 private void setParseError(String reason, Exception e) throws ParseException { 849 ParseException newExc = makeParseException(reason + ": " + e.getMessage()); 850 newExc.initCause(e); 851 throw newExc; 852 } 853 854 private ParseException makeParseException(String reason) { 855 return new ParseException(appendErrorContext(reason), ruleIndex); 856 } 857 858 private static final int U_PARSE_CONTEXT_LEN = 16; 859 860 // C++ setErrorContext() 861 private String appendErrorContext(String reason) { 862 // Note: This relies on the calling code maintaining the ruleIndex 863 // at a position that is useful for debugging. 864 // For example, at the beginning of a reset or relation etc. 865 StringBuilder msg = new StringBuilder(reason); 866 msg.append(" at index ").append(ruleIndex); 867 // We are not counting line numbers. 868 869 msg.append(" near \""); 870 // before ruleIndex 871 int start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1); 872 if(start < 0) { 873 start = 0; 874 } else if(start > 0 && Character.isLowSurrogate(rules.charAt(start))) { 875 ++start; 876 } 877 msg.append(rules, start, ruleIndex); 878 879 msg.append('!'); 880 // starting from ruleIndex 881 int length = rules.length() - ruleIndex; 882 if(length >= U_PARSE_CONTEXT_LEN) { 883 length = U_PARSE_CONTEXT_LEN - 1; 884 if(Character.isHighSurrogate(rules.charAt(ruleIndex + length - 1))) { 885 --length; 886 } 887 } 888 msg.append(rules, ruleIndex, ruleIndex + length); 889 return msg.append('\"').toString(); 890 } 891 892 /** 893 * ASCII [:P:] and [:S:]: 894 * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E] 895 */ 896 private static boolean isSyntaxChar(int c) { 897 return 0x21 <= c && c <= 0x7e && 898 (c <= 0x2f || (0x3a <= c && c <= 0x40) || 899 (0x5b <= c && c <= 0x60) || (0x7b <= c)); 900 } 901 902 private int skipWhiteSpace(int i) { 903 while(i < rules.length() && PatternProps.isWhiteSpace(rules.charAt(i))) { 904 ++i; 905 } 906 return i; 907 } 908 909 private Normalizer2 nfd = Normalizer2.getNFDInstance(); 910 private Normalizer2 nfc = Normalizer2.getNFCInstance(); 911 912 private String rules; 913 private final CollationData baseData; 914 private CollationSettings settings; 915 916 private Sink sink; 917 private Importer importer; 918 919 private int ruleIndex; 920 } 921