1 // 2017 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 package com.ibm.icu.impl.number; 4 5 import java.math.BigDecimal; 6 import java.math.MathContext; 7 import java.text.ParseException; 8 import java.text.ParsePosition; 9 import java.util.HashSet; 10 import java.util.Iterator; 11 import java.util.Set; 12 import java.util.concurrent.ConcurrentHashMap; 13 14 import com.ibm.icu.impl.StandardPlural; 15 import com.ibm.icu.impl.TextTrieMap; 16 import com.ibm.icu.lang.UCharacter; 17 import com.ibm.icu.text.CurrencyPluralInfo; 18 import com.ibm.icu.text.DecimalFormatSymbols; 19 import com.ibm.icu.text.NumberFormat; 20 import com.ibm.icu.text.UnicodeSet; 21 import com.ibm.icu.util.Currency; 22 import com.ibm.icu.util.Currency.CurrencyStringInfo; 23 import com.ibm.icu.util.CurrencyAmount; 24 import com.ibm.icu.util.ULocale; 25 26 /** 27 * A parser designed to convert an arbitrary human-generated string to its best representation as a 28 * number: a long, a BigInteger, or a BigDecimal. 29 * 30 * <p>The parser may traverse multiple parse paths in the same strings if there is ambiguity. For 31 * example, the string "12,345.67" has two main interpretations: it could be "12.345" in a locale 32 * that uses '.' as the grouping separator, or it could be "12345.67" in a locale that uses ',' as 33 * the grouping separator. Since the second option has a longer parse path (consumes more of the 34 * input string), the parser will accept the second option. 35 */ 36 public class Parse { 37 38 /** Controls the set of rules for parsing a string. */ 39 public static enum ParseMode { 40 /** 41 * Lenient mode should be used if you want to accept malformed user input. It will use 42 * heuristics to attempt to parse through typographical errors in the string. 43 */ 44 LENIENT, 45 46 /** 47 * Strict mode should be used if you want to require that the input is well-formed. More 48 * specifically, it differs from lenient mode in the following ways: 49 * 50 * <ul> 51 * <li>Grouping widths must match the grouping settings. For example, "12,3,45" will fail if 52 * the grouping width is 3, as in the pattern "#,##0". 53 * <li>The string must contain a complete prefix and suffix. For example, if the pattern is 54 * "{#};(#)", then "{123}" or "(123)" would match, but "{123", "123}", and "123" would all 55 * fail. (The latter strings would be accepted in lenient mode.) 56 * <li>Whitespace may not appear at arbitrary places in the string. In lenient mode, 57 * whitespace is allowed to occur arbitrarily before and after prefixes and exponent 58 * separators. 59 * <li>Leading grouping separators are not allowed, as in ",123". 60 * <li>Minus and plus signs can only appear if specified in the pattern. In lenient mode, a 61 * plus or minus sign can always precede a number. 62 * <li>The set of characters that can be interpreted as a decimal or grouping separator is 63 * smaller. 64 * <li><strong>If currency parsing is enabled,</strong> currencies must only appear where 65 * specified in either the current pattern string or in a valid pattern string for the 66 * current locale. For example, if the pattern is "0.00", then "$1.23" would match, but 67 * "1.23$" would fail to match. 68 * </ul> 69 */ 70 STRICT, 71 72 /** 73 * Fast mode should be used in applications that don't require prefixes and suffixes to match. 74 * 75 * <p>In addition to ignoring prefixes and suffixes, fast mode performs the following 76 * optimizations: 77 * 78 * <ul> 79 * <li>Ignores digit strings from {@link DecimalFormatSymbols} and only uses the code point's 80 * Unicode digit property. If you are not using custom digit strings, this should not 81 * cause a change in behavior. 82 * <li>Instead of traversing multiple possible parse paths, a "greedy" parsing strategy is 83 * used, which might mean that fast mode won't accept strings that lenient or strict mode 84 * would accept. Since prefix and suffix strings are ignored, this is not an issue unless 85 * you are using custom symbols. 86 * </ul> 87 */ 88 FAST, 89 } 90 91 /** 92 * An enum containing the choices for strategy in parsing when choosing between grouping and 93 * decimal separators. 94 */ 95 public static enum GroupingMode { 96 /** 97 * Accept decimal equivalents as decimals, and if that fails, accept all equivalence classes 98 * (periods, commas, and whitespace-like) as grouping. This is a more lenient strategy. 99 * 100 * <p>For example, if the formatter's current locale is <em>fr-FR</em>, then "1.234" will parse 101 * as 1234, even though <em>fr-FR</em> does not use a period as the grouping separator. 102 */ 103 DEFAULT, 104 105 /** 106 * Accept decimal equivalents as decimals and grouping equivalents as grouping. This strategy is 107 * more strict. 108 * 109 * <p>For example, if the formatter's current locale is <em>fr-FR</em>, then "1.234" will fail 110 * to parse since <em>fr-FR</em> does not use a period as the grouping separator. 111 */ 112 RESTRICTED 113 } 114 115 /** 116 * @see Parse#parse(String, ParsePosition, ParseMode, boolean, boolean, DecimalFormatProperties, 117 * DecimalFormatSymbols) 118 */ 119 private static enum StateName { 120 BEFORE_PREFIX, 121 AFTER_PREFIX, 122 AFTER_INTEGER_DIGIT, 123 AFTER_FRACTION_DIGIT, 124 AFTER_EXPONENT_SEPARATOR, 125 AFTER_EXPONENT_DIGIT, 126 BEFORE_SUFFIX, 127 BEFORE_SUFFIX_SEEN_EXPONENT, 128 AFTER_SUFFIX, 129 INSIDE_CURRENCY, 130 INSIDE_DIGIT, 131 INSIDE_STRING, 132 INSIDE_AFFIX_PATTERN; 133 } 134 135 // This set was decided after discussion with icu-design@. See ticket #13309. 136 // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property). 137 private static final UnicodeSet UNISET_WHITESPACE = 138 new UnicodeSet("[[:Zs:][\\u0009]]").freeze(); 139 140 // BiDi characters are skipped over and ignored at any point in the string, even in strict mode. 141 private static final UnicodeSet UNISET_BIDI = 142 new UnicodeSet("[[\\u200E\\u200F\\u061C]]").freeze(); 143 144 // TODO: Re-generate these sets from the database. They probably haven't been updated in a while. 145 private static final UnicodeSet UNISET_PERIOD_LIKE = 146 new UnicodeSet("[.\\u2024\\u3002\\uFE12\\uFE52\\uFF0E\\uFF61]").freeze(); 147 private static final UnicodeSet UNISET_STRICT_PERIOD_LIKE = 148 new UnicodeSet("[.\\u2024\\uFE52\\uFF0E\\uFF61]").freeze(); 149 private static final UnicodeSet UNISET_COMMA_LIKE = 150 new UnicodeSet("[,\\u060C\\u066B\\u3001\\uFE10\\uFE11\\uFE50\\uFE51\\uFF0C\\uFF64]").freeze(); 151 private static final UnicodeSet UNISET_STRICT_COMMA_LIKE = 152 new UnicodeSet("[,\\u066B\\uFE10\\uFE50\\uFF0C]").freeze(); 153 private static final UnicodeSet UNISET_OTHER_GROUPING_SEPARATORS = 154 new UnicodeSet( 155 "[\\ '\\u00A0\\u066C\\u2000-\\u200A\\u2018\\u2019\\u202F\\u205F\\u3000\\uFF07]") 156 .freeze(); 157 158 // For parse return value calculation. 159 private static final BigDecimal MIN_LONG_AS_BIG_DECIMAL = new BigDecimal(Long.MIN_VALUE); 160 private static final BigDecimal MAX_LONG_AS_BIG_DECIMAL = new BigDecimal(Long.MAX_VALUE); 161 162 private enum SeparatorType { 163 COMMA_LIKE, 164 PERIOD_LIKE, 165 OTHER_GROUPING, 166 UNKNOWN; 167 168 static SeparatorType fromCp(int cp, ParseMode mode) { 169 if (mode == ParseMode.FAST) { 170 return SeparatorType.UNKNOWN; 171 } else if (mode == ParseMode.STRICT) { 172 if (UNISET_STRICT_COMMA_LIKE.contains(cp)) return COMMA_LIKE; 173 if (UNISET_STRICT_PERIOD_LIKE.contains(cp)) return PERIOD_LIKE; 174 if (UNISET_OTHER_GROUPING_SEPARATORS.contains(cp)) return OTHER_GROUPING; 175 return UNKNOWN; 176 } else { 177 if (UNISET_COMMA_LIKE.contains(cp)) return COMMA_LIKE; 178 if (UNISET_PERIOD_LIKE.contains(cp)) return PERIOD_LIKE; 179 if (UNISET_OTHER_GROUPING_SEPARATORS.contains(cp)) return OTHER_GROUPING; 180 return UNKNOWN; 181 } 182 } 183 } 184 185 private static enum DigitType { 186 INTEGER, 187 FRACTION, 188 EXPONENT 189 } 190 191 /** 192 * Holds a snapshot in time of a single parse path. This includes the digits seen so far, the 193 * current state name, and other properties like the grouping separator used on this parse path, 194 * details about the exponent and negative signs, etc. 195 */ 196 private static class StateItem { 197 // Parser state: 198 // The "trailingChars" is used to keep track of how many characters from the end of the string 199 // are ignorable and should be removed from the parse position should this item be accepted. 200 // The "score" is used to help rank two otherwise equivalent parse paths. Currently, the only 201 // function giving points to the score is prefix/suffix. 202 StateName name; 203 int trailingCount; 204 int score; 205 206 // Numerical value: 207 DecimalQuantity_DualStorageBCD fq = new DecimalQuantity_DualStorageBCD(); 208 int numDigits; 209 int trailingZeros; 210 int exponent; 211 212 // Other items that we've seen: 213 int groupingCp; 214 long groupingWidths; 215 String isoCode; 216 boolean sawNegative; 217 boolean sawNegativeExponent; 218 boolean sawCurrency; 219 boolean sawNaN; 220 boolean sawInfinity; 221 AffixHolder affix; 222 boolean sawPrefix; 223 boolean sawSuffix; 224 boolean sawDecimalPoint; 225 boolean sawExponentDigit; 226 227 // Data for intermediate parsing steps: 228 StateName returnTo1; 229 StateName returnTo2; 230 // For string literals: 231 CharSequence currentString; 232 int currentOffset; 233 boolean currentTrailing; 234 // For affix patterns: 235 CharSequence currentAffixPattern; 236 long currentStepwiseParserTag; 237 // For currency: 238 TextTrieMap<CurrencyStringInfo>.ParseState currentCurrencyTrieState; 239 // For multi-code-point digits: 240 TextTrieMap<Byte>.ParseState currentDigitTrieState; 241 DigitType currentDigitType; 242 243 // Identification for path tracing: 244 final char id; 245 String path; 246 247 StateItem(char _id) { 248 id = _id; 249 } 250 251 /** 252 * Clears the instance so that it can be re-used. 253 * 254 * @return Myself, for chaining. 255 */ 256 StateItem clear() { 257 // Parser state: 258 name = StateName.BEFORE_PREFIX; 259 trailingCount = 0; 260 score = 0; 261 262 // Numerical value: 263 fq.clear(); 264 numDigits = 0; 265 trailingZeros = 0; 266 exponent = 0; 267 268 // Other items we've seen: 269 groupingCp = -1; 270 groupingWidths = 0L; 271 isoCode = null; 272 sawNegative = false; 273 sawNegativeExponent = false; 274 sawCurrency = false; 275 sawNaN = false; 276 sawInfinity = false; 277 affix = null; 278 sawPrefix = false; 279 sawSuffix = false; 280 sawDecimalPoint = false; 281 sawExponentDigit = false; 282 283 // Data for intermediate parsing steps: 284 returnTo1 = null; 285 returnTo2 = null; 286 currentString = null; 287 currentOffset = 0; 288 currentTrailing = false; 289 currentAffixPattern = null; 290 currentStepwiseParserTag = 0L; 291 currentCurrencyTrieState = null; 292 currentDigitTrieState = null; 293 currentDigitType = null; 294 295 // Identification for path tracing: 296 // id is constant and is not cleared 297 path = ""; 298 299 return this; 300 } 301 302 /** 303 * Sets the internal value of this instance equal to another instance. 304 * 305 * <p>newName and cpOrN1 are required as parameters to this function because every time a code 306 * point is consumed and a state item is copied, both of the corresponding fields should be 307 * updated; it would be an error if they weren't updated. 308 * 309 * @param other The instance to copy from. 310 * @param newName The state name that the new copy should take on. 311 * @param trailing If positive, record this code point as trailing; if negative, reset the 312 * trailing count to zero. 313 * @return Myself, for chaining. 314 */ 315 StateItem copyFrom(StateItem other, StateName newName, int trailing) { 316 // Parser state: 317 name = newName; 318 score = other.score; 319 320 // Either reset trailingCount or add the width of the current code point. 321 trailingCount = (trailing < 0) ? 0 : other.trailingCount + Character.charCount(trailing); 322 323 // Numerical value: 324 fq.copyFrom(other.fq); 325 numDigits = other.numDigits; 326 trailingZeros = other.trailingZeros; 327 exponent = other.exponent; 328 329 // Other items we've seen: 330 groupingCp = other.groupingCp; 331 groupingWidths = other.groupingWidths; 332 isoCode = other.isoCode; 333 sawNegative = other.sawNegative; 334 sawNegativeExponent = other.sawNegativeExponent; 335 sawCurrency = other.sawCurrency; 336 sawNaN = other.sawNaN; 337 sawInfinity = other.sawInfinity; 338 affix = other.affix; 339 sawPrefix = other.sawPrefix; 340 sawSuffix = other.sawSuffix; 341 sawDecimalPoint = other.sawDecimalPoint; 342 sawExponentDigit = other.sawExponentDigit; 343 344 // Data for intermediate parsing steps: 345 returnTo1 = other.returnTo1; 346 returnTo2 = other.returnTo2; 347 currentString = other.currentString; 348 currentOffset = other.currentOffset; 349 currentTrailing = other.currentTrailing; 350 currentAffixPattern = other.currentAffixPattern; 351 currentStepwiseParserTag = other.currentStepwiseParserTag; 352 currentCurrencyTrieState = other.currentCurrencyTrieState; 353 currentDigitTrieState = other.currentDigitTrieState; 354 currentDigitType = other.currentDigitType; 355 356 // Record source node if debugging 357 if (DEBUGGING) { 358 path = other.path + other.id; 359 } 360 361 return this; 362 } 363 364 /** 365 * Adds a digit to the internal representation of this instance. 366 * 367 * @param digit The digit that was read from the string. 368 * @param type Whether the digit occured after the decimal point. 369 */ 370 void appendDigit(byte digit, DigitType type) { 371 if (type == DigitType.EXPONENT) { 372 sawExponentDigit = true; 373 int newExponent = exponent * 10 + digit; 374 if (newExponent < exponent) { 375 // overflow 376 exponent = Integer.MAX_VALUE; 377 } else { 378 exponent = newExponent; 379 } 380 } else { 381 numDigits++; 382 if (type == DigitType.FRACTION && digit == 0) { 383 trailingZeros++; 384 } else if (type == DigitType.FRACTION) { 385 fq.appendDigit(digit, trailingZeros, false); 386 trailingZeros = 0; 387 } else { 388 fq.appendDigit(digit, 0, true); 389 } 390 } 391 } 392 393 /** @return Whether or not this item contains a valid number. */ 394 public boolean hasNumber() { 395 return numDigits > 0 || sawNaN || sawInfinity; 396 } 397 398 /** 399 * Converts the internal digits from this instance into a Number, preferring a Long, then a 400 * BigInteger, then a BigDecimal. A Double is used for NaN, infinity, and -0.0. 401 * 402 * @return The Number. Never null. 403 */ 404 Number toNumber(DecimalFormatProperties properties) { 405 // Check for NaN, infinity, and -0.0 406 if (sawNaN) { 407 return Double.NaN; 408 } 409 if (sawInfinity) { 410 if (sawNegative) { 411 return Double.NEGATIVE_INFINITY; 412 } else { 413 return Double.POSITIVE_INFINITY; 414 } 415 } 416 if (fq.isZero() && sawNegative) { 417 return -0.0; 418 } 419 420 // Check for exponent overflow 421 boolean forceBigDecimal = properties.getParseToBigDecimal(); 422 if (exponent == Integer.MAX_VALUE) { 423 if (sawNegativeExponent && sawNegative) { 424 return -0.0; 425 } else if (sawNegativeExponent) { 426 return 0.0; 427 } else if (sawNegative) { 428 return Double.NEGATIVE_INFINITY; 429 } else { 430 return Double.POSITIVE_INFINITY; 431 } 432 } else if (exponent > 1000) { 433 // BigDecimals can handle huge values better than BigIntegers. 434 forceBigDecimal = true; 435 } 436 437 // Multipliers must be applied in reverse. 438 BigDecimal multiplier = properties.getMultiplier(); 439 if (properties.getMagnitudeMultiplier() != 0) { 440 if (multiplier == null) multiplier = BigDecimal.ONE; 441 multiplier = multiplier.scaleByPowerOfTen(properties.getMagnitudeMultiplier()); 442 } 443 int delta = (sawNegativeExponent ? -1 : 1) * exponent; 444 445 // We need to use a math context in order to prevent non-terminating decimal expansions. 446 // This is only used when dividing by the multiplier. 447 MathContext mc = RoundingUtils.getMathContextOr34Digits(properties); 448 449 // Construct the output number. 450 // This is the only step during fast-mode parsing that incurs object creations. 451 BigDecimal result = fq.toBigDecimal(); 452 if (sawNegative) result = result.negate(); 453 result = result.scaleByPowerOfTen(delta); 454 if (multiplier != null) { 455 result = result.divide(multiplier, mc); 456 } 457 result = result.stripTrailingZeros(); 458 if (forceBigDecimal || result.scale() > 0) { 459 return result; 460 } else if (result.compareTo(MIN_LONG_AS_BIG_DECIMAL) >= 0 461 && result.compareTo(MAX_LONG_AS_BIG_DECIMAL) <= 0) { 462 return result.longValueExact(); 463 } else { 464 return result.toBigIntegerExact(); 465 } 466 } 467 468 /** 469 * Converts the internal digits to a number, and also associates the number with the parsed 470 * currency. 471 * 472 * @return The CurrencyAmount. Never null. 473 */ 474 public CurrencyAmount toCurrencyAmount(DecimalFormatProperties properties) { 475 assert isoCode != null; 476 Number number = toNumber(properties); 477 Currency currency = Currency.getInstance(isoCode); 478 return new CurrencyAmount(number, currency); 479 } 480 481 @Override 482 public String toString() { 483 StringBuilder sb = new StringBuilder(); 484 sb.append("["); 485 sb.append(path); 486 sb.append("] "); 487 sb.append(name.name()); 488 if (name == StateName.INSIDE_STRING) { 489 sb.append("{"); 490 sb.append(currentString); 491 sb.append(":"); 492 sb.append(currentOffset); 493 sb.append("}"); 494 } 495 if (name == StateName.INSIDE_AFFIX_PATTERN) { 496 sb.append("{"); 497 sb.append(currentAffixPattern); 498 sb.append(":"); 499 sb.append(AffixUtils.getOffset(currentStepwiseParserTag) - 1); 500 sb.append("}"); 501 } 502 sb.append(" "); 503 sb.append(fq.toBigDecimal()); 504 sb.append(" grouping:"); 505 sb.append(groupingCp == -1 ? new char[] {'?'} : Character.toChars(groupingCp)); 506 sb.append(" widths:"); 507 sb.append(Long.toHexString(groupingWidths)); 508 sb.append(" seen:"); 509 sb.append(sawNegative ? 1 : 0); 510 sb.append(sawNegativeExponent ? 1 : 0); 511 sb.append(sawNaN ? 1 : 0); 512 sb.append(sawInfinity ? 1 : 0); 513 sb.append(sawPrefix ? 1 : 0); 514 sb.append(sawSuffix ? 1 : 0); 515 sb.append(sawDecimalPoint ? 1 : 0); 516 sb.append(" trailing:"); 517 sb.append(trailingCount); 518 sb.append(" score:"); 519 sb.append(score); 520 sb.append(" affix:"); 521 sb.append(affix); 522 sb.append(" currency:"); 523 sb.append(isoCode); 524 return sb.toString(); 525 } 526 } 527 528 /** 529 * Holds an ordered list of {@link StateItem} and other metadata about the string to be parsed. 530 * There are two internal arrays of {@link StateItem}, which are swapped back and forth in order 531 * to avoid object creations. The items in one array can be populated at the same time that items 532 * in the other array are being read from. 533 */ 534 private static class ParserState { 535 536 // Basic ParserStateItem lists: 537 StateItem[] items = new StateItem[16]; 538 StateItem[] prevItems = new StateItem[16]; 539 int length; 540 int prevLength; 541 542 // Properties and Symbols memory: 543 DecimalFormatProperties properties; 544 DecimalFormatSymbols symbols; 545 ParseMode mode; 546 boolean caseSensitive; 547 boolean parseCurrency; 548 GroupingMode groupingMode; 549 550 // Other pre-computed fields: 551 int decimalCp1; 552 int decimalCp2; 553 int groupingCp1; 554 int groupingCp2; 555 SeparatorType decimalType1; 556 SeparatorType decimalType2; 557 SeparatorType groupingType1; 558 SeparatorType groupingType2; 559 560 TextTrieMap<Byte> digitTrie; 561 Set<AffixHolder> affixHolders = new HashSet<AffixHolder>(); 562 563 ParserState() { 564 for (int i = 0; i < items.length; i++) { 565 items[i] = new StateItem((char) ('A' + i)); 566 prevItems[i] = new StateItem((char) ('A' + i)); 567 } 568 } 569 570 /** 571 * Clears the internal state in order to prepare for parsing a new string. 572 * 573 * @return Myself, for chaining. 574 */ 575 ParserState clear() { 576 length = 0; 577 prevLength = 0; 578 digitTrie = null; 579 affixHolders.clear(); 580 return this; 581 } 582 583 /** 584 * Swaps the internal arrays of {@link StateItem}. Sets the length of the primary list to zero, 585 * so that it can be appended to. 586 */ 587 void swap() { 588 StateItem[] temp = prevItems; 589 prevItems = items; 590 items = temp; 591 prevLength = length; 592 length = 0; 593 } 594 595 /** 596 * Swaps the internal arrays of {@link StateItem}. Sets the length of the primary list to the 597 * length of the previous list, so that it can be read from. 598 */ 599 void swapBack() { 600 StateItem[] temp = prevItems; 601 prevItems = items; 602 items = temp; 603 length = prevLength; 604 prevLength = 0; 605 } 606 607 /** 608 * Gets the next available {@link StateItem} from the primary list for writing. This method 609 * should be thought of like a list append method, except that there are no object creations 610 * taking place. 611 * 612 * <p>It is the caller's responsibility to call either {@link StateItem#clear} or {@link 613 * StateItem#copyFrom} on the returned object. 614 * 615 * @return A dirty {@link StateItem}. 616 */ 617 StateItem getNext() { 618 if (length >= items.length) { 619 // TODO: What to do here? Expand the array? 620 // This case is rare and would happen only with specially designed input. 621 // For now, just overwrite the last entry. 622 length = items.length - 1; 623 } 624 StateItem item = items[length]; 625 length++; 626 return item; 627 } 628 629 /** @return The index of the last inserted StateItem via a call to {@link #getNext}. */ 630 public int lastInsertedIndex() { 631 assert length > 0; 632 return length - 1; 633 } 634 635 /** 636 * Gets a {@link StateItem} from the primary list. Assumes that the item has already been added 637 * via a call to {@link #getNext}. 638 * 639 * @param i The index of the item to get. 640 * @return The item. 641 */ 642 public StateItem getItem(int i) { 643 assert i >= 0 && i < length; 644 return items[i]; 645 } 646 647 @Override 648 public String toString() { 649 StringBuilder sb = new StringBuilder(); 650 sb.append("<ParseState mode:"); 651 sb.append(mode); 652 sb.append(" caseSensitive:"); 653 sb.append(caseSensitive); 654 sb.append(" parseCurrency:"); 655 sb.append(parseCurrency); 656 sb.append(" groupingMode:"); 657 sb.append(groupingMode); 658 sb.append(" decimalCps:"); 659 sb.append((char) decimalCp1); 660 sb.append((char) decimalCp2); 661 sb.append(" groupingCps:"); 662 sb.append((char) groupingCp1); 663 sb.append((char) groupingCp2); 664 sb.append(" affixes:"); 665 sb.append(affixHolders); 666 sb.append(">"); 667 return sb.toString(); 668 } 669 } 670 671 /** 672 * A wrapper for affixes. Affixes can be string-based or pattern-based, and they can come from 673 * several sources, including the property bag and the locale paterns from CLDR data. 674 */ 675 private static class AffixHolder { 676 final String p; // prefix 677 final String s; // suffix 678 final boolean strings; 679 final boolean negative; 680 681 static final AffixHolder EMPTY_POSITIVE = new AffixHolder("", "", true, false); 682 static final AffixHolder EMPTY_NEGATIVE = new AffixHolder("", "", true, true); 683 684 static void addToState(ParserState state, DecimalFormatProperties properties) { 685 AffixHolder pp = fromPropertiesPositivePattern(properties); 686 AffixHolder np = fromPropertiesNegativePattern(properties); 687 AffixHolder ps = fromPropertiesPositiveString(properties); 688 AffixHolder ns = fromPropertiesNegativeString(properties); 689 if (pp != null) state.affixHolders.add(pp); 690 if (ps != null) state.affixHolders.add(ps); 691 if (np != null) state.affixHolders.add(np); 692 if (ns != null) state.affixHolders.add(ns); 693 } 694 695 static AffixHolder fromPropertiesPositivePattern(DecimalFormatProperties properties) { 696 String ppp = properties.getPositivePrefixPattern(); 697 String psp = properties.getPositiveSuffixPattern(); 698 if (properties.getSignAlwaysShown()) { 699 // TODO: This logic is somewhat duplicated from MurkyModifier. 700 boolean foundSign = false; 701 String npp = properties.getNegativePrefixPattern(); 702 String nsp = properties.getNegativeSuffixPattern(); 703 if (AffixUtils.containsType(npp, AffixUtils.TYPE_MINUS_SIGN)) { 704 foundSign = true; 705 ppp = AffixUtils.replaceType(npp, AffixUtils.TYPE_MINUS_SIGN, '+'); 706 } 707 if (AffixUtils.containsType(nsp, AffixUtils.TYPE_MINUS_SIGN)) { 708 foundSign = true; 709 psp = AffixUtils.replaceType(nsp, AffixUtils.TYPE_MINUS_SIGN, '+'); 710 } 711 if (!foundSign) { 712 ppp = "+" + ppp; 713 } 714 } 715 return getInstance(ppp, psp, false, false); 716 } 717 718 static AffixHolder fromPropertiesNegativePattern(DecimalFormatProperties properties) { 719 String npp = properties.getNegativePrefixPattern(); 720 String nsp = properties.getNegativeSuffixPattern(); 721 if (npp == null && nsp == null) { 722 npp = properties.getPositivePrefixPattern(); 723 nsp = properties.getPositiveSuffixPattern(); 724 if (npp == null) { 725 npp = "-"; 726 } else { 727 npp = "-" + npp; 728 } 729 } 730 return getInstance(npp, nsp, false, true); 731 } 732 733 static AffixHolder fromPropertiesPositiveString(DecimalFormatProperties properties) { 734 String pp = properties.getPositivePrefix(); 735 String ps = properties.getPositiveSuffix(); 736 if (pp == null && ps == null) return null; 737 return getInstance(pp, ps, true, false); 738 } 739 740 static AffixHolder fromPropertiesNegativeString(DecimalFormatProperties properties) { 741 String np = properties.getNegativePrefix(); 742 String ns = properties.getNegativeSuffix(); 743 if (np == null && ns == null) return null; 744 return getInstance(np, ns, true, true); 745 } 746 747 static AffixHolder getInstance(String p, String s, boolean strings, boolean negative) { 748 if (p == null && s == null) return negative ? EMPTY_NEGATIVE : EMPTY_POSITIVE; 749 if (p == null) p = ""; 750 if (s == null) s = ""; 751 if (p.length() == 0 && s.length() == 0) return negative ? EMPTY_NEGATIVE : EMPTY_POSITIVE; 752 return new AffixHolder(p, s, strings, negative); 753 } 754 755 AffixHolder(String pp, String sp, boolean strings, boolean negative) { 756 this.p = pp; 757 this.s = sp; 758 this.strings = strings; 759 this.negative = negative; 760 } 761 762 @Override 763 public boolean equals(Object other) { 764 if (other == null) return false; 765 if (this == other) return true; 766 if (!(other instanceof AffixHolder)) return false; 767 AffixHolder _other = (AffixHolder) other; 768 if (!p.equals(_other.p)) return false; 769 if (!s.equals(_other.s)) return false; 770 if (strings != _other.strings) return false; 771 if (negative != _other.negative) return false; 772 return true; 773 } 774 775 @Override 776 public int hashCode() { 777 return p.hashCode() ^ s.hashCode(); 778 } 779 780 @Override 781 public String toString() { 782 StringBuilder sb = new StringBuilder(); 783 sb.append("{"); 784 sb.append(p); 785 sb.append("|"); 786 sb.append(s); 787 sb.append("|"); 788 sb.append(strings ? 'S' : 'P'); 789 sb.append("}"); 790 return sb.toString(); 791 } 792 } 793 794 /** 795 * A class that holds information about all currency affix patterns for the locale. This allows 796 * the parser to accept currencies in any format that are valid for the locale. 797 */ 798 private static class CurrencyAffixPatterns { 799 private final Set<AffixHolder> set = new HashSet<AffixHolder>(); 800 801 private static final ConcurrentHashMap<ULocale, CurrencyAffixPatterns> currencyAffixPatterns = 802 new ConcurrentHashMap<ULocale, CurrencyAffixPatterns>(); 803 804 static void addToState(ULocale uloc, ParserState state) { 805 CurrencyAffixPatterns value = currencyAffixPatterns.get(uloc); 806 if (value == null) { 807 // There can be multiple threads computing the same CurrencyAffixPatterns simultaneously, 808 // but that scenario is harmless. 809 CurrencyAffixPatterns newValue = new CurrencyAffixPatterns(uloc); 810 currencyAffixPatterns.putIfAbsent(uloc, newValue); 811 value = currencyAffixPatterns.get(uloc); 812 } 813 state.affixHolders.addAll(value.set); 814 } 815 816 private CurrencyAffixPatterns(ULocale uloc) { 817 // Get the basic currency pattern. 818 String pattern = NumberFormat.getPatternForStyle(uloc, NumberFormat.CURRENCYSTYLE); 819 addPattern(pattern); 820 821 // Get the currency plural patterns. 822 // TODO: Update this after CurrencyPluralInfo is replaced. 823 CurrencyPluralInfo pluralInfo = CurrencyPluralInfo.getInstance(uloc); 824 for (StandardPlural plural : StandardPlural.VALUES) { 825 pattern = pluralInfo.getCurrencyPluralPattern(plural.getKeyword()); 826 addPattern(pattern); 827 } 828 } 829 830 private static final ThreadLocal<DecimalFormatProperties> threadLocalProperties = 831 new ThreadLocal<DecimalFormatProperties>() { 832 @Override 833 protected DecimalFormatProperties initialValue() { 834 return new DecimalFormatProperties(); 835 } 836 }; 837 838 private void addPattern(String pattern) { 839 DecimalFormatProperties properties = threadLocalProperties.get(); 840 try { 841 PatternStringParser.parseToExistingProperties(pattern, properties); 842 } catch (IllegalArgumentException e) { 843 // This should only happen if there is a bug in CLDR data. Fail silently. 844 } 845 set.add(AffixHolder.fromPropertiesPositivePattern(properties)); 846 set.add(AffixHolder.fromPropertiesNegativePattern(properties)); 847 } 848 } 849 850 /** 851 * Makes a {@link TextTrieMap} for parsing digit strings. A trie is required only if the digit 852 * strings are longer than one code point. In order for this to be the case, the user would have 853 * needed to specify custom multi-character digits, like "(0)". 854 * 855 * @param digitStrings The list of digit strings from DecimalFormatSymbols. 856 * @return A trie, or null if a trie is not required. 857 */ 858 static TextTrieMap<Byte> makeDigitTrie(String[] digitStrings) { 859 boolean requiresTrie = false; 860 for (int i = 0; i < 10; i++) { 861 String str = digitStrings[i]; 862 if (Character.charCount(Character.codePointAt(str, 0)) != str.length()) { 863 requiresTrie = true; 864 break; 865 } 866 } 867 if (!requiresTrie) return null; 868 869 // TODO: Consider caching the tries so they don't need to be re-created run to run. 870 // (Low-priority since multi-character digits are rare in practice) 871 TextTrieMap<Byte> trieMap = new TextTrieMap<Byte>(false); 872 for (int i = 0; i < 10; i++) { 873 trieMap.put(digitStrings[i], (byte) i); 874 } 875 return trieMap; 876 } 877 878 protected static final ThreadLocal<ParserState> threadLocalParseState = 879 new ThreadLocal<ParserState>() { 880 @Override 881 protected ParserState initialValue() { 882 return new ParserState(); 883 } 884 }; 885 886 protected static final ThreadLocal<ParsePosition> threadLocalParsePosition = 887 new ThreadLocal<ParsePosition>() { 888 @Override 889 protected ParsePosition initialValue() { 890 return new ParsePosition(0); 891 } 892 }; 893 894 /** 895 * @internal 896 * @deprecated This API is ICU internal only. TODO: Remove this set from ScientificNumberFormat. 897 */ 898 @Deprecated 899 public static final UnicodeSet UNISET_PLUS = 900 new UnicodeSet( 901 0x002B, 0x002B, 0x207A, 0x207A, 0x208A, 0x208A, 0x2795, 0x2795, 0xFB29, 0xFB29, 902 0xFE62, 0xFE62, 0xFF0B, 0xFF0B) 903 .freeze(); 904 905 /** 906 * @internal 907 * @deprecated This API is ICU internal only. TODO: Remove this set from ScientificNumberFormat. 908 */ 909 @Deprecated 910 public static final UnicodeSet UNISET_MINUS = 911 new UnicodeSet( 912 0x002D, 0x002D, 0x207B, 0x207B, 0x208B, 0x208B, 0x2212, 0x2212, 0x2796, 0x2796, 913 0xFE63, 0xFE63, 0xFF0D, 0xFF0D) 914 .freeze(); 915 916 public static Number parse(String input, DecimalFormatProperties properties, DecimalFormatSymbols symbols) { 917 ParsePosition ppos = threadLocalParsePosition.get(); 918 ppos.setIndex(0); 919 return parse(input, ppos, properties, symbols); 920 } 921 922 // TODO: DELETE ME once debugging is finished 923 public static volatile boolean DEBUGGING = false; 924 925 /** 926 * Implements an iterative parser that maintains a lists of possible states at each code point in 927 * the string. At each code point in the string, the list of possible states is updated based on 928 * the states coming from the previous code point. The parser stops when it reaches the end of the 929 * string or when there are no possible parse paths remaining in the string. 930 * 931 * <p>TODO: This API is not fully flushed out. Right now this is internal-only. 932 * 933 * @param input The string to parse. 934 * @param ppos A {@link ParsePosition} to hold the index at which parsing stopped. 935 * @param properties A property bag, used only for determining the prefix/suffix strings and the 936 * padding character. 937 * @param symbols A {@link DecimalFormatSymbols} object, used for determining locale-specific 938 * symbols for grouping/decimal separators, digit strings, and prefix/suffix substitutions. 939 * @return A Number matching the parser's best interpretation of the string. 940 */ 941 public static Number parse( 942 CharSequence input, 943 ParsePosition ppos, 944 DecimalFormatProperties properties, 945 DecimalFormatSymbols symbols) { 946 StateItem best = _parse(input, ppos, false, properties, symbols); 947 return (best == null) ? null : best.toNumber(properties); 948 } 949 950 public static CurrencyAmount parseCurrency( 951 String input, DecimalFormatProperties properties, DecimalFormatSymbols symbols) throws ParseException { 952 return parseCurrency(input, null, properties, symbols); 953 } 954 955 public static CurrencyAmount parseCurrency( 956 CharSequence input, ParsePosition ppos, DecimalFormatProperties properties, DecimalFormatSymbols symbols) 957 throws ParseException { 958 if (ppos == null) { 959 ppos = threadLocalParsePosition.get(); 960 ppos.setIndex(0); 961 ppos.setErrorIndex(-1); 962 } 963 StateItem best = _parse(input, ppos, true, properties, symbols); 964 return (best == null) ? null : best.toCurrencyAmount(properties); 965 } 966 967 private static StateItem _parse( 968 CharSequence input, 969 ParsePosition ppos, 970 boolean parseCurrency, 971 DecimalFormatProperties properties, 972 DecimalFormatSymbols symbols) { 973 974 if (input == null || ppos == null || properties == null || symbols == null) { 975 throw new IllegalArgumentException("All arguments are required for parse."); 976 } 977 978 ParseMode mode = properties.getParseMode(); 979 if (mode == null) mode = ParseMode.LENIENT; 980 boolean integerOnly = properties.getParseIntegerOnly(); 981 boolean ignoreExponent = properties.getParseNoExponent(); 982 boolean ignoreGrouping = properties.getGroupingSize() <= 0; 983 984 // Set up the initial state 985 ParserState state = threadLocalParseState.get().clear(); 986 state.properties = properties; 987 state.symbols = symbols; 988 state.mode = mode; 989 state.parseCurrency = parseCurrency; 990 state.groupingMode = properties.getParseGroupingMode(); 991 if (state.groupingMode == null) state.groupingMode = GroupingMode.DEFAULT; 992 state.caseSensitive = properties.getParseCaseSensitive(); 993 state.decimalCp1 = Character.codePointAt(symbols.getDecimalSeparatorString(), 0); 994 state.decimalCp2 = Character.codePointAt(symbols.getMonetaryDecimalSeparatorString(), 0); 995 state.groupingCp1 = Character.codePointAt(symbols.getGroupingSeparatorString(), 0); 996 state.groupingCp2 = Character.codePointAt(symbols.getMonetaryGroupingSeparatorString(), 0); 997 state.decimalType1 = SeparatorType.fromCp(state.decimalCp1, mode); 998 state.decimalType2 = SeparatorType.fromCp(state.decimalCp2, mode); 999 state.groupingType1 = SeparatorType.fromCp(state.groupingCp1, mode); 1000 state.groupingType2 = SeparatorType.fromCp(state.groupingCp2, mode); 1001 StateItem initialStateItem = state.getNext().clear(); 1002 initialStateItem.name = StateName.BEFORE_PREFIX; 1003 1004 if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) { 1005 state.digitTrie = makeDigitTrie(symbols.getDigitStringsLocal()); 1006 AffixHolder.addToState(state, properties); 1007 if (parseCurrency) { 1008 CurrencyAffixPatterns.addToState(symbols.getULocale(), state); 1009 } 1010 } 1011 1012 if (DEBUGGING) { 1013 System.out.println("Parsing: " + input); 1014 System.out.println(properties); 1015 System.out.println(state); 1016 } 1017 1018 // Start walking through the string, one codepoint at a time. Backtracking is not allowed. This 1019 // is to enforce linear runtime and prevent cases that could result in an infinite loop. 1020 int offset = ppos.getIndex(); 1021 for (; offset < input.length(); ) { 1022 int cp = Character.codePointAt(input, offset); 1023 state.swap(); 1024 for (int i = 0; i < state.prevLength; i++) { 1025 StateItem item = state.prevItems[i]; 1026 if (DEBUGGING) { 1027 System.out.println(":" + offset + item.id + " " + item); 1028 } 1029 1030 // In the switch statement below, if you see a line like: 1031 // if (state.length > 0 && mode == ParseMode.FAST) break; 1032 // it is used for accelerating the fast parse mode. The check is performed only in the 1033 // states BEFORE_PREFIX, AFTER_INTEGER_DIGIT, and AFTER_FRACTION_DIGIT, which are the 1034 // most common states. 1035 1036 switch (item.name) { 1037 case BEFORE_PREFIX: 1038 // Beginning of string 1039 if (mode == ParseMode.LENIENT || mode == ParseMode.FAST) { 1040 acceptMinusOrPlusSign(cp, StateName.BEFORE_PREFIX, state, item, false); 1041 if (state.length > 0 && mode == ParseMode.FAST) break; 1042 } 1043 acceptIntegerDigit(cp, StateName.AFTER_INTEGER_DIGIT, state, item); 1044 if (state.length > 0 && mode == ParseMode.FAST) break; 1045 acceptBidi(cp, StateName.BEFORE_PREFIX, state, item); 1046 if (state.length > 0 && mode == ParseMode.FAST) break; 1047 acceptWhitespace(cp, StateName.BEFORE_PREFIX, state, item); 1048 if (state.length > 0 && mode == ParseMode.FAST) break; 1049 acceptPadding(cp, StateName.BEFORE_PREFIX, state, item); 1050 if (state.length > 0 && mode == ParseMode.FAST) break; 1051 acceptNan(cp, StateName.BEFORE_SUFFIX, state, item); 1052 if (state.length > 0 && mode == ParseMode.FAST) break; 1053 acceptInfinity(cp, StateName.BEFORE_SUFFIX, state, item); 1054 if (state.length > 0 && mode == ParseMode.FAST) break; 1055 if (!integerOnly) { 1056 acceptDecimalPoint(cp, StateName.AFTER_FRACTION_DIGIT, state, item); 1057 if (state.length > 0 && mode == ParseMode.FAST) break; 1058 } 1059 if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) { 1060 acceptPrefix(cp, StateName.AFTER_PREFIX, state, item); 1061 } 1062 if (mode == ParseMode.LENIENT || mode == ParseMode.FAST) { 1063 if (!ignoreGrouping) { 1064 acceptGrouping(cp, StateName.AFTER_INTEGER_DIGIT, state, item); 1065 if (state.length > 0 && mode == ParseMode.FAST) break; 1066 } 1067 if (parseCurrency) { 1068 acceptCurrency(cp, StateName.BEFORE_PREFIX, state, item); 1069 } 1070 } 1071 break; 1072 1073 case AFTER_PREFIX: 1074 // Prefix is consumed 1075 acceptBidi(cp, StateName.AFTER_PREFIX, state, item); 1076 acceptPadding(cp, StateName.AFTER_PREFIX, state, item); 1077 acceptNan(cp, StateName.BEFORE_SUFFIX, state, item); 1078 acceptInfinity(cp, StateName.BEFORE_SUFFIX, state, item); 1079 acceptIntegerDigit(cp, StateName.AFTER_INTEGER_DIGIT, state, item); 1080 if (!integerOnly) { 1081 acceptDecimalPoint(cp, StateName.AFTER_FRACTION_DIGIT, state, item); 1082 } 1083 if (mode == ParseMode.LENIENT || mode == ParseMode.FAST) { 1084 acceptWhitespace(cp, StateName.AFTER_PREFIX, state, item); 1085 if (!ignoreGrouping) { 1086 acceptGrouping(cp, StateName.AFTER_INTEGER_DIGIT, state, item); 1087 } 1088 if (parseCurrency) { 1089 acceptCurrency(cp, StateName.AFTER_PREFIX, state, item); 1090 } 1091 } 1092 break; 1093 1094 case AFTER_INTEGER_DIGIT: 1095 // Previous character was an integer digit (or grouping/whitespace) 1096 acceptIntegerDigit(cp, StateName.AFTER_INTEGER_DIGIT, state, item); 1097 if (state.length > 0 && mode == ParseMode.FAST) break; 1098 if (!integerOnly) { 1099 acceptDecimalPoint(cp, StateName.AFTER_FRACTION_DIGIT, state, item); 1100 if (state.length > 0 && mode == ParseMode.FAST) break; 1101 } 1102 if (!ignoreGrouping) { 1103 acceptGrouping(cp, StateName.AFTER_INTEGER_DIGIT, state, item); 1104 if (state.length > 0 && mode == ParseMode.FAST) break; 1105 } 1106 acceptBidi(cp, StateName.BEFORE_SUFFIX, state, item); 1107 if (state.length > 0 && mode == ParseMode.FAST) break; 1108 acceptPadding(cp, StateName.BEFORE_SUFFIX, state, item); 1109 if (state.length > 0 && mode == ParseMode.FAST) break; 1110 if (!ignoreExponent) { 1111 acceptExponentSeparator(cp, StateName.AFTER_EXPONENT_SEPARATOR, state, item); 1112 if (state.length > 0 && mode == ParseMode.FAST) break; 1113 } 1114 if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) { 1115 acceptSuffix(cp, StateName.AFTER_SUFFIX, state, item); 1116 } 1117 if (mode == ParseMode.LENIENT || mode == ParseMode.FAST) { 1118 acceptWhitespace(cp, StateName.BEFORE_SUFFIX, state, item); 1119 if (state.length > 0 && mode == ParseMode.FAST) break; 1120 // TODO(sffc): acceptMinusOrPlusSign(cp, StateName.BEFORE_SUFFIX, state, item, false); 1121 if (state.length > 0 && mode == ParseMode.FAST) break; 1122 if (parseCurrency) { 1123 acceptCurrency(cp, StateName.BEFORE_SUFFIX, state, item); 1124 } 1125 } 1126 break; 1127 1128 case AFTER_FRACTION_DIGIT: 1129 // We encountered a decimal point 1130 acceptFractionDigit(cp, StateName.AFTER_FRACTION_DIGIT, state, item); 1131 if (state.length > 0 && mode == ParseMode.FAST) break; 1132 acceptBidi(cp, StateName.BEFORE_SUFFIX, state, item); 1133 if (state.length > 0 && mode == ParseMode.FAST) break; 1134 acceptPadding(cp, StateName.BEFORE_SUFFIX, state, item); 1135 if (state.length > 0 && mode == ParseMode.FAST) break; 1136 if (!ignoreExponent) { 1137 acceptExponentSeparator(cp, StateName.AFTER_EXPONENT_SEPARATOR, state, item); 1138 if (state.length > 0 && mode == ParseMode.FAST) break; 1139 } 1140 if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) { 1141 acceptSuffix(cp, StateName.AFTER_SUFFIX, state, item); 1142 } 1143 if (mode == ParseMode.LENIENT || mode == ParseMode.FAST) { 1144 acceptWhitespace(cp, StateName.BEFORE_SUFFIX, state, item); 1145 if (state.length > 0 && mode == ParseMode.FAST) break; 1146 // TODO(sffc): acceptMinusOrPlusSign(cp, StateName.BEFORE_SUFFIX, state, item, false); 1147 if (state.length > 0 && mode == ParseMode.FAST) break; 1148 if (parseCurrency) { 1149 acceptCurrency(cp, StateName.BEFORE_SUFFIX, state, item); 1150 } 1151 } 1152 break; 1153 1154 case AFTER_EXPONENT_SEPARATOR: 1155 acceptBidi(cp, StateName.AFTER_EXPONENT_SEPARATOR, state, item); 1156 acceptMinusOrPlusSign(cp, StateName.AFTER_EXPONENT_SEPARATOR, state, item, true); 1157 acceptExponentDigit(cp, StateName.AFTER_EXPONENT_DIGIT, state, item); 1158 break; 1159 1160 case AFTER_EXPONENT_DIGIT: 1161 acceptBidi(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item); 1162 acceptPadding(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item); 1163 acceptExponentDigit(cp, StateName.AFTER_EXPONENT_DIGIT, state, item); 1164 if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) { 1165 acceptSuffix(cp, StateName.AFTER_SUFFIX, state, item); 1166 } 1167 if (mode == ParseMode.LENIENT || mode == ParseMode.FAST) { 1168 acceptWhitespace(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item); 1169 // TODO(sffc): acceptMinusOrPlusSign(cp, StateName.BEFORE_SUFFIX, state, item, false); 1170 if (parseCurrency) { 1171 acceptCurrency(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item); 1172 } 1173 } 1174 break; 1175 1176 case BEFORE_SUFFIX: 1177 // Accept whitespace, suffixes, and exponent separators 1178 acceptBidi(cp, StateName.BEFORE_SUFFIX, state, item); 1179 acceptPadding(cp, StateName.BEFORE_SUFFIX, state, item); 1180 if (!ignoreExponent) { 1181 acceptExponentSeparator(cp, StateName.AFTER_EXPONENT_SEPARATOR, state, item); 1182 } 1183 if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) { 1184 acceptSuffix(cp, StateName.AFTER_SUFFIX, state, item); 1185 } 1186 if (mode == ParseMode.LENIENT || mode == ParseMode.FAST) { 1187 acceptWhitespace(cp, StateName.BEFORE_SUFFIX, state, item); 1188 // TODO(sffc): acceptMinusOrPlusSign(cp, StateName.BEFORE_SUFFIX, state, item, false); 1189 if (parseCurrency) { 1190 acceptCurrency(cp, StateName.BEFORE_SUFFIX, state, item); 1191 } 1192 } 1193 break; 1194 1195 case BEFORE_SUFFIX_SEEN_EXPONENT: 1196 // Accept whitespace and suffixes but not exponent separators 1197 acceptBidi(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item); 1198 acceptPadding(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item); 1199 if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) { 1200 acceptSuffix(cp, StateName.AFTER_SUFFIX, state, item); 1201 } 1202 if (mode == ParseMode.LENIENT || mode == ParseMode.FAST) { 1203 acceptWhitespace(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item); 1204 // TODO(sffc): acceptMinusOrPlusSign(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item, false); 1205 if (parseCurrency) { 1206 acceptCurrency(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item); 1207 } 1208 } 1209 break; 1210 1211 case AFTER_SUFFIX: 1212 if ((mode == ParseMode.LENIENT || mode == ParseMode.FAST) && parseCurrency) { 1213 // Continue traversing in case there is a currency symbol to consume 1214 acceptBidi(cp, StateName.AFTER_SUFFIX, state, item); 1215 acceptPadding(cp, StateName.AFTER_SUFFIX, state, item); 1216 acceptWhitespace(cp, StateName.AFTER_SUFFIX, state, item); 1217 // TODO(sffc): acceptMinusOrPlusSign(cp, StateName.AFTER_SUFFIX, state, item, false); 1218 if (parseCurrency) { 1219 acceptCurrency(cp, StateName.AFTER_SUFFIX, state, item); 1220 } 1221 } 1222 // Otherwise, do not accept any more characters. 1223 break; 1224 1225 case INSIDE_CURRENCY: 1226 acceptCurrencyOffset(cp, state, item); 1227 break; 1228 1229 case INSIDE_DIGIT: 1230 acceptDigitTrieOffset(cp, state, item); 1231 break; 1232 1233 case INSIDE_STRING: 1234 acceptStringOffset(cp, state, item); 1235 break; 1236 1237 case INSIDE_AFFIX_PATTERN: 1238 acceptAffixPatternOffset(cp, state, item); 1239 break; 1240 } 1241 } 1242 1243 if (state.length == 0) { 1244 // No parse paths continue past this point. We have found the longest parsable string 1245 // from the input. Restore previous state without the offset and break. 1246 state.swapBack(); 1247 break; 1248 } 1249 1250 offset += Character.charCount(cp); 1251 } 1252 1253 // Post-processing 1254 if (state.length == 0) { 1255 if (DEBUGGING) { 1256 System.out.println("No matches found"); 1257 System.out.println("- - - - - - - - - -"); 1258 } 1259 return null; 1260 } else { 1261 1262 // Loop through the candidates. "continue" skips a candidate as invalid. 1263 StateItem best = null; 1264 outer: 1265 for (int i = 0; i < state.length; i++) { 1266 StateItem item = state.items[i]; 1267 1268 if (DEBUGGING) { 1269 System.out.println(":end " + item); 1270 } 1271 1272 // Check that at least one digit was read. 1273 if (!item.hasNumber()) { 1274 if (DEBUGGING) System.out.println("-> rejected due to no number value"); 1275 continue; 1276 } 1277 1278 if (mode == ParseMode.STRICT) { 1279 // Perform extra checks for strict mode. 1280 // We require that the affixes match. 1281 boolean sawPrefix = item.sawPrefix || (item.affix != null && item.affix.p.isEmpty()); 1282 boolean sawSuffix = item.sawSuffix || (item.affix != null && item.affix.s.isEmpty()); 1283 boolean hasEmptyAffix = 1284 state.affixHolders.contains(AffixHolder.EMPTY_POSITIVE) 1285 || state.affixHolders.contains(AffixHolder.EMPTY_NEGATIVE); 1286 if (sawPrefix && sawSuffix) { 1287 // OK 1288 } else if (!sawPrefix && !sawSuffix && hasEmptyAffix) { 1289 // OK 1290 } else { 1291 // Has a prefix or suffix that doesn't match 1292 if (DEBUGGING) System.out.println("-> rejected due to mismatched prefix/suffix"); 1293 continue; 1294 } 1295 1296 // Check for scientific notation. 1297 if (properties.getMinimumExponentDigits() > 0 && !item.sawExponentDigit) { 1298 if (DEBUGGING) System.out.println("-> reject due to lack of exponent"); 1299 continue; 1300 } 1301 1302 // Check that grouping sizes are valid. 1303 int grouping1 = properties.getGroupingSize(); 1304 int grouping2 = properties.getSecondaryGroupingSize(); 1305 grouping1 = grouping1 > 0 ? grouping1 : grouping2; 1306 grouping2 = grouping2 > 0 ? grouping2 : grouping1; 1307 long groupingWidths = item.groupingWidths; 1308 int numGroupingRegions = 16 - Long.numberOfLeadingZeros(groupingWidths) / 4; 1309 // If the last grouping is zero, accept strings like "1," but reject string like "1,.23" 1310 // Strip off multiple last-groupings to handle cases like "123,," or "123 " 1311 while (numGroupingRegions > 1 && (groupingWidths & 0xf) == 0) { 1312 if (item.sawDecimalPoint) { 1313 if (DEBUGGING) System.out.println("-> rejected due to decimal point after grouping"); 1314 continue outer; 1315 } else { 1316 groupingWidths >>>= 4; 1317 numGroupingRegions--; 1318 } 1319 } 1320 if (grouping1 <= 0) { 1321 // OK (no grouping data available) 1322 } else if (numGroupingRegions <= 1) { 1323 // OK (no grouping digits) 1324 } else if ((groupingWidths & 0xf) != grouping1) { 1325 // First grouping size is invalid 1326 if (DEBUGGING) System.out.println("-> rejected due to first grouping violation"); 1327 continue; 1328 } else if (((groupingWidths >>> ((numGroupingRegions - 1) * 4)) & 0xf) > grouping2) { 1329 // String like "1234,567" where the highest grouping is too large 1330 if (DEBUGGING) System.out.println("-> rejected due to final grouping violation"); 1331 continue; 1332 } else { 1333 for (int j = 1; j < numGroupingRegions - 1; j++) { 1334 if (((groupingWidths >>> (j * 4)) & 0xf) != grouping2) { 1335 // A grouping size somewhere in the middle is invalid 1336 if (DEBUGGING) System.out.println("-> rejected due to inner grouping violation"); 1337 continue outer; 1338 } 1339 } 1340 } 1341 } 1342 1343 // Optionally require that the presence of a decimal point matches the pattern. 1344 if (properties.getDecimalPatternMatchRequired() 1345 && item.sawDecimalPoint 1346 != (properties.getDecimalSeparatorAlwaysShown() 1347 || properties.getMaximumFractionDigits() != 0)) { 1348 if (DEBUGGING) System.out.println("-> rejected due to decimal point violation"); 1349 continue; 1350 } 1351 1352 // When parsing currencies, require that a currency symbol was found. 1353 if (parseCurrency && !item.sawCurrency) { 1354 if (DEBUGGING) System.out.println("-> rejected due to lack of currency"); 1355 continue; 1356 } 1357 1358 // If we get here, then this candidate is acceptable. 1359 // Use the earliest candidate in the list, or the one with the highest score, or the 1360 // one with the fewest trailing digits. 1361 if (best == null) { 1362 best = item; 1363 } else if (item.score > best.score) { 1364 best = item; 1365 } else if (item.trailingCount < best.trailingCount) { 1366 best = item; 1367 } 1368 } 1369 1370 if (DEBUGGING) { 1371 System.out.println("- - - - - - - - - -"); 1372 } 1373 1374 if (best != null) { 1375 ppos.setIndex(offset - best.trailingCount); 1376 return best; 1377 } else { 1378 ppos.setErrorIndex(offset); 1379 return null; 1380 } 1381 } 1382 } 1383 1384 /** 1385 * If <code>cp</code> is whitespace (as determined by the unicode set {@link #UNISET_WHITESPACE}), 1386 * copies <code>item</code> to the new list in <code>state</code> and sets its state name to 1387 * <code>nextName</code>. 1388 * 1389 * @param cp The code point to check. 1390 * @param nextName The new state name if the check passes. 1391 * @param state The state object to update. 1392 * @param item The old state leading into the code point. 1393 */ 1394 private static void acceptWhitespace( 1395 int cp, StateName nextName, ParserState state, StateItem item) { 1396 if (UNISET_WHITESPACE.contains(cp)) { 1397 state.getNext().copyFrom(item, nextName, cp); 1398 } 1399 } 1400 1401 /** 1402 * If <code>cp</code> is a bidi control character (as determined by the unicode set {@link 1403 * #UNISET_BIDI}), copies <code>item</code> to the new list in <code>state</code> and sets its 1404 * state name to <code>nextName</code>. 1405 * 1406 * @param cp The code point to check. 1407 * @param nextName The new state name if the check passes. 1408 * @param state The state object to update. 1409 * @param item The old state leading into the code point. 1410 */ 1411 private static void acceptBidi(int cp, StateName nextName, ParserState state, StateItem item) { 1412 if (UNISET_BIDI.contains(cp)) { 1413 state.getNext().copyFrom(item, nextName, cp); 1414 } 1415 } 1416 1417 /** 1418 * If <code>cp</code> is a padding character (as determined by {@link ParserState#paddingCp}), 1419 * copies <code>item</code> to the new list in <code>state</code> and sets its state name to 1420 * <code>nextName</code>. 1421 * 1422 * @param cp The code point to check. 1423 * @param nextName The new state name if the check passes. 1424 * @param state The state object to update. 1425 * @param item The old state leading into the code point. 1426 */ 1427 private static void acceptPadding(int cp, StateName nextName, ParserState state, StateItem item) { 1428 CharSequence padding = state.properties.getPadString(); 1429 if (padding == null || padding.length() == 0) return; 1430 int referenceCp = Character.codePointAt(padding, 0); 1431 if (cp == referenceCp) { 1432 state.getNext().copyFrom(item, nextName, cp); 1433 } 1434 } 1435 1436 private static void acceptIntegerDigit( 1437 int cp, StateName nextName, ParserState state, StateItem item) { 1438 acceptDigitHelper(cp, nextName, state, item, DigitType.INTEGER); 1439 } 1440 1441 private static void acceptFractionDigit( 1442 int cp, StateName nextName, ParserState state, StateItem item) { 1443 acceptDigitHelper(cp, nextName, state, item, DigitType.FRACTION); 1444 } 1445 1446 private static void acceptExponentDigit( 1447 int cp, StateName nextName, ParserState state, StateItem item) { 1448 acceptDigitHelper(cp, nextName, state, item, DigitType.EXPONENT); 1449 } 1450 1451 /** 1452 * If <code>cp</code> is a digit character (as determined by either {@link UCharacter#digit} or 1453 * {@link ParserState#digitCps}), copies <code>item</code> to the new list in <code>state</code> 1454 * and sets its state name to one determined by <code>type</code>. Also copies the digit into a 1455 * field in the new item determined by <code>type</code>. 1456 * 1457 * @param cp The code point to check. 1458 * @param nextName The state to set if a digit is accepted. 1459 * @param state The state object to update. 1460 * @param item The old state leading into the code point. 1461 * @param type The digit type, which determines the next state and the field into which to insert 1462 * the digit. 1463 */ 1464 private static void acceptDigitHelper( 1465 int cp, StateName nextName, ParserState state, StateItem item, DigitType type) { 1466 // Check the Unicode digit character property 1467 byte digit = (byte) UCharacter.digit(cp, 10); 1468 StateItem next = null; 1469 1470 // Look for the digit: 1471 if (digit >= 0) { 1472 // Code point is a number 1473 next = state.getNext().copyFrom(item, nextName, -1); 1474 } 1475 1476 // Do not perform the expensive string manipulations in fast mode. 1477 if (digit < 0 && (state.mode == ParseMode.LENIENT || state.mode == ParseMode.STRICT)) { 1478 if (state.digitTrie == null) { 1479 // Check custom digits, all of which are at most one code point 1480 for (byte d = 0; d < 10; d++) { 1481 int referenceCp = Character.codePointAt(state.symbols.getDigitStringsLocal()[d], 0); 1482 if (cp == referenceCp) { 1483 digit = d; 1484 next = state.getNext().copyFrom(item, nextName, -1); 1485 } 1486 } 1487 } else { 1488 // Custom digits have more than one code point 1489 acceptDigitTrie(cp, nextName, state, item, type); 1490 } 1491 } 1492 1493 // Save state 1494 recordDigit(next, digit, type); 1495 } 1496 1497 /** 1498 * Helper function for {@link acceptDigit} and {@link acceptDigitTrie} to save a complete digit in 1499 * a state item and update grouping widths. 1500 * 1501 * @param next The new StateItem 1502 * @param digit The digit to record 1503 * @param type The type of the digit to record (INTEGER, FRACTION, or EXPONENT) 1504 */ 1505 private static void recordDigit(StateItem next, byte digit, DigitType type) { 1506 if (next == null) return; 1507 next.appendDigit(digit, type); 1508 if (type == DigitType.INTEGER && (next.groupingWidths & 0xf) < 15) { 1509 next.groupingWidths++; 1510 } 1511 } 1512 1513 /** 1514 * If <code>cp</code> is a sign (as determined by the unicode sets {@link #UNISET_PLUS} and {@link 1515 * #UNISET_MINUS}), copies <code>item</code> to the new list in <code>state</code>. Loops back to 1516 * the same state name. 1517 * 1518 * @param cp The code point to check. 1519 * @param state The state object to update. 1520 * @param item The old state leading into the code point. 1521 */ 1522 private static void acceptMinusOrPlusSign( 1523 int cp, StateName nextName, ParserState state, StateItem item, boolean exponent) { 1524 acceptMinusSign(cp, nextName, null, state, item, exponent); 1525 acceptPlusSign(cp, nextName, null, state, item, exponent); 1526 } 1527 1528 private static long acceptMinusSign( 1529 int cp, 1530 StateName returnTo1, 1531 StateName returnTo2, 1532 ParserState state, 1533 StateItem item, 1534 boolean exponent) { 1535 if (UNISET_MINUS.contains(cp)) { 1536 StateItem next = state.getNext().copyFrom(item, returnTo1, -1); 1537 next.returnTo1 = returnTo2; 1538 if (exponent) { 1539 next.sawNegativeExponent = true; 1540 } else { 1541 next.sawNegative = true; 1542 } 1543 return 1L << state.lastInsertedIndex(); 1544 } else { 1545 return 0L; 1546 } 1547 } 1548 1549 private static long acceptPlusSign( 1550 int cp, 1551 StateName returnTo1, 1552 StateName returnTo2, 1553 ParserState state, 1554 StateItem item, 1555 boolean exponent) { 1556 if (UNISET_PLUS.contains(cp)) { 1557 StateItem next = state.getNext().copyFrom(item, returnTo1, -1); 1558 next.returnTo1 = returnTo2; 1559 return 1L << state.lastInsertedIndex(); 1560 } else { 1561 return 0L; 1562 } 1563 } 1564 1565 /** 1566 * If <code>cp</code> is a grouping separator (as determined by the unicode set {@link 1567 * #UNISET_GROUPING}), copies <code>item</code> to the new list in <code>state</code> and loops 1568 * back to the same state. Also accepts if <code>cp</code> is the locale-specific grouping 1569 * separator in {@link ParserState#groupingCp}, in which case the {@link 1570 * StateItem#usesLocaleSymbols} flag is also set. 1571 * 1572 * @param cp The code point to check. 1573 * @param state The state object to update. 1574 * @param item The old state leading into the code point. 1575 */ 1576 private static void acceptGrouping( 1577 int cp, StateName nextName, ParserState state, StateItem item) { 1578 // Do not accept mixed grouping separators in the same string. 1579 if (item.groupingCp == -1) { 1580 // First time seeing a grouping separator. 1581 SeparatorType cpType = SeparatorType.fromCp(cp, state.mode); 1582 1583 // Always accept if exactly the same as the locale grouping separator. 1584 if (cp != state.groupingCp1 && cp != state.groupingCp2) { 1585 // Reject if not in one of the three primary equivalence classes. 1586 if (cpType == SeparatorType.UNKNOWN) { 1587 return; 1588 } 1589 if (state.groupingMode == GroupingMode.RESTRICTED) { 1590 // Reject if not in the same class as the locale grouping separator. 1591 if (cpType != state.groupingType1 || cpType != state.groupingType2) { 1592 return; 1593 } 1594 } else { 1595 // Reject if in the same class as the decimal separator. 1596 if (cpType == SeparatorType.COMMA_LIKE 1597 && (state.decimalType1 == SeparatorType.COMMA_LIKE 1598 || state.decimalType2 == SeparatorType.COMMA_LIKE)) { 1599 return; 1600 } 1601 if (cpType == SeparatorType.PERIOD_LIKE 1602 && (state.decimalType1 == SeparatorType.PERIOD_LIKE 1603 || state.decimalType2 == SeparatorType.PERIOD_LIKE)) { 1604 return; 1605 } 1606 } 1607 } 1608 1609 // A match was found. 1610 StateItem next = state.getNext().copyFrom(item, nextName, cp); 1611 next.groupingCp = cp; 1612 next.groupingWidths <<= 4; 1613 } else { 1614 // Have already seen a grouping separator. 1615 if (cp == item.groupingCp) { 1616 StateItem next = state.getNext().copyFrom(item, nextName, cp); 1617 next.groupingWidths <<= 4; 1618 } 1619 } 1620 } 1621 1622 /** 1623 * If <code>cp</code> is a decimal (as determined by the unicode set {@link #UNISET_DECIMAL}), 1624 * copies <code>item</code> to the new list in <code>state</code> and goes to {@link 1625 * StateName#AFTER_FRACTION_DIGIT}. Also accepts if <code>cp</code> is the locale-specific decimal 1626 * point in {@link ParserState#decimalCp}, in which case the {@link StateItem#usesLocaleSymbols} 1627 * flag is also set. 1628 * 1629 * @param cp The code point to check. 1630 * @param state The state object to update. 1631 * @param item The old state leading into the code point. 1632 */ 1633 private static void acceptDecimalPoint( 1634 int cp, StateName nextName, ParserState state, StateItem item) { 1635 if (cp == item.groupingCp) { 1636 // Don't accept a decimal point that is the same as the grouping separator 1637 return; 1638 } 1639 1640 SeparatorType cpType = SeparatorType.fromCp(cp, state.mode); 1641 1642 // We require that the decimal separator be in the same class as the locale. 1643 if (cpType != state.decimalType1 && cpType != state.decimalType2) { 1644 return; 1645 } 1646 1647 // If in UNKNOWN or OTHER, require an exact match. 1648 if (cpType == SeparatorType.OTHER_GROUPING || cpType == SeparatorType.UNKNOWN) { 1649 if (cp != state.decimalCp1 && cp != state.decimalCp2) { 1650 return; 1651 } 1652 } 1653 1654 // A match was found. 1655 StateItem next = state.getNext().copyFrom(item, nextName, -1); 1656 next.sawDecimalPoint = true; 1657 } 1658 1659 private static void acceptNan(int cp, StateName nextName, ParserState state, StateItem item) { 1660 CharSequence nan = state.symbols.getNaN(); 1661 long added = acceptString(cp, nextName, null, state, item, nan, 0, false); 1662 1663 // Set state in the items that were added by the function call 1664 for (int i = Long.numberOfTrailingZeros(added); (1L << i) <= added; i++) { 1665 if (((1L << i) & added) != 0) { 1666 state.getItem(i).sawNaN = true; 1667 } 1668 } 1669 } 1670 1671 private static void acceptInfinity( 1672 int cp, StateName nextName, ParserState state, StateItem item) { 1673 CharSequence inf = state.symbols.getInfinity(); 1674 long added = acceptString(cp, nextName, null, state, item, inf, 0, false); 1675 1676 // Set state in the items that were added by the function call 1677 for (int i = Long.numberOfTrailingZeros(added); (1L << i) <= added; i++) { 1678 if (((1L << i) & added) != 0) { 1679 state.getItem(i).sawInfinity = true; 1680 } 1681 } 1682 } 1683 1684 private static void acceptExponentSeparator( 1685 int cp, StateName nextName, ParserState state, StateItem item) { 1686 CharSequence exp = state.symbols.getExponentSeparator(); 1687 acceptString(cp, nextName, null, state, item, exp, 0, true); 1688 } 1689 1690 private static void acceptPrefix(int cp, StateName nextName, ParserState state, StateItem item) { 1691 for (AffixHolder holder : state.affixHolders) { 1692 acceptAffixHolder(cp, nextName, state, item, holder, true); 1693 } 1694 } 1695 1696 private static void acceptSuffix(int cp, StateName nextName, ParserState state, StateItem item) { 1697 if (item.affix != null) { 1698 acceptAffixHolder(cp, nextName, state, item, item.affix, false); 1699 } else { 1700 for (AffixHolder holder : state.affixHolders) { 1701 acceptAffixHolder(cp, nextName, state, item, holder, false); 1702 } 1703 } 1704 } 1705 1706 private static void acceptAffixHolder( 1707 int cp, 1708 StateName nextName, 1709 ParserState state, 1710 StateItem item, 1711 AffixHolder holder, 1712 boolean prefix) { 1713 if (holder == null) return; 1714 String str = prefix ? holder.p : holder.s; 1715 long added; 1716 if (holder.strings) { 1717 added = acceptString(cp, nextName, null, state, item, str, 0, false); 1718 } else { 1719 added = 1720 acceptAffixPattern(cp, nextName, state, item, str, AffixUtils.nextToken(0, str)); 1721 } 1722 // Record state in the added entries 1723 for (int i = Long.numberOfTrailingZeros(added); (1L << i) <= added; i++) { 1724 if (((1L << i) & added) != 0) { 1725 StateItem next = state.getItem(i); 1726 next.affix = holder; 1727 if (prefix) next.sawPrefix = true; 1728 if (!prefix) next.sawSuffix = true; 1729 if (holder.negative) next.sawNegative = true; 1730 // 10 point reward for consuming a prefix/suffix: 1731 next.score += 10; 1732 // 1 point reward for positive holders (if there is ambiguity, we want to favor positive): 1733 if (!holder.negative) next.score += 1; 1734 // 5 point reward for affix holders that have an empty prefix or suffix (we won't see them again): 1735 if (!next.sawPrefix && holder.p.isEmpty()) next.score += 5; 1736 if (!next.sawSuffix && holder.s.isEmpty()) next.score += 5; 1737 } 1738 } 1739 } 1740 1741 private static long acceptStringOffset(int cp, ParserState state, StateItem item) { 1742 return acceptString( 1743 cp, 1744 item.returnTo1, 1745 item.returnTo2, 1746 state, 1747 item, 1748 item.currentString, 1749 item.currentOffset, 1750 item.currentTrailing); 1751 } 1752 1753 /** 1754 * Accepts a code point if the code point is compatible with the string at the given offset. 1755 * Handles runs of ignorable characters. 1756 * 1757 * <p>This method will add either one or two {@link StateItem} to the {@link ParserState}. 1758 * 1759 * @param cp The current code point, which will be checked for a match to the string. 1760 * @param ret1 The state to return to after reaching the end of the string. 1761 * @param ret2 The state to save in <code>returnTo1</code> after reaching the end of the string. 1762 * Set to null if returning to the main state loop. 1763 * @param trailing true if this string should be ignored for the purposes of recording trailing 1764 * code points; false if it trailing count should be reset after reading the string. 1765 * @param state The current {@link ParserState} 1766 * @param item The current {@link StateItem} 1767 * @param str The string against which to check for a match. 1768 * @param offset The number of chars into the string. Initial value should be 0. 1769 * @param trailing false if this string is strong and should reset trailing count to zero when it 1770 * is fully consumed. 1771 * @return A bitmask where the bits correspond to the items that were added. Set to 0L if no items 1772 * were added. 1773 */ 1774 private static long acceptString( 1775 int cp, 1776 StateName ret1, 1777 StateName ret2, 1778 ParserState state, 1779 StateItem item, 1780 CharSequence str, 1781 int offset, 1782 boolean trailing) { 1783 if (str == null || str.length() == 0) return 0L; 1784 return acceptStringOrAffixPatternWithIgnorables( 1785 cp, ret1, ret2, state, item, str, offset, trailing, true); 1786 } 1787 1788 private static long acceptStringNonIgnorable( 1789 int cp, 1790 StateName ret1, 1791 StateName ret2, 1792 ParserState state, 1793 StateItem item, 1794 CharSequence str, 1795 boolean trailing, 1796 int referenceCp, 1797 long firstOffsetOrTag, 1798 long nextOffsetOrTag) { 1799 long added = 0L; 1800 int firstOffset = (int) firstOffsetOrTag; 1801 int nextOffset = (int) nextOffsetOrTag; 1802 if (codePointEquals(referenceCp, cp, state)) { 1803 if (firstOffset < str.length()) { 1804 added |= acceptStringHelper(cp, ret1, ret2, state, item, str, firstOffset, trailing); 1805 } 1806 if (nextOffset >= str.length()) { 1807 added |= acceptStringHelper(cp, ret1, ret2, state, item, str, nextOffset, trailing); 1808 } 1809 return added; 1810 } else { 1811 return 0L; 1812 } 1813 } 1814 1815 /** 1816 * Internal method that is used to step to the next code point of a string or exit the string if 1817 * at the end. 1818 * 1819 * @param cp See {@link #acceptString} 1820 * @param returnTo1 See {@link #acceptString} 1821 * @param returnTo2 See {@link #acceptString} 1822 * @param state See {@link #acceptString} 1823 * @param item See {@link #acceptString} 1824 * @param str See {@link #acceptString} 1825 * @param newOffset The offset at which the next step should start. If past the end of the string, 1826 * exit the string and return to the outer loop. 1827 * @param trailing See {@link #acceptString} 1828 * @return Bitmask containing one entry, the one that was added. 1829 */ 1830 private static long acceptStringHelper( 1831 int cp, 1832 StateName returnTo1, 1833 StateName returnTo2, 1834 ParserState state, 1835 StateItem item, 1836 CharSequence str, 1837 int newOffset, 1838 boolean trailing) { 1839 StateItem next = state.getNext().copyFrom(item, null, cp); 1840 next.score += 1; // reward for consuming a cp from string 1841 if (newOffset < str.length()) { 1842 // String has more code points. 1843 next.name = StateName.INSIDE_STRING; 1844 next.returnTo1 = returnTo1; 1845 next.returnTo2 = returnTo2; 1846 next.currentString = str; 1847 next.currentOffset = newOffset; 1848 next.currentTrailing = trailing; 1849 } else { 1850 // We've reached the end of the string. 1851 next.name = returnTo1; 1852 if (!trailing) next.trailingCount = 0; 1853 next.returnTo1 = returnTo2; 1854 next.returnTo2 = null; 1855 } 1856 return 1L << state.lastInsertedIndex(); 1857 } 1858 1859 private static long acceptAffixPatternOffset(int cp, ParserState state, StateItem item) { 1860 return acceptAffixPattern( 1861 cp, item.returnTo1, state, item, item.currentAffixPattern, item.currentStepwiseParserTag); 1862 } 1863 1864 /** 1865 * Accepts a code point if the code point is compatible with the affix pattern at the offset 1866 * encoded in the tag argument. 1867 * 1868 * @param cp The current code point, which will be checked for a match to the string. 1869 * @param returnTo The state to return to after reaching the end of the string. 1870 * @param state The current {@link ParserState} 1871 * @param item The current {@link StateItem} 1872 * @param str The string containing the affix pattern. 1873 * @param tag The current state of the stepwise parser. Initial value should be 0L. 1874 * @return A bitmask where the bits correspond to the items that were added. Set to 0L if no items 1875 * were added. 1876 */ 1877 private static long acceptAffixPattern( 1878 int cp, StateName ret1, ParserState state, StateItem item, CharSequence str, long tag) { 1879 if (str == null || str.length() == 0) return 0L; 1880 return acceptStringOrAffixPatternWithIgnorables( 1881 cp, ret1, null, state, item, str, tag, false, false); 1882 } 1883 1884 private static long acceptAffixPatternNonIgnorable( 1885 int cp, 1886 StateName returnTo, 1887 ParserState state, 1888 StateItem item, 1889 CharSequence str, 1890 int typeOrCp, 1891 long firstTag, 1892 long nextTag) { 1893 1894 // Convert from the returned tag to a code point, string, or currency to check 1895 int resolvedCp = -1; 1896 CharSequence resolvedStr = null; 1897 boolean resolvedMinusSign = false; 1898 boolean resolvedPlusSign = false; 1899 boolean resolvedCurrency = false; 1900 if (typeOrCp < 0) { 1901 // Symbol 1902 switch (typeOrCp) { 1903 case AffixUtils.TYPE_MINUS_SIGN: 1904 resolvedMinusSign = true; 1905 break; 1906 case AffixUtils.TYPE_PLUS_SIGN: 1907 resolvedPlusSign = true; 1908 break; 1909 case AffixUtils.TYPE_PERCENT: 1910 resolvedStr = state.symbols.getPercentString(); 1911 if (resolvedStr.length() != 1 || resolvedStr.charAt(0) != '%') { 1912 resolvedCp = '%'; // accept ASCII percent as well as locale percent 1913 } 1914 break; 1915 case AffixUtils.TYPE_PERMILLE: 1916 resolvedStr = state.symbols.getPerMillString(); 1917 if (resolvedStr.length() != 1 || resolvedStr.charAt(0) != '') { 1918 resolvedCp = ''; // accept ASCII permille as well as locale permille 1919 } 1920 break; 1921 case AffixUtils.TYPE_CURRENCY_SINGLE: 1922 case AffixUtils.TYPE_CURRENCY_DOUBLE: 1923 case AffixUtils.TYPE_CURRENCY_TRIPLE: 1924 case AffixUtils.TYPE_CURRENCY_QUAD: 1925 case AffixUtils.TYPE_CURRENCY_QUINT: 1926 case AffixUtils.TYPE_CURRENCY_OVERFLOW: 1927 resolvedCurrency = true; 1928 break; 1929 default: 1930 throw new AssertionError(); 1931 } 1932 } else { 1933 resolvedCp = typeOrCp; 1934 } 1935 1936 long added = 0L; 1937 if (resolvedCp >= 0 && codePointEquals(cp, resolvedCp, state)) { 1938 if (firstTag >= 0) { 1939 added |= acceptAffixPatternHelper(cp, returnTo, state, item, str, firstTag); 1940 } 1941 if (nextTag < 0) { 1942 added |= acceptAffixPatternHelper(cp, returnTo, state, item, str, nextTag); 1943 } 1944 } 1945 if (resolvedMinusSign) { 1946 if (firstTag >= 0) { 1947 added |= acceptMinusSign(cp, StateName.INSIDE_AFFIX_PATTERN, returnTo, state, item, false); 1948 } 1949 if (nextTag < 0) { 1950 added |= acceptMinusSign(cp, returnTo, null, state, item, false); 1951 } 1952 if (added == 0L) { 1953 // Also attempt to accept custom minus sign string 1954 String mss = state.symbols.getMinusSignString(); 1955 int mssCp = Character.codePointAt(mss, 0); 1956 if (mss.length() != Character.charCount(mssCp) || !UNISET_MINUS.contains(mssCp)) { 1957 resolvedStr = mss; 1958 } 1959 } 1960 } 1961 if (resolvedPlusSign) { 1962 if (firstTag >= 0) { 1963 added |= acceptPlusSign(cp, StateName.INSIDE_AFFIX_PATTERN, returnTo, state, item, false); 1964 } 1965 if (nextTag < 0) { 1966 added |= acceptPlusSign(cp, returnTo, null, state, item, false); 1967 } 1968 if (added == 0L) { 1969 // Also attempt to accept custom plus sign string 1970 String pss = state.symbols.getPlusSignString(); 1971 int pssCp = Character.codePointAt(pss, 0); 1972 if (pss.length() != Character.charCount(pssCp) || !UNISET_MINUS.contains(pssCp)) { 1973 resolvedStr = pss; 1974 } 1975 } 1976 } 1977 if (resolvedStr != null) { 1978 if (firstTag >= 0) { 1979 added |= 1980 acceptString( 1981 cp, StateName.INSIDE_AFFIX_PATTERN, returnTo, state, item, resolvedStr, 0, false); 1982 } 1983 if (nextTag < 0) { 1984 added |= acceptString(cp, returnTo, null, state, item, resolvedStr, 0, false); 1985 } 1986 } 1987 if (resolvedCurrency) { 1988 if (firstTag >= 0) { 1989 added |= acceptCurrency(cp, StateName.INSIDE_AFFIX_PATTERN, returnTo, state, item); 1990 } 1991 if (nextTag < 0) { 1992 added |= acceptCurrency(cp, returnTo, null, state, item); 1993 } 1994 } 1995 1996 // Set state in the items that were added by the function calls 1997 for (int i = Long.numberOfTrailingZeros(added); (1L << i) <= added; i++) { 1998 if (((1L << i) & added) != 0) { 1999 state.getItem(i).currentAffixPattern = str; 2000 state.getItem(i).currentStepwiseParserTag = firstTag; 2001 } 2002 } 2003 return added; 2004 } 2005 2006 /** 2007 * Internal method that is used to step to the next token of a affix pattern or exit the affix 2008 * pattern if at the end. 2009 * 2010 * @param cp See {@link #acceptAffixPattern} 2011 * @param returnTo1 See {@link #acceptAffixPattern} 2012 * @param state See {@link #acceptAffixPattern} 2013 * @param item See {@link #acceptAffixPattern} 2014 * @param str See {@link #acceptAffixPattern} 2015 * @param newOffset The tag corresponding to the next token in the affix pattern that should be 2016 * recorded and consumed in a future call to {@link #acceptAffixPatternOffset}. 2017 * @return Bitmask containing one entry, the one that was added. 2018 */ 2019 private static long acceptAffixPatternHelper( 2020 int cp, 2021 StateName returnTo, 2022 ParserState state, 2023 StateItem item, 2024 CharSequence str, 2025 long newTag) { 2026 StateItem next = state.getNext().copyFrom(item, null, cp); 2027 next.score += 1; // reward for consuming a cp from pattern 2028 if (newTag >= 0) { 2029 // Additional tokens in affix string. 2030 next.name = StateName.INSIDE_AFFIX_PATTERN; 2031 next.returnTo1 = returnTo; 2032 next.currentAffixPattern = str; 2033 next.currentStepwiseParserTag = newTag; 2034 } else { 2035 // Reached last token in affix string. 2036 next.name = returnTo; 2037 next.trailingCount = 0; 2038 next.returnTo1 = null; 2039 } 2040 return 1L << state.lastInsertedIndex(); 2041 } 2042 2043 /** 2044 * Consumes tokens from a string or affix pattern following ICU's rules for handling of whitespace 2045 * and bidi control characters (collectively called "ignorables"). The methods {@link 2046 * #acceptStringHelper}, {@link #acceptAffixPatternHelper}, {@link #acceptStringNonIgnorable}, and 2047 * {@link #acceptAffixPatternNonIgnorable} will be called by this method to actually add parse 2048 * paths. 2049 * 2050 * <p>In the "NonIgnorable" functions, two arguments are passed: firstOffsetOrTag and 2051 * nextOffsetOrTag. These two arguments should add parse paths according to the following rules: 2052 * 2053 * <pre> 2054 * if (firstOffsetOrTag is valid or inside string boundary) { 2055 * // Add parse path going to firstOffsetOrTag 2056 * } 2057 * if (nextOffsetOrTag is invalid or beyond string boundary) { 2058 * // Add parse path leaving the string 2059 * } 2060 * </pre> 2061 * 2062 * <p>Note that there may be multiple parse paths added by these lines. This is important in order 2063 * to properly handle runs of ignorables. 2064 * 2065 * @param cp See {@link #acceptString} and {@link #acceptAffixPattern} 2066 * @param ret1 See {@link #acceptString} and {@link #acceptAffixPattern} 2067 * @param ret2 See {@link #acceptString} (affix pattern can pass null) 2068 * @param state See {@link #acceptString} and {@link #acceptAffixPattern} 2069 * @param item See {@link #acceptString} and {@link #acceptAffixPattern} 2070 * @param str See {@link #acceptString} and {@link #acceptAffixPattern} 2071 * @param offsetOrTag The current int offset for strings, or the current tag for affix patterns. 2072 * @param trailing See {@link #acceptString} (affix patterns can pass false) 2073 * @param isString true if the parameters correspond to a string; false if they correspond to an 2074 * affix pattern. 2075 * @return A bitmask containing the entries that were added. 2076 */ 2077 private static long acceptStringOrAffixPatternWithIgnorables( 2078 int cp, 2079 StateName ret1, 2080 StateName ret2 /* String only */, 2081 ParserState state, 2082 StateItem item, 2083 CharSequence str, 2084 long offsetOrTag /* offset for string; tag for affix pattern */, 2085 boolean trailing /* String only */, 2086 boolean isString) { 2087 2088 // Runs of ignorables (whitespace and bidi control marks) can occur at the beginning, middle, 2089 // or end of the reference string, or a run across the entire string. 2090 // 2091 // - A run at the beginning or in the middle corresponds to a run of length *zero or more* 2092 // in the input. 2093 // - A run at the end need to be matched exactly. 2094 // - A string that contains only ignorable characters also needs to be matched exactly. 2095 // 2096 // Because the behavior differs, we need logic here to determine which case we have. 2097 2098 int typeOrCp = 2099 isString 2100 ? Character.codePointAt(str, (int) offsetOrTag) 2101 : AffixUtils.getTypeOrCp(offsetOrTag); 2102 2103 if (isIgnorable(typeOrCp, state)) { 2104 // Look for the next nonignorable code point 2105 int nextTypeOrCp = typeOrCp; 2106 long prevOffsetOrTag; 2107 long nextOffsetOrTag = offsetOrTag; 2108 long firstOffsetOrTag = 0L; 2109 while (true) { 2110 prevOffsetOrTag = nextOffsetOrTag; 2111 nextOffsetOrTag = 2112 isString 2113 ? nextOffsetOrTag + Character.charCount(nextTypeOrCp) 2114 : AffixUtils.nextToken(nextOffsetOrTag, str); 2115 if (firstOffsetOrTag == 0L) firstOffsetOrTag = nextOffsetOrTag; 2116 if (isString ? nextOffsetOrTag >= str.length() : nextOffsetOrTag < 0) { 2117 // Integer.MIN_VALUE is an invalid value for either a type or a cp; 2118 // use it to indicate the end of the string. 2119 nextTypeOrCp = Integer.MIN_VALUE; 2120 break; 2121 } 2122 nextTypeOrCp = 2123 isString 2124 ? Character.codePointAt(str, (int) nextOffsetOrTag) 2125 : AffixUtils.getTypeOrCp(nextOffsetOrTag); 2126 if (!isIgnorable(nextTypeOrCp, state)) break; 2127 } 2128 2129 if (nextTypeOrCp == Integer.MIN_VALUE) { 2130 // Run at end or string that contains only ignorable characters. 2131 if (codePointEquals(cp, typeOrCp, state)) { 2132 // Step forward and also exit the string if not at very end. 2133 // RETURN 2134 long added = 0L; 2135 added |= 2136 isString 2137 ? acceptStringHelper( 2138 cp, ret1, ret2, state, item, str, (int) firstOffsetOrTag, trailing) 2139 : acceptAffixPatternHelper(cp, ret1, state, item, str, firstOffsetOrTag); 2140 if (firstOffsetOrTag != nextOffsetOrTag) { 2141 added |= 2142 isString 2143 ? acceptStringHelper( 2144 cp, ret1, ret2, state, item, str, (int) nextOffsetOrTag, trailing) 2145 : acceptAffixPatternHelper(cp, ret1, state, item, str, nextOffsetOrTag); 2146 } 2147 return added; 2148 } else { 2149 // Code point does not exactly match the run at end. 2150 // RETURN 2151 return 0L; 2152 } 2153 } else { 2154 // Run at beginning or in middle. 2155 if (isIgnorable(cp, state)) { 2156 // Consume the ignorable. 2157 // RETURN 2158 return isString 2159 ? acceptStringHelper( 2160 cp, ret1, ret2, state, item, str, (int) prevOffsetOrTag, trailing) 2161 : acceptAffixPatternHelper(cp, ret1, state, item, str, prevOffsetOrTag); 2162 } else { 2163 // Go to nonignorable cp. 2164 // FALL THROUGH 2165 } 2166 } 2167 2168 // Fall through to the nonignorable code point found above. 2169 assert nextTypeOrCp != Integer.MIN_VALUE; 2170 typeOrCp = nextTypeOrCp; 2171 offsetOrTag = nextOffsetOrTag; 2172 } 2173 assert !isIgnorable(typeOrCp, state); 2174 2175 // Look for the next nonignorable code point after this nonignorable code point 2176 // to determine if we are at the end of the string. 2177 int nextTypeOrCp = typeOrCp; 2178 long nextOffsetOrTag = offsetOrTag; 2179 long firstOffsetOrTag = 0L; 2180 while (true) { 2181 nextOffsetOrTag = 2182 isString 2183 ? nextOffsetOrTag + Character.charCount(nextTypeOrCp) 2184 : AffixUtils.nextToken(nextOffsetOrTag, str); 2185 if (firstOffsetOrTag == 0L) firstOffsetOrTag = nextOffsetOrTag; 2186 if (isString ? nextOffsetOrTag >= str.length() : nextOffsetOrTag < 0) { 2187 nextTypeOrCp = -1; 2188 break; 2189 } 2190 nextTypeOrCp = 2191 isString 2192 ? Character.codePointAt(str, (int) nextOffsetOrTag) 2193 : AffixUtils.getTypeOrCp(nextOffsetOrTag); 2194 if (!isIgnorable(nextTypeOrCp, state)) break; 2195 } 2196 2197 // Nonignorable logic. 2198 return isString 2199 ? acceptStringNonIgnorable( 2200 cp, ret1, ret2, state, item, str, trailing, typeOrCp, firstOffsetOrTag, nextOffsetOrTag) 2201 : acceptAffixPatternNonIgnorable( 2202 cp, ret1, state, item, str, typeOrCp, firstOffsetOrTag, nextOffsetOrTag); 2203 } 2204 2205 /** 2206 * This method can add up to four items to the new list in <code>state</code>. 2207 * 2208 * <p>If <code>cp</code> is equal to any known ISO code or long name, copies <code>item</code> to 2209 * the new list in <code>state</code> and sets its ISO code to the corresponding currency. 2210 * 2211 * <p>If <code>cp</code> is the first code point of any ISO code or long name having more them one 2212 * code point in length, copies <code>item</code> to the new list in <code>state</code> along with 2213 * an instance of {@link TextTrieMap.ParseState} for tracking the following code points. 2214 * 2215 * @param cp The code point to check. 2216 * @param state The state object to update. 2217 * @param item The old state leading into the code point. 2218 */ 2219 private static void acceptCurrency( 2220 int cp, StateName nextName, ParserState state, StateItem item) { 2221 acceptCurrency(cp, nextName, null, state, item); 2222 } 2223 2224 private static long acceptCurrency( 2225 int cp, StateName returnTo1, StateName returnTo2, ParserState state, StateItem item) { 2226 if (item.sawCurrency) return 0L; 2227 long added = 0L; 2228 2229 // Accept from local currency information 2230 String str1, str2; 2231 Currency currency = state.properties.getCurrency(); 2232 if (currency != null) { 2233 str1 = currency.getName(state.symbols.getULocale(), Currency.SYMBOL_NAME, null); 2234 str2 = currency.getCurrencyCode(); 2235 // TODO: Should we also accept long names? In currency mode, they are in the CLDR data. 2236 } else { 2237 currency = state.symbols.getCurrency(); 2238 str1 = state.symbols.getCurrencySymbol(); 2239 str2 = state.symbols.getInternationalCurrencySymbol(); 2240 } 2241 added |= acceptString(cp, returnTo1, returnTo2, state, item, str1, 0, false); 2242 added |= acceptString(cp, returnTo1, returnTo2, state, item, str2, 0, false); 2243 for (int i = Long.numberOfTrailingZeros(added); (1L << i) <= added; i++) { 2244 if (((1L << i) & added) != 0) { 2245 state.getItem(i).sawCurrency = true; 2246 state.getItem(i).isoCode = str2; 2247 } 2248 } 2249 2250 // Accept from CLDR data 2251 if (state.parseCurrency) { 2252 ULocale uloc = state.symbols.getULocale(); 2253 TextTrieMap<Currency.CurrencyStringInfo>.ParseState trie1 = 2254 Currency.openParseState(uloc, cp, Currency.LONG_NAME); 2255 TextTrieMap<Currency.CurrencyStringInfo>.ParseState trie2 = 2256 Currency.openParseState(uloc, cp, Currency.SYMBOL_NAME); 2257 added |= acceptCurrencyHelper(cp, returnTo1, returnTo2, state, item, trie1); 2258 added |= acceptCurrencyHelper(cp, returnTo1, returnTo2, state, item, trie2); 2259 } 2260 2261 return added; 2262 } 2263 2264 /** 2265 * If <code>cp</code> is the next code point of any currency, copies <code>item</code> to the new 2266 * list in <code>state</code> along with an instance of {@link TextTrieMap.ParseState} for 2267 * tracking the following code points. 2268 * 2269 * <p>This method should only be called in a state following {@link #acceptCurrency}. 2270 * 2271 * @param cp The code point to check. 2272 * @param state The state object to update. 2273 * @param item The old state leading into the code point. 2274 */ 2275 private static void acceptCurrencyOffset(int cp, ParserState state, StateItem item) { 2276 acceptCurrencyHelper( 2277 cp, item.returnTo1, item.returnTo2, state, item, item.currentCurrencyTrieState); 2278 } 2279 2280 private static long acceptCurrencyHelper( 2281 int cp, 2282 StateName returnTo1, 2283 StateName returnTo2, 2284 ParserState state, 2285 StateItem item, 2286 TextTrieMap<Currency.CurrencyStringInfo>.ParseState trieState) { 2287 if (trieState == null) return 0L; 2288 trieState.accept(cp); 2289 long added = 0L; 2290 Iterator<Currency.CurrencyStringInfo> currentMatches = trieState.getCurrentMatches(); 2291 if (currentMatches != null) { 2292 // Match on current code point 2293 // TODO: What should happen with multiple currency matches? 2294 StateItem next = state.getNext().copyFrom(item, returnTo1, -1); 2295 next.returnTo1 = returnTo2; 2296 next.returnTo2 = null; 2297 next.sawCurrency = true; 2298 next.isoCode = currentMatches.next().getISOCode(); 2299 added |= 1L << state.lastInsertedIndex(); 2300 } 2301 if (!trieState.atEnd()) { 2302 // Prepare for matches on future code points 2303 StateItem next = state.getNext().copyFrom(item, StateName.INSIDE_CURRENCY, -1); 2304 next.returnTo1 = returnTo1; 2305 next.returnTo2 = returnTo2; 2306 next.currentCurrencyTrieState = trieState; 2307 added |= 1L << state.lastInsertedIndex(); 2308 } 2309 return added; 2310 } 2311 2312 private static long acceptDigitTrie( 2313 int cp, StateName nextName, ParserState state, StateItem item, DigitType type) { 2314 assert state.digitTrie != null; 2315 TextTrieMap<Byte>.ParseState trieState = state.digitTrie.openParseState(cp); 2316 if (trieState == null) return 0L; 2317 return acceptDigitTrieHelper(cp, nextName, state, item, type, trieState); 2318 } 2319 2320 private static void acceptDigitTrieOffset(int cp, ParserState state, StateItem item) { 2321 acceptDigitTrieHelper( 2322 cp, item.returnTo1, state, item, item.currentDigitType, item.currentDigitTrieState); 2323 } 2324 2325 private static long acceptDigitTrieHelper( 2326 int cp, 2327 StateName returnTo1, 2328 ParserState state, 2329 StateItem item, 2330 DigitType type, 2331 TextTrieMap<Byte>.ParseState trieState) { 2332 if (trieState == null) return 0L; 2333 trieState.accept(cp); 2334 long added = 0L; 2335 Iterator<Byte> currentMatches = trieState.getCurrentMatches(); 2336 if (currentMatches != null) { 2337 // Match on current code point 2338 byte digit = currentMatches.next(); 2339 StateItem next = state.getNext().copyFrom(item, returnTo1, -1); 2340 next.returnTo1 = null; 2341 recordDigit(next, digit, type); 2342 added |= 1L << state.lastInsertedIndex(); 2343 } 2344 if (!trieState.atEnd()) { 2345 // Prepare for matches on future code points 2346 StateItem next = state.getNext().copyFrom(item, StateName.INSIDE_DIGIT, -1); 2347 next.returnTo1 = returnTo1; 2348 next.currentDigitTrieState = trieState; 2349 next.currentDigitType = type; 2350 added |= 1L << state.lastInsertedIndex(); 2351 } 2352 return added; 2353 } 2354 2355 /** 2356 * Checks whether the two given code points are equal after applying case mapping as requested in 2357 * the ParserState. 2358 * 2359 * @see #acceptString 2360 * @see #acceptAffixPattern 2361 */ 2362 private static boolean codePointEquals(int cp1, int cp2, ParserState state) { 2363 if (!state.caseSensitive) { 2364 cp1 = UCharacter.foldCase(cp1, true); 2365 cp2 = UCharacter.foldCase(cp2, true); 2366 } 2367 return cp1 == cp2; 2368 } 2369 2370 /** 2371 * Checks whether the given code point is "ignorable" and should be skipped. BiDi control marks 2372 * are always ignorable, and whitespace is ignorable in lenient mode. 2373 * 2374 * <p>Returns false if cp is negative. 2375 * 2376 * @param cp The code point to test. 2377 * @param state The current {@link ParserState}, used for determining strict mode. 2378 * @return true if cp is ignorable; false otherwise. 2379 */ 2380 private static boolean isIgnorable(int cp, ParserState state) { 2381 if (cp < 0) return false; 2382 if (UNISET_BIDI.contains(cp)) return true; 2383 return state.mode == ParseMode.LENIENT && UNISET_WHITESPACE.contains(cp); 2384 } 2385 } 2386