1 // 2017 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 package com.ibm.icu.impl.number; 4 5 import com.ibm.icu.text.NumberFormat; 6 7 /** 8 * Performs manipulations on affix patterns: the prefix and suffix strings associated with a decimal 9 * format pattern. For example: 10 * 11 * <table> 12 * <tr><th>Affix Pattern</th><th>Example Unescaped (Formatted) String</th></tr> 13 * <tr><td>abc</td><td>abc</td></tr> 14 * <tr><td>ab-</td><td>ab</td></tr> 15 * <tr><td>ab'-'</td><td>ab-</td></tr> 16 * <tr><td>ab''</td><td>ab'</td></tr> 17 * </table> 18 * 19 * To manually iterate over tokens in a literal string, use the following pattern, which is designed 20 * to be efficient. 21 * 22 * <pre> 23 * long tag = 0L; 24 * while (AffixPatternUtils.hasNext(tag, patternString)) { 25 * tag = AffixPatternUtils.nextToken(tag, patternString); 26 * int typeOrCp = AffixPatternUtils.getTypeOrCp(tag); 27 * switch (typeOrCp) { 28 * case AffixPatternUtils.TYPE_MINUS_SIGN: 29 * // Current token is a minus sign. 30 * break; 31 * case AffixPatternUtils.TYPE_PLUS_SIGN: 32 * // Current token is a plus sign. 33 * break; 34 * case AffixPatternUtils.TYPE_PERCENT: 35 * // Current token is a percent sign. 36 * break; 37 * // ... other types ... 38 * default: 39 * // Current token is an arbitrary code point. 40 * // The variable typeOrCp is the code point. 41 * break; 42 * } 43 * } 44 * </pre> 45 */ 46 public class AffixUtils { 47 48 private static final int STATE_BASE = 0; 49 private static final int STATE_FIRST_QUOTE = 1; 50 private static final int STATE_INSIDE_QUOTE = 2; 51 private static final int STATE_AFTER_QUOTE = 3; 52 private static final int STATE_FIRST_CURR = 4; 53 private static final int STATE_SECOND_CURR = 5; 54 private static final int STATE_THIRD_CURR = 6; 55 private static final int STATE_FOURTH_CURR = 7; 56 private static final int STATE_FIFTH_CURR = 8; 57 private static final int STATE_OVERFLOW_CURR = 9; 58 59 /** Represents a literal character; the value is stored in the code point field. */ 60 private static final int TYPE_CODEPOINT = 0; 61 62 /** Represents a minus sign symbol '-'. */ 63 public static final int TYPE_MINUS_SIGN = -1; 64 65 /** Represents a plus sign symbol '+'. */ 66 public static final int TYPE_PLUS_SIGN = -2; 67 68 /** Represents a percent sign symbol '%'. */ 69 public static final int TYPE_PERCENT = -3; 70 71 /** Represents a permille sign symbol ''. */ 72 public static final int TYPE_PERMILLE = -4; 73 74 /** Represents a single currency symbol ''. */ 75 public static final int TYPE_CURRENCY_SINGLE = -5; 76 77 /** Represents a double currency symbol ''. */ 78 public static final int TYPE_CURRENCY_DOUBLE = -6; 79 80 /** Represents a triple currency symbol ''. */ 81 public static final int TYPE_CURRENCY_TRIPLE = -7; 82 83 /** Represents a quadruple currency symbol ''. */ 84 public static final int TYPE_CURRENCY_QUAD = -8; 85 86 /** Represents a quintuple currency symbol ''. */ 87 public static final int TYPE_CURRENCY_QUINT = -9; 88 89 /** Represents a sequence of six or more currency symbols. */ 90 public static final int TYPE_CURRENCY_OVERFLOW = -15; 91 92 public static interface SymbolProvider { 93 public CharSequence getSymbol(int type); 94 } 95 96 /** 97 * Estimates the number of code points present in an unescaped version of the affix pattern string 98 * (one that would be returned by {@link #unescape}), assuming that all interpolated symbols 99 * consume one code point and that currencies consume as many code points as their symbol width. 100 * Used for computing padding width. 101 * 102 * @param patternString The original string whose width will be estimated. 103 * @return The length of the unescaped string. 104 */ 105 public static int estimateLength(CharSequence patternString) { 106 if (patternString == null) return 0; 107 int state = STATE_BASE; 108 int offset = 0; 109 int length = 0; 110 for (; offset < patternString.length(); ) { 111 int cp = Character.codePointAt(patternString, offset); 112 113 switch (state) { 114 case STATE_BASE: 115 if (cp == '\'') { 116 // First quote 117 state = STATE_FIRST_QUOTE; 118 } else { 119 // Unquoted symbol 120 length++; 121 } 122 break; 123 case STATE_FIRST_QUOTE: 124 if (cp == '\'') { 125 // Repeated quote 126 length++; 127 state = STATE_BASE; 128 } else { 129 // Quoted code point 130 length++; 131 state = STATE_INSIDE_QUOTE; 132 } 133 break; 134 case STATE_INSIDE_QUOTE: 135 if (cp == '\'') { 136 // End of quoted sequence 137 state = STATE_AFTER_QUOTE; 138 } else { 139 // Quoted code point 140 length++; 141 } 142 break; 143 case STATE_AFTER_QUOTE: 144 if (cp == '\'') { 145 // Double quote inside of quoted sequence 146 length++; 147 state = STATE_INSIDE_QUOTE; 148 } else { 149 // Unquoted symbol 150 length++; 151 } 152 break; 153 default: 154 throw new AssertionError(); 155 } 156 157 offset += Character.charCount(cp); 158 } 159 160 switch (state) { 161 case STATE_FIRST_QUOTE: 162 case STATE_INSIDE_QUOTE: 163 throw new IllegalArgumentException("Unterminated quote: \"" + patternString + "\""); 164 default: 165 break; 166 } 167 168 return length; 169 } 170 171 /** 172 * Takes a string and escapes (quotes) characters that have special meaning in the affix pattern 173 * syntax. This function does not reverse-lookup symbols. 174 * 175 * <p>Example input: "-$x"; example output: "'-'$x" 176 * 177 * @param input The string to be escaped. 178 * @param output The string builder to which to append the escaped string. 179 * @return The number of chars (UTF-16 code units) appended to the output. 180 */ 181 public static int escape(CharSequence input, StringBuilder output) { 182 if (input == null) return 0; 183 int state = STATE_BASE; 184 int offset = 0; 185 int startLength = output.length(); 186 for (; offset < input.length(); ) { 187 int cp = Character.codePointAt(input, offset); 188 189 switch (cp) { 190 case '\'': 191 output.append("''"); 192 break; 193 194 case '-': 195 case '+': 196 case '%': 197 case '': 198 case '': 199 if (state == STATE_BASE) { 200 output.append('\''); 201 output.appendCodePoint(cp); 202 state = STATE_INSIDE_QUOTE; 203 } else { 204 output.appendCodePoint(cp); 205 } 206 break; 207 208 default: 209 if (state == STATE_INSIDE_QUOTE) { 210 output.append('\''); 211 output.appendCodePoint(cp); 212 state = STATE_BASE; 213 } else { 214 output.appendCodePoint(cp); 215 } 216 break; 217 } 218 offset += Character.charCount(cp); 219 } 220 221 if (state == STATE_INSIDE_QUOTE) { 222 output.append('\''); 223 } 224 225 return output.length() - startLength; 226 } 227 228 /** Version of {@link #escape} that returns a String, or null if input is null. */ 229 public static String escape(CharSequence input) { 230 if (input == null) return null; 231 StringBuilder sb = new StringBuilder(); 232 escape(input, sb); 233 return sb.toString(); 234 } 235 236 public static final NumberFormat.Field getFieldForType(int type) { 237 switch (type) { 238 case TYPE_MINUS_SIGN: 239 return NumberFormat.Field.SIGN; 240 case TYPE_PLUS_SIGN: 241 return NumberFormat.Field.SIGN; 242 case TYPE_PERCENT: 243 return NumberFormat.Field.PERCENT; 244 case TYPE_PERMILLE: 245 return NumberFormat.Field.PERMILLE; 246 case TYPE_CURRENCY_SINGLE: 247 return NumberFormat.Field.CURRENCY; 248 case TYPE_CURRENCY_DOUBLE: 249 return NumberFormat.Field.CURRENCY; 250 case TYPE_CURRENCY_TRIPLE: 251 return NumberFormat.Field.CURRENCY; 252 case TYPE_CURRENCY_QUAD: 253 return NumberFormat.Field.CURRENCY; 254 case TYPE_CURRENCY_QUINT: 255 return NumberFormat.Field.CURRENCY; 256 case TYPE_CURRENCY_OVERFLOW: 257 return NumberFormat.Field.CURRENCY; 258 default: 259 throw new AssertionError(); 260 } 261 } 262 263 /** 264 * Executes the unescape state machine. Replaces the unquoted characters "-", "+", "%", "", and 265 * "" with the corresponding symbols provided by the {@link SymbolProvider}, and inserts the 266 * result into the NumberStringBuilder at the requested location. 267 * 268 * <p>Example input: "'-'x"; example output: "-$x" 269 * 270 * @param affixPattern The original string to be unescaped. 271 * @param output The NumberStringBuilder to mutate with the result. 272 * @param position The index into the NumberStringBuilder to insert the the string. 273 * @param provider An object to generate locale symbols. 274 * @return The length of the string added to affixPattern. 275 */ 276 public static int unescape( 277 CharSequence affixPattern, 278 NumberStringBuilder output, 279 int position, 280 SymbolProvider provider) { 281 assert affixPattern != null; 282 int length = 0; 283 long tag = 0L; 284 while (hasNext(tag, affixPattern)) { 285 tag = nextToken(tag, affixPattern); 286 int typeOrCp = getTypeOrCp(tag); 287 if (typeOrCp == TYPE_CURRENCY_OVERFLOW) { 288 // Don't go to the provider for this special case 289 length += output.insertCodePoint(position + length, 0xFFFD, NumberFormat.Field.CURRENCY); 290 } else if (typeOrCp < 0) { 291 length += output.insert(position + length, provider.getSymbol(typeOrCp), getFieldForType(typeOrCp)); 292 } else { 293 length += output.insertCodePoint(position + length, typeOrCp, null); 294 } 295 } 296 return length; 297 } 298 299 /** 300 * Sames as {@link #unescape}, but only calculates the code point count. More efficient than {@link #unescape} 301 * if you only need the length but not the string itself. 302 * 303 * @param affixPattern The original string to be unescaped. 304 * @param provider An object to generate locale symbols. 305 * @return The number of code points in the unescaped string. 306 */ 307 public static int unescapedCodePointCount(CharSequence affixPattern, SymbolProvider provider) { 308 int length = 0; 309 long tag = 0L; 310 while (hasNext(tag, affixPattern)) { 311 tag = nextToken(tag, affixPattern); 312 int typeOrCp = getTypeOrCp(tag); 313 if (typeOrCp == TYPE_CURRENCY_OVERFLOW) { 314 length += 1; 315 } else if (typeOrCp < 0) { 316 CharSequence symbol = provider.getSymbol(typeOrCp); 317 length += Character.codePointCount(symbol, 0, symbol.length()); 318 } else { 319 length += 1; 320 } 321 } 322 return length; 323 } 324 325 /** 326 * Checks whether the given affix pattern contains at least one token of the given type, which is 327 * one of the constants "TYPE_" in {@link AffixUtils}. 328 * 329 * @param affixPattern The affix pattern to check. 330 * @param type The token type. 331 * @return true if the affix pattern contains the given token type; false otherwise. 332 */ 333 public static boolean containsType(CharSequence affixPattern, int type) { 334 if (affixPattern == null || affixPattern.length() == 0) { 335 return false; 336 } 337 long tag = 0L; 338 while (hasNext(tag, affixPattern)) { 339 tag = nextToken(tag, affixPattern); 340 if (getTypeOrCp(tag) == type) { 341 return true; 342 } 343 } 344 return false; 345 } 346 347 /** 348 * Checks whether the specified affix pattern has any unquoted currency symbols (""). 349 * 350 * @param affixPattern The string to check for currency symbols. 351 * @return true if the literal has at least one unquoted currency symbol; false otherwise. 352 */ 353 public static boolean hasCurrencySymbols(CharSequence affixPattern) { 354 if (affixPattern == null || affixPattern.length() == 0) return false; 355 long tag = 0L; 356 while (hasNext(tag, affixPattern)) { 357 tag = nextToken(tag, affixPattern); 358 int typeOrCp = getTypeOrCp(tag); 359 if (typeOrCp < 0 && getFieldForType(typeOrCp) == NumberFormat.Field.CURRENCY) { 360 return true; 361 } 362 } 363 return false; 364 } 365 366 /** 367 * Replaces all occurrences of tokens with the given type with the given replacement char. 368 * 369 * @param affixPattern The source affix pattern (does not get modified). 370 * @param type The token type. 371 * @param replacementChar The char to substitute in place of chars of the given token type. 372 * @return A string containing the new affix pattern. 373 */ 374 public static String replaceType(CharSequence affixPattern, int type, char replacementChar) { 375 if (affixPattern == null || affixPattern.length() == 0) return ""; 376 char[] chars = affixPattern.toString().toCharArray(); 377 long tag = 0L; 378 while (hasNext(tag, affixPattern)) { 379 tag = nextToken(tag, affixPattern); 380 if (getTypeOrCp(tag) == type) { 381 int offset = getOffset(tag); 382 chars[offset - 1] = replacementChar; 383 } 384 } 385 return new String(chars); 386 } 387 388 /** 389 * Returns the next token from the affix pattern. 390 * 391 * @param tag A bitmask used for keeping track of state from token to token. The initial value 392 * should be 0L. 393 * @param patternString The affix pattern. 394 * @return The bitmask tag to pass to the next call of this method to retrieve the following token 395 * (never negative), or -1 if there were no more tokens in the affix pattern. 396 * @see #hasNext 397 */ 398 public static long nextToken(long tag, CharSequence patternString) { 399 int offset = getOffset(tag); 400 int state = getState(tag); 401 for (; offset < patternString.length(); ) { 402 int cp = Character.codePointAt(patternString, offset); 403 int count = Character.charCount(cp); 404 405 switch (state) { 406 case STATE_BASE: 407 switch (cp) { 408 case '\'': 409 state = STATE_FIRST_QUOTE; 410 offset += count; 411 // continue to the next code point 412 break; 413 case '-': 414 return makeTag(offset + count, TYPE_MINUS_SIGN, STATE_BASE, 0); 415 case '+': 416 return makeTag(offset + count, TYPE_PLUS_SIGN, STATE_BASE, 0); 417 case '%': 418 return makeTag(offset + count, TYPE_PERCENT, STATE_BASE, 0); 419 case '': 420 return makeTag(offset + count, TYPE_PERMILLE, STATE_BASE, 0); 421 case '': 422 state = STATE_FIRST_CURR; 423 offset += count; 424 // continue to the next code point 425 break; 426 default: 427 return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp); 428 } 429 break; 430 case STATE_FIRST_QUOTE: 431 if (cp == '\'') { 432 return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp); 433 } else { 434 return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp); 435 } 436 case STATE_INSIDE_QUOTE: 437 if (cp == '\'') { 438 state = STATE_AFTER_QUOTE; 439 offset += count; 440 // continue to the next code point 441 break; 442 } else { 443 return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp); 444 } 445 case STATE_AFTER_QUOTE: 446 if (cp == '\'') { 447 return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp); 448 } else { 449 state = STATE_BASE; 450 // re-evaluate this code point 451 break; 452 } 453 case STATE_FIRST_CURR: 454 if (cp == '') { 455 state = STATE_SECOND_CURR; 456 offset += count; 457 // continue to the next code point 458 break; 459 } else { 460 return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0); 461 } 462 case STATE_SECOND_CURR: 463 if (cp == '') { 464 state = STATE_THIRD_CURR; 465 offset += count; 466 // continue to the next code point 467 break; 468 } else { 469 return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0); 470 } 471 case STATE_THIRD_CURR: 472 if (cp == '') { 473 state = STATE_FOURTH_CURR; 474 offset += count; 475 // continue to the next code point 476 break; 477 } else { 478 return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0); 479 } 480 case STATE_FOURTH_CURR: 481 if (cp == '') { 482 state = STATE_FIFTH_CURR; 483 offset += count; 484 // continue to the next code point 485 break; 486 } else { 487 return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0); 488 } 489 case STATE_FIFTH_CURR: 490 if (cp == '') { 491 state = STATE_OVERFLOW_CURR; 492 offset += count; 493 // continue to the next code point 494 break; 495 } else { 496 return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0); 497 } 498 case STATE_OVERFLOW_CURR: 499 if (cp == '') { 500 offset += count; 501 // continue to the next code point and loop back to this state 502 break; 503 } else { 504 return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0); 505 } 506 default: 507 throw new AssertionError(); 508 } 509 } 510 // End of string 511 switch (state) { 512 case STATE_BASE: 513 // No more tokens in string. 514 return -1L; 515 case STATE_FIRST_QUOTE: 516 case STATE_INSIDE_QUOTE: 517 // For consistent behavior with the JDK and ICU 58, throw an exception here. 518 throw new IllegalArgumentException( 519 "Unterminated quote in pattern affix: \"" + patternString + "\""); 520 case STATE_AFTER_QUOTE: 521 // No more tokens in string. 522 return -1L; 523 case STATE_FIRST_CURR: 524 return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0); 525 case STATE_SECOND_CURR: 526 return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0); 527 case STATE_THIRD_CURR: 528 return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0); 529 case STATE_FOURTH_CURR: 530 return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0); 531 case STATE_FIFTH_CURR: 532 return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0); 533 case STATE_OVERFLOW_CURR: 534 return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0); 535 default: 536 throw new AssertionError(); 537 } 538 } 539 540 /** 541 * Returns whether the affix pattern string has any more tokens to be retrieved from a call to 542 * {@link #nextToken}. 543 * 544 * @param tag The bitmask tag of the previous token, as returned by {@link #nextToken}. 545 * @param string The affix pattern. 546 * @return true if there are more tokens to consume; false otherwise. 547 */ 548 public static boolean hasNext(long tag, CharSequence string) { 549 assert tag >= 0; 550 int state = getState(tag); 551 int offset = getOffset(tag); 552 // Special case: the last character in string is an end quote. 553 if (state == STATE_INSIDE_QUOTE 554 && offset == string.length() - 1 555 && string.charAt(offset) == '\'') { 556 return false; 557 } else if (state != STATE_BASE) { 558 return true; 559 } else { 560 return offset < string.length(); 561 } 562 } 563 564 /** 565 * This function helps determine the identity of the token consumed by {@link #nextToken}. 566 * Converts from a bitmask tag, based on a call to {@link #nextToken}, to its corresponding symbol 567 * type or code point. 568 * 569 * @param tag The bitmask tag of the current token, as returned by {@link #nextToken}. 570 * @return If less than zero, a symbol type corresponding to one of the <code>TYPE_</code> 571 * constants, such as {@link #TYPE_MINUS_SIGN}. If greater than or equal to zero, a literal 572 * code point. 573 */ 574 public static int getTypeOrCp(long tag) { 575 assert tag >= 0; 576 int type = getType(tag); 577 return (type == TYPE_CODEPOINT) ? getCodePoint(tag) : -type; 578 } 579 580 /** 581 * Encodes the given values into a 64-bit tag. 582 * 583 * <ul> 584 * <li>Bits 0-31 => offset (int32) 585 * <li>Bits 32-35 => type (uint4) 586 * <li>Bits 36-39 => state (uint4) 587 * <li>Bits 40-60 => code point (uint21) 588 * <li>Bits 61-63 => unused 589 * </ul> 590 */ 591 private static long makeTag(int offset, int type, int state, int cp) { 592 long tag = 0L; 593 tag |= offset; 594 tag |= (-(long) type) << 32; 595 tag |= ((long) state) << 36; 596 tag |= ((long) cp) << 40; 597 assert tag >= 0; 598 return tag; 599 } 600 601 static int getOffset(long tag) { 602 return (int) (tag & 0xffffffff); 603 } 604 605 static int getType(long tag) { 606 return (int) ((tag >>> 32) & 0xf); 607 } 608 609 static int getState(long tag) { 610 return (int) ((tag >>> 36) & 0xf); 611 } 612 613 static int getCodePoint(long tag) { 614 return (int) (tag >>> 40); 615 } 616 } 617