Home | History | Annotate | Download | only in number
      1 //  2017 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 package com.ibm.icu.impl.number;
      4 
      5 import com.ibm.icu.text.NumberFormat;
      6 
      7 /**
      8  * Performs manipulations on affix patterns: the prefix and suffix strings associated with a decimal
      9  * format pattern. For example:
     10  *
     11  * <table>
     12  * <tr><th>Affix Pattern</th><th>Example Unescaped (Formatted) String</th></tr>
     13  * <tr><td>abc</td><td>abc</td></tr>
     14  * <tr><td>ab-</td><td>ab</td></tr>
     15  * <tr><td>ab'-'</td><td>ab-</td></tr>
     16  * <tr><td>ab''</td><td>ab'</td></tr>
     17  * </table>
     18  *
     19  * To manually iterate over tokens in a literal string, use the following pattern, which is designed
     20  * to be efficient.
     21  *
     22  * <pre>
     23  * long tag = 0L;
     24  * while (AffixPatternUtils.hasNext(tag, patternString)) {
     25  *   tag = AffixPatternUtils.nextToken(tag, patternString);
     26  *   int typeOrCp = AffixPatternUtils.getTypeOrCp(tag);
     27  *   switch (typeOrCp) {
     28  *     case AffixPatternUtils.TYPE_MINUS_SIGN:
     29  *       // Current token is a minus sign.
     30  *       break;
     31  *     case AffixPatternUtils.TYPE_PLUS_SIGN:
     32  *       // Current token is a plus sign.
     33  *       break;
     34  *     case AffixPatternUtils.TYPE_PERCENT:
     35  *       // Current token is a percent sign.
     36  *       break;
     37  *     // ... other types ...
     38  *     default:
     39  *       // Current token is an arbitrary code point.
     40  *       // The variable typeOrCp is the code point.
     41  *       break;
     42  *   }
     43  * }
     44  * </pre>
     45  */
     46 public class AffixUtils {
     47 
     48   private static final int STATE_BASE = 0;
     49   private static final int STATE_FIRST_QUOTE = 1;
     50   private static final int STATE_INSIDE_QUOTE = 2;
     51   private static final int STATE_AFTER_QUOTE = 3;
     52   private static final int STATE_FIRST_CURR = 4;
     53   private static final int STATE_SECOND_CURR = 5;
     54   private static final int STATE_THIRD_CURR = 6;
     55   private static final int STATE_FOURTH_CURR = 7;
     56   private static final int STATE_FIFTH_CURR = 8;
     57   private static final int STATE_OVERFLOW_CURR = 9;
     58 
     59   /** Represents a literal character; the value is stored in the code point field. */
     60   private static final int TYPE_CODEPOINT = 0;
     61 
     62   /** Represents a minus sign symbol '-'. */
     63   public static final int TYPE_MINUS_SIGN = -1;
     64 
     65   /** Represents a plus sign symbol '+'. */
     66   public static final int TYPE_PLUS_SIGN = -2;
     67 
     68   /** Represents a percent sign symbol '%'. */
     69   public static final int TYPE_PERCENT = -3;
     70 
     71   /** Represents a permille sign symbol ''. */
     72   public static final int TYPE_PERMILLE = -4;
     73 
     74   /** Represents a single currency symbol ''. */
     75   public static final int TYPE_CURRENCY_SINGLE = -5;
     76 
     77   /** Represents a double currency symbol ''. */
     78   public static final int TYPE_CURRENCY_DOUBLE = -6;
     79 
     80   /** Represents a triple currency symbol ''. */
     81   public static final int TYPE_CURRENCY_TRIPLE = -7;
     82 
     83   /** Represents a quadruple currency symbol ''. */
     84   public static final int TYPE_CURRENCY_QUAD = -8;
     85 
     86   /** Represents a quintuple currency symbol ''. */
     87   public static final int TYPE_CURRENCY_QUINT = -9;
     88 
     89   /** Represents a sequence of six or more currency symbols. */
     90   public static final int TYPE_CURRENCY_OVERFLOW = -15;
     91 
     92   public static interface SymbolProvider {
     93     public CharSequence getSymbol(int type);
     94   }
     95 
     96   /**
     97    * Estimates the number of code points present in an unescaped version of the affix pattern string
     98    * (one that would be returned by {@link #unescape}), assuming that all interpolated symbols
     99    * consume one code point and that currencies consume as many code points as their symbol width.
    100    * Used for computing padding width.
    101    *
    102    * @param patternString The original string whose width will be estimated.
    103    * @return The length of the unescaped string.
    104    */
    105   public static int estimateLength(CharSequence patternString) {
    106     if (patternString == null) return 0;
    107     int state = STATE_BASE;
    108     int offset = 0;
    109     int length = 0;
    110     for (; offset < patternString.length(); ) {
    111       int cp = Character.codePointAt(patternString, offset);
    112 
    113       switch (state) {
    114         case STATE_BASE:
    115           if (cp == '\'') {
    116             // First quote
    117             state = STATE_FIRST_QUOTE;
    118           } else {
    119             // Unquoted symbol
    120             length++;
    121           }
    122           break;
    123         case STATE_FIRST_QUOTE:
    124           if (cp == '\'') {
    125             // Repeated quote
    126             length++;
    127             state = STATE_BASE;
    128           } else {
    129             // Quoted code point
    130             length++;
    131             state = STATE_INSIDE_QUOTE;
    132           }
    133           break;
    134         case STATE_INSIDE_QUOTE:
    135           if (cp == '\'') {
    136             // End of quoted sequence
    137             state = STATE_AFTER_QUOTE;
    138           } else {
    139             // Quoted code point
    140             length++;
    141           }
    142           break;
    143         case STATE_AFTER_QUOTE:
    144           if (cp == '\'') {
    145             // Double quote inside of quoted sequence
    146             length++;
    147             state = STATE_INSIDE_QUOTE;
    148           } else {
    149             // Unquoted symbol
    150             length++;
    151           }
    152           break;
    153         default:
    154           throw new AssertionError();
    155       }
    156 
    157       offset += Character.charCount(cp);
    158     }
    159 
    160     switch (state) {
    161       case STATE_FIRST_QUOTE:
    162       case STATE_INSIDE_QUOTE:
    163         throw new IllegalArgumentException("Unterminated quote: \"" + patternString + "\"");
    164       default:
    165         break;
    166     }
    167 
    168     return length;
    169   }
    170 
    171   /**
    172    * Takes a string and escapes (quotes) characters that have special meaning in the affix pattern
    173    * syntax. This function does not reverse-lookup symbols.
    174    *
    175    * <p>Example input: "-$x"; example output: "'-'$x"
    176    *
    177    * @param input The string to be escaped.
    178    * @param output The string builder to which to append the escaped string.
    179    * @return The number of chars (UTF-16 code units) appended to the output.
    180    */
    181   public static int escape(CharSequence input, StringBuilder output) {
    182     if (input == null) return 0;
    183     int state = STATE_BASE;
    184     int offset = 0;
    185     int startLength = output.length();
    186     for (; offset < input.length(); ) {
    187       int cp = Character.codePointAt(input, offset);
    188 
    189       switch (cp) {
    190         case '\'':
    191           output.append("''");
    192           break;
    193 
    194         case '-':
    195         case '+':
    196         case '%':
    197         case '':
    198         case '':
    199           if (state == STATE_BASE) {
    200             output.append('\'');
    201             output.appendCodePoint(cp);
    202             state = STATE_INSIDE_QUOTE;
    203           } else {
    204             output.appendCodePoint(cp);
    205           }
    206           break;
    207 
    208         default:
    209           if (state == STATE_INSIDE_QUOTE) {
    210             output.append('\'');
    211             output.appendCodePoint(cp);
    212             state = STATE_BASE;
    213           } else {
    214             output.appendCodePoint(cp);
    215           }
    216           break;
    217       }
    218       offset += Character.charCount(cp);
    219     }
    220 
    221     if (state == STATE_INSIDE_QUOTE) {
    222       output.append('\'');
    223     }
    224 
    225     return output.length() - startLength;
    226   }
    227 
    228   /** Version of {@link #escape} that returns a String, or null if input is null. */
    229   public static String escape(CharSequence input) {
    230     if (input == null) return null;
    231     StringBuilder sb = new StringBuilder();
    232     escape(input, sb);
    233     return sb.toString();
    234   }
    235 
    236   public static final NumberFormat.Field getFieldForType(int type) {
    237     switch (type) {
    238       case TYPE_MINUS_SIGN:
    239         return NumberFormat.Field.SIGN;
    240       case TYPE_PLUS_SIGN:
    241         return NumberFormat.Field.SIGN;
    242       case TYPE_PERCENT:
    243         return NumberFormat.Field.PERCENT;
    244       case TYPE_PERMILLE:
    245         return NumberFormat.Field.PERMILLE;
    246       case TYPE_CURRENCY_SINGLE:
    247         return NumberFormat.Field.CURRENCY;
    248       case TYPE_CURRENCY_DOUBLE:
    249         return NumberFormat.Field.CURRENCY;
    250       case TYPE_CURRENCY_TRIPLE:
    251         return NumberFormat.Field.CURRENCY;
    252       case TYPE_CURRENCY_QUAD:
    253         return NumberFormat.Field.CURRENCY;
    254       case TYPE_CURRENCY_QUINT:
    255         return NumberFormat.Field.CURRENCY;
    256       case TYPE_CURRENCY_OVERFLOW:
    257         return NumberFormat.Field.CURRENCY;
    258       default:
    259         throw new AssertionError();
    260     }
    261   }
    262 
    263   /**
    264    * Executes the unescape state machine. Replaces the unquoted characters "-", "+", "%", "", and
    265    * "" with the corresponding symbols provided by the {@link SymbolProvider}, and inserts the
    266    * result into the NumberStringBuilder at the requested location.
    267    *
    268    * <p>Example input: "'-'x"; example output: "-$x"
    269    *
    270    * @param affixPattern The original string to be unescaped.
    271    * @param output The NumberStringBuilder to mutate with the result.
    272    * @param position The index into the NumberStringBuilder to insert the the string.
    273    * @param provider An object to generate locale symbols.
    274    * @return The length of the string added to affixPattern.
    275    */
    276   public static int unescape(
    277       CharSequence affixPattern,
    278       NumberStringBuilder output,
    279       int position,
    280       SymbolProvider provider) {
    281     assert affixPattern != null;
    282     int length = 0;
    283     long tag = 0L;
    284     while (hasNext(tag, affixPattern)) {
    285       tag = nextToken(tag, affixPattern);
    286       int typeOrCp = getTypeOrCp(tag);
    287       if (typeOrCp == TYPE_CURRENCY_OVERFLOW) {
    288         // Don't go to the provider for this special case
    289         length += output.insertCodePoint(position + length, 0xFFFD, NumberFormat.Field.CURRENCY);
    290       } else if (typeOrCp < 0) {
    291         length += output.insert(position + length, provider.getSymbol(typeOrCp), getFieldForType(typeOrCp));
    292       } else {
    293         length += output.insertCodePoint(position + length, typeOrCp, null);
    294       }
    295     }
    296     return length;
    297   }
    298 
    299   /**
    300    * Sames as {@link #unescape}, but only calculates the code point count.  More efficient than {@link #unescape}
    301    * if you only need the length but not the string itself.
    302    *
    303    * @param affixPattern The original string to be unescaped.
    304    * @param provider An object to generate locale symbols.
    305    * @return The number of code points in the unescaped string.
    306    */
    307   public static int unescapedCodePointCount(CharSequence affixPattern, SymbolProvider provider) {
    308     int length = 0;
    309     long tag = 0L;
    310     while (hasNext(tag, affixPattern)) {
    311       tag = nextToken(tag, affixPattern);
    312       int typeOrCp = getTypeOrCp(tag);
    313       if (typeOrCp == TYPE_CURRENCY_OVERFLOW) {
    314         length += 1;
    315       } else if (typeOrCp < 0) {
    316         CharSequence symbol = provider.getSymbol(typeOrCp);
    317         length += Character.codePointCount(symbol, 0, symbol.length());
    318       } else {
    319         length += 1;
    320       }
    321     }
    322     return length;
    323   }
    324 
    325   /**
    326    * Checks whether the given affix pattern contains at least one token of the given type, which is
    327    * one of the constants "TYPE_" in {@link AffixUtils}.
    328    *
    329    * @param affixPattern The affix pattern to check.
    330    * @param type The token type.
    331    * @return true if the affix pattern contains the given token type; false otherwise.
    332    */
    333   public static boolean containsType(CharSequence affixPattern, int type) {
    334     if (affixPattern == null || affixPattern.length() == 0) {
    335         return false;
    336     }
    337     long tag = 0L;
    338     while (hasNext(tag, affixPattern)) {
    339       tag = nextToken(tag, affixPattern);
    340       if (getTypeOrCp(tag) == type) {
    341         return true;
    342       }
    343     }
    344     return false;
    345   }
    346 
    347   /**
    348    * Checks whether the specified affix pattern has any unquoted currency symbols ("").
    349    *
    350    * @param affixPattern The string to check for currency symbols.
    351    * @return true if the literal has at least one unquoted currency symbol; false otherwise.
    352    */
    353   public static boolean hasCurrencySymbols(CharSequence affixPattern) {
    354     if (affixPattern == null || affixPattern.length() == 0) return false;
    355     long tag = 0L;
    356     while (hasNext(tag, affixPattern)) {
    357       tag = nextToken(tag, affixPattern);
    358       int typeOrCp = getTypeOrCp(tag);
    359       if (typeOrCp < 0 && getFieldForType(typeOrCp) == NumberFormat.Field.CURRENCY) {
    360         return true;
    361       }
    362     }
    363     return false;
    364   }
    365 
    366   /**
    367    * Replaces all occurrences of tokens with the given type with the given replacement char.
    368    *
    369    * @param affixPattern The source affix pattern (does not get modified).
    370    * @param type The token type.
    371    * @param replacementChar The char to substitute in place of chars of the given token type.
    372    * @return A string containing the new affix pattern.
    373    */
    374   public static String replaceType(CharSequence affixPattern, int type, char replacementChar) {
    375     if (affixPattern == null || affixPattern.length() == 0) return "";
    376     char[] chars = affixPattern.toString().toCharArray();
    377     long tag = 0L;
    378     while (hasNext(tag, affixPattern)) {
    379       tag = nextToken(tag, affixPattern);
    380       if (getTypeOrCp(tag) == type) {
    381         int offset = getOffset(tag);
    382         chars[offset - 1] = replacementChar;
    383       }
    384     }
    385     return new String(chars);
    386   }
    387 
    388   /**
    389    * Returns the next token from the affix pattern.
    390    *
    391    * @param tag A bitmask used for keeping track of state from token to token. The initial value
    392    *     should be 0L.
    393    * @param patternString The affix pattern.
    394    * @return The bitmask tag to pass to the next call of this method to retrieve the following token
    395    *     (never negative), or -1 if there were no more tokens in the affix pattern.
    396    * @see #hasNext
    397    */
    398   public static long nextToken(long tag, CharSequence patternString) {
    399     int offset = getOffset(tag);
    400     int state = getState(tag);
    401     for (; offset < patternString.length(); ) {
    402       int cp = Character.codePointAt(patternString, offset);
    403       int count = Character.charCount(cp);
    404 
    405       switch (state) {
    406         case STATE_BASE:
    407           switch (cp) {
    408             case '\'':
    409               state = STATE_FIRST_QUOTE;
    410               offset += count;
    411               // continue to the next code point
    412               break;
    413             case '-':
    414               return makeTag(offset + count, TYPE_MINUS_SIGN, STATE_BASE, 0);
    415             case '+':
    416               return makeTag(offset + count, TYPE_PLUS_SIGN, STATE_BASE, 0);
    417             case '%':
    418               return makeTag(offset + count, TYPE_PERCENT, STATE_BASE, 0);
    419             case '':
    420               return makeTag(offset + count, TYPE_PERMILLE, STATE_BASE, 0);
    421             case '':
    422               state = STATE_FIRST_CURR;
    423               offset += count;
    424               // continue to the next code point
    425               break;
    426             default:
    427               return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
    428           }
    429           break;
    430         case STATE_FIRST_QUOTE:
    431           if (cp == '\'') {
    432             return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
    433           } else {
    434             return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
    435           }
    436         case STATE_INSIDE_QUOTE:
    437           if (cp == '\'') {
    438             state = STATE_AFTER_QUOTE;
    439             offset += count;
    440             // continue to the next code point
    441             break;
    442           } else {
    443             return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
    444           }
    445         case STATE_AFTER_QUOTE:
    446           if (cp == '\'') {
    447             return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
    448           } else {
    449             state = STATE_BASE;
    450             // re-evaluate this code point
    451             break;
    452           }
    453         case STATE_FIRST_CURR:
    454           if (cp == '') {
    455             state = STATE_SECOND_CURR;
    456             offset += count;
    457             // continue to the next code point
    458             break;
    459           } else {
    460             return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
    461           }
    462         case STATE_SECOND_CURR:
    463           if (cp == '') {
    464             state = STATE_THIRD_CURR;
    465             offset += count;
    466             // continue to the next code point
    467             break;
    468           } else {
    469             return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
    470           }
    471         case STATE_THIRD_CURR:
    472           if (cp == '') {
    473             state = STATE_FOURTH_CURR;
    474             offset += count;
    475             // continue to the next code point
    476             break;
    477           } else {
    478             return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
    479           }
    480         case STATE_FOURTH_CURR:
    481           if (cp == '') {
    482             state = STATE_FIFTH_CURR;
    483             offset += count;
    484             // continue to the next code point
    485             break;
    486           } else {
    487             return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
    488           }
    489         case STATE_FIFTH_CURR:
    490           if (cp == '') {
    491             state = STATE_OVERFLOW_CURR;
    492             offset += count;
    493             // continue to the next code point
    494             break;
    495           } else {
    496             return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
    497           }
    498         case STATE_OVERFLOW_CURR:
    499           if (cp == '') {
    500             offset += count;
    501             // continue to the next code point and loop back to this state
    502             break;
    503           } else {
    504             return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
    505           }
    506         default:
    507           throw new AssertionError();
    508       }
    509     }
    510     // End of string
    511     switch (state) {
    512       case STATE_BASE:
    513         // No more tokens in string.
    514         return -1L;
    515       case STATE_FIRST_QUOTE:
    516       case STATE_INSIDE_QUOTE:
    517         // For consistent behavior with the JDK and ICU 58, throw an exception here.
    518         throw new IllegalArgumentException(
    519             "Unterminated quote in pattern affix: \"" + patternString + "\"");
    520       case STATE_AFTER_QUOTE:
    521         // No more tokens in string.
    522         return -1L;
    523       case STATE_FIRST_CURR:
    524         return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
    525       case STATE_SECOND_CURR:
    526         return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
    527       case STATE_THIRD_CURR:
    528         return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
    529       case STATE_FOURTH_CURR:
    530         return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
    531       case STATE_FIFTH_CURR:
    532         return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
    533       case STATE_OVERFLOW_CURR:
    534         return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
    535       default:
    536         throw new AssertionError();
    537     }
    538   }
    539 
    540   /**
    541    * Returns whether the affix pattern string has any more tokens to be retrieved from a call to
    542    * {@link #nextToken}.
    543    *
    544    * @param tag The bitmask tag of the previous token, as returned by {@link #nextToken}.
    545    * @param string The affix pattern.
    546    * @return true if there are more tokens to consume; false otherwise.
    547    */
    548   public static boolean hasNext(long tag, CharSequence string) {
    549     assert tag >= 0;
    550     int state = getState(tag);
    551     int offset = getOffset(tag);
    552     // Special case: the last character in string is an end quote.
    553     if (state == STATE_INSIDE_QUOTE
    554         && offset == string.length() - 1
    555         && string.charAt(offset) == '\'') {
    556       return false;
    557     } else if (state != STATE_BASE) {
    558       return true;
    559     } else {
    560       return offset < string.length();
    561     }
    562   }
    563 
    564   /**
    565    * This function helps determine the identity of the token consumed by {@link #nextToken}.
    566    * Converts from a bitmask tag, based on a call to {@link #nextToken}, to its corresponding symbol
    567    * type or code point.
    568    *
    569    * @param tag The bitmask tag of the current token, as returned by {@link #nextToken}.
    570    * @return If less than zero, a symbol type corresponding to one of the <code>TYPE_</code>
    571    *     constants, such as {@link #TYPE_MINUS_SIGN}. If greater than or equal to zero, a literal
    572    *     code point.
    573    */
    574   public static int getTypeOrCp(long tag) {
    575     assert tag >= 0;
    576     int type = getType(tag);
    577     return (type == TYPE_CODEPOINT) ? getCodePoint(tag) : -type;
    578   }
    579 
    580   /**
    581    * Encodes the given values into a 64-bit tag.
    582    *
    583    * <ul>
    584    *   <li>Bits 0-31 => offset (int32)
    585    *   <li>Bits 32-35 => type (uint4)
    586    *   <li>Bits 36-39 => state (uint4)
    587    *   <li>Bits 40-60 => code point (uint21)
    588    *   <li>Bits 61-63 => unused
    589    * </ul>
    590    */
    591   private static long makeTag(int offset, int type, int state, int cp) {
    592     long tag = 0L;
    593     tag |= offset;
    594     tag |= (-(long) type) << 32;
    595     tag |= ((long) state) << 36;
    596     tag |= ((long) cp) << 40;
    597     assert tag >= 0;
    598     return tag;
    599   }
    600 
    601   static int getOffset(long tag) {
    602     return (int) (tag & 0xffffffff);
    603   }
    604 
    605   static int getType(long tag) {
    606     return (int) ((tag >>> 32) & 0xf);
    607   }
    608 
    609   static int getState(long tag) {
    610     return (int) ((tag >>> 36) & 0xf);
    611   }
    612 
    613   static int getCodePoint(long tag) {
    614     return (int) (tag >>> 40);
    615   }
    616 }
    617