Home | History | Annotate | Download | only in i18n
      1 //  2017 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 
      4 #include "unicode/utypes.h"
      5 
      6 #if !UCONFIG_NO_FORMATTING
      7 
      8 #include "number_affixutils.h"
      9 #include "unicode/utf16.h"
     10 #include "unicode/uniset.h"
     11 
     12 using namespace icu;
     13 using namespace icu::number;
     14 using namespace icu::number::impl;
     15 
     16 TokenConsumer::~TokenConsumer() = default;
     17 SymbolProvider::~SymbolProvider() = default;
     18 
     19 int32_t AffixUtils::estimateLength(const UnicodeString &patternString, UErrorCode &status) {
     20     AffixPatternState state = STATE_BASE;
     21     int32_t offset = 0;
     22     int32_t length = 0;
     23     for (; offset < patternString.length();) {
     24         UChar32 cp = patternString.char32At(offset);
     25 
     26         switch (state) {
     27             case STATE_BASE:
     28                 if (cp == u'\'') {
     29                     // First quote
     30                     state = STATE_FIRST_QUOTE;
     31                 } else {
     32                     // Unquoted symbol
     33                     length++;
     34                 }
     35                 break;
     36             case STATE_FIRST_QUOTE:
     37                 if (cp == u'\'') {
     38                     // Repeated quote
     39                     length++;
     40                     state = STATE_BASE;
     41                 } else {
     42                     // Quoted code point
     43                     length++;
     44                     state = STATE_INSIDE_QUOTE;
     45                 }
     46                 break;
     47             case STATE_INSIDE_QUOTE:
     48                 if (cp == u'\'') {
     49                     // End of quoted sequence
     50                     state = STATE_AFTER_QUOTE;
     51                 } else {
     52                     // Quoted code point
     53                     length++;
     54                 }
     55                 break;
     56             case STATE_AFTER_QUOTE:
     57                 if (cp == u'\'') {
     58                     // Double quote inside of quoted sequence
     59                     length++;
     60                     state = STATE_INSIDE_QUOTE;
     61                 } else {
     62                     // Unquoted symbol
     63                     length++;
     64                 }
     65                 break;
     66             default:
     67                 U_ASSERT(false);
     68         }
     69 
     70         offset += U16_LENGTH(cp);
     71     }
     72 
     73     switch (state) {
     74         case STATE_FIRST_QUOTE:
     75         case STATE_INSIDE_QUOTE:
     76             status = U_ILLEGAL_ARGUMENT_ERROR;
     77             break;
     78         default:
     79             break;
     80     }
     81 
     82     return length;
     83 }
     84 
     85 UnicodeString AffixUtils::escape(const UnicodeString &input) {
     86     AffixPatternState state = STATE_BASE;
     87     int32_t offset = 0;
     88     UnicodeString output;
     89     for (; offset < input.length();) {
     90         UChar32 cp = input.char32At(offset);
     91 
     92         switch (cp) {
     93             case u'\'':
     94                 output.append(u"''", -1);
     95                 break;
     96 
     97             case u'-':
     98             case u'+':
     99             case u'%':
    100             case u'':
    101             case u'':
    102                 if (state == STATE_BASE) {
    103                     output.append(u'\'');
    104                     output.append(cp);
    105                     state = STATE_INSIDE_QUOTE;
    106                 } else {
    107                     output.append(cp);
    108                 }
    109                 break;
    110 
    111             default:
    112                 if (state == STATE_INSIDE_QUOTE) {
    113                     output.append(u'\'');
    114                     output.append(cp);
    115                     state = STATE_BASE;
    116                 } else {
    117                     output.append(cp);
    118                 }
    119                 break;
    120         }
    121         offset += U16_LENGTH(cp);
    122     }
    123 
    124     if (state == STATE_INSIDE_QUOTE) {
    125         output.append(u'\'');
    126     }
    127 
    128     return output;
    129 }
    130 
    131 Field AffixUtils::getFieldForType(AffixPatternType type) {
    132     switch (type) {
    133         case TYPE_MINUS_SIGN:
    134             return Field::UNUM_SIGN_FIELD;
    135         case TYPE_PLUS_SIGN:
    136             return Field::UNUM_SIGN_FIELD;
    137         case TYPE_PERCENT:
    138             return Field::UNUM_PERCENT_FIELD;
    139         case TYPE_PERMILLE:
    140             return Field::UNUM_PERMILL_FIELD;
    141         case TYPE_CURRENCY_SINGLE:
    142             return Field::UNUM_CURRENCY_FIELD;
    143         case TYPE_CURRENCY_DOUBLE:
    144             return Field::UNUM_CURRENCY_FIELD;
    145         case TYPE_CURRENCY_TRIPLE:
    146             return Field::UNUM_CURRENCY_FIELD;
    147         case TYPE_CURRENCY_QUAD:
    148             return Field::UNUM_CURRENCY_FIELD;
    149         case TYPE_CURRENCY_QUINT:
    150             return Field::UNUM_CURRENCY_FIELD;
    151         case TYPE_CURRENCY_OVERFLOW:
    152             return Field::UNUM_CURRENCY_FIELD;
    153         default:
    154             U_ASSERT(false);
    155             return Field::UNUM_FIELD_COUNT; // suppress "control reaches end of non-void function"
    156     }
    157 }
    158 
    159 int32_t
    160 AffixUtils::unescape(const UnicodeString &affixPattern, NumberStringBuilder &output, int32_t position,
    161                      const SymbolProvider &provider, UErrorCode &status) {
    162     int32_t length = 0;
    163     AffixTag tag;
    164     while (hasNext(tag, affixPattern)) {
    165         tag = nextToken(tag, affixPattern, status);
    166         if (U_FAILURE(status)) { return length; }
    167         if (tag.type == TYPE_CURRENCY_OVERFLOW) {
    168             // Don't go to the provider for this special case
    169             length += output.insertCodePoint(position + length, 0xFFFD, UNUM_CURRENCY_FIELD, status);
    170         } else if (tag.type < 0) {
    171             length += output.insert(
    172                     position + length, provider.getSymbol(tag.type), getFieldForType(tag.type), status);
    173         } else {
    174             length += output.insertCodePoint(position + length, tag.codePoint, UNUM_FIELD_COUNT, status);
    175         }
    176     }
    177     return length;
    178 }
    179 
    180 int32_t AffixUtils::unescapedCodePointCount(const UnicodeString &affixPattern,
    181                                             const SymbolProvider &provider, UErrorCode &status) {
    182     int32_t length = 0;
    183     AffixTag tag;
    184     while (hasNext(tag, affixPattern)) {
    185         tag = nextToken(tag, affixPattern, status);
    186         if (U_FAILURE(status)) { return length; }
    187         if (tag.type == TYPE_CURRENCY_OVERFLOW) {
    188             length += 1;
    189         } else if (tag.type < 0) {
    190             length += provider.getSymbol(tag.type).length();
    191         } else {
    192             length += U16_LENGTH(tag.codePoint);
    193         }
    194     }
    195     return length;
    196 }
    197 
    198 bool
    199 AffixUtils::containsType(const UnicodeString &affixPattern, AffixPatternType type, UErrorCode &status) {
    200     if (affixPattern.length() == 0) {
    201         return false;
    202     }
    203     AffixTag tag;
    204     while (hasNext(tag, affixPattern)) {
    205         tag = nextToken(tag, affixPattern, status);
    206         if (U_FAILURE(status)) { return false; }
    207         if (tag.type == type) {
    208             return true;
    209         }
    210     }
    211     return false;
    212 }
    213 
    214 bool AffixUtils::hasCurrencySymbols(const UnicodeString &affixPattern, UErrorCode &status) {
    215     if (affixPattern.length() == 0) {
    216         return false;
    217     }
    218     AffixTag tag;
    219     while (hasNext(tag, affixPattern)) {
    220         tag = nextToken(tag, affixPattern, status);
    221         if (U_FAILURE(status)) { return false; }
    222         if (tag.type < 0 && getFieldForType(tag.type) == UNUM_CURRENCY_FIELD) {
    223             return true;
    224         }
    225     }
    226     return false;
    227 }
    228 
    229 UnicodeString AffixUtils::replaceType(const UnicodeString &affixPattern, AffixPatternType type,
    230                                       char16_t replacementChar, UErrorCode &status) {
    231     UnicodeString output(affixPattern); // copy
    232     if (affixPattern.length() == 0) {
    233         return output;
    234     };
    235     AffixTag tag;
    236     while (hasNext(tag, affixPattern)) {
    237         tag = nextToken(tag, affixPattern, status);
    238         if (U_FAILURE(status)) { return output; }
    239         if (tag.type == type) {
    240             output.replace(tag.offset - 1, 1, replacementChar);
    241         }
    242     }
    243     return output;
    244 }
    245 
    246 bool AffixUtils::containsOnlySymbolsAndIgnorables(const UnicodeString& affixPattern,
    247                                                   const UnicodeSet& ignorables, UErrorCode& status) {
    248     if (affixPattern.length() == 0) {
    249         return true;
    250     };
    251     AffixTag tag;
    252     while (hasNext(tag, affixPattern)) {
    253         tag = nextToken(tag, affixPattern, status);
    254         if (U_FAILURE(status)) { return false; }
    255         if (tag.type == TYPE_CODEPOINT && !ignorables.contains(tag.codePoint)) {
    256             return false;
    257         }
    258     }
    259     return true;
    260 }
    261 
    262 void AffixUtils::iterateWithConsumer(const UnicodeString& affixPattern, TokenConsumer& consumer,
    263                                      UErrorCode& status) {
    264     if (affixPattern.length() == 0) {
    265         return;
    266     };
    267     AffixTag tag;
    268     while (hasNext(tag, affixPattern)) {
    269         tag = nextToken(tag, affixPattern, status);
    270         if (U_FAILURE(status)) { return; }
    271         consumer.consumeToken(tag.type, tag.codePoint, status);
    272         if (U_FAILURE(status)) { return; }
    273     }
    274 }
    275 
    276 AffixTag AffixUtils::nextToken(AffixTag tag, const UnicodeString &patternString, UErrorCode &status) {
    277     int32_t offset = tag.offset;
    278     int32_t state = tag.state;
    279     for (; offset < patternString.length();) {
    280         UChar32 cp = patternString.char32At(offset);
    281         int32_t count = U16_LENGTH(cp);
    282 
    283         switch (state) {
    284             case STATE_BASE:
    285                 switch (cp) {
    286                     case u'\'':
    287                         state = STATE_FIRST_QUOTE;
    288                         offset += count;
    289                         // continue to the next code point
    290                         break;
    291                     case u'-':
    292                         return makeTag(offset + count, TYPE_MINUS_SIGN, STATE_BASE, 0);
    293                     case u'+':
    294                         return makeTag(offset + count, TYPE_PLUS_SIGN, STATE_BASE, 0);
    295                     case u'%':
    296                         return makeTag(offset + count, TYPE_PERCENT, STATE_BASE, 0);
    297                     case u'':
    298                         return makeTag(offset + count, TYPE_PERMILLE, STATE_BASE, 0);
    299                     case u'':
    300                         state = STATE_FIRST_CURR;
    301                         offset += count;
    302                         // continue to the next code point
    303                         break;
    304                     default:
    305                         return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
    306                 }
    307                 break;
    308             case STATE_FIRST_QUOTE:
    309                 if (cp == u'\'') {
    310                     return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
    311                 } else {
    312                     return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
    313                 }
    314             case STATE_INSIDE_QUOTE:
    315                 if (cp == u'\'') {
    316                     state = STATE_AFTER_QUOTE;
    317                     offset += count;
    318                     // continue to the next code point
    319                     break;
    320                 } else {
    321                     return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
    322                 }
    323             case STATE_AFTER_QUOTE:
    324                 if (cp == u'\'') {
    325                     return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
    326                 } else {
    327                     state = STATE_BASE;
    328                     // re-evaluate this code point
    329                     break;
    330                 }
    331             case STATE_FIRST_CURR:
    332                 if (cp == u'') {
    333                     state = STATE_SECOND_CURR;
    334                     offset += count;
    335                     // continue to the next code point
    336                     break;
    337                 } else {
    338                     return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
    339                 }
    340             case STATE_SECOND_CURR:
    341                 if (cp == u'') {
    342                     state = STATE_THIRD_CURR;
    343                     offset += count;
    344                     // continue to the next code point
    345                     break;
    346                 } else {
    347                     return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
    348                 }
    349             case STATE_THIRD_CURR:
    350                 if (cp == u'') {
    351                     state = STATE_FOURTH_CURR;
    352                     offset += count;
    353                     // continue to the next code point
    354                     break;
    355                 } else {
    356                     return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
    357                 }
    358             case STATE_FOURTH_CURR:
    359                 if (cp == u'') {
    360                     state = STATE_FIFTH_CURR;
    361                     offset += count;
    362                     // continue to the next code point
    363                     break;
    364                 } else {
    365                     return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
    366                 }
    367             case STATE_FIFTH_CURR:
    368                 if (cp == u'') {
    369                     state = STATE_OVERFLOW_CURR;
    370                     offset += count;
    371                     // continue to the next code point
    372                     break;
    373                 } else {
    374                     return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
    375                 }
    376             case STATE_OVERFLOW_CURR:
    377                 if (cp == u'') {
    378                     offset += count;
    379                     // continue to the next code point and loop back to this state
    380                     break;
    381                 } else {
    382                     return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
    383                 }
    384             default:
    385                 U_ASSERT(false);
    386         }
    387     }
    388     // End of string
    389     switch (state) {
    390         case STATE_BASE:
    391             // No more tokens in string.
    392             return {-1};
    393         case STATE_FIRST_QUOTE:
    394         case STATE_INSIDE_QUOTE:
    395             // For consistent behavior with the JDK and ICU 58, set an error here.
    396             status = U_ILLEGAL_ARGUMENT_ERROR;
    397             return {-1};
    398         case STATE_AFTER_QUOTE:
    399             // No more tokens in string.
    400             return {-1};
    401         case STATE_FIRST_CURR:
    402             return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
    403         case STATE_SECOND_CURR:
    404             return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
    405         case STATE_THIRD_CURR:
    406             return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
    407         case STATE_FOURTH_CURR:
    408             return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
    409         case STATE_FIFTH_CURR:
    410             return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
    411         case STATE_OVERFLOW_CURR:
    412             return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
    413         default:
    414             U_ASSERT(false);
    415             return {-1}; // suppress "control reaches end of non-void function"
    416     }
    417 }
    418 
    419 bool AffixUtils::hasNext(const AffixTag &tag, const UnicodeString &string) {
    420     // First check for the {-1} and default initializer syntax.
    421     if (tag.offset < 0) {
    422         return false;
    423     } else if (tag.offset == 0) {
    424         return string.length() > 0;
    425     }
    426     // The rest of the fields are safe to use now.
    427     // Special case: the last character in string is an end quote.
    428     if (tag.state == STATE_INSIDE_QUOTE && tag.offset == string.length() - 1 &&
    429         string.charAt(tag.offset) == u'\'') {
    430         return false;
    431     } else if (tag.state != STATE_BASE) {
    432         return true;
    433     } else {
    434         return tag.offset < string.length();
    435     }
    436 }
    437 
    438 #endif /* #if !UCONFIG_NO_FORMATTING */
    439