Home | History | Annotate | Download | only in i18n
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4  * Copyright (C) 2015, International Business Machines
      5  * Corporation and others.  All Rights Reserved.
      6  *
      7  * file name: affixpatternparser.cpp
      8  */
      9 
     10 #include "unicode/utypes.h"
     11 
     12 #if !UCONFIG_NO_FORMATTING
     13 
     14 #include "unicode/dcfmtsym.h"
     15 #include "unicode/plurrule.h"
     16 #include "unicode/ucurr.h"
     17 #include "affixpatternparser.h"
     18 #include "charstr.h"
     19 #include "precision.h"
     20 #include "uassert.h"
     21 #include "unistrappender.h"
     22 
     23         static UChar gDefaultSymbols[] = {0xa4, 0xa4, 0xa4};
     24 
     25 static UChar gPercent = 0x25;
     26 static UChar gPerMill = 0x2030;
     27 static UChar gNegative = 0x2D;
     28 static UChar gPositive = 0x2B;
     29 
     30 #define PACK_TOKEN_AND_LENGTH(t, l) ((UChar) (((t) << 8) | (l & 0xFF)))
     31 
     32 #define UNPACK_TOKEN(c) ((AffixPattern::ETokenType) (((c) >> 8) & 0x7F))
     33 
     34 #define UNPACK_LONG(c) (((c) >> 8) & 0x80)
     35 
     36 #define UNPACK_LENGTH(c) ((c) & 0xFF)
     37 
     38 U_NAMESPACE_BEGIN
     39 
     40 static int32_t
     41 nextToken(const UChar *buffer, int32_t idx, int32_t len, UChar *token) {
     42     if (buffer[idx] != 0x27 || idx + 1 == len) {
     43         *token = buffer[idx];
     44         return 1;
     45     }
     46     *token = buffer[idx + 1];
     47     if (buffer[idx + 1] == 0xA4) {
     48         int32_t i = 2;
     49         for (; idx + i < len && i < 4 && buffer[idx + i] == buffer[idx + 1]; ++i)
     50           ;
     51         return i;
     52     }
     53     return 2;
     54 }
     55 
     56 static int32_t
     57 nextUserToken(const UChar *buffer, int32_t idx, int32_t len, UChar *token) {
     58     *token = buffer[idx];
     59     int32_t max;
     60     switch (buffer[idx]) {
     61     case 0x27:
     62         max = 2;
     63         break;
     64     case 0xA4:
     65         max = 3;
     66         break;
     67     default:
     68         max = 1;
     69         break;
     70     }
     71     int32_t i = 1;
     72     for (; idx + i < len && i < max && buffer[idx + i] == buffer[idx]; ++i)
     73       ;
     74     return i;
     75 }
     76 
     77 CurrencyAffixInfo::CurrencyAffixInfo()
     78         : fSymbol(gDefaultSymbols, 1),
     79           fISO(gDefaultSymbols, 2),
     80           fLong(DigitAffix(gDefaultSymbols, 3)),
     81           fIsDefault(TRUE) {
     82 }
     83 
     84 void
     85 CurrencyAffixInfo::set(
     86         const char *locale,
     87         const PluralRules *rules,
     88         const UChar *currency,
     89         UErrorCode &status) {
     90     if (U_FAILURE(status)) {
     91         return;
     92     }
     93     fIsDefault = FALSE;
     94     if (currency == NULL) {
     95         fSymbol.setTo(gDefaultSymbols, 1);
     96         fISO.setTo(gDefaultSymbols, 2);
     97         fLong.remove();
     98         fLong.append(gDefaultSymbols, 3);
     99         fIsDefault = TRUE;
    100         return;
    101     }
    102     int32_t len;
    103     UBool unusedIsChoice;
    104     const UChar *symbol = ucurr_getName(
    105             currency, locale, UCURR_SYMBOL_NAME, &unusedIsChoice,
    106             &len, &status);
    107     if (U_FAILURE(status)) {
    108         return;
    109     }
    110     fSymbol.setTo(symbol, len);
    111     fISO.setTo(currency, u_strlen(currency));
    112     fLong.remove();
    113     StringEnumeration* keywords = rules->getKeywords(status);
    114     if (U_FAILURE(status)) {
    115         return;
    116     }
    117     const UnicodeString* pluralCount;
    118     while ((pluralCount = keywords->snext(status)) != NULL) {
    119         CharString pCount;
    120         pCount.appendInvariantChars(*pluralCount, status);
    121         const UChar *pluralName = ucurr_getPluralName(
    122             currency, locale, &unusedIsChoice, pCount.data(),
    123             &len, &status);
    124         fLong.setVariant(pCount.data(), UnicodeString(pluralName, len), status);
    125     }
    126     delete keywords;
    127 }
    128 
    129 void
    130 CurrencyAffixInfo::adjustPrecision(
    131         const UChar *currency, const UCurrencyUsage usage,
    132         FixedPrecision &precision, UErrorCode &status) {
    133     if (U_FAILURE(status)) {
    134         return;
    135     }
    136 
    137     int32_t digitCount = ucurr_getDefaultFractionDigitsForUsage(
    138             currency, usage, &status);
    139     precision.fMin.setFracDigitCount(digitCount);
    140     precision.fMax.setFracDigitCount(digitCount);
    141     double increment = ucurr_getRoundingIncrementForUsage(
    142             currency, usage, &status);
    143     if (increment == 0.0) {
    144         precision.fRoundingIncrement.clear();
    145     } else {
    146         precision.fRoundingIncrement.set(increment);
    147         // guard against round-off error
    148         precision.fRoundingIncrement.round(6);
    149     }
    150 }
    151 
    152 void
    153 AffixPattern::addLiteral(
    154         const UChar *literal, int32_t start, int32_t len) {
    155     char32Count += u_countChar32(literal + start, len);
    156     literals.append(literal, start, len);
    157     int32_t tlen = tokens.length();
    158     // Takes 4 UChars to encode maximum literal length.
    159     UChar *tokenChars = tokens.getBuffer(tlen + 4);
    160 
    161     // find start of literal size. May be tlen if there is no literal.
    162     // While finding start of literal size, compute literal length
    163     int32_t literalLength = 0;
    164     int32_t tLiteralStart = tlen;
    165     while (tLiteralStart > 0 && UNPACK_TOKEN(tokenChars[tLiteralStart - 1]) == kLiteral) {
    166         tLiteralStart--;
    167         literalLength <<= 8;
    168         literalLength |= UNPACK_LENGTH(tokenChars[tLiteralStart]);
    169     }
    170     // Add number of chars we just added to literal
    171     literalLength += len;
    172 
    173     // Now encode the new length starting at tLiteralStart
    174     tlen = tLiteralStart;
    175     tokenChars[tlen++] = PACK_TOKEN_AND_LENGTH(kLiteral, literalLength & 0xFF);
    176     literalLength >>= 8;
    177     while (literalLength) {
    178         tokenChars[tlen++] = PACK_TOKEN_AND_LENGTH(kLiteral | 0x80, literalLength & 0xFF);
    179         literalLength >>= 8;
    180     }
    181     tokens.releaseBuffer(tlen);
    182 }
    183 
    184 void
    185 AffixPattern::add(ETokenType t) {
    186     add(t, 1);
    187 }
    188 
    189 void
    190 AffixPattern::addCurrency(uint8_t count) {
    191     add(kCurrency, count);
    192 }
    193 
    194 void
    195 AffixPattern::add(ETokenType t, uint8_t count) {
    196     U_ASSERT(t != kLiteral);
    197     char32Count += count;
    198     switch (t) {
    199     case kCurrency:
    200         hasCurrencyToken = TRUE;
    201         break;
    202     case kPercent:
    203         hasPercentToken = TRUE;
    204         break;
    205     case kPerMill:
    206         hasPermillToken = TRUE;
    207         break;
    208     default:
    209         // Do nothing
    210         break;
    211     }
    212     tokens.append(PACK_TOKEN_AND_LENGTH(t, count));
    213 }
    214 
    215 AffixPattern &
    216 AffixPattern::append(const AffixPattern &other) {
    217     AffixPatternIterator iter;
    218     other.iterator(iter);
    219     UnicodeString literal;
    220     while (iter.nextToken()) {
    221         switch (iter.getTokenType()) {
    222         case kLiteral:
    223             iter.getLiteral(literal);
    224             addLiteral(literal.getBuffer(), 0, literal.length());
    225             break;
    226         case kCurrency:
    227             addCurrency(iter.getTokenLength());
    228             break;
    229         default:
    230             add(iter.getTokenType());
    231             break;
    232         }
    233     }
    234     return *this;
    235 }
    236 
    237 void
    238 AffixPattern::remove() {
    239     tokens.remove();
    240     literals.remove();
    241     hasCurrencyToken = FALSE;
    242     hasPercentToken = FALSE;
    243     hasPermillToken = FALSE;
    244     char32Count = 0;
    245 }
    246 
    247 // escapes literals for strings where special characters are NOT escaped
    248 // except for apostrophe.
    249 static void escapeApostropheInLiteral(
    250         const UnicodeString &literal, UnicodeStringAppender &appender) {
    251     int32_t len = literal.length();
    252     const UChar *buffer = literal.getBuffer();
    253     for (int32_t i = 0; i < len; ++i) {
    254         UChar ch = buffer[i];
    255         switch (ch) {
    256             case 0x27:
    257                 appender.append((UChar) 0x27);
    258                 appender.append((UChar) 0x27);
    259                 break;
    260             default:
    261                 appender.append(ch);
    262                 break;
    263         }
    264     }
    265 }
    266 
    267 
    268 // escapes literals for user strings where special characters in literals
    269 // are escaped with apostrophe.
    270 static void escapeLiteral(
    271         const UnicodeString &literal, UnicodeStringAppender &appender) {
    272     int32_t len = literal.length();
    273     const UChar *buffer = literal.getBuffer();
    274     for (int32_t i = 0; i < len; ++i) {
    275         UChar ch = buffer[i];
    276         switch (ch) {
    277             case 0x27:
    278                 appender.append((UChar) 0x27);
    279                 appender.append((UChar) 0x27);
    280                 break;
    281             case 0x25:
    282                 appender.append((UChar) 0x27);
    283                 appender.append((UChar) 0x25);
    284                 appender.append((UChar) 0x27);
    285                 break;
    286             case 0x2030:
    287                 appender.append((UChar) 0x27);
    288                 appender.append((UChar) 0x2030);
    289                 appender.append((UChar) 0x27);
    290                 break;
    291             case 0xA4:
    292                 appender.append((UChar) 0x27);
    293                 appender.append((UChar) 0xA4);
    294                 appender.append((UChar) 0x27);
    295                 break;
    296             case 0x2D:
    297                 appender.append((UChar) 0x27);
    298                 appender.append((UChar) 0x2D);
    299                 appender.append((UChar) 0x27);
    300                 break;
    301             case 0x2B:
    302                 appender.append((UChar) 0x27);
    303                 appender.append((UChar) 0x2B);
    304                 appender.append((UChar) 0x27);
    305                 break;
    306             default:
    307                 appender.append(ch);
    308                 break;
    309         }
    310     }
    311 }
    312 
    313 UnicodeString &
    314 AffixPattern::toString(UnicodeString &appendTo) const {
    315     AffixPatternIterator iter;
    316     iterator(iter);
    317     UnicodeStringAppender appender(appendTo);
    318     UnicodeString literal;
    319     while (iter.nextToken()) {
    320         switch (iter.getTokenType()) {
    321         case kLiteral:
    322             escapeApostropheInLiteral(iter.getLiteral(literal), appender);
    323             break;
    324         case kPercent:
    325             appender.append((UChar) 0x27);
    326             appender.append((UChar) 0x25);
    327             break;
    328         case kPerMill:
    329             appender.append((UChar) 0x27);
    330             appender.append((UChar) 0x2030);
    331             break;
    332         case kCurrency:
    333             {
    334                 appender.append((UChar) 0x27);
    335                 int32_t cl = iter.getTokenLength();
    336                 for (int32_t i = 0; i < cl; ++i) {
    337                     appender.append((UChar) 0xA4);
    338                 }
    339             }
    340             break;
    341         case kNegative:
    342             appender.append((UChar) 0x27);
    343             appender.append((UChar) 0x2D);
    344             break;
    345         case kPositive:
    346             appender.append((UChar) 0x27);
    347             appender.append((UChar) 0x2B);
    348             break;
    349         default:
    350             U_ASSERT(FALSE);
    351             break;
    352         }
    353     }
    354     return appendTo;
    355 }
    356 
    357 UnicodeString &
    358 AffixPattern::toUserString(UnicodeString &appendTo) const {
    359     AffixPatternIterator iter;
    360     iterator(iter);
    361     UnicodeStringAppender appender(appendTo);
    362     UnicodeString literal;
    363     while (iter.nextToken()) {
    364         switch (iter.getTokenType()) {
    365         case kLiteral:
    366             escapeLiteral(iter.getLiteral(literal), appender);
    367             break;
    368         case kPercent:
    369             appender.append((UChar) 0x25);
    370             break;
    371         case kPerMill:
    372             appender.append((UChar) 0x2030);
    373             break;
    374         case kCurrency:
    375             {
    376                 int32_t cl = iter.getTokenLength();
    377                 for (int32_t i = 0; i < cl; ++i) {
    378                     appender.append((UChar) 0xA4);
    379                 }
    380             }
    381             break;
    382         case kNegative:
    383             appender.append((UChar) 0x2D);
    384             break;
    385         case kPositive:
    386             appender.append((UChar) 0x2B);
    387             break;
    388         default:
    389             U_ASSERT(FALSE);
    390             break;
    391         }
    392     }
    393     return appendTo;
    394 }
    395 
    396 class AffixPatternAppender : public UMemory {
    397 public:
    398     AffixPatternAppender(AffixPattern &dest) : fDest(&dest), fIdx(0) { }
    399 
    400     inline void append(UChar x) {
    401         if (fIdx == UPRV_LENGTHOF(fBuffer)) {
    402             fDest->addLiteral(fBuffer, 0, fIdx);
    403             fIdx = 0;
    404         }
    405         fBuffer[fIdx++] = x;
    406     }
    407 
    408     inline void append(UChar32 x) {
    409         if (fIdx >= UPRV_LENGTHOF(fBuffer) - 1) {
    410             fDest->addLiteral(fBuffer, 0, fIdx);
    411             fIdx = 0;
    412         }
    413         U16_APPEND_UNSAFE(fBuffer, fIdx, x);
    414     }
    415 
    416     inline void flush() {
    417         if (fIdx) {
    418             fDest->addLiteral(fBuffer, 0, fIdx);
    419         }
    420         fIdx = 0;
    421     }
    422 
    423     /**
    424      * flush the buffer when we go out of scope.
    425      */
    426     ~AffixPatternAppender() {
    427         flush();
    428     }
    429 private:
    430     AffixPattern *fDest;
    431     int32_t fIdx;
    432     UChar fBuffer[32];
    433     AffixPatternAppender(const AffixPatternAppender &other);
    434     AffixPatternAppender &operator=(const AffixPatternAppender &other);
    435 };
    436 
    437 
    438 AffixPattern &
    439 AffixPattern::parseUserAffixString(
    440         const UnicodeString &affixStr,
    441         AffixPattern &appendTo,
    442         UErrorCode &status) {
    443     if (U_FAILURE(status)) {
    444         return appendTo;
    445     }
    446     int32_t len = affixStr.length();
    447     const UChar *buffer = affixStr.getBuffer();
    448     // 0 = not quoted; 1 = quoted.
    449     int32_t state = 0;
    450     AffixPatternAppender appender(appendTo);
    451     for (int32_t i = 0; i < len; ) {
    452         UChar token;
    453         int32_t tokenSize = nextUserToken(buffer, i, len, &token);
    454         i += tokenSize;
    455         if (token == 0x27 && tokenSize == 1) { // quote
    456             state = 1 - state;
    457             continue;
    458         }
    459         if (state == 0) {
    460             switch (token) {
    461             case 0x25:
    462                 appender.flush();
    463                 appendTo.add(kPercent, 1);
    464                 break;
    465             case 0x27:  // double quote
    466                 appender.append((UChar) 0x27);
    467                 break;
    468             case 0x2030:
    469                 appender.flush();
    470                 appendTo.add(kPerMill, 1);
    471                 break;
    472             case 0x2D:
    473                 appender.flush();
    474                 appendTo.add(kNegative, 1);
    475                 break;
    476             case 0x2B:
    477                 appender.flush();
    478                 appendTo.add(kPositive, 1);
    479                 break;
    480             case 0xA4:
    481                 appender.flush();
    482                 appendTo.add(kCurrency, tokenSize);
    483                 break;
    484             default:
    485                 appender.append(token);
    486                 break;
    487             }
    488         } else {
    489             switch (token) {
    490             case 0x27:  // double quote
    491                 appender.append((UChar) 0x27);
    492                 break;
    493             case 0xA4: // included b/c tokenSize can be > 1
    494                 for (int32_t j = 0; j < tokenSize; ++j) {
    495                     appender.append((UChar) 0xA4);
    496                 }
    497                 break;
    498             default:
    499                 appender.append(token);
    500                 break;
    501             }
    502         }
    503     }
    504     return appendTo;
    505 }
    506 
    507 AffixPattern &
    508 AffixPattern::parseAffixString(
    509         const UnicodeString &affixStr,
    510         AffixPattern &appendTo,
    511         UErrorCode &status) {
    512     if (U_FAILURE(status)) {
    513         return appendTo;
    514     }
    515     int32_t len = affixStr.length();
    516     const UChar *buffer = affixStr.getBuffer();
    517     for (int32_t i = 0; i < len; ) {
    518         UChar token;
    519         int32_t tokenSize = nextToken(buffer, i, len, &token);
    520         if (tokenSize == 1) {
    521             int32_t literalStart = i;
    522             ++i;
    523             while (i < len && (tokenSize = nextToken(buffer, i, len, &token)) == 1) {
    524                 ++i;
    525             }
    526             appendTo.addLiteral(buffer, literalStart, i - literalStart);
    527 
    528             // If we reached end of string, we are done
    529             if (i == len) {
    530                 return appendTo;
    531             }
    532         }
    533         i += tokenSize;
    534         switch (token) {
    535         case 0x25:
    536             appendTo.add(kPercent, 1);
    537             break;
    538         case 0x2030:
    539             appendTo.add(kPerMill, 1);
    540             break;
    541         case 0x2D:
    542             appendTo.add(kNegative, 1);
    543             break;
    544         case 0x2B:
    545             appendTo.add(kPositive, 1);
    546             break;
    547         case 0xA4:
    548             {
    549                 if (tokenSize - 1 > 3) {
    550                     status = U_PARSE_ERROR;
    551                     return appendTo;
    552                 }
    553                 appendTo.add(kCurrency, tokenSize - 1);
    554             }
    555             break;
    556         default:
    557             appendTo.addLiteral(&token, 0, 1);
    558             break;
    559         }
    560     }
    561     return appendTo;
    562 }
    563 
    564 AffixPatternIterator &
    565 AffixPattern::iterator(AffixPatternIterator &result) const {
    566     result.nextLiteralIndex = 0;
    567     result.lastLiteralLength = 0;
    568     result.nextTokenIndex = 0;
    569     result.tokens = &tokens;
    570     result.literals = &literals;
    571     return result;
    572 }
    573 
    574 UBool
    575 AffixPatternIterator::nextToken() {
    576     int32_t tlen = tokens->length();
    577     if (nextTokenIndex == tlen) {
    578         return FALSE;
    579     }
    580     ++nextTokenIndex;
    581     const UChar *tokenBuffer = tokens->getBuffer();
    582     if (UNPACK_TOKEN(tokenBuffer[nextTokenIndex - 1]) ==
    583             AffixPattern::kLiteral) {
    584         while (nextTokenIndex < tlen &&
    585                 UNPACK_LONG(tokenBuffer[nextTokenIndex])) {
    586             ++nextTokenIndex;
    587         }
    588         lastLiteralLength = 0;
    589         int32_t i = nextTokenIndex - 1;
    590         for (; UNPACK_LONG(tokenBuffer[i]); --i) {
    591             lastLiteralLength <<= 8;
    592             lastLiteralLength |= UNPACK_LENGTH(tokenBuffer[i]);
    593         }
    594         lastLiteralLength <<= 8;
    595         lastLiteralLength |= UNPACK_LENGTH(tokenBuffer[i]);
    596         nextLiteralIndex += lastLiteralLength;
    597     }
    598     return TRUE;
    599 }
    600 
    601 AffixPattern::ETokenType
    602 AffixPatternIterator::getTokenType() const {
    603     return UNPACK_TOKEN(tokens->charAt(nextTokenIndex - 1));
    604 }
    605 
    606 UnicodeString &
    607 AffixPatternIterator::getLiteral(UnicodeString &result) const {
    608     const UChar *buffer = literals->getBuffer();
    609     result.setTo(buffer + (nextLiteralIndex - lastLiteralLength), lastLiteralLength);
    610     return result;
    611 }
    612 
    613 int32_t
    614 AffixPatternIterator::getTokenLength() const {
    615     const UChar *tokenBuffer = tokens->getBuffer();
    616     AffixPattern::ETokenType type = UNPACK_TOKEN(tokenBuffer[nextTokenIndex - 1]);
    617     return type == AffixPattern::kLiteral ? lastLiteralLength : UNPACK_LENGTH(tokenBuffer[nextTokenIndex - 1]);
    618 }
    619 
    620 AffixPatternParser::AffixPatternParser()
    621         : fPercent(gPercent), fPermill(gPerMill), fNegative(gNegative), fPositive(gPositive) {
    622 }
    623 
    624 AffixPatternParser::AffixPatternParser(
    625         const DecimalFormatSymbols &symbols) {
    626     setDecimalFormatSymbols(symbols);
    627 }
    628 
    629 void
    630 AffixPatternParser::setDecimalFormatSymbols(
    631         const DecimalFormatSymbols &symbols) {
    632     fPercent = symbols.getConstSymbol(DecimalFormatSymbols::kPercentSymbol);
    633     fPermill = symbols.getConstSymbol(DecimalFormatSymbols::kPerMillSymbol);
    634     fNegative = symbols.getConstSymbol(DecimalFormatSymbols::kMinusSignSymbol);
    635     fPositive = symbols.getConstSymbol(DecimalFormatSymbols::kPlusSignSymbol);
    636 }
    637 
    638 PluralAffix &
    639 AffixPatternParser::parse(
    640         const AffixPattern &affixPattern,
    641         const CurrencyAffixInfo &currencyAffixInfo,
    642         PluralAffix &appendTo,
    643         UErrorCode &status) const {
    644     if (U_FAILURE(status)) {
    645         return appendTo;
    646     }
    647     AffixPatternIterator iter;
    648     affixPattern.iterator(iter);
    649     UnicodeString literal;
    650     while (iter.nextToken()) {
    651         switch (iter.getTokenType()) {
    652         case AffixPattern::kPercent:
    653             appendTo.append(fPercent, UNUM_PERCENT_FIELD);
    654             break;
    655         case AffixPattern::kPerMill:
    656             appendTo.append(fPermill, UNUM_PERMILL_FIELD);
    657             break;
    658         case AffixPattern::kNegative:
    659             appendTo.append(fNegative, UNUM_SIGN_FIELD);
    660             break;
    661         case AffixPattern::kPositive:
    662             appendTo.append(fPositive, UNUM_SIGN_FIELD);
    663             break;
    664         case AffixPattern::kCurrency:
    665             switch (iter.getTokenLength()) {
    666                 case 1:
    667                     appendTo.append(
    668                             currencyAffixInfo.getSymbol(), UNUM_CURRENCY_FIELD);
    669                     break;
    670                 case 2:
    671                     appendTo.append(
    672                             currencyAffixInfo.getISO(), UNUM_CURRENCY_FIELD);
    673                     break;
    674                 case 3:
    675                     appendTo.append(
    676                             currencyAffixInfo.getLong(), UNUM_CURRENCY_FIELD, status);
    677                     break;
    678                 default:
    679                     U_ASSERT(FALSE);
    680                     break;
    681             }
    682             break;
    683         case AffixPattern::kLiteral:
    684             appendTo.append(iter.getLiteral(literal));
    685             break;
    686         default:
    687             U_ASSERT(FALSE);
    688             break;
    689         }
    690     }
    691     return appendTo;
    692 }
    693 
    694 
    695 U_NAMESPACE_END
    696 #endif /* #if !UCONFIG_NO_FORMATTING */
    697