Home | History | Annotate | Download | only in i18n
      1 /*
      2  * Copyright (C) 2015, International Business Machines
      3  * Corporation and others.  All Rights Reserved.
      4  *
      5  * file name: affixpatternparser.cpp
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_FORMATTING
     11 
     12 #include "unicode/dcfmtsym.h"
     13 #include "unicode/plurrule.h"
     14 #include "unicode/ucurr.h"
     15 #include "affixpatternparser.h"
     16 #include "charstr.h"
     17 #include "precision.h"
     18 #include "uassert.h"
     19 #include "unistrappender.h"
     20 
     21         static UChar gDefaultSymbols[] = {0xa4, 0xa4, 0xa4};
     22 
     23 static UChar gPercent = 0x25;
     24 static UChar gPerMill = 0x2030;
     25 static UChar gNegative = 0x2D;
     26 static UChar gPositive = 0x2B;
     27 
     28 #define PACK_TOKEN_AND_LENGTH(t, l) ((UChar) (((t) << 8) | (l & 0xFF)))
     29 
     30 #define UNPACK_TOKEN(c) ((AffixPattern::ETokenType) (((c) >> 8) & 0x7F))
     31 
     32 #define UNPACK_LONG(c) (((c) >> 8) & 0x80)
     33 
     34 #define UNPACK_LENGTH(c) ((c) & 0xFF)
     35 
     36 U_NAMESPACE_BEGIN
     37 
     38 static int32_t
     39 nextToken(const UChar *buffer, int32_t idx, int32_t len, UChar *token) {
     40     if (buffer[idx] != 0x27 || idx + 1 == len) {
     41         *token = buffer[idx];
     42         return 1;
     43     }
     44     *token = buffer[idx + 1];
     45     if (buffer[idx + 1] == 0xA4) {
     46         int32_t i = 2;
     47         for (; idx + i < len && i < 4 && buffer[idx + i] == buffer[idx + 1]; ++i);
     48         return i;
     49     }
     50     return 2;
     51 }
     52 
     53 static int32_t
     54 nextUserToken(const UChar *buffer, int32_t idx, int32_t len, UChar *token) {
     55     *token = buffer[idx];
     56     int32_t max;
     57     switch (buffer[idx]) {
     58     case 0x27:
     59         max = 2;
     60         break;
     61     case 0xA4:
     62         max = 3;
     63         break;
     64     default:
     65         max = 1;
     66         break;
     67     }
     68     int32_t i = 1;
     69     for (; idx + i < len && i < max && buffer[idx + i] == buffer[idx]; ++i);
     70     return i;
     71 }
     72 
     73 CurrencyAffixInfo::CurrencyAffixInfo()
     74         : fSymbol(gDefaultSymbols, 1),
     75           fISO(gDefaultSymbols, 2),
     76           fLong(DigitAffix(gDefaultSymbols, 3)),
     77           fIsDefault(TRUE) {
     78 }
     79 
     80 void
     81 CurrencyAffixInfo::set(
     82         const char *locale,
     83         const PluralRules *rules,
     84         const UChar *currency,
     85         UErrorCode &status) {
     86     if (U_FAILURE(status)) {
     87         return;
     88     }
     89     fIsDefault = FALSE;
     90     if (currency == NULL) {
     91         fSymbol.setTo(gDefaultSymbols, 1);
     92         fISO.setTo(gDefaultSymbols, 2);
     93         fLong.remove();
     94         fLong.append(gDefaultSymbols, 3);
     95         fIsDefault = TRUE;
     96         return;
     97     }
     98     int32_t len;
     99     UBool unusedIsChoice;
    100     const UChar *symbol = ucurr_getName(
    101             currency, locale, UCURR_SYMBOL_NAME, &unusedIsChoice,
    102             &len, &status);
    103     if (U_FAILURE(status)) {
    104         return;
    105     }
    106     fSymbol.setTo(symbol, len);
    107     fISO.setTo(currency, u_strlen(currency));
    108     fLong.remove();
    109     StringEnumeration* keywords = rules->getKeywords(status);
    110     if (U_FAILURE(status)) {
    111         return;
    112     }
    113     const UnicodeString* pluralCount;
    114     while ((pluralCount = keywords->snext(status)) != NULL) {
    115         CharString pCount;
    116         pCount.appendInvariantChars(*pluralCount, status);
    117         const UChar *pluralName = ucurr_getPluralName(
    118             currency, locale, &unusedIsChoice, pCount.data(),
    119             &len, &status);
    120         fLong.setVariant(pCount.data(), UnicodeString(pluralName, len), status);
    121     }
    122     delete keywords;
    123 }
    124 
    125 void
    126 CurrencyAffixInfo::adjustPrecision(
    127         const UChar *currency, const UCurrencyUsage usage,
    128         FixedPrecision &precision, UErrorCode &status) {
    129     if (U_FAILURE(status)) {
    130         return;
    131     }
    132 
    133     int32_t digitCount = ucurr_getDefaultFractionDigitsForUsage(
    134             currency, usage, &status);
    135     precision.fMin.setFracDigitCount(digitCount);
    136     precision.fMax.setFracDigitCount(digitCount);
    137     double increment = ucurr_getRoundingIncrementForUsage(
    138             currency, usage, &status);
    139     if (increment == 0.0) {
    140         precision.fRoundingIncrement.clear();
    141     } else {
    142         precision.fRoundingIncrement.set(increment);
    143         // guard against round-off error
    144         precision.fRoundingIncrement.round(6);
    145     }
    146 }
    147 
    148 void
    149 AffixPattern::addLiteral(
    150         const UChar *literal, int32_t start, int32_t len) {
    151     char32Count += u_countChar32(literal + start, len);
    152     literals.append(literal, start, len);
    153     int32_t tlen = tokens.length();
    154     // Takes 4 UChars to encode maximum literal length.
    155     UChar *tokenChars = tokens.getBuffer(tlen + 4);
    156 
    157     // find start of literal size. May be tlen if there is no literal.
    158     // While finding start of literal size, compute literal length
    159     int32_t literalLength = 0;
    160     int32_t tLiteralStart = tlen;
    161     while (tLiteralStart > 0 && UNPACK_TOKEN(tokenChars[tLiteralStart - 1]) == kLiteral) {
    162         tLiteralStart--;
    163         literalLength <<= 8;
    164         literalLength |= UNPACK_LENGTH(tokenChars[tLiteralStart]);
    165     }
    166     // Add number of chars we just added to literal
    167     literalLength += len;
    168 
    169     // Now encode the new length starting at tLiteralStart
    170     tlen = tLiteralStart;
    171     tokenChars[tlen++] = PACK_TOKEN_AND_LENGTH(kLiteral, literalLength & 0xFF);
    172     literalLength >>= 8;
    173     while (literalLength) {
    174         tokenChars[tlen++] = PACK_TOKEN_AND_LENGTH(kLiteral | 0x80, literalLength & 0xFF);
    175         literalLength >>= 8;
    176     }
    177     tokens.releaseBuffer(tlen);
    178 }
    179 
    180 void
    181 AffixPattern::add(ETokenType t) {
    182     add(t, 1);
    183 }
    184 
    185 void
    186 AffixPattern::addCurrency(uint8_t count) {
    187     add(kCurrency, count);
    188 }
    189 
    190 void
    191 AffixPattern::add(ETokenType t, uint8_t count) {
    192     U_ASSERT(t != kLiteral);
    193     char32Count += count;
    194     switch (t) {
    195     case kCurrency:
    196         hasCurrencyToken = TRUE;
    197         break;
    198     case kPercent:
    199         hasPercentToken = TRUE;
    200         break;
    201     case kPerMill:
    202         hasPermillToken = TRUE;
    203         break;
    204     default:
    205         // Do nothing
    206         break;
    207     }
    208     tokens.append(PACK_TOKEN_AND_LENGTH(t, count));
    209 }
    210 
    211 AffixPattern &
    212 AffixPattern::append(const AffixPattern &other) {
    213     AffixPatternIterator iter;
    214     other.iterator(iter);
    215     UnicodeString literal;
    216     while (iter.nextToken()) {
    217         switch (iter.getTokenType()) {
    218         case kLiteral:
    219             iter.getLiteral(literal);
    220             addLiteral(literal.getBuffer(), 0, literal.length());
    221             break;
    222         case kCurrency:
    223             addCurrency(iter.getTokenLength());
    224             break;
    225         default:
    226             add(iter.getTokenType());
    227             break;
    228         }
    229     }
    230     return *this;
    231 }
    232 
    233 void
    234 AffixPattern::remove() {
    235     tokens.remove();
    236     literals.remove();
    237     hasCurrencyToken = FALSE;
    238     hasPercentToken = FALSE;
    239     hasPermillToken = FALSE;
    240     char32Count = 0;
    241 }
    242 
    243 // escapes literals for strings where special characters are NOT escaped
    244 // except for apostrophe.
    245 static void escapeApostropheInLiteral(
    246         const UnicodeString &literal, UnicodeStringAppender &appender) {
    247     int32_t len = literal.length();
    248     const UChar *buffer = literal.getBuffer();
    249     for (int32_t i = 0; i < len; ++i) {
    250         UChar ch = buffer[i];
    251         switch (ch) {
    252             case 0x27:
    253                 appender.append((UChar) 0x27);
    254                 appender.append((UChar) 0x27);
    255                 break;
    256             default:
    257                 appender.append(ch);
    258                 break;
    259         }
    260     }
    261 }
    262 
    263 
    264 // escapes literals for user strings where special characters in literals
    265 // are escaped with apostrophe.
    266 static void escapeLiteral(
    267         const UnicodeString &literal, UnicodeStringAppender &appender) {
    268     int32_t len = literal.length();
    269     const UChar *buffer = literal.getBuffer();
    270     for (int32_t i = 0; i < len; ++i) {
    271         UChar ch = buffer[i];
    272         switch (ch) {
    273             case 0x27:
    274                 appender.append((UChar) 0x27);
    275                 appender.append((UChar) 0x27);
    276                 break;
    277             case 0x25:
    278                 appender.append((UChar) 0x27);
    279                 appender.append((UChar) 0x25);
    280                 appender.append((UChar) 0x27);
    281                 break;
    282             case 0x2030:
    283                 appender.append((UChar) 0x27);
    284                 appender.append((UChar) 0x2030);
    285                 appender.append((UChar) 0x27);
    286                 break;
    287             case 0xA4:
    288                 appender.append((UChar) 0x27);
    289                 appender.append((UChar) 0xA4);
    290                 appender.append((UChar) 0x27);
    291                 break;
    292             case 0x2D:
    293                 appender.append((UChar) 0x27);
    294                 appender.append((UChar) 0x2D);
    295                 appender.append((UChar) 0x27);
    296                 break;
    297             case 0x2B:
    298                 appender.append((UChar) 0x27);
    299                 appender.append((UChar) 0x2B);
    300                 appender.append((UChar) 0x27);
    301                 break;
    302             default:
    303                 appender.append(ch);
    304                 break;
    305         }
    306     }
    307 }
    308 
    309 UnicodeString &
    310 AffixPattern::toString(UnicodeString &appendTo) const {
    311     AffixPatternIterator iter;
    312     iterator(iter);
    313     UnicodeStringAppender appender(appendTo);
    314     UnicodeString literal;
    315     while (iter.nextToken()) {
    316         switch (iter.getTokenType()) {
    317         case kLiteral:
    318             escapeApostropheInLiteral(iter.getLiteral(literal), appender);
    319             break;
    320         case kPercent:
    321             appender.append((UChar) 0x27);
    322             appender.append((UChar) 0x25);
    323             break;
    324         case kPerMill:
    325             appender.append((UChar) 0x27);
    326             appender.append((UChar) 0x2030);
    327             break;
    328         case kCurrency:
    329             {
    330                 appender.append((UChar) 0x27);
    331                 int32_t cl = iter.getTokenLength();
    332                 for (int32_t i = 0; i < cl; ++i) {
    333                     appender.append((UChar) 0xA4);
    334                 }
    335             }
    336             break;
    337         case kNegative:
    338             appender.append((UChar) 0x27);
    339             appender.append((UChar) 0x2D);
    340             break;
    341         case kPositive:
    342             appender.append((UChar) 0x27);
    343             appender.append((UChar) 0x2B);
    344             break;
    345         default:
    346             U_ASSERT(FALSE);
    347             break;
    348         }
    349     }
    350     return appendTo;
    351 }
    352 
    353 UnicodeString &
    354 AffixPattern::toUserString(UnicodeString &appendTo) const {
    355     AffixPatternIterator iter;
    356     iterator(iter);
    357     UnicodeStringAppender appender(appendTo);
    358     UnicodeString literal;
    359     while (iter.nextToken()) {
    360         switch (iter.getTokenType()) {
    361         case kLiteral:
    362             escapeLiteral(iter.getLiteral(literal), appender);
    363             break;
    364         case kPercent:
    365             appender.append((UChar) 0x25);
    366             break;
    367         case kPerMill:
    368             appender.append((UChar) 0x2030);
    369             break;
    370         case kCurrency:
    371             {
    372                 int32_t cl = iter.getTokenLength();
    373                 for (int32_t i = 0; i < cl; ++i) {
    374                     appender.append((UChar) 0xA4);
    375                 }
    376             }
    377             break;
    378         case kNegative:
    379             appender.append((UChar) 0x2D);
    380             break;
    381         case kPositive:
    382             appender.append((UChar) 0x2B);
    383             break;
    384         default:
    385             U_ASSERT(FALSE);
    386             break;
    387         }
    388     }
    389     return appendTo;
    390 }
    391 
    392 class AffixPatternAppender : public UMemory {
    393 public:
    394     AffixPatternAppender(AffixPattern &dest) : fDest(&dest), fIdx(0) { }
    395 
    396     inline void append(UChar x) {
    397         if (fIdx == UPRV_LENGTHOF(fBuffer)) {
    398             fDest->addLiteral(fBuffer, 0, fIdx);
    399             fIdx = 0;
    400         }
    401         fBuffer[fIdx++] = x;
    402     }
    403 
    404     inline void append(UChar32 x) {
    405         if (fIdx >= UPRV_LENGTHOF(fBuffer) - 1) {
    406             fDest->addLiteral(fBuffer, 0, fIdx);
    407             fIdx = 0;
    408         }
    409         U16_APPEND_UNSAFE(fBuffer, fIdx, x);
    410     }
    411 
    412     inline void flush() {
    413         if (fIdx) {
    414             fDest->addLiteral(fBuffer, 0, fIdx);
    415         }
    416         fIdx = 0;
    417     }
    418 
    419     /**
    420      * flush the buffer when we go out of scope.
    421      */
    422     ~AffixPatternAppender() {
    423         flush();
    424     }
    425 private:
    426     AffixPattern *fDest;
    427     int32_t fIdx;
    428     UChar fBuffer[32];
    429     AffixPatternAppender(const AffixPatternAppender &other);
    430     AffixPatternAppender &operator=(const AffixPatternAppender &other);
    431 };
    432 
    433 
    434 AffixPattern &
    435 AffixPattern::parseUserAffixString(
    436         const UnicodeString &affixStr,
    437         AffixPattern &appendTo,
    438         UErrorCode &status) {
    439     if (U_FAILURE(status)) {
    440         return appendTo;
    441     }
    442     int32_t len = affixStr.length();
    443     const UChar *buffer = affixStr.getBuffer();
    444     // 0 = not quoted; 1 = quoted.
    445     int32_t state = 0;
    446     AffixPatternAppender appender(appendTo);
    447     for (int32_t i = 0; i < len; ) {
    448         UChar token;
    449         int32_t tokenSize = nextUserToken(buffer, i, len, &token);
    450         i += tokenSize;
    451         if (token == 0x27 && tokenSize == 1) { // quote
    452             state = 1 - state;
    453             continue;
    454         }
    455         if (state == 0) {
    456             switch (token) {
    457             case 0x25:
    458                 appender.flush();
    459                 appendTo.add(kPercent, 1);
    460                 break;
    461             case 0x27:  // double quote
    462                 appender.append((UChar) 0x27);
    463                 break;
    464             case 0x2030:
    465                 appender.flush();
    466                 appendTo.add(kPerMill, 1);
    467                 break;
    468             case 0x2D:
    469                 appender.flush();
    470                 appendTo.add(kNegative, 1);
    471                 break;
    472             case 0x2B:
    473                 appender.flush();
    474                 appendTo.add(kPositive, 1);
    475                 break;
    476             case 0xA4:
    477                 appender.flush();
    478                 appendTo.add(kCurrency, tokenSize);
    479                 break;
    480             default:
    481                 appender.append(token);
    482                 break;
    483             }
    484         } else {
    485             switch (token) {
    486             case 0x27:  // double quote
    487                 appender.append((UChar) 0x27);
    488                 break;
    489             case 0xA4: // included b/c tokenSize can be > 1
    490                 for (int32_t j = 0; j < tokenSize; ++j) {
    491                     appender.append((UChar) 0xA4);
    492                 }
    493                 break;
    494             default:
    495                 appender.append(token);
    496                 break;
    497             }
    498         }
    499     }
    500     return appendTo;
    501 }
    502 
    503 AffixPattern &
    504 AffixPattern::parseAffixString(
    505         const UnicodeString &affixStr,
    506         AffixPattern &appendTo,
    507         UErrorCode &status) {
    508     if (U_FAILURE(status)) {
    509         return appendTo;
    510     }
    511     int32_t len = affixStr.length();
    512     const UChar *buffer = affixStr.getBuffer();
    513     for (int32_t i = 0; i < len; ) {
    514         UChar token;
    515         int32_t tokenSize = nextToken(buffer, i, len, &token);
    516         if (tokenSize == 1) {
    517             int32_t literalStart = i;
    518             ++i;
    519             while (i < len && (tokenSize = nextToken(buffer, i, len, &token)) == 1) {
    520                 ++i;
    521             }
    522             appendTo.addLiteral(buffer, literalStart, i - literalStart);
    523 
    524             // If we reached end of string, we are done
    525             if (i == len) {
    526                 return appendTo;
    527             }
    528         }
    529         i += tokenSize;
    530         switch (token) {
    531         case 0x25:
    532             appendTo.add(kPercent, 1);
    533             break;
    534         case 0x2030:
    535             appendTo.add(kPerMill, 1);
    536             break;
    537         case 0x2D:
    538             appendTo.add(kNegative, 1);
    539             break;
    540         case 0x2B:
    541             appendTo.add(kPositive, 1);
    542             break;
    543         case 0xA4:
    544             {
    545                 if (tokenSize - 1 > 3) {
    546                     status = U_PARSE_ERROR;
    547                     return appendTo;
    548                 }
    549                 appendTo.add(kCurrency, tokenSize - 1);
    550             }
    551             break;
    552         default:
    553             appendTo.addLiteral(&token, 0, 1);
    554             break;
    555         }
    556     }
    557     return appendTo;
    558 }
    559 
    560 AffixPatternIterator &
    561 AffixPattern::iterator(AffixPatternIterator &result) const {
    562     result.nextLiteralIndex = 0;
    563     result.lastLiteralLength = 0;
    564     result.nextTokenIndex = 0;
    565     result.tokens = &tokens;
    566     result.literals = &literals;
    567     return result;
    568 }
    569 
    570 UBool
    571 AffixPatternIterator::nextToken() {
    572     int32_t tlen = tokens->length();
    573     if (nextTokenIndex == tlen) {
    574         return FALSE;
    575     }
    576     ++nextTokenIndex;
    577     const UChar *tokenBuffer = tokens->getBuffer();
    578     if (UNPACK_TOKEN(tokenBuffer[nextTokenIndex - 1]) ==
    579             AffixPattern::kLiteral) {
    580         while (nextTokenIndex < tlen &&
    581                 UNPACK_LONG(tokenBuffer[nextTokenIndex])) {
    582             ++nextTokenIndex;
    583         }
    584         lastLiteralLength = 0;
    585         int32_t i = nextTokenIndex - 1;
    586         for (; UNPACK_LONG(tokenBuffer[i]); --i) {
    587             lastLiteralLength <<= 8;
    588             lastLiteralLength |= UNPACK_LENGTH(tokenBuffer[i]);
    589         }
    590         lastLiteralLength <<= 8;
    591         lastLiteralLength |= UNPACK_LENGTH(tokenBuffer[i]);
    592         nextLiteralIndex += lastLiteralLength;
    593     }
    594     return TRUE;
    595 }
    596 
    597 AffixPattern::ETokenType
    598 AffixPatternIterator::getTokenType() const {
    599     return UNPACK_TOKEN(tokens->charAt(nextTokenIndex - 1));
    600 }
    601 
    602 UnicodeString &
    603 AffixPatternIterator::getLiteral(UnicodeString &result) const {
    604     const UChar *buffer = literals->getBuffer();
    605     result.setTo(buffer + (nextLiteralIndex - lastLiteralLength), lastLiteralLength);
    606     return result;
    607 }
    608 
    609 int32_t
    610 AffixPatternIterator::getTokenLength() const {
    611     const UChar *tokenBuffer = tokens->getBuffer();
    612     AffixPattern::ETokenType type = UNPACK_TOKEN(tokenBuffer[nextTokenIndex - 1]);
    613     return type == AffixPattern::kLiteral ? lastLiteralLength : UNPACK_LENGTH(tokenBuffer[nextTokenIndex - 1]);
    614 }
    615 
    616 AffixPatternParser::AffixPatternParser()
    617         : fPercent(gPercent), fPermill(gPerMill), fNegative(gNegative), fPositive(gPositive) {
    618 }
    619 
    620 AffixPatternParser::AffixPatternParser(
    621         const DecimalFormatSymbols &symbols) {
    622     setDecimalFormatSymbols(symbols);
    623 }
    624 
    625 void
    626 AffixPatternParser::setDecimalFormatSymbols(
    627         const DecimalFormatSymbols &symbols) {
    628     fPercent = symbols.getConstSymbol(DecimalFormatSymbols::kPercentSymbol);
    629     fPermill = symbols.getConstSymbol(DecimalFormatSymbols::kPerMillSymbol);
    630     fNegative = symbols.getConstSymbol(DecimalFormatSymbols::kMinusSignSymbol);
    631     fPositive = symbols.getConstSymbol(DecimalFormatSymbols::kPlusSignSymbol);
    632 }
    633 
    634 PluralAffix &
    635 AffixPatternParser::parse(
    636         const AffixPattern &affixPattern,
    637         const CurrencyAffixInfo &currencyAffixInfo,
    638         PluralAffix &appendTo,
    639         UErrorCode &status) const {
    640     if (U_FAILURE(status)) {
    641         return appendTo;
    642     }
    643     AffixPatternIterator iter;
    644     affixPattern.iterator(iter);
    645     UnicodeString literal;
    646     while (iter.nextToken()) {
    647         switch (iter.getTokenType()) {
    648         case AffixPattern::kPercent:
    649             appendTo.append(fPercent, UNUM_PERCENT_FIELD);
    650             break;
    651         case AffixPattern::kPerMill:
    652             appendTo.append(fPermill, UNUM_PERMILL_FIELD);
    653             break;
    654         case AffixPattern::kNegative:
    655             appendTo.append(fNegative, UNUM_SIGN_FIELD);
    656             break;
    657         case AffixPattern::kPositive:
    658             appendTo.append(fPositive, UNUM_SIGN_FIELD);
    659             break;
    660         case AffixPattern::kCurrency:
    661             switch (iter.getTokenLength()) {
    662                 case 1:
    663                     appendTo.append(
    664                             currencyAffixInfo.getSymbol(), UNUM_CURRENCY_FIELD);
    665                     break;
    666                 case 2:
    667                     appendTo.append(
    668                             currencyAffixInfo.getISO(), UNUM_CURRENCY_FIELD);
    669                     break;
    670                 case 3:
    671                     appendTo.append(
    672                             currencyAffixInfo.getLong(), UNUM_CURRENCY_FIELD, status);
    673                     break;
    674                 default:
    675                     U_ASSERT(FALSE);
    676                     break;
    677             }
    678             break;
    679         case AffixPattern::kLiteral:
    680             appendTo.append(iter.getLiteral(literal));
    681             break;
    682         default:
    683             U_ASSERT(FALSE);
    684             break;
    685         }
    686     }
    687     return appendTo;
    688 }
    689 
    690 
    691 U_NAMESPACE_END
    692 #endif /* #if !UCONFIG_NO_FORMATTING */
    693