1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 * Copyright (C) 2015, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 * file name: affixpatternparser.cpp 8 */ 9 10 #include "unicode/utypes.h" 11 12 #if !UCONFIG_NO_FORMATTING 13 14 #include "unicode/dcfmtsym.h" 15 #include "unicode/plurrule.h" 16 #include "unicode/ucurr.h" 17 #include "affixpatternparser.h" 18 #include "charstr.h" 19 #include "precision.h" 20 #include "uassert.h" 21 #include "unistrappender.h" 22 23 static UChar gDefaultSymbols[] = {0xa4, 0xa4, 0xa4}; 24 25 static UChar gPercent = 0x25; 26 static UChar gPerMill = 0x2030; 27 static UChar gNegative = 0x2D; 28 static UChar gPositive = 0x2B; 29 30 #define PACK_TOKEN_AND_LENGTH(t, l) ((UChar) (((t) << 8) | (l & 0xFF))) 31 32 #define UNPACK_TOKEN(c) ((AffixPattern::ETokenType) (((c) >> 8) & 0x7F)) 33 34 #define UNPACK_LONG(c) (((c) >> 8) & 0x80) 35 36 #define UNPACK_LENGTH(c) ((c) & 0xFF) 37 38 U_NAMESPACE_BEGIN 39 40 static int32_t 41 nextToken(const UChar *buffer, int32_t idx, int32_t len, UChar *token) { 42 if (buffer[idx] != 0x27 || idx + 1 == len) { 43 *token = buffer[idx]; 44 return 1; 45 } 46 *token = buffer[idx + 1]; 47 if (buffer[idx + 1] == 0xA4) { 48 int32_t i = 2; 49 for (; idx + i < len && i < 4 && buffer[idx + i] == buffer[idx + 1]; ++i) 50 ; 51 return i; 52 } 53 return 2; 54 } 55 56 static int32_t 57 nextUserToken(const UChar *buffer, int32_t idx, int32_t len, UChar *token) { 58 *token = buffer[idx]; 59 int32_t max; 60 switch (buffer[idx]) { 61 case 0x27: 62 max = 2; 63 break; 64 case 0xA4: 65 max = 3; 66 break; 67 default: 68 max = 1; 69 break; 70 } 71 int32_t i = 1; 72 for (; idx + i < len && i < max && buffer[idx + i] == buffer[idx]; ++i) 73 ; 74 return i; 75 } 76 77 CurrencyAffixInfo::CurrencyAffixInfo() 78 : fSymbol(gDefaultSymbols, 1), 79 fISO(gDefaultSymbols, 2), 80 fLong(DigitAffix(gDefaultSymbols, 3)), 81 fIsDefault(TRUE) { 82 } 83 84 void 85 CurrencyAffixInfo::set( 86 const char *locale, 87 const PluralRules *rules, 88 const UChar *currency, 89 UErrorCode &status) { 90 if (U_FAILURE(status)) { 91 return; 92 } 93 fIsDefault = FALSE; 94 if (currency == NULL) { 95 fSymbol.setTo(gDefaultSymbols, 1); 96 fISO.setTo(gDefaultSymbols, 2); 97 fLong.remove(); 98 fLong.append(gDefaultSymbols, 3); 99 fIsDefault = TRUE; 100 return; 101 } 102 int32_t len; 103 UBool unusedIsChoice; 104 const UChar *symbol = ucurr_getName( 105 currency, locale, UCURR_SYMBOL_NAME, &unusedIsChoice, 106 &len, &status); 107 if (U_FAILURE(status)) { 108 return; 109 } 110 fSymbol.setTo(symbol, len); 111 fISO.setTo(currency, u_strlen(currency)); 112 fLong.remove(); 113 StringEnumeration* keywords = rules->getKeywords(status); 114 if (U_FAILURE(status)) { 115 return; 116 } 117 const UnicodeString* pluralCount; 118 while ((pluralCount = keywords->snext(status)) != NULL) { 119 CharString pCount; 120 pCount.appendInvariantChars(*pluralCount, status); 121 const UChar *pluralName = ucurr_getPluralName( 122 currency, locale, &unusedIsChoice, pCount.data(), 123 &len, &status); 124 fLong.setVariant(pCount.data(), UnicodeString(pluralName, len), status); 125 } 126 delete keywords; 127 } 128 129 void 130 CurrencyAffixInfo::adjustPrecision( 131 const UChar *currency, const UCurrencyUsage usage, 132 FixedPrecision &precision, UErrorCode &status) { 133 if (U_FAILURE(status)) { 134 return; 135 } 136 137 int32_t digitCount = ucurr_getDefaultFractionDigitsForUsage( 138 currency, usage, &status); 139 precision.fMin.setFracDigitCount(digitCount); 140 precision.fMax.setFracDigitCount(digitCount); 141 double increment = ucurr_getRoundingIncrementForUsage( 142 currency, usage, &status); 143 if (increment == 0.0) { 144 precision.fRoundingIncrement.clear(); 145 } else { 146 precision.fRoundingIncrement.set(increment); 147 // guard against round-off error 148 precision.fRoundingIncrement.round(6); 149 } 150 } 151 152 void 153 AffixPattern::addLiteral( 154 const UChar *literal, int32_t start, int32_t len) { 155 char32Count += u_countChar32(literal + start, len); 156 literals.append(literal, start, len); 157 int32_t tlen = tokens.length(); 158 // Takes 4 UChars to encode maximum literal length. 159 UChar *tokenChars = tokens.getBuffer(tlen + 4); 160 161 // find start of literal size. May be tlen if there is no literal. 162 // While finding start of literal size, compute literal length 163 int32_t literalLength = 0; 164 int32_t tLiteralStart = tlen; 165 while (tLiteralStart > 0 && UNPACK_TOKEN(tokenChars[tLiteralStart - 1]) == kLiteral) { 166 tLiteralStart--; 167 literalLength <<= 8; 168 literalLength |= UNPACK_LENGTH(tokenChars[tLiteralStart]); 169 } 170 // Add number of chars we just added to literal 171 literalLength += len; 172 173 // Now encode the new length starting at tLiteralStart 174 tlen = tLiteralStart; 175 tokenChars[tlen++] = PACK_TOKEN_AND_LENGTH(kLiteral, literalLength & 0xFF); 176 literalLength >>= 8; 177 while (literalLength) { 178 tokenChars[tlen++] = PACK_TOKEN_AND_LENGTH(kLiteral | 0x80, literalLength & 0xFF); 179 literalLength >>= 8; 180 } 181 tokens.releaseBuffer(tlen); 182 } 183 184 void 185 AffixPattern::add(ETokenType t) { 186 add(t, 1); 187 } 188 189 void 190 AffixPattern::addCurrency(uint8_t count) { 191 add(kCurrency, count); 192 } 193 194 void 195 AffixPattern::add(ETokenType t, uint8_t count) { 196 U_ASSERT(t != kLiteral); 197 char32Count += count; 198 switch (t) { 199 case kCurrency: 200 hasCurrencyToken = TRUE; 201 break; 202 case kPercent: 203 hasPercentToken = TRUE; 204 break; 205 case kPerMill: 206 hasPermillToken = TRUE; 207 break; 208 default: 209 // Do nothing 210 break; 211 } 212 tokens.append(PACK_TOKEN_AND_LENGTH(t, count)); 213 } 214 215 AffixPattern & 216 AffixPattern::append(const AffixPattern &other) { 217 AffixPatternIterator iter; 218 other.iterator(iter); 219 UnicodeString literal; 220 while (iter.nextToken()) { 221 switch (iter.getTokenType()) { 222 case kLiteral: 223 iter.getLiteral(literal); 224 addLiteral(literal.getBuffer(), 0, literal.length()); 225 break; 226 case kCurrency: 227 addCurrency(iter.getTokenLength()); 228 break; 229 default: 230 add(iter.getTokenType()); 231 break; 232 } 233 } 234 return *this; 235 } 236 237 void 238 AffixPattern::remove() { 239 tokens.remove(); 240 literals.remove(); 241 hasCurrencyToken = FALSE; 242 hasPercentToken = FALSE; 243 hasPermillToken = FALSE; 244 char32Count = 0; 245 } 246 247 // escapes literals for strings where special characters are NOT escaped 248 // except for apostrophe. 249 static void escapeApostropheInLiteral( 250 const UnicodeString &literal, UnicodeStringAppender &appender) { 251 int32_t len = literal.length(); 252 const UChar *buffer = literal.getBuffer(); 253 for (int32_t i = 0; i < len; ++i) { 254 UChar ch = buffer[i]; 255 switch (ch) { 256 case 0x27: 257 appender.append((UChar) 0x27); 258 appender.append((UChar) 0x27); 259 break; 260 default: 261 appender.append(ch); 262 break; 263 } 264 } 265 } 266 267 268 // escapes literals for user strings where special characters in literals 269 // are escaped with apostrophe. 270 static void escapeLiteral( 271 const UnicodeString &literal, UnicodeStringAppender &appender) { 272 int32_t len = literal.length(); 273 const UChar *buffer = literal.getBuffer(); 274 for (int32_t i = 0; i < len; ++i) { 275 UChar ch = buffer[i]; 276 switch (ch) { 277 case 0x27: 278 appender.append((UChar) 0x27); 279 appender.append((UChar) 0x27); 280 break; 281 case 0x25: 282 appender.append((UChar) 0x27); 283 appender.append((UChar) 0x25); 284 appender.append((UChar) 0x27); 285 break; 286 case 0x2030: 287 appender.append((UChar) 0x27); 288 appender.append((UChar) 0x2030); 289 appender.append((UChar) 0x27); 290 break; 291 case 0xA4: 292 appender.append((UChar) 0x27); 293 appender.append((UChar) 0xA4); 294 appender.append((UChar) 0x27); 295 break; 296 case 0x2D: 297 appender.append((UChar) 0x27); 298 appender.append((UChar) 0x2D); 299 appender.append((UChar) 0x27); 300 break; 301 case 0x2B: 302 appender.append((UChar) 0x27); 303 appender.append((UChar) 0x2B); 304 appender.append((UChar) 0x27); 305 break; 306 default: 307 appender.append(ch); 308 break; 309 } 310 } 311 } 312 313 UnicodeString & 314 AffixPattern::toString(UnicodeString &appendTo) const { 315 AffixPatternIterator iter; 316 iterator(iter); 317 UnicodeStringAppender appender(appendTo); 318 UnicodeString literal; 319 while (iter.nextToken()) { 320 switch (iter.getTokenType()) { 321 case kLiteral: 322 escapeApostropheInLiteral(iter.getLiteral(literal), appender); 323 break; 324 case kPercent: 325 appender.append((UChar) 0x27); 326 appender.append((UChar) 0x25); 327 break; 328 case kPerMill: 329 appender.append((UChar) 0x27); 330 appender.append((UChar) 0x2030); 331 break; 332 case kCurrency: 333 { 334 appender.append((UChar) 0x27); 335 int32_t cl = iter.getTokenLength(); 336 for (int32_t i = 0; i < cl; ++i) { 337 appender.append((UChar) 0xA4); 338 } 339 } 340 break; 341 case kNegative: 342 appender.append((UChar) 0x27); 343 appender.append((UChar) 0x2D); 344 break; 345 case kPositive: 346 appender.append((UChar) 0x27); 347 appender.append((UChar) 0x2B); 348 break; 349 default: 350 U_ASSERT(FALSE); 351 break; 352 } 353 } 354 return appendTo; 355 } 356 357 UnicodeString & 358 AffixPattern::toUserString(UnicodeString &appendTo) const { 359 AffixPatternIterator iter; 360 iterator(iter); 361 UnicodeStringAppender appender(appendTo); 362 UnicodeString literal; 363 while (iter.nextToken()) { 364 switch (iter.getTokenType()) { 365 case kLiteral: 366 escapeLiteral(iter.getLiteral(literal), appender); 367 break; 368 case kPercent: 369 appender.append((UChar) 0x25); 370 break; 371 case kPerMill: 372 appender.append((UChar) 0x2030); 373 break; 374 case kCurrency: 375 { 376 int32_t cl = iter.getTokenLength(); 377 for (int32_t i = 0; i < cl; ++i) { 378 appender.append((UChar) 0xA4); 379 } 380 } 381 break; 382 case kNegative: 383 appender.append((UChar) 0x2D); 384 break; 385 case kPositive: 386 appender.append((UChar) 0x2B); 387 break; 388 default: 389 U_ASSERT(FALSE); 390 break; 391 } 392 } 393 return appendTo; 394 } 395 396 class AffixPatternAppender : public UMemory { 397 public: 398 AffixPatternAppender(AffixPattern &dest) : fDest(&dest), fIdx(0) { } 399 400 inline void append(UChar x) { 401 if (fIdx == UPRV_LENGTHOF(fBuffer)) { 402 fDest->addLiteral(fBuffer, 0, fIdx); 403 fIdx = 0; 404 } 405 fBuffer[fIdx++] = x; 406 } 407 408 inline void append(UChar32 x) { 409 if (fIdx >= UPRV_LENGTHOF(fBuffer) - 1) { 410 fDest->addLiteral(fBuffer, 0, fIdx); 411 fIdx = 0; 412 } 413 U16_APPEND_UNSAFE(fBuffer, fIdx, x); 414 } 415 416 inline void flush() { 417 if (fIdx) { 418 fDest->addLiteral(fBuffer, 0, fIdx); 419 } 420 fIdx = 0; 421 } 422 423 /** 424 * flush the buffer when we go out of scope. 425 */ 426 ~AffixPatternAppender() { 427 flush(); 428 } 429 private: 430 AffixPattern *fDest; 431 int32_t fIdx; 432 UChar fBuffer[32]; 433 AffixPatternAppender(const AffixPatternAppender &other); 434 AffixPatternAppender &operator=(const AffixPatternAppender &other); 435 }; 436 437 438 AffixPattern & 439 AffixPattern::parseUserAffixString( 440 const UnicodeString &affixStr, 441 AffixPattern &appendTo, 442 UErrorCode &status) { 443 if (U_FAILURE(status)) { 444 return appendTo; 445 } 446 int32_t len = affixStr.length(); 447 const UChar *buffer = affixStr.getBuffer(); 448 // 0 = not quoted; 1 = quoted. 449 int32_t state = 0; 450 AffixPatternAppender appender(appendTo); 451 for (int32_t i = 0; i < len; ) { 452 UChar token; 453 int32_t tokenSize = nextUserToken(buffer, i, len, &token); 454 i += tokenSize; 455 if (token == 0x27 && tokenSize == 1) { // quote 456 state = 1 - state; 457 continue; 458 } 459 if (state == 0) { 460 switch (token) { 461 case 0x25: 462 appender.flush(); 463 appendTo.add(kPercent, 1); 464 break; 465 case 0x27: // double quote 466 appender.append((UChar) 0x27); 467 break; 468 case 0x2030: 469 appender.flush(); 470 appendTo.add(kPerMill, 1); 471 break; 472 case 0x2D: 473 appender.flush(); 474 appendTo.add(kNegative, 1); 475 break; 476 case 0x2B: 477 appender.flush(); 478 appendTo.add(kPositive, 1); 479 break; 480 case 0xA4: 481 appender.flush(); 482 appendTo.add(kCurrency, tokenSize); 483 break; 484 default: 485 appender.append(token); 486 break; 487 } 488 } else { 489 switch (token) { 490 case 0x27: // double quote 491 appender.append((UChar) 0x27); 492 break; 493 case 0xA4: // included b/c tokenSize can be > 1 494 for (int32_t j = 0; j < tokenSize; ++j) { 495 appender.append((UChar) 0xA4); 496 } 497 break; 498 default: 499 appender.append(token); 500 break; 501 } 502 } 503 } 504 return appendTo; 505 } 506 507 AffixPattern & 508 AffixPattern::parseAffixString( 509 const UnicodeString &affixStr, 510 AffixPattern &appendTo, 511 UErrorCode &status) { 512 if (U_FAILURE(status)) { 513 return appendTo; 514 } 515 int32_t len = affixStr.length(); 516 const UChar *buffer = affixStr.getBuffer(); 517 for (int32_t i = 0; i < len; ) { 518 UChar token; 519 int32_t tokenSize = nextToken(buffer, i, len, &token); 520 if (tokenSize == 1) { 521 int32_t literalStart = i; 522 ++i; 523 while (i < len && (tokenSize = nextToken(buffer, i, len, &token)) == 1) { 524 ++i; 525 } 526 appendTo.addLiteral(buffer, literalStart, i - literalStart); 527 528 // If we reached end of string, we are done 529 if (i == len) { 530 return appendTo; 531 } 532 } 533 i += tokenSize; 534 switch (token) { 535 case 0x25: 536 appendTo.add(kPercent, 1); 537 break; 538 case 0x2030: 539 appendTo.add(kPerMill, 1); 540 break; 541 case 0x2D: 542 appendTo.add(kNegative, 1); 543 break; 544 case 0x2B: 545 appendTo.add(kPositive, 1); 546 break; 547 case 0xA4: 548 { 549 if (tokenSize - 1 > 3) { 550 status = U_PARSE_ERROR; 551 return appendTo; 552 } 553 appendTo.add(kCurrency, tokenSize - 1); 554 } 555 break; 556 default: 557 appendTo.addLiteral(&token, 0, 1); 558 break; 559 } 560 } 561 return appendTo; 562 } 563 564 AffixPatternIterator & 565 AffixPattern::iterator(AffixPatternIterator &result) const { 566 result.nextLiteralIndex = 0; 567 result.lastLiteralLength = 0; 568 result.nextTokenIndex = 0; 569 result.tokens = &tokens; 570 result.literals = &literals; 571 return result; 572 } 573 574 UBool 575 AffixPatternIterator::nextToken() { 576 int32_t tlen = tokens->length(); 577 if (nextTokenIndex == tlen) { 578 return FALSE; 579 } 580 ++nextTokenIndex; 581 const UChar *tokenBuffer = tokens->getBuffer(); 582 if (UNPACK_TOKEN(tokenBuffer[nextTokenIndex - 1]) == 583 AffixPattern::kLiteral) { 584 while (nextTokenIndex < tlen && 585 UNPACK_LONG(tokenBuffer[nextTokenIndex])) { 586 ++nextTokenIndex; 587 } 588 lastLiteralLength = 0; 589 int32_t i = nextTokenIndex - 1; 590 for (; UNPACK_LONG(tokenBuffer[i]); --i) { 591 lastLiteralLength <<= 8; 592 lastLiteralLength |= UNPACK_LENGTH(tokenBuffer[i]); 593 } 594 lastLiteralLength <<= 8; 595 lastLiteralLength |= UNPACK_LENGTH(tokenBuffer[i]); 596 nextLiteralIndex += lastLiteralLength; 597 } 598 return TRUE; 599 } 600 601 AffixPattern::ETokenType 602 AffixPatternIterator::getTokenType() const { 603 return UNPACK_TOKEN(tokens->charAt(nextTokenIndex - 1)); 604 } 605 606 UnicodeString & 607 AffixPatternIterator::getLiteral(UnicodeString &result) const { 608 const UChar *buffer = literals->getBuffer(); 609 result.setTo(buffer + (nextLiteralIndex - lastLiteralLength), lastLiteralLength); 610 return result; 611 } 612 613 int32_t 614 AffixPatternIterator::getTokenLength() const { 615 const UChar *tokenBuffer = tokens->getBuffer(); 616 AffixPattern::ETokenType type = UNPACK_TOKEN(tokenBuffer[nextTokenIndex - 1]); 617 return type == AffixPattern::kLiteral ? lastLiteralLength : UNPACK_LENGTH(tokenBuffer[nextTokenIndex - 1]); 618 } 619 620 AffixPatternParser::AffixPatternParser() 621 : fPercent(gPercent), fPermill(gPerMill), fNegative(gNegative), fPositive(gPositive) { 622 } 623 624 AffixPatternParser::AffixPatternParser( 625 const DecimalFormatSymbols &symbols) { 626 setDecimalFormatSymbols(symbols); 627 } 628 629 void 630 AffixPatternParser::setDecimalFormatSymbols( 631 const DecimalFormatSymbols &symbols) { 632 fPercent = symbols.getConstSymbol(DecimalFormatSymbols::kPercentSymbol); 633 fPermill = symbols.getConstSymbol(DecimalFormatSymbols::kPerMillSymbol); 634 fNegative = symbols.getConstSymbol(DecimalFormatSymbols::kMinusSignSymbol); 635 fPositive = symbols.getConstSymbol(DecimalFormatSymbols::kPlusSignSymbol); 636 } 637 638 PluralAffix & 639 AffixPatternParser::parse( 640 const AffixPattern &affixPattern, 641 const CurrencyAffixInfo ¤cyAffixInfo, 642 PluralAffix &appendTo, 643 UErrorCode &status) const { 644 if (U_FAILURE(status)) { 645 return appendTo; 646 } 647 AffixPatternIterator iter; 648 affixPattern.iterator(iter); 649 UnicodeString literal; 650 while (iter.nextToken()) { 651 switch (iter.getTokenType()) { 652 case AffixPattern::kPercent: 653 appendTo.append(fPercent, UNUM_PERCENT_FIELD); 654 break; 655 case AffixPattern::kPerMill: 656 appendTo.append(fPermill, UNUM_PERMILL_FIELD); 657 break; 658 case AffixPattern::kNegative: 659 appendTo.append(fNegative, UNUM_SIGN_FIELD); 660 break; 661 case AffixPattern::kPositive: 662 appendTo.append(fPositive, UNUM_SIGN_FIELD); 663 break; 664 case AffixPattern::kCurrency: 665 switch (iter.getTokenLength()) { 666 case 1: 667 appendTo.append( 668 currencyAffixInfo.getSymbol(), UNUM_CURRENCY_FIELD); 669 break; 670 case 2: 671 appendTo.append( 672 currencyAffixInfo.getISO(), UNUM_CURRENCY_FIELD); 673 break; 674 case 3: 675 appendTo.append( 676 currencyAffixInfo.getLong(), UNUM_CURRENCY_FIELD, status); 677 break; 678 default: 679 U_ASSERT(FALSE); 680 break; 681 } 682 break; 683 case AffixPattern::kLiteral: 684 appendTo.append(iter.getLiteral(literal)); 685 break; 686 default: 687 U_ASSERT(FALSE); 688 break; 689 } 690 } 691 return appendTo; 692 } 693 694 695 U_NAMESPACE_END 696 #endif /* #if !UCONFIG_NO_FORMATTING */ 697