1 // 2017 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 #include "unicode/utypes.h" 5 6 #if !UCONFIG_NO_FORMATTING 7 8 #include "number_affixutils.h" 9 #include "unicode/utf16.h" 10 #include "unicode/uniset.h" 11 12 using namespace icu; 13 using namespace icu::number; 14 using namespace icu::number::impl; 15 16 TokenConsumer::~TokenConsumer() = default; 17 SymbolProvider::~SymbolProvider() = default; 18 19 int32_t AffixUtils::estimateLength(const UnicodeString &patternString, UErrorCode &status) { 20 AffixPatternState state = STATE_BASE; 21 int32_t offset = 0; 22 int32_t length = 0; 23 for (; offset < patternString.length();) { 24 UChar32 cp = patternString.char32At(offset); 25 26 switch (state) { 27 case STATE_BASE: 28 if (cp == u'\'') { 29 // First quote 30 state = STATE_FIRST_QUOTE; 31 } else { 32 // Unquoted symbol 33 length++; 34 } 35 break; 36 case STATE_FIRST_QUOTE: 37 if (cp == u'\'') { 38 // Repeated quote 39 length++; 40 state = STATE_BASE; 41 } else { 42 // Quoted code point 43 length++; 44 state = STATE_INSIDE_QUOTE; 45 } 46 break; 47 case STATE_INSIDE_QUOTE: 48 if (cp == u'\'') { 49 // End of quoted sequence 50 state = STATE_AFTER_QUOTE; 51 } else { 52 // Quoted code point 53 length++; 54 } 55 break; 56 case STATE_AFTER_QUOTE: 57 if (cp == u'\'') { 58 // Double quote inside of quoted sequence 59 length++; 60 state = STATE_INSIDE_QUOTE; 61 } else { 62 // Unquoted symbol 63 length++; 64 } 65 break; 66 default: 67 U_ASSERT(false); 68 } 69 70 offset += U16_LENGTH(cp); 71 } 72 73 switch (state) { 74 case STATE_FIRST_QUOTE: 75 case STATE_INSIDE_QUOTE: 76 status = U_ILLEGAL_ARGUMENT_ERROR; 77 break; 78 default: 79 break; 80 } 81 82 return length; 83 } 84 85 UnicodeString AffixUtils::escape(const UnicodeString &input) { 86 AffixPatternState state = STATE_BASE; 87 int32_t offset = 0; 88 UnicodeString output; 89 for (; offset < input.length();) { 90 UChar32 cp = input.char32At(offset); 91 92 switch (cp) { 93 case u'\'': 94 output.append(u"''", -1); 95 break; 96 97 case u'-': 98 case u'+': 99 case u'%': 100 case u'': 101 case u'': 102 if (state == STATE_BASE) { 103 output.append(u'\''); 104 output.append(cp); 105 state = STATE_INSIDE_QUOTE; 106 } else { 107 output.append(cp); 108 } 109 break; 110 111 default: 112 if (state == STATE_INSIDE_QUOTE) { 113 output.append(u'\''); 114 output.append(cp); 115 state = STATE_BASE; 116 } else { 117 output.append(cp); 118 } 119 break; 120 } 121 offset += U16_LENGTH(cp); 122 } 123 124 if (state == STATE_INSIDE_QUOTE) { 125 output.append(u'\''); 126 } 127 128 return output; 129 } 130 131 Field AffixUtils::getFieldForType(AffixPatternType type) { 132 switch (type) { 133 case TYPE_MINUS_SIGN: 134 return Field::UNUM_SIGN_FIELD; 135 case TYPE_PLUS_SIGN: 136 return Field::UNUM_SIGN_FIELD; 137 case TYPE_PERCENT: 138 return Field::UNUM_PERCENT_FIELD; 139 case TYPE_PERMILLE: 140 return Field::UNUM_PERMILL_FIELD; 141 case TYPE_CURRENCY_SINGLE: 142 return Field::UNUM_CURRENCY_FIELD; 143 case TYPE_CURRENCY_DOUBLE: 144 return Field::UNUM_CURRENCY_FIELD; 145 case TYPE_CURRENCY_TRIPLE: 146 return Field::UNUM_CURRENCY_FIELD; 147 case TYPE_CURRENCY_QUAD: 148 return Field::UNUM_CURRENCY_FIELD; 149 case TYPE_CURRENCY_QUINT: 150 return Field::UNUM_CURRENCY_FIELD; 151 case TYPE_CURRENCY_OVERFLOW: 152 return Field::UNUM_CURRENCY_FIELD; 153 default: 154 U_ASSERT(false); 155 return Field::UNUM_FIELD_COUNT; // suppress "control reaches end of non-void function" 156 } 157 } 158 159 int32_t 160 AffixUtils::unescape(const UnicodeString &affixPattern, NumberStringBuilder &output, int32_t position, 161 const SymbolProvider &provider, UErrorCode &status) { 162 int32_t length = 0; 163 AffixTag tag; 164 while (hasNext(tag, affixPattern)) { 165 tag = nextToken(tag, affixPattern, status); 166 if (U_FAILURE(status)) { return length; } 167 if (tag.type == TYPE_CURRENCY_OVERFLOW) { 168 // Don't go to the provider for this special case 169 length += output.insertCodePoint(position + length, 0xFFFD, UNUM_CURRENCY_FIELD, status); 170 } else if (tag.type < 0) { 171 length += output.insert( 172 position + length, provider.getSymbol(tag.type), getFieldForType(tag.type), status); 173 } else { 174 length += output.insertCodePoint(position + length, tag.codePoint, UNUM_FIELD_COUNT, status); 175 } 176 } 177 return length; 178 } 179 180 int32_t AffixUtils::unescapedCodePointCount(const UnicodeString &affixPattern, 181 const SymbolProvider &provider, UErrorCode &status) { 182 int32_t length = 0; 183 AffixTag tag; 184 while (hasNext(tag, affixPattern)) { 185 tag = nextToken(tag, affixPattern, status); 186 if (U_FAILURE(status)) { return length; } 187 if (tag.type == TYPE_CURRENCY_OVERFLOW) { 188 length += 1; 189 } else if (tag.type < 0) { 190 length += provider.getSymbol(tag.type).length(); 191 } else { 192 length += U16_LENGTH(tag.codePoint); 193 } 194 } 195 return length; 196 } 197 198 bool 199 AffixUtils::containsType(const UnicodeString &affixPattern, AffixPatternType type, UErrorCode &status) { 200 if (affixPattern.length() == 0) { 201 return false; 202 } 203 AffixTag tag; 204 while (hasNext(tag, affixPattern)) { 205 tag = nextToken(tag, affixPattern, status); 206 if (U_FAILURE(status)) { return false; } 207 if (tag.type == type) { 208 return true; 209 } 210 } 211 return false; 212 } 213 214 bool AffixUtils::hasCurrencySymbols(const UnicodeString &affixPattern, UErrorCode &status) { 215 if (affixPattern.length() == 0) { 216 return false; 217 } 218 AffixTag tag; 219 while (hasNext(tag, affixPattern)) { 220 tag = nextToken(tag, affixPattern, status); 221 if (U_FAILURE(status)) { return false; } 222 if (tag.type < 0 && getFieldForType(tag.type) == UNUM_CURRENCY_FIELD) { 223 return true; 224 } 225 } 226 return false; 227 } 228 229 UnicodeString AffixUtils::replaceType(const UnicodeString &affixPattern, AffixPatternType type, 230 char16_t replacementChar, UErrorCode &status) { 231 UnicodeString output(affixPattern); // copy 232 if (affixPattern.length() == 0) { 233 return output; 234 }; 235 AffixTag tag; 236 while (hasNext(tag, affixPattern)) { 237 tag = nextToken(tag, affixPattern, status); 238 if (U_FAILURE(status)) { return output; } 239 if (tag.type == type) { 240 output.replace(tag.offset - 1, 1, replacementChar); 241 } 242 } 243 return output; 244 } 245 246 bool AffixUtils::containsOnlySymbolsAndIgnorables(const UnicodeString& affixPattern, 247 const UnicodeSet& ignorables, UErrorCode& status) { 248 if (affixPattern.length() == 0) { 249 return true; 250 }; 251 AffixTag tag; 252 while (hasNext(tag, affixPattern)) { 253 tag = nextToken(tag, affixPattern, status); 254 if (U_FAILURE(status)) { return false; } 255 if (tag.type == TYPE_CODEPOINT && !ignorables.contains(tag.codePoint)) { 256 return false; 257 } 258 } 259 return true; 260 } 261 262 void AffixUtils::iterateWithConsumer(const UnicodeString& affixPattern, TokenConsumer& consumer, 263 UErrorCode& status) { 264 if (affixPattern.length() == 0) { 265 return; 266 }; 267 AffixTag tag; 268 while (hasNext(tag, affixPattern)) { 269 tag = nextToken(tag, affixPattern, status); 270 if (U_FAILURE(status)) { return; } 271 consumer.consumeToken(tag.type, tag.codePoint, status); 272 if (U_FAILURE(status)) { return; } 273 } 274 } 275 276 AffixTag AffixUtils::nextToken(AffixTag tag, const UnicodeString &patternString, UErrorCode &status) { 277 int32_t offset = tag.offset; 278 int32_t state = tag.state; 279 for (; offset < patternString.length();) { 280 UChar32 cp = patternString.char32At(offset); 281 int32_t count = U16_LENGTH(cp); 282 283 switch (state) { 284 case STATE_BASE: 285 switch (cp) { 286 case u'\'': 287 state = STATE_FIRST_QUOTE; 288 offset += count; 289 // continue to the next code point 290 break; 291 case u'-': 292 return makeTag(offset + count, TYPE_MINUS_SIGN, STATE_BASE, 0); 293 case u'+': 294 return makeTag(offset + count, TYPE_PLUS_SIGN, STATE_BASE, 0); 295 case u'%': 296 return makeTag(offset + count, TYPE_PERCENT, STATE_BASE, 0); 297 case u'': 298 return makeTag(offset + count, TYPE_PERMILLE, STATE_BASE, 0); 299 case u'': 300 state = STATE_FIRST_CURR; 301 offset += count; 302 // continue to the next code point 303 break; 304 default: 305 return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp); 306 } 307 break; 308 case STATE_FIRST_QUOTE: 309 if (cp == u'\'') { 310 return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp); 311 } else { 312 return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp); 313 } 314 case STATE_INSIDE_QUOTE: 315 if (cp == u'\'') { 316 state = STATE_AFTER_QUOTE; 317 offset += count; 318 // continue to the next code point 319 break; 320 } else { 321 return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp); 322 } 323 case STATE_AFTER_QUOTE: 324 if (cp == u'\'') { 325 return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp); 326 } else { 327 state = STATE_BASE; 328 // re-evaluate this code point 329 break; 330 } 331 case STATE_FIRST_CURR: 332 if (cp == u'') { 333 state = STATE_SECOND_CURR; 334 offset += count; 335 // continue to the next code point 336 break; 337 } else { 338 return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0); 339 } 340 case STATE_SECOND_CURR: 341 if (cp == u'') { 342 state = STATE_THIRD_CURR; 343 offset += count; 344 // continue to the next code point 345 break; 346 } else { 347 return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0); 348 } 349 case STATE_THIRD_CURR: 350 if (cp == u'') { 351 state = STATE_FOURTH_CURR; 352 offset += count; 353 // continue to the next code point 354 break; 355 } else { 356 return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0); 357 } 358 case STATE_FOURTH_CURR: 359 if (cp == u'') { 360 state = STATE_FIFTH_CURR; 361 offset += count; 362 // continue to the next code point 363 break; 364 } else { 365 return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0); 366 } 367 case STATE_FIFTH_CURR: 368 if (cp == u'') { 369 state = STATE_OVERFLOW_CURR; 370 offset += count; 371 // continue to the next code point 372 break; 373 } else { 374 return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0); 375 } 376 case STATE_OVERFLOW_CURR: 377 if (cp == u'') { 378 offset += count; 379 // continue to the next code point and loop back to this state 380 break; 381 } else { 382 return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0); 383 } 384 default: 385 U_ASSERT(false); 386 } 387 } 388 // End of string 389 switch (state) { 390 case STATE_BASE: 391 // No more tokens in string. 392 return {-1}; 393 case STATE_FIRST_QUOTE: 394 case STATE_INSIDE_QUOTE: 395 // For consistent behavior with the JDK and ICU 58, set an error here. 396 status = U_ILLEGAL_ARGUMENT_ERROR; 397 return {-1}; 398 case STATE_AFTER_QUOTE: 399 // No more tokens in string. 400 return {-1}; 401 case STATE_FIRST_CURR: 402 return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0); 403 case STATE_SECOND_CURR: 404 return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0); 405 case STATE_THIRD_CURR: 406 return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0); 407 case STATE_FOURTH_CURR: 408 return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0); 409 case STATE_FIFTH_CURR: 410 return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0); 411 case STATE_OVERFLOW_CURR: 412 return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0); 413 default: 414 U_ASSERT(false); 415 return {-1}; // suppress "control reaches end of non-void function" 416 } 417 } 418 419 bool AffixUtils::hasNext(const AffixTag &tag, const UnicodeString &string) { 420 // First check for the {-1} and default initializer syntax. 421 if (tag.offset < 0) { 422 return false; 423 } else if (tag.offset == 0) { 424 return string.length() > 0; 425 } 426 // The rest of the fields are safe to use now. 427 // Special case: the last character in string is an end quote. 428 if (tag.state == STATE_INSIDE_QUOTE && tag.offset == string.length() - 1 && 429 string.charAt(tag.offset) == u'\'') { 430 return false; 431 } else if (tag.state != STATE_BASE) { 432 return true; 433 } else { 434 return tag.offset < string.length(); 435 } 436 } 437 438 #endif /* #if !UCONFIG_NO_FORMATTING */ 439