1 /* 2 ****************************************************************************** 3 * Copyright (C) 1997-2014, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ****************************************************************************** 6 * file name: nfrule.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * Modification history 12 * Date Name Comments 13 * 10/11/2001 Doug Ported from ICU4J 14 */ 15 16 #include "nfrule.h" 17 18 #if U_HAVE_RBNF 19 20 #include "unicode/localpointer.h" 21 #include "unicode/rbnf.h" 22 #include "unicode/tblcoll.h" 23 #include "unicode/plurfmt.h" 24 #include "unicode/upluralrules.h" 25 #include "unicode/coleitr.h" 26 #include "unicode/uchar.h" 27 #include "nfrs.h" 28 #include "nfrlist.h" 29 #include "nfsubs.h" 30 #include "patternprops.h" 31 32 U_NAMESPACE_BEGIN 33 34 NFRule::NFRule(const RuleBasedNumberFormat* _rbnf) 35 : baseValue((int32_t)0) 36 , radix(0) 37 , exponent(0) 38 , ruleText() 39 , sub1(NULL) 40 , sub2(NULL) 41 , formatter(_rbnf) 42 , rulePatternFormat(NULL) 43 { 44 } 45 46 NFRule::~NFRule() 47 { 48 if (sub1 != sub2) { 49 delete sub2; 50 } 51 delete sub1; 52 delete rulePatternFormat; 53 } 54 55 static const UChar gLeftBracket = 0x005b; 56 static const UChar gRightBracket = 0x005d; 57 static const UChar gColon = 0x003a; 58 static const UChar gZero = 0x0030; 59 static const UChar gNine = 0x0039; 60 static const UChar gSpace = 0x0020; 61 static const UChar gSlash = 0x002f; 62 static const UChar gGreaterThan = 0x003e; 63 static const UChar gLessThan = 0x003c; 64 static const UChar gComma = 0x002c; 65 static const UChar gDot = 0x002e; 66 static const UChar gTick = 0x0027; 67 //static const UChar gMinus = 0x002d; 68 static const UChar gSemicolon = 0x003b; 69 70 static const UChar gMinusX[] = {0x2D, 0x78, 0}; /* "-x" */ 71 static const UChar gXDotX[] = {0x78, 0x2E, 0x78, 0}; /* "x.x" */ 72 static const UChar gXDotZero[] = {0x78, 0x2E, 0x30, 0}; /* "x.0" */ 73 static const UChar gZeroDotX[] = {0x30, 0x2E, 0x78, 0}; /* "0.x" */ 74 75 static const UChar gDollarOpenParenthesis[] = {0x24, 0x28, 0}; /* "$(" */ 76 static const UChar gClosedParenthesisDollar[] = {0x29, 0x24, 0}; /* ")$" */ 77 78 static const UChar gLessLess[] = {0x3C, 0x3C, 0}; /* "<<" */ 79 static const UChar gLessPercent[] = {0x3C, 0x25, 0}; /* "<%" */ 80 static const UChar gLessHash[] = {0x3C, 0x23, 0}; /* "<#" */ 81 static const UChar gLessZero[] = {0x3C, 0x30, 0}; /* "<0" */ 82 static const UChar gGreaterGreater[] = {0x3E, 0x3E, 0}; /* ">>" */ 83 static const UChar gGreaterPercent[] = {0x3E, 0x25, 0}; /* ">%" */ 84 static const UChar gGreaterHash[] = {0x3E, 0x23, 0}; /* ">#" */ 85 static const UChar gGreaterZero[] = {0x3E, 0x30, 0}; /* ">0" */ 86 static const UChar gEqualPercent[] = {0x3D, 0x25, 0}; /* "=%" */ 87 static const UChar gEqualHash[] = {0x3D, 0x23, 0}; /* "=#" */ 88 static const UChar gEqualZero[] = {0x3D, 0x30, 0}; /* "=0" */ 89 static const UChar gGreaterGreaterGreater[] = {0x3E, 0x3E, 0x3E, 0}; /* ">>>" */ 90 91 static const UChar * const tokenStrings[] = { 92 gLessLess, gLessPercent, gLessHash, gLessZero, 93 gGreaterGreater, gGreaterPercent,gGreaterHash, gGreaterZero, 94 gEqualPercent, gEqualHash, gEqualZero, NULL 95 }; 96 97 void 98 NFRule::makeRules(UnicodeString& description, 99 const NFRuleSet *ruleSet, 100 const NFRule *predecessor, 101 const RuleBasedNumberFormat *rbnf, 102 NFRuleList& rules, 103 UErrorCode& status) 104 { 105 // we know we're making at least one rule, so go ahead and 106 // new it up and initialize its basevalue and divisor 107 // (this also strips the rule descriptor, if any, off the 108 // descripton string) 109 NFRule* rule1 = new NFRule(rbnf); 110 /* test for NULL */ 111 if (rule1 == 0) { 112 status = U_MEMORY_ALLOCATION_ERROR; 113 return; 114 } 115 rule1->parseRuleDescriptor(description, status); 116 117 // check the description to see whether there's text enclosed 118 // in brackets 119 int32_t brack1 = description.indexOf(gLeftBracket); 120 int32_t brack2 = description.indexOf(gRightBracket); 121 122 // if the description doesn't contain a matched pair of brackets, 123 // or if it's of a type that doesn't recognize bracketed text, 124 // then leave the description alone, initialize the rule's 125 // rule text and substitutions, and return that rule 126 if (brack1 == -1 || brack2 == -1 || brack1 > brack2 127 || rule1->getType() == kProperFractionRule 128 || rule1->getType() == kNegativeNumberRule) { 129 rule1->extractSubstitutions(ruleSet, description, predecessor, status); 130 rules.add(rule1); 131 } else { 132 // if the description does contain a matched pair of brackets, 133 // then it's really shorthand for two rules (with one exception) 134 NFRule* rule2 = NULL; 135 UnicodeString sbuf; 136 137 // we'll actually only split the rule into two rules if its 138 // base value is an even multiple of its divisor (or it's one 139 // of the special rules) 140 if ((rule1->baseValue > 0 141 && (rule1->baseValue % util64_pow(rule1->radix, rule1->exponent)) == 0) 142 || rule1->getType() == kImproperFractionRule 143 || rule1->getType() == kMasterRule) { 144 145 // if it passes that test, new up the second rule. If the 146 // rule set both rules will belong to is a fraction rule 147 // set, they both have the same base value; otherwise, 148 // increment the original rule's base value ("rule1" actually 149 // goes SECOND in the rule set's rule list) 150 rule2 = new NFRule(rbnf); 151 /* test for NULL */ 152 if (rule2 == 0) { 153 status = U_MEMORY_ALLOCATION_ERROR; 154 return; 155 } 156 if (rule1->baseValue >= 0) { 157 rule2->baseValue = rule1->baseValue; 158 if (!ruleSet->isFractionRuleSet()) { 159 ++rule1->baseValue; 160 } 161 } 162 163 // if the description began with "x.x" and contains bracketed 164 // text, it describes both the improper fraction rule and 165 // the proper fraction rule 166 else if (rule1->getType() == kImproperFractionRule) { 167 rule2->setType(kProperFractionRule); 168 } 169 170 // if the description began with "x.0" and contains bracketed 171 // text, it describes both the master rule and the 172 // improper fraction rule 173 else if (rule1->getType() == kMasterRule) { 174 rule2->baseValue = rule1->baseValue; 175 rule1->setType(kImproperFractionRule); 176 } 177 178 // both rules have the same radix and exponent (i.e., the 179 // same divisor) 180 rule2->radix = rule1->radix; 181 rule2->exponent = rule1->exponent; 182 183 // rule2's rule text omits the stuff in brackets: initalize 184 // its rule text and substitutions accordingly 185 sbuf.append(description, 0, brack1); 186 if (brack2 + 1 < description.length()) { 187 sbuf.append(description, brack2 + 1, description.length() - brack2 - 1); 188 } 189 rule2->extractSubstitutions(ruleSet, sbuf, predecessor, status); 190 } 191 192 // rule1's text includes the text in the brackets but omits 193 // the brackets themselves: initialize _its_ rule text and 194 // substitutions accordingly 195 sbuf.setTo(description, 0, brack1); 196 sbuf.append(description, brack1 + 1, brack2 - brack1 - 1); 197 if (brack2 + 1 < description.length()) { 198 sbuf.append(description, brack2 + 1, description.length() - brack2 - 1); 199 } 200 rule1->extractSubstitutions(ruleSet, sbuf, predecessor, status); 201 202 // if we only have one rule, return it; if we have two, return 203 // a two-element array containing them (notice that rule2 goes 204 // BEFORE rule1 in the list: in all cases, rule2 OMITS the 205 // material in the brackets and rule1 INCLUDES the material 206 // in the brackets) 207 if (rule2 != NULL) { 208 rules.add(rule2); 209 } 210 rules.add(rule1); 211 } 212 } 213 214 /** 215 * This function parses the rule's rule descriptor (i.e., the base 216 * value and/or other tokens that precede the rule's rule text 217 * in the description) and sets the rule's base value, radix, and 218 * exponent according to the descriptor. (If the description doesn't 219 * include a rule descriptor, then this function sets everything to 220 * default values and the rule set sets the rule's real base value). 221 * @param description The rule's description 222 * @return If "description" included a rule descriptor, this is 223 * "description" with the descriptor and any trailing whitespace 224 * stripped off. Otherwise; it's "descriptor" unchangd. 225 */ 226 void 227 NFRule::parseRuleDescriptor(UnicodeString& description, UErrorCode& status) 228 { 229 // the description consists of a rule descriptor and a rule body, 230 // separated by a colon. The rule descriptor is optional. If 231 // it's omitted, just set the base value to 0. 232 int32_t p = description.indexOf(gColon); 233 if (p == -1) { 234 setBaseValue((int32_t)0, status); 235 } else { 236 // copy the descriptor out into its own string and strip it, 237 // along with any trailing whitespace, out of the original 238 // description 239 UnicodeString descriptor; 240 descriptor.setTo(description, 0, p); 241 242 ++p; 243 while (p < description.length() && PatternProps::isWhiteSpace(description.charAt(p))) { 244 ++p; 245 } 246 description.removeBetween(0, p); 247 248 // check first to see if the rule descriptor matches the token 249 // for one of the special rules. If it does, set the base 250 // value to the correct identfier value 251 if (0 == descriptor.compare(gMinusX, 2)) { 252 setType(kNegativeNumberRule); 253 } 254 else if (0 == descriptor.compare(gXDotX, 3)) { 255 setType(kImproperFractionRule); 256 } 257 else if (0 == descriptor.compare(gZeroDotX, 3)) { 258 setType(kProperFractionRule); 259 } 260 else if (0 == descriptor.compare(gXDotZero, 3)) { 261 setType(kMasterRule); 262 } 263 264 // if the rule descriptor begins with a digit, it's a descriptor 265 // for a normal rule 266 // since we don't have Long.parseLong, and this isn't much work anyway, 267 // just build up the value as we encounter the digits. 268 else if (descriptor.charAt(0) >= gZero && descriptor.charAt(0) <= gNine) { 269 int64_t val = 0; 270 p = 0; 271 UChar c = gSpace; 272 273 // begin parsing the descriptor: copy digits 274 // into "tempValue", skip periods, commas, and spaces, 275 // stop on a slash or > sign (or at the end of the string), 276 // and throw an exception on any other character 277 int64_t ll_10 = 10; 278 while (p < descriptor.length()) { 279 c = descriptor.charAt(p); 280 if (c >= gZero && c <= gNine) { 281 val = val * ll_10 + (int32_t)(c - gZero); 282 } 283 else if (c == gSlash || c == gGreaterThan) { 284 break; 285 } 286 else if (PatternProps::isWhiteSpace(c) || c == gComma || c == gDot) { 287 } 288 else { 289 // throw new IllegalArgumentException("Illegal character in rule descriptor"); 290 status = U_PARSE_ERROR; 291 return; 292 } 293 ++p; 294 } 295 296 // we have the base value, so set it 297 setBaseValue(val, status); 298 299 // if we stopped the previous loop on a slash, we're 300 // now parsing the rule's radix. Again, accumulate digits 301 // in tempValue, skip punctuation, stop on a > mark, and 302 // throw an exception on anything else 303 if (c == gSlash) { 304 val = 0; 305 ++p; 306 int64_t ll_10 = 10; 307 while (p < descriptor.length()) { 308 c = descriptor.charAt(p); 309 if (c >= gZero && c <= gNine) { 310 val = val * ll_10 + (int32_t)(c - gZero); 311 } 312 else if (c == gGreaterThan) { 313 break; 314 } 315 else if (PatternProps::isWhiteSpace(c) || c == gComma || c == gDot) { 316 } 317 else { 318 // throw new IllegalArgumentException("Illegal character is rule descriptor"); 319 status = U_PARSE_ERROR; 320 return; 321 } 322 ++p; 323 } 324 325 // tempValue now contain's the rule's radix. Set it 326 // accordingly, and recalculate the rule's exponent 327 radix = (int32_t)val; 328 if (radix == 0) { 329 // throw new IllegalArgumentException("Rule can't have radix of 0"); 330 status = U_PARSE_ERROR; 331 } 332 333 exponent = expectedExponent(); 334 } 335 336 // if we stopped the previous loop on a > sign, then continue 337 // for as long as we still see > signs. For each one, 338 // decrement the exponent (unless the exponent is already 0). 339 // If we see another character before reaching the end of 340 // the descriptor, that's also a syntax error. 341 if (c == gGreaterThan) { 342 while (p < descriptor.length()) { 343 c = descriptor.charAt(p); 344 if (c == gGreaterThan && exponent > 0) { 345 --exponent; 346 } else { 347 // throw new IllegalArgumentException("Illegal character in rule descriptor"); 348 status = U_PARSE_ERROR; 349 return; 350 } 351 ++p; 352 } 353 } 354 } 355 } 356 357 // finally, if the rule body begins with an apostrophe, strip it off 358 // (this is generally used to put whitespace at the beginning of 359 // a rule's rule text) 360 if (description.length() > 0 && description.charAt(0) == gTick) { 361 description.removeBetween(0, 1); 362 } 363 364 // return the description with all the stuff we've just waded through 365 // stripped off the front. It now contains just the rule body. 366 // return description; 367 } 368 369 /** 370 * Searches the rule's rule text for the substitution tokens, 371 * creates the substitutions, and removes the substitution tokens 372 * from the rule's rule text. 373 * @param owner The rule set containing this rule 374 * @param predecessor The rule preseding this one in "owners" rule list 375 * @param ownersOwner The RuleBasedFormat that owns this rule 376 */ 377 void 378 NFRule::extractSubstitutions(const NFRuleSet* ruleSet, 379 const UnicodeString &ruleText, 380 const NFRule* predecessor, 381 UErrorCode& status) 382 { 383 if (U_FAILURE(status)) { 384 return; 385 } 386 this->ruleText = ruleText; 387 this->rulePatternFormat = NULL; 388 sub1 = extractSubstitution(ruleSet, predecessor, status); 389 if (sub1 == NULL || sub1->isNullSubstitution()) { 390 // Small optimization. There is no need to create a redundant NullSubstitution. 391 sub2 = sub1; 392 } 393 else { 394 sub2 = extractSubstitution(ruleSet, predecessor, status); 395 } 396 int32_t pluralRuleStart = this->ruleText.indexOf(gDollarOpenParenthesis, -1, 0); 397 int32_t pluralRuleEnd = (pluralRuleStart >= 0 ? this->ruleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart) : -1); 398 if (pluralRuleEnd >= 0) { 399 int32_t endType = this->ruleText.indexOf(gComma, pluralRuleStart); 400 if (endType < 0) { 401 status = U_PARSE_ERROR; 402 return; 403 } 404 UnicodeString type(this->ruleText.tempSubString(pluralRuleStart + 2, endType - pluralRuleStart - 2)); 405 UPluralType pluralType; 406 if (type.startsWith(UNICODE_STRING_SIMPLE("cardinal"))) { 407 pluralType = UPLURAL_TYPE_CARDINAL; 408 } 409 else if (type.startsWith(UNICODE_STRING_SIMPLE("ordinal"))) { 410 pluralType = UPLURAL_TYPE_ORDINAL; 411 } 412 else { 413 status = U_ILLEGAL_ARGUMENT_ERROR; 414 return; 415 } 416 rulePatternFormat = formatter->createPluralFormat(pluralType, 417 this->ruleText.tempSubString(endType + 1, pluralRuleEnd - endType - 1), status); 418 } 419 } 420 421 /** 422 * Searches the rule's rule text for the first substitution token, 423 * creates a substitution based on it, and removes the token from 424 * the rule's rule text. 425 * @param owner The rule set containing this rule 426 * @param predecessor The rule preceding this one in the rule set's 427 * rule list 428 * @param ownersOwner The RuleBasedNumberFormat that owns this rule 429 * @return The newly-created substitution. This is never null; if 430 * the rule text doesn't contain any substitution tokens, this will 431 * be a NullSubstitution. 432 */ 433 NFSubstitution * 434 NFRule::extractSubstitution(const NFRuleSet* ruleSet, 435 const NFRule* predecessor, 436 UErrorCode& status) 437 { 438 NFSubstitution* result = NULL; 439 440 // search the rule's rule text for the first two characters of 441 // a substitution token 442 int32_t subStart = indexOfAny(tokenStrings); 443 int32_t subEnd = subStart; 444 445 // if we didn't find one, create a null substitution positioned 446 // at the end of the rule text 447 if (subStart == -1) { 448 return NFSubstitution::makeSubstitution(ruleText.length(), this, predecessor, 449 ruleSet, this->formatter, UnicodeString(), status); 450 } 451 452 // special-case the ">>>" token, since searching for the > at the 453 // end will actually find the > in the middle 454 if (ruleText.indexOf(gGreaterGreaterGreater, 3, 0) == subStart) { 455 subEnd = subStart + 2; 456 457 // otherwise the substitution token ends with the same character 458 // it began with 459 } else { 460 UChar c = ruleText.charAt(subStart); 461 subEnd = ruleText.indexOf(c, subStart + 1); 462 // special case for '<%foo<<' 463 if (c == gLessThan && subEnd != -1 && subEnd < ruleText.length() - 1 && ruleText.charAt(subEnd+1) == c) { 464 // ordinals use "=#,##0==%abbrev=" as their rule. Notice that the '==' in the middle 465 // occurs because of the juxtaposition of two different rules. The check for '<' is a hack 466 // to get around this. Having the duplicate at the front would cause problems with 467 // rules like "<<%" to format, say, percents... 468 ++subEnd; 469 } 470 } 471 472 // if we don't find the end of the token (i.e., if we're on a single, 473 // unmatched token character), create a null substitution positioned 474 // at the end of the rule 475 if (subEnd == -1) { 476 return NFSubstitution::makeSubstitution(ruleText.length(), this, predecessor, 477 ruleSet, this->formatter, UnicodeString(), status); 478 } 479 480 // if we get here, we have a real substitution token (or at least 481 // some text bounded by substitution token characters). Use 482 // makeSubstitution() to create the right kind of substitution 483 UnicodeString subToken; 484 subToken.setTo(ruleText, subStart, subEnd + 1 - subStart); 485 result = NFSubstitution::makeSubstitution(subStart, this, predecessor, ruleSet, 486 this->formatter, subToken, status); 487 488 // remove the substitution from the rule text 489 ruleText.removeBetween(subStart, subEnd+1); 490 491 return result; 492 } 493 494 /** 495 * Sets the rule's base value, and causes the radix and exponent 496 * to be recalculated. This is used during construction when we 497 * don't know the rule's base value until after it's been 498 * constructed. It should be used at any other time. 499 * @param The new base value for the rule. 500 */ 501 void 502 NFRule::setBaseValue(int64_t newBaseValue, UErrorCode& status) 503 { 504 // set the base value 505 baseValue = newBaseValue; 506 507 // if this isn't a special rule, recalculate the radix and exponent 508 // (the radix always defaults to 10; if it's supposed to be something 509 // else, it's cleaned up by the caller and the exponent is 510 // recalculated again-- the only function that does this is 511 // NFRule.parseRuleDescriptor() ) 512 if (baseValue >= 1) { 513 radix = 10; 514 exponent = expectedExponent(); 515 516 // this function gets called on a fully-constructed rule whose 517 // description didn't specify a base value. This means it 518 // has substitutions, and some substitutions hold on to copies 519 // of the rule's divisor. Fix their copies of the divisor. 520 if (sub1 != NULL) { 521 sub1->setDivisor(radix, exponent, status); 522 } 523 if (sub2 != NULL) { 524 sub2->setDivisor(radix, exponent, status); 525 } 526 527 // if this is a special rule, its radix and exponent are basically 528 // ignored. Set them to "safe" default values 529 } else { 530 radix = 10; 531 exponent = 0; 532 } 533 } 534 535 /** 536 * This calculates the rule's exponent based on its radix and base 537 * value. This will be the highest power the radix can be raised to 538 * and still produce a result less than or equal to the base value. 539 */ 540 int16_t 541 NFRule::expectedExponent() const 542 { 543 // since the log of 0, or the log base 0 of something, causes an 544 // error, declare the exponent in these cases to be 0 (we also 545 // deal with the special-rule identifiers here) 546 if (radix == 0 || baseValue < 1) { 547 return 0; 548 } 549 550 // we get rounding error in some cases-- for example, log 1000 / log 10 551 // gives us 1.9999999996 instead of 2. The extra logic here is to take 552 // that into account 553 int16_t tempResult = (int16_t)(uprv_log((double)baseValue) / uprv_log((double)radix)); 554 int64_t temp = util64_pow(radix, tempResult + 1); 555 if (temp <= baseValue) { 556 tempResult += 1; 557 } 558 return tempResult; 559 } 560 561 /** 562 * Searches the rule's rule text for any of the specified strings. 563 * @param strings An array of strings to search the rule's rule 564 * text for 565 * @return The index of the first match in the rule's rule text 566 * (i.e., the first substring in the rule's rule text that matches 567 * _any_ of the strings in "strings"). If none of the strings in 568 * "strings" is found in the rule's rule text, returns -1. 569 */ 570 int32_t 571 NFRule::indexOfAny(const UChar* const strings[]) const 572 { 573 int result = -1; 574 for (int i = 0; strings[i]; i++) { 575 int32_t pos = ruleText.indexOf(*strings[i]); 576 if (pos != -1 && (result == -1 || pos < result)) { 577 result = pos; 578 } 579 } 580 return result; 581 } 582 583 //----------------------------------------------------------------------- 584 // boilerplate 585 //----------------------------------------------------------------------- 586 587 /** 588 * Tests two rules for equality. 589 * @param that The rule to compare this one against 590 * @return True is the two rules are functionally equivalent 591 */ 592 UBool 593 NFRule::operator==(const NFRule& rhs) const 594 { 595 return baseValue == rhs.baseValue 596 && radix == rhs.radix 597 && exponent == rhs.exponent 598 && ruleText == rhs.ruleText 599 && *sub1 == *rhs.sub1 600 && *sub2 == *rhs.sub2; 601 } 602 603 /** 604 * Returns a textual representation of the rule. This won't 605 * necessarily be the same as the description that this rule 606 * was created with, but it will produce the same result. 607 * @return A textual description of the rule 608 */ 609 static void util_append64(UnicodeString& result, int64_t n) 610 { 611 UChar buffer[256]; 612 int32_t len = util64_tou(n, buffer, sizeof(buffer)); 613 UnicodeString temp(buffer, len); 614 result.append(temp); 615 } 616 617 void 618 NFRule::_appendRuleText(UnicodeString& result) const 619 { 620 switch (getType()) { 621 case kNegativeNumberRule: result.append(gMinusX, 2); break; 622 case kImproperFractionRule: result.append(gXDotX, 3); break; 623 case kProperFractionRule: result.append(gZeroDotX, 3); break; 624 case kMasterRule: result.append(gXDotZero, 3); break; 625 default: 626 // for a normal rule, write out its base value, and if the radix is 627 // something other than 10, write out the radix (with the preceding 628 // slash, of course). Then calculate the expected exponent and if 629 // if isn't the same as the actual exponent, write an appropriate 630 // number of > signs. Finally, terminate the whole thing with 631 // a colon. 632 util_append64(result, baseValue); 633 if (radix != 10) { 634 result.append(gSlash); 635 util_append64(result, radix); 636 } 637 int numCarets = expectedExponent() - exponent; 638 for (int i = 0; i < numCarets; i++) { 639 result.append(gGreaterThan); 640 } 641 break; 642 } 643 result.append(gColon); 644 result.append(gSpace); 645 646 // if the rule text begins with a space, write an apostrophe 647 // (whitespace after the rule descriptor is ignored; the 648 // apostrophe is used to make the whitespace significant) 649 if (ruleText.charAt(0) == gSpace && sub1->getPos() != 0) { 650 result.append(gTick); 651 } 652 653 // now, write the rule's rule text, inserting appropriate 654 // substitution tokens in the appropriate places 655 UnicodeString ruleTextCopy; 656 ruleTextCopy.setTo(ruleText); 657 658 UnicodeString temp; 659 sub2->toString(temp); 660 ruleTextCopy.insert(sub2->getPos(), temp); 661 sub1->toString(temp); 662 ruleTextCopy.insert(sub1->getPos(), temp); 663 664 result.append(ruleTextCopy); 665 666 // and finally, top the whole thing off with a semicolon and 667 // return the result 668 result.append(gSemicolon); 669 } 670 671 //----------------------------------------------------------------------- 672 // formatting 673 //----------------------------------------------------------------------- 674 675 /** 676 * Formats the number, and inserts the resulting text into 677 * toInsertInto. 678 * @param number The number being formatted 679 * @param toInsertInto The string where the resultant text should 680 * be inserted 681 * @param pos The position in toInsertInto where the resultant text 682 * should be inserted 683 */ 684 void 685 NFRule::doFormat(int64_t number, UnicodeString& toInsertInto, int32_t pos, UErrorCode& status) const 686 { 687 // first, insert the rule's rule text into toInsertInto at the 688 // specified position, then insert the results of the substitutions 689 // into the right places in toInsertInto (notice we do the 690 // substitutions in reverse order so that the offsets don't get 691 // messed up) 692 int32_t pluralRuleStart = ruleText.length(); 693 int32_t lengthOffset = 0; 694 if (!rulePatternFormat) { 695 toInsertInto.insert(pos, ruleText); 696 } 697 else { 698 pluralRuleStart = ruleText.indexOf(gDollarOpenParenthesis, -1, 0); 699 int pluralRuleEnd = ruleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart); 700 int initialLength = toInsertInto.length(); 701 if (pluralRuleEnd < ruleText.length() - 1) { 702 toInsertInto.insert(pos, ruleText.tempSubString(pluralRuleEnd + 2)); 703 } 704 toInsertInto.insert(pos, 705 rulePatternFormat->format((int32_t)(number/uprv_pow(radix, exponent)), status)); 706 if (pluralRuleStart > 0) { 707 toInsertInto.insert(pos, ruleText.tempSubString(0, pluralRuleStart)); 708 } 709 lengthOffset = ruleText.length() - (toInsertInto.length() - initialLength); 710 } 711 712 if (!sub2->isNullSubstitution()) { 713 sub2->doSubstitution(number, toInsertInto, pos - (sub2->getPos() > pluralRuleStart ? lengthOffset : 0), status); 714 } 715 if (!sub1->isNullSubstitution()) { 716 sub1->doSubstitution(number, toInsertInto, pos - (sub1->getPos() > pluralRuleStart ? lengthOffset : 0), status); 717 } 718 } 719 720 /** 721 * Formats the number, and inserts the resulting text into 722 * toInsertInto. 723 * @param number The number being formatted 724 * @param toInsertInto The string where the resultant text should 725 * be inserted 726 * @param pos The position in toInsertInto where the resultant text 727 * should be inserted 728 */ 729 void 730 NFRule::doFormat(double number, UnicodeString& toInsertInto, int32_t pos, UErrorCode& status) const 731 { 732 // first, insert the rule's rule text into toInsertInto at the 733 // specified position, then insert the results of the substitutions 734 // into the right places in toInsertInto 735 // [again, we have two copies of this routine that do the same thing 736 // so that we don't sacrifice precision in a long by casting it 737 // to a double] 738 int32_t pluralRuleStart = ruleText.length(); 739 int32_t lengthOffset = 0; 740 if (!rulePatternFormat) { 741 toInsertInto.insert(pos, ruleText); 742 } 743 else { 744 pluralRuleStart = ruleText.indexOf(gDollarOpenParenthesis, -1, 0); 745 int pluralRuleEnd = ruleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart); 746 int initialLength = toInsertInto.length(); 747 if (pluralRuleEnd < ruleText.length() - 1) { 748 toInsertInto.insert(pos, ruleText.tempSubString(pluralRuleEnd + 2)); 749 } 750 toInsertInto.insert(pos, 751 rulePatternFormat->format((int32_t)(number/uprv_pow(radix, exponent)), status)); 752 if (pluralRuleStart > 0) { 753 toInsertInto.insert(pos, ruleText.tempSubString(0, pluralRuleStart)); 754 } 755 lengthOffset = ruleText.length() - (toInsertInto.length() - initialLength); 756 } 757 758 if (!sub2->isNullSubstitution()) { 759 sub2->doSubstitution(number, toInsertInto, pos - (sub2->getPos() > pluralRuleStart ? lengthOffset : 0), status); 760 } 761 if (!sub1->isNullSubstitution()) { 762 sub1->doSubstitution(number, toInsertInto, pos - (sub1->getPos() > pluralRuleStart ? lengthOffset : 0), status); 763 } 764 } 765 766 /** 767 * Used by the owning rule set to determine whether to invoke the 768 * rollback rule (i.e., whether this rule or the one that precedes 769 * it in the rule set's list should be used to format the number) 770 * @param The number being formatted 771 * @return True if the rule set should use the rule that precedes 772 * this one in its list; false if it should use this rule 773 */ 774 UBool 775 NFRule::shouldRollBack(double number) const 776 { 777 // we roll back if the rule contains a modulus substitution, 778 // the number being formatted is an even multiple of the rule's 779 // divisor, and the rule's base value is NOT an even multiple 780 // of its divisor 781 // In other words, if the original description had 782 // 100: << hundred[ >>]; 783 // that expands into 784 // 100: << hundred; 785 // 101: << hundred >>; 786 // internally. But when we're formatting 200, if we use the rule 787 // at 101, which would normally apply, we get "two hundred zero". 788 // To prevent this, we roll back and use the rule at 100 instead. 789 // This is the logic that makes this happen: the rule at 101 has 790 // a modulus substitution, its base value isn't an even multiple 791 // of 100, and the value we're trying to format _is_ an even 792 // multiple of 100. This is called the "rollback rule." 793 if ((sub1->isModulusSubstitution()) || (sub2->isModulusSubstitution())) { 794 int64_t re = util64_pow(radix, exponent); 795 return uprv_fmod(number, (double)re) == 0 && (baseValue % re) != 0; 796 } 797 return FALSE; 798 } 799 800 //----------------------------------------------------------------------- 801 // parsing 802 //----------------------------------------------------------------------- 803 804 /** 805 * Attempts to parse the string with this rule. 806 * @param text The string being parsed 807 * @param parsePosition On entry, the value is ignored and assumed to 808 * be 0. On exit, this has been updated with the position of the first 809 * character not consumed by matching the text against this rule 810 * (if this rule doesn't match the text at all, the parse position 811 * if left unchanged (presumably at 0) and the function returns 812 * new Long(0)). 813 * @param isFractionRule True if this rule is contained within a 814 * fraction rule set. This is only used if the rule has no 815 * substitutions. 816 * @return If this rule matched the text, this is the rule's base value 817 * combined appropriately with the results of parsing the substitutions. 818 * If nothing matched, this is new Long(0) and the parse position is 819 * left unchanged. The result will be an instance of Long if the 820 * result is an integer and Double otherwise. The result is never null. 821 */ 822 #ifdef RBNF_DEBUG 823 #include <stdio.h> 824 825 static void dumpUS(FILE* f, const UnicodeString& us) { 826 int len = us.length(); 827 char* buf = (char *)uprv_malloc((len+1)*sizeof(char)); //new char[len+1]; 828 if (buf != NULL) { 829 us.extract(0, len, buf); 830 buf[len] = 0; 831 fprintf(f, "%s", buf); 832 uprv_free(buf); //delete[] buf; 833 } 834 } 835 #endif 836 837 UBool 838 NFRule::doParse(const UnicodeString& text, 839 ParsePosition& parsePosition, 840 UBool isFractionRule, 841 double upperBound, 842 Formattable& resVal) const 843 { 844 // internally we operate on a copy of the string being parsed 845 // (because we're going to change it) and use our own ParsePosition 846 ParsePosition pp; 847 UnicodeString workText(text); 848 849 // check to see whether the text before the first substitution 850 // matches the text at the beginning of the string being 851 // parsed. If it does, strip that off the front of workText; 852 // otherwise, dump out with a mismatch 853 UnicodeString prefix; 854 prefix.setTo(ruleText, 0, sub1->getPos()); 855 856 #ifdef RBNF_DEBUG 857 fprintf(stderr, "doParse %x ", this); 858 { 859 UnicodeString rt; 860 _appendRuleText(rt); 861 dumpUS(stderr, rt); 862 } 863 864 fprintf(stderr, " text: '", this); 865 dumpUS(stderr, text); 866 fprintf(stderr, "' prefix: '"); 867 dumpUS(stderr, prefix); 868 #endif 869 stripPrefix(workText, prefix, pp); 870 int32_t prefixLength = text.length() - workText.length(); 871 872 #ifdef RBNF_DEBUG 873 fprintf(stderr, "' pl: %d ppi: %d s1p: %d\n", prefixLength, pp.getIndex(), sub1->getPos()); 874 #endif 875 876 if (pp.getIndex() == 0 && sub1->getPos() != 0) { 877 // commented out because ParsePosition doesn't have error index in 1.1.x 878 // restored for ICU4C port 879 parsePosition.setErrorIndex(pp.getErrorIndex()); 880 resVal.setLong(0); 881 return TRUE; 882 } 883 884 // this is the fun part. The basic guts of the rule-matching 885 // logic is matchToDelimiter(), which is called twice. The first 886 // time it searches the input string for the rule text BETWEEN 887 // the substitutions and tries to match the intervening text 888 // in the input string with the first substitution. If that 889 // succeeds, it then calls it again, this time to look for the 890 // rule text after the second substitution and to match the 891 // intervening input text against the second substitution. 892 // 893 // For example, say we have a rule that looks like this: 894 // first << middle >> last; 895 // and input text that looks like this: 896 // first one middle two last 897 // First we use stripPrefix() to match "first " in both places and 898 // strip it off the front, leaving 899 // one middle two last 900 // Then we use matchToDelimiter() to match " middle " and try to 901 // match "one" against a substitution. If it's successful, we now 902 // have 903 // two last 904 // We use matchToDelimiter() a second time to match " last" and 905 // try to match "two" against a substitution. If "two" matches 906 // the substitution, we have a successful parse. 907 // 908 // Since it's possible in many cases to find multiple instances 909 // of each of these pieces of rule text in the input string, 910 // we need to try all the possible combinations of these 911 // locations. This prevents us from prematurely declaring a mismatch, 912 // and makes sure we match as much input text as we can. 913 int highWaterMark = 0; 914 double result = 0; 915 int start = 0; 916 double tempBaseValue = (double)(baseValue <= 0 ? 0 : baseValue); 917 918 UnicodeString temp; 919 do { 920 // our partial parse result starts out as this rule's base 921 // value. If it finds a successful match, matchToDelimiter() 922 // will compose this in some way with what it gets back from 923 // the substitution, giving us a new partial parse result 924 pp.setIndex(0); 925 926 temp.setTo(ruleText, sub1->getPos(), sub2->getPos() - sub1->getPos()); 927 double partialResult = matchToDelimiter(workText, start, tempBaseValue, 928 temp, pp, sub1, 929 upperBound); 930 931 // if we got a successful match (or were trying to match a 932 // null substitution), pp is now pointing at the first unmatched 933 // character. Take note of that, and try matchToDelimiter() 934 // on the input text again 935 if (pp.getIndex() != 0 || sub1->isNullSubstitution()) { 936 start = pp.getIndex(); 937 938 UnicodeString workText2; 939 workText2.setTo(workText, pp.getIndex(), workText.length() - pp.getIndex()); 940 ParsePosition pp2; 941 942 // the second matchToDelimiter() will compose our previous 943 // partial result with whatever it gets back from its 944 // substitution if there's a successful match, giving us 945 // a real result 946 temp.setTo(ruleText, sub2->getPos(), ruleText.length() - sub2->getPos()); 947 partialResult = matchToDelimiter(workText2, 0, partialResult, 948 temp, pp2, sub2, 949 upperBound); 950 951 // if we got a successful match on this second 952 // matchToDelimiter() call, update the high-water mark 953 // and result (if necessary) 954 if (pp2.getIndex() != 0 || sub2->isNullSubstitution()) { 955 if (prefixLength + pp.getIndex() + pp2.getIndex() > highWaterMark) { 956 highWaterMark = prefixLength + pp.getIndex() + pp2.getIndex(); 957 result = partialResult; 958 } 959 } 960 // commented out because ParsePosition doesn't have error index in 1.1.x 961 // restored for ICU4C port 962 else { 963 int32_t temp = pp2.getErrorIndex() + sub1->getPos() + pp.getIndex(); 964 if (temp> parsePosition.getErrorIndex()) { 965 parsePosition.setErrorIndex(temp); 966 } 967 } 968 } 969 // commented out because ParsePosition doesn't have error index in 1.1.x 970 // restored for ICU4C port 971 else { 972 int32_t temp = sub1->getPos() + pp.getErrorIndex(); 973 if (temp > parsePosition.getErrorIndex()) { 974 parsePosition.setErrorIndex(temp); 975 } 976 } 977 // keep trying to match things until the outer matchToDelimiter() 978 // call fails to make a match (each time, it picks up where it 979 // left off the previous time) 980 } while (sub1->getPos() != sub2->getPos() 981 && pp.getIndex() > 0 982 && pp.getIndex() < workText.length() 983 && pp.getIndex() != start); 984 985 // update the caller's ParsePosition with our high-water mark 986 // (i.e., it now points at the first character this function 987 // didn't match-- the ParsePosition is therefore unchanged if 988 // we didn't match anything) 989 parsePosition.setIndex(highWaterMark); 990 // commented out because ParsePosition doesn't have error index in 1.1.x 991 // restored for ICU4C port 992 if (highWaterMark > 0) { 993 parsePosition.setErrorIndex(0); 994 } 995 996 // this is a hack for one unusual condition: Normally, whether this 997 // rule belong to a fraction rule set or not is handled by its 998 // substitutions. But if that rule HAS NO substitutions, then 999 // we have to account for it here. By definition, if the matching 1000 // rule in a fraction rule set has no substitutions, its numerator 1001 // is 1, and so the result is the reciprocal of its base value. 1002 if (isFractionRule && 1003 highWaterMark > 0 && 1004 sub1->isNullSubstitution()) { 1005 result = 1 / result; 1006 } 1007 1008 resVal.setDouble(result); 1009 return TRUE; // ??? do we need to worry if it is a long or a double? 1010 } 1011 1012 /** 1013 * This function is used by parse() to match the text being parsed 1014 * against a possible prefix string. This function 1015 * matches characters from the beginning of the string being parsed 1016 * to characters from the prospective prefix. If they match, pp is 1017 * updated to the first character not matched, and the result is 1018 * the unparsed part of the string. If they don't match, the whole 1019 * string is returned, and pp is left unchanged. 1020 * @param text The string being parsed 1021 * @param prefix The text to match against 1022 * @param pp On entry, ignored and assumed to be 0. On exit, points 1023 * to the first unmatched character (assuming the whole prefix matched), 1024 * or is unchanged (if the whole prefix didn't match). 1025 * @return If things match, this is the unparsed part of "text"; 1026 * if they didn't match, this is "text". 1027 */ 1028 void 1029 NFRule::stripPrefix(UnicodeString& text, const UnicodeString& prefix, ParsePosition& pp) const 1030 { 1031 // if the prefix text is empty, dump out without doing anything 1032 if (prefix.length() != 0) { 1033 UErrorCode status = U_ZERO_ERROR; 1034 // use prefixLength() to match the beginning of 1035 // "text" against "prefix". This function returns the 1036 // number of characters from "text" that matched (or 0 if 1037 // we didn't match the whole prefix) 1038 int32_t pfl = prefixLength(text, prefix, status); 1039 if (U_FAILURE(status)) { // Memory allocation error. 1040 return; 1041 } 1042 if (pfl != 0) { 1043 // if we got a successful match, update the parse position 1044 // and strip the prefix off of "text" 1045 pp.setIndex(pp.getIndex() + pfl); 1046 text.remove(0, pfl); 1047 } 1048 } 1049 } 1050 1051 /** 1052 * Used by parse() to match a substitution and any following text. 1053 * "text" is searched for instances of "delimiter". For each instance 1054 * of delimiter, the intervening text is tested to see whether it 1055 * matches the substitution. The longest match wins. 1056 * @param text The string being parsed 1057 * @param startPos The position in "text" where we should start looking 1058 * for "delimiter". 1059 * @param baseValue A partial parse result (often the rule's base value), 1060 * which is combined with the result from matching the substitution 1061 * @param delimiter The string to search "text" for. 1062 * @param pp Ignored and presumed to be 0 on entry. If there's a match, 1063 * on exit this will point to the first unmatched character. 1064 * @param sub If we find "delimiter" in "text", this substitution is used 1065 * to match the text between the beginning of the string and the 1066 * position of "delimiter." (If "delimiter" is the empty string, then 1067 * this function just matches against this substitution and updates 1068 * everything accordingly.) 1069 * @param upperBound When matching the substitution, it will only 1070 * consider rules with base values lower than this value. 1071 * @return If there's a match, this is the result of composing 1072 * baseValue with the result of matching the substitution. Otherwise, 1073 * this is new Long(0). It's never null. If the result is an integer, 1074 * this will be an instance of Long; otherwise, it's an instance of 1075 * Double. 1076 * 1077 * !!! note {dlf} in point of fact, in the java code the caller always converts 1078 * the result to a double, so we might as well return one. 1079 */ 1080 double 1081 NFRule::matchToDelimiter(const UnicodeString& text, 1082 int32_t startPos, 1083 double _baseValue, 1084 const UnicodeString& delimiter, 1085 ParsePosition& pp, 1086 const NFSubstitution* sub, 1087 double upperBound) const 1088 { 1089 UErrorCode status = U_ZERO_ERROR; 1090 // if "delimiter" contains real (i.e., non-ignorable) text, search 1091 // it for "delimiter" beginning at "start". If that succeeds, then 1092 // use "sub"'s doParse() method to match the text before the 1093 // instance of "delimiter" we just found. 1094 if (!allIgnorable(delimiter, status)) { 1095 if (U_FAILURE(status)) { //Memory allocation error. 1096 return 0; 1097 } 1098 ParsePosition tempPP; 1099 Formattable result; 1100 1101 // use findText() to search for "delimiter". It returns a two- 1102 // element array: element 0 is the position of the match, and 1103 // element 1 is the number of characters that matched 1104 // "delimiter". 1105 int32_t dLen; 1106 int32_t dPos = findText(text, delimiter, startPos, &dLen); 1107 1108 // if findText() succeeded, isolate the text preceding the 1109 // match, and use "sub" to match that text 1110 while (dPos >= 0) { 1111 UnicodeString subText; 1112 subText.setTo(text, 0, dPos); 1113 if (subText.length() > 0) { 1114 UBool success = sub->doParse(subText, tempPP, _baseValue, upperBound, 1115 #if UCONFIG_NO_COLLATION 1116 FALSE, 1117 #else 1118 formatter->isLenient(), 1119 #endif 1120 result); 1121 1122 // if the substitution could match all the text up to 1123 // where we found "delimiter", then this function has 1124 // a successful match. Bump the caller's parse position 1125 // to point to the first character after the text 1126 // that matches "delimiter", and return the result 1127 // we got from parsing the substitution. 1128 if (success && tempPP.getIndex() == dPos) { 1129 pp.setIndex(dPos + dLen); 1130 return result.getDouble(); 1131 } 1132 // commented out because ParsePosition doesn't have error index in 1.1.x 1133 // restored for ICU4C port 1134 else { 1135 if (tempPP.getErrorIndex() > 0) { 1136 pp.setErrorIndex(tempPP.getErrorIndex()); 1137 } else { 1138 pp.setErrorIndex(tempPP.getIndex()); 1139 } 1140 } 1141 } 1142 1143 // if we didn't match the substitution, search for another 1144 // copy of "delimiter" in "text" and repeat the loop if 1145 // we find it 1146 tempPP.setIndex(0); 1147 dPos = findText(text, delimiter, dPos + dLen, &dLen); 1148 } 1149 // if we make it here, this was an unsuccessful match, and we 1150 // leave pp unchanged and return 0 1151 pp.setIndex(0); 1152 return 0; 1153 1154 // if "delimiter" is empty, or consists only of ignorable characters 1155 // (i.e., is semantically empty), thwe we obviously can't search 1156 // for "delimiter". Instead, just use "sub" to parse as much of 1157 // "text" as possible. 1158 } else { 1159 ParsePosition tempPP; 1160 Formattable result; 1161 1162 // try to match the whole string against the substitution 1163 UBool success = sub->doParse(text, tempPP, _baseValue, upperBound, 1164 #if UCONFIG_NO_COLLATION 1165 FALSE, 1166 #else 1167 formatter->isLenient(), 1168 #endif 1169 result); 1170 if (success && (tempPP.getIndex() != 0 || sub->isNullSubstitution())) { 1171 // if there's a successful match (or it's a null 1172 // substitution), update pp to point to the first 1173 // character we didn't match, and pass the result from 1174 // sub.doParse() on through to the caller 1175 pp.setIndex(tempPP.getIndex()); 1176 return result.getDouble(); 1177 } 1178 // commented out because ParsePosition doesn't have error index in 1.1.x 1179 // restored for ICU4C port 1180 else { 1181 pp.setErrorIndex(tempPP.getErrorIndex()); 1182 } 1183 1184 // and if we get to here, then nothing matched, so we return 1185 // 0 and leave pp alone 1186 return 0; 1187 } 1188 } 1189 1190 /** 1191 * Used by stripPrefix() to match characters. If lenient parse mode 1192 * is off, this just calls startsWith(). If lenient parse mode is on, 1193 * this function uses CollationElementIterators to match characters in 1194 * the strings (only primary-order differences are significant in 1195 * determining whether there's a match). 1196 * @param str The string being tested 1197 * @param prefix The text we're hoping to see at the beginning 1198 * of "str" 1199 * @return If "prefix" is found at the beginning of "str", this 1200 * is the number of characters in "str" that were matched (this 1201 * isn't necessarily the same as the length of "prefix" when matching 1202 * text with a collator). If there's no match, this is 0. 1203 */ 1204 int32_t 1205 NFRule::prefixLength(const UnicodeString& str, const UnicodeString& prefix, UErrorCode& status) const 1206 { 1207 // if we're looking for an empty prefix, it obviously matches 1208 // zero characters. Just go ahead and return 0. 1209 if (prefix.length() == 0) { 1210 return 0; 1211 } 1212 1213 #if !UCONFIG_NO_COLLATION 1214 // go through all this grief if we're in lenient-parse mode 1215 if (formatter->isLenient()) { 1216 // get the formatter's collator and use it to create two 1217 // collation element iterators, one over the target string 1218 // and another over the prefix (right now, we'll throw an 1219 // exception if the collator we get back from the formatter 1220 // isn't a RuleBasedCollator, because RuleBasedCollator defines 1221 // the CollationElementIterator protocol. Hopefully, this 1222 // will change someday.) 1223 const RuleBasedCollator* collator = formatter->getCollator(); 1224 if (collator == NULL) { 1225 status = U_MEMORY_ALLOCATION_ERROR; 1226 return 0; 1227 } 1228 LocalPointer<CollationElementIterator> strIter(collator->createCollationElementIterator(str)); 1229 LocalPointer<CollationElementIterator> prefixIter(collator->createCollationElementIterator(prefix)); 1230 // Check for memory allocation error. 1231 if (strIter.isNull() || prefixIter.isNull()) { 1232 status = U_MEMORY_ALLOCATION_ERROR; 1233 return 0; 1234 } 1235 1236 UErrorCode err = U_ZERO_ERROR; 1237 1238 // The original code was problematic. Consider this match: 1239 // prefix = "fifty-" 1240 // string = " fifty-7" 1241 // The intent is to match string up to the '7', by matching 'fifty-' at position 1 1242 // in the string. Unfortunately, we were getting a match, and then computing where 1243 // the match terminated by rematching the string. The rematch code was using as an 1244 // initial guess the substring of string between 0 and prefix.length. Because of 1245 // the leading space and trailing hyphen (both ignorable) this was succeeding, leaving 1246 // the position before the hyphen in the string. Recursing down, we then parsed the 1247 // remaining string '-7' as numeric. The resulting number turned out as 43 (50 - 7). 1248 // This was not pretty, especially since the string "fifty-7" parsed just fine. 1249 // 1250 // We have newer APIs now, so we can use calls on the iterator to determine what we 1251 // matched up to. If we terminate because we hit the last element in the string, 1252 // our match terminates at this length. If we terminate because we hit the last element 1253 // in the target, our match terminates at one before the element iterator position. 1254 1255 // match collation elements between the strings 1256 int32_t oStr = strIter->next(err); 1257 int32_t oPrefix = prefixIter->next(err); 1258 1259 while (oPrefix != CollationElementIterator::NULLORDER) { 1260 // skip over ignorable characters in the target string 1261 while (CollationElementIterator::primaryOrder(oStr) == 0 1262 && oStr != CollationElementIterator::NULLORDER) { 1263 oStr = strIter->next(err); 1264 } 1265 1266 // skip over ignorable characters in the prefix 1267 while (CollationElementIterator::primaryOrder(oPrefix) == 0 1268 && oPrefix != CollationElementIterator::NULLORDER) { 1269 oPrefix = prefixIter->next(err); 1270 } 1271 1272 // dlf: move this above following test, if we consume the 1273 // entire target, aren't we ok even if the source was also 1274 // entirely consumed? 1275 1276 // if skipping over ignorables brought to the end of 1277 // the prefix, we DID match: drop out of the loop 1278 if (oPrefix == CollationElementIterator::NULLORDER) { 1279 break; 1280 } 1281 1282 // if skipping over ignorables brought us to the end 1283 // of the target string, we didn't match and return 0 1284 if (oStr == CollationElementIterator::NULLORDER) { 1285 return 0; 1286 } 1287 1288 // match collation elements from the two strings 1289 // (considering only primary differences). If we 1290 // get a mismatch, dump out and return 0 1291 if (CollationElementIterator::primaryOrder(oStr) 1292 != CollationElementIterator::primaryOrder(oPrefix)) { 1293 return 0; 1294 1295 // otherwise, advance to the next character in each string 1296 // and loop (we drop out of the loop when we exhaust 1297 // collation elements in the prefix) 1298 } else { 1299 oStr = strIter->next(err); 1300 oPrefix = prefixIter->next(err); 1301 } 1302 } 1303 1304 int32_t result = strIter->getOffset(); 1305 if (oStr != CollationElementIterator::NULLORDER) { 1306 --result; // back over character that we don't want to consume; 1307 } 1308 1309 #ifdef RBNF_DEBUG 1310 fprintf(stderr, "prefix length: %d\n", result); 1311 #endif 1312 return result; 1313 #if 0 1314 //---------------------------------------------------------------- 1315 // JDK 1.2-specific API call 1316 // return strIter.getOffset(); 1317 //---------------------------------------------------------------- 1318 // JDK 1.1 HACK (take out for 1.2-specific code) 1319 1320 // if we make it to here, we have a successful match. Now we 1321 // have to find out HOW MANY characters from the target string 1322 // matched the prefix (there isn't necessarily a one-to-one 1323 // mapping between collation elements and characters). 1324 // In JDK 1.2, there's a simple getOffset() call we can use. 1325 // In JDK 1.1, on the other hand, we have to go through some 1326 // ugly contortions. First, use the collator to compare the 1327 // same number of characters from the prefix and target string. 1328 // If they're equal, we're done. 1329 collator->setStrength(Collator::PRIMARY); 1330 if (str.length() >= prefix.length()) { 1331 UnicodeString temp; 1332 temp.setTo(str, 0, prefix.length()); 1333 if (collator->equals(temp, prefix)) { 1334 #ifdef RBNF_DEBUG 1335 fprintf(stderr, "returning: %d\n", prefix.length()); 1336 #endif 1337 return prefix.length(); 1338 } 1339 } 1340 1341 // if they're not equal, then we have to compare successively 1342 // larger and larger substrings of the target string until we 1343 // get to one that matches the prefix. At that point, we know 1344 // how many characters matched the prefix, and we can return. 1345 int32_t p = 1; 1346 while (p <= str.length()) { 1347 UnicodeString temp; 1348 temp.setTo(str, 0, p); 1349 if (collator->equals(temp, prefix)) { 1350 return p; 1351 } else { 1352 ++p; 1353 } 1354 } 1355 1356 // SHOULD NEVER GET HERE!!! 1357 return 0; 1358 //---------------------------------------------------------------- 1359 #endif 1360 1361 // If lenient parsing is turned off, forget all that crap above. 1362 // Just use String.startsWith() and be done with it. 1363 } else 1364 #endif 1365 { 1366 if (str.startsWith(prefix)) { 1367 return prefix.length(); 1368 } else { 1369 return 0; 1370 } 1371 } 1372 } 1373 1374 /** 1375 * Searches a string for another string. If lenient parsing is off, 1376 * this just calls indexOf(). If lenient parsing is on, this function 1377 * uses CollationElementIterator to match characters, and only 1378 * primary-order differences are significant in determining whether 1379 * there's a match. 1380 * @param str The string to search 1381 * @param key The string to search "str" for 1382 * @param startingAt The index into "str" where the search is to 1383 * begin 1384 * @return A two-element array of ints. Element 0 is the position 1385 * of the match, or -1 if there was no match. Element 1 is the 1386 * number of characters in "str" that matched (which isn't necessarily 1387 * the same as the length of "key") 1388 */ 1389 int32_t 1390 NFRule::findText(const UnicodeString& str, 1391 const UnicodeString& key, 1392 int32_t startingAt, 1393 int32_t* length) const 1394 { 1395 if (rulePatternFormat) { 1396 Formattable result; 1397 FieldPosition position(UNUM_INTEGER_FIELD); 1398 position.setBeginIndex(startingAt); 1399 rulePatternFormat->parseType(str, this, result, position); 1400 int start = position.getBeginIndex(); 1401 if (start >= 0) { 1402 int32_t pluralRuleStart = ruleText.indexOf(gDollarOpenParenthesis, -1, 0); 1403 int32_t pluralRuleSuffix = ruleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart) + 2; 1404 int32_t matchLen = position.getEndIndex() - start; 1405 UnicodeString prefix(ruleText.tempSubString(0, pluralRuleStart)); 1406 UnicodeString suffix(ruleText.tempSubString(pluralRuleSuffix)); 1407 if (str.compare(start - prefix.length(), prefix.length(), prefix, 0, prefix.length()) == 0 1408 && str.compare(start + matchLen, suffix.length(), suffix, 0, suffix.length()) == 0) 1409 { 1410 *length = matchLen + prefix.length() + suffix.length(); 1411 return start - prefix.length(); 1412 } 1413 } 1414 *length = 0; 1415 return -1; 1416 } 1417 if (!formatter->isLenient()) { 1418 // if lenient parsing is turned off, this is easy: just call 1419 // String.indexOf() and we're done 1420 *length = key.length(); 1421 return str.indexOf(key, startingAt); 1422 } 1423 else { 1424 // but if lenient parsing is turned ON, we've got some work 1425 // ahead of us 1426 return findTextLenient(str, key, startingAt, length); 1427 } 1428 } 1429 1430 int32_t 1431 NFRule::findTextLenient(const UnicodeString& str, 1432 const UnicodeString& key, 1433 int32_t startingAt, 1434 int32_t* length) const 1435 { 1436 //---------------------------------------------------------------- 1437 // JDK 1.1 HACK (take out of 1.2-specific code) 1438 1439 // in JDK 1.2, CollationElementIterator provides us with an 1440 // API to map between character offsets and collation elements 1441 // and we can do this by marching through the string comparing 1442 // collation elements. We can't do that in JDK 1.1. Insted, 1443 // we have to go through this horrible slow mess: 1444 int32_t p = startingAt; 1445 int32_t keyLen = 0; 1446 1447 // basically just isolate smaller and smaller substrings of 1448 // the target string (each running to the end of the string, 1449 // and with the first one running from startingAt to the end) 1450 // and then use prefixLength() to see if the search key is at 1451 // the beginning of each substring. This is excruciatingly 1452 // slow, but it will locate the key and tell use how long the 1453 // matching text was. 1454 UnicodeString temp; 1455 UErrorCode status = U_ZERO_ERROR; 1456 while (p < str.length() && keyLen == 0) { 1457 temp.setTo(str, p, str.length() - p); 1458 keyLen = prefixLength(temp, key, status); 1459 if (U_FAILURE(status)) { 1460 break; 1461 } 1462 if (keyLen != 0) { 1463 *length = keyLen; 1464 return p; 1465 } 1466 ++p; 1467 } 1468 // if we make it to here, we didn't find it. Return -1 for the 1469 // location. The length should be ignored, but set it to 0, 1470 // which should be "safe" 1471 *length = 0; 1472 return -1; 1473 } 1474 1475 /** 1476 * Checks to see whether a string consists entirely of ignorable 1477 * characters. 1478 * @param str The string to test. 1479 * @return true if the string is empty of consists entirely of 1480 * characters that the number formatter's collator says are 1481 * ignorable at the primary-order level. false otherwise. 1482 */ 1483 UBool 1484 NFRule::allIgnorable(const UnicodeString& str, UErrorCode& status) const 1485 { 1486 // if the string is empty, we can just return true 1487 if (str.length() == 0) { 1488 return TRUE; 1489 } 1490 1491 #if !UCONFIG_NO_COLLATION 1492 // if lenient parsing is turned on, walk through the string with 1493 // a collation element iterator and make sure each collation 1494 // element is 0 (ignorable) at the primary level 1495 if (formatter->isLenient()) { 1496 const RuleBasedCollator* collator = formatter->getCollator(); 1497 if (collator == NULL) { 1498 status = U_MEMORY_ALLOCATION_ERROR; 1499 return FALSE; 1500 } 1501 LocalPointer<CollationElementIterator> iter(collator->createCollationElementIterator(str)); 1502 1503 // Memory allocation error check. 1504 if (iter.isNull()) { 1505 status = U_MEMORY_ALLOCATION_ERROR; 1506 return FALSE; 1507 } 1508 1509 UErrorCode err = U_ZERO_ERROR; 1510 int32_t o = iter->next(err); 1511 while (o != CollationElementIterator::NULLORDER 1512 && CollationElementIterator::primaryOrder(o) == 0) { 1513 o = iter->next(err); 1514 } 1515 1516 return o == CollationElementIterator::NULLORDER; 1517 } 1518 #endif 1519 1520 // if lenient parsing is turned off, there is no such thing as 1521 // an ignorable character: return true only if the string is empty 1522 return FALSE; 1523 } 1524 1525 U_NAMESPACE_END 1526 1527 /* U_HAVE_RBNF */ 1528 #endif 1529