1 /* 2 ****************************************************************************** 3 * Copyright (C) 1997-2008, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ****************************************************************************** 6 * file name: nfrule.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * Modification history 12 * Date Name Comments 13 * 10/11/2001 Doug Ported from ICU4J 14 */ 15 16 #include "nfrule.h" 17 18 #if U_HAVE_RBNF 19 20 #include "unicode/rbnf.h" 21 #include "unicode/tblcoll.h" 22 #include "unicode/coleitr.h" 23 #include "unicode/uchar.h" 24 #include "nfrs.h" 25 #include "nfrlist.h" 26 #include "nfsubs.h" 27 28 #include "util.h" 29 30 U_NAMESPACE_BEGIN 31 32 NFRule::NFRule(const RuleBasedNumberFormat* _rbnf) 33 : baseValue((int32_t)0) 34 , radix(0) 35 , exponent(0) 36 , ruleText() 37 , sub1(NULL) 38 , sub2(NULL) 39 , formatter(_rbnf) 40 { 41 } 42 43 NFRule::~NFRule() 44 { 45 delete sub1; 46 delete sub2; 47 } 48 49 static const UChar gLeftBracket = 0x005b; 50 static const UChar gRightBracket = 0x005d; 51 static const UChar gColon = 0x003a; 52 static const UChar gZero = 0x0030; 53 static const UChar gNine = 0x0039; 54 static const UChar gSpace = 0x0020; 55 static const UChar gSlash = 0x002f; 56 static const UChar gGreaterThan = 0x003e; 57 static const UChar gLessThan = 0x003c; 58 static const UChar gComma = 0x002c; 59 static const UChar gDot = 0x002e; 60 static const UChar gTick = 0x0027; 61 //static const UChar gMinus = 0x002d; 62 static const UChar gSemicolon = 0x003b; 63 64 static const UChar gMinusX[] = {0x2D, 0x78, 0}; /* "-x" */ 65 static const UChar gXDotX[] = {0x78, 0x2E, 0x78, 0}; /* "x.x" */ 66 static const UChar gXDotZero[] = {0x78, 0x2E, 0x30, 0}; /* "x.0" */ 67 static const UChar gZeroDotX[] = {0x30, 0x2E, 0x78, 0}; /* "0.x" */ 68 69 static const UChar gLessLess[] = {0x3C, 0x3C, 0}; /* "<<" */ 70 static const UChar gLessPercent[] = {0x3C, 0x25, 0}; /* "<%" */ 71 static const UChar gLessHash[] = {0x3C, 0x23, 0}; /* "<#" */ 72 static const UChar gLessZero[] = {0x3C, 0x30, 0}; /* "<0" */ 73 static const UChar gGreaterGreater[] = {0x3E, 0x3E, 0}; /* ">>" */ 74 static const UChar gGreaterPercent[] = {0x3E, 0x25, 0}; /* ">%" */ 75 static const UChar gGreaterHash[] = {0x3E, 0x23, 0}; /* ">#" */ 76 static const UChar gGreaterZero[] = {0x3E, 0x30, 0}; /* ">0" */ 77 static const UChar gEqualPercent[] = {0x3D, 0x25, 0}; /* "=%" */ 78 static const UChar gEqualHash[] = {0x3D, 0x23, 0}; /* "=#" */ 79 static const UChar gEqualZero[] = {0x3D, 0x30, 0}; /* "=0" */ 80 static const UChar gEmptyString[] = {0}; /* "" */ 81 static const UChar gGreaterGreaterGreater[] = {0x3E, 0x3E, 0x3E, 0}; /* ">>>" */ 82 83 static const UChar * const tokenStrings[] = { 84 gLessLess, gLessPercent, gLessHash, gLessZero, 85 gGreaterGreater, gGreaterPercent,gGreaterHash, gGreaterZero, 86 gEqualPercent, gEqualHash, gEqualZero, NULL 87 }; 88 89 void 90 NFRule::makeRules(UnicodeString& description, 91 const NFRuleSet *ruleSet, 92 const NFRule *predecessor, 93 const RuleBasedNumberFormat *rbnf, 94 NFRuleList& rules, 95 UErrorCode& status) 96 { 97 // we know we're making at least one rule, so go ahead and 98 // new it up and initialize its basevalue and divisor 99 // (this also strips the rule descriptor, if any, off the 100 // descripton string) 101 NFRule* rule1 = new NFRule(rbnf); 102 /* test for NULL */ 103 if (rule1 == 0) { 104 status = U_MEMORY_ALLOCATION_ERROR; 105 return; 106 } 107 rule1->parseRuleDescriptor(description, status); 108 109 // check the description to see whether there's text enclosed 110 // in brackets 111 int32_t brack1 = description.indexOf(gLeftBracket); 112 int32_t brack2 = description.indexOf(gRightBracket); 113 114 // if the description doesn't contain a matched pair of brackets, 115 // or if it's of a type that doesn't recognize bracketed text, 116 // then leave the description alone, initialize the rule's 117 // rule text and substitutions, and return that rule 118 if (brack1 == -1 || brack2 == -1 || brack1 > brack2 119 || rule1->getType() == kProperFractionRule 120 || rule1->getType() == kNegativeNumberRule) { 121 rule1->ruleText = description; 122 rule1->extractSubstitutions(ruleSet, predecessor, rbnf, status); 123 rules.add(rule1); 124 } else { 125 // if the description does contain a matched pair of brackets, 126 // then it's really shorthand for two rules (with one exception) 127 NFRule* rule2 = NULL; 128 UnicodeString sbuf; 129 130 // we'll actually only split the rule into two rules if its 131 // base value is an even multiple of its divisor (or it's one 132 // of the special rules) 133 if ((rule1->baseValue > 0 134 && (rule1->baseValue % util64_pow(rule1->radix, rule1->exponent)) == 0) 135 || rule1->getType() == kImproperFractionRule 136 || rule1->getType() == kMasterRule) { 137 138 // if it passes that test, new up the second rule. If the 139 // rule set both rules will belong to is a fraction rule 140 // set, they both have the same base value; otherwise, 141 // increment the original rule's base value ("rule1" actually 142 // goes SECOND in the rule set's rule list) 143 rule2 = new NFRule(rbnf); 144 /* test for NULL */ 145 if (rule2 == 0) { 146 status = U_MEMORY_ALLOCATION_ERROR; 147 return; 148 } 149 if (rule1->baseValue >= 0) { 150 rule2->baseValue = rule1->baseValue; 151 if (!ruleSet->isFractionRuleSet()) { 152 ++rule1->baseValue; 153 } 154 } 155 156 // if the description began with "x.x" and contains bracketed 157 // text, it describes both the improper fraction rule and 158 // the proper fraction rule 159 else if (rule1->getType() == kImproperFractionRule) { 160 rule2->setType(kProperFractionRule); 161 } 162 163 // if the description began with "x.0" and contains bracketed 164 // text, it describes both the master rule and the 165 // improper fraction rule 166 else if (rule1->getType() == kMasterRule) { 167 rule2->baseValue = rule1->baseValue; 168 rule1->setType(kImproperFractionRule); 169 } 170 171 // both rules have the same radix and exponent (i.e., the 172 // same divisor) 173 rule2->radix = rule1->radix; 174 rule2->exponent = rule1->exponent; 175 176 // rule2's rule text omits the stuff in brackets: initalize 177 // its rule text and substitutions accordingly 178 sbuf.append(description, 0, brack1); 179 if (brack2 + 1 < description.length()) { 180 sbuf.append(description, brack2 + 1, description.length() - brack2 - 1); 181 } 182 rule2->ruleText.setTo(sbuf); 183 rule2->extractSubstitutions(ruleSet, predecessor, rbnf, status); 184 } 185 186 // rule1's text includes the text in the brackets but omits 187 // the brackets themselves: initialize _its_ rule text and 188 // substitutions accordingly 189 sbuf.setTo(description, 0, brack1); 190 sbuf.append(description, brack1 + 1, brack2 - brack1 - 1); 191 if (brack2 + 1 < description.length()) { 192 sbuf.append(description, brack2 + 1, description.length() - brack2 - 1); 193 } 194 rule1->ruleText.setTo(sbuf); 195 rule1->extractSubstitutions(ruleSet, predecessor, rbnf, status); 196 197 // if we only have one rule, return it; if we have two, return 198 // a two-element array containing them (notice that rule2 goes 199 // BEFORE rule1 in the list: in all cases, rule2 OMITS the 200 // material in the brackets and rule1 INCLUDES the material 201 // in the brackets) 202 if (rule2 != NULL) { 203 rules.add(rule2); 204 } 205 rules.add(rule1); 206 } 207 } 208 209 /** 210 * This function parses the rule's rule descriptor (i.e., the base 211 * value and/or other tokens that precede the rule's rule text 212 * in the description) and sets the rule's base value, radix, and 213 * exponent according to the descriptor. (If the description doesn't 214 * include a rule descriptor, then this function sets everything to 215 * default values and the rule set sets the rule's real base value). 216 * @param description The rule's description 217 * @return If "description" included a rule descriptor, this is 218 * "description" with the descriptor and any trailing whitespace 219 * stripped off. Otherwise; it's "descriptor" unchangd. 220 */ 221 void 222 NFRule::parseRuleDescriptor(UnicodeString& description, UErrorCode& status) 223 { 224 // the description consists of a rule descriptor and a rule body, 225 // separated by a colon. The rule descriptor is optional. If 226 // it's omitted, just set the base value to 0. 227 int32_t p = description.indexOf(gColon); 228 if (p == -1) { 229 setBaseValue((int32_t)0, status); 230 } else { 231 // copy the descriptor out into its own string and strip it, 232 // along with any trailing whitespace, out of the original 233 // description 234 UnicodeString descriptor; 235 descriptor.setTo(description, 0, p); 236 237 ++p; 238 while (p < description.length() && uprv_isRuleWhiteSpace(description.charAt(p))) { 239 ++p; 240 } 241 description.removeBetween(0, p); 242 243 // check first to see if the rule descriptor matches the token 244 // for one of the special rules. If it does, set the base 245 // value to the correct identfier value 246 if (descriptor == gMinusX) { 247 setType(kNegativeNumberRule); 248 } 249 else if (descriptor == gXDotX) { 250 setType(kImproperFractionRule); 251 } 252 else if (descriptor == gZeroDotX) { 253 setType(kProperFractionRule); 254 } 255 else if (descriptor == gXDotZero) { 256 setType(kMasterRule); 257 } 258 259 // if the rule descriptor begins with a digit, it's a descriptor 260 // for a normal rule 261 // since we don't have Long.parseLong, and this isn't much work anyway, 262 // just build up the value as we encounter the digits. 263 else if (descriptor.charAt(0) >= gZero && descriptor.charAt(0) <= gNine) { 264 int64_t val = 0; 265 p = 0; 266 UChar c = gSpace; 267 268 // begin parsing the descriptor: copy digits 269 // into "tempValue", skip periods, commas, and spaces, 270 // stop on a slash or > sign (or at the end of the string), 271 // and throw an exception on any other character 272 int64_t ll_10 = 10; 273 while (p < descriptor.length()) { 274 c = descriptor.charAt(p); 275 if (c >= gZero && c <= gNine) { 276 val = val * ll_10 + (int32_t)(c - gZero); 277 } 278 else if (c == gSlash || c == gGreaterThan) { 279 break; 280 } 281 else if (uprv_isRuleWhiteSpace(c) || c == gComma || c == gDot) { 282 } 283 else { 284 // throw new IllegalArgumentException("Illegal character in rule descriptor"); 285 status = U_PARSE_ERROR; 286 return; 287 } 288 ++p; 289 } 290 291 // we have the base value, so set it 292 setBaseValue(val, status); 293 294 // if we stopped the previous loop on a slash, we're 295 // now parsing the rule's radix. Again, accumulate digits 296 // in tempValue, skip punctuation, stop on a > mark, and 297 // throw an exception on anything else 298 if (c == gSlash) { 299 val = 0; 300 ++p; 301 int64_t ll_10 = 10; 302 while (p < descriptor.length()) { 303 c = descriptor.charAt(p); 304 if (c >= gZero && c <= gNine) { 305 val = val * ll_10 + (int32_t)(c - gZero); 306 } 307 else if (c == gGreaterThan) { 308 break; 309 } 310 else if (uprv_isRuleWhiteSpace(c) || c == gComma || c == gDot) { 311 } 312 else { 313 // throw new IllegalArgumentException("Illegal character is rule descriptor"); 314 status = U_PARSE_ERROR; 315 return; 316 } 317 ++p; 318 } 319 320 // tempValue now contain's the rule's radix. Set it 321 // accordingly, and recalculate the rule's exponent 322 radix = (int32_t)val; 323 if (radix == 0) { 324 // throw new IllegalArgumentException("Rule can't have radix of 0"); 325 status = U_PARSE_ERROR; 326 } 327 328 exponent = expectedExponent(); 329 } 330 331 // if we stopped the previous loop on a > sign, then continue 332 // for as long as we still see > signs. For each one, 333 // decrement the exponent (unless the exponent is already 0). 334 // If we see another character before reaching the end of 335 // the descriptor, that's also a syntax error. 336 if (c == gGreaterThan) { 337 while (p < descriptor.length()) { 338 c = descriptor.charAt(p); 339 if (c == gGreaterThan && exponent > 0) { 340 --exponent; 341 } else { 342 // throw new IllegalArgumentException("Illegal character in rule descriptor"); 343 status = U_PARSE_ERROR; 344 return; 345 } 346 ++p; 347 } 348 } 349 } 350 } 351 352 // finally, if the rule body begins with an apostrophe, strip it off 353 // (this is generally used to put whitespace at the beginning of 354 // a rule's rule text) 355 if (description.length() > 0 && description.charAt(0) == gTick) { 356 description.removeBetween(0, 1); 357 } 358 359 // return the description with all the stuff we've just waded through 360 // stripped off the front. It now contains just the rule body. 361 // return description; 362 } 363 364 /** 365 * Searches the rule's rule text for the substitution tokens, 366 * creates the substitutions, and removes the substitution tokens 367 * from the rule's rule text. 368 * @param owner The rule set containing this rule 369 * @param predecessor The rule preseding this one in "owners" rule list 370 * @param ownersOwner The RuleBasedFormat that owns this rule 371 */ 372 void 373 NFRule::extractSubstitutions(const NFRuleSet* ruleSet, 374 const NFRule* predecessor, 375 const RuleBasedNumberFormat* rbnf, 376 UErrorCode& status) 377 { 378 if (U_SUCCESS(status)) { 379 sub1 = extractSubstitution(ruleSet, predecessor, rbnf, status); 380 sub2 = extractSubstitution(ruleSet, predecessor, rbnf, status); 381 } 382 } 383 384 /** 385 * Searches the rule's rule text for the first substitution token, 386 * creates a substitution based on it, and removes the token from 387 * the rule's rule text. 388 * @param owner The rule set containing this rule 389 * @param predecessor The rule preceding this one in the rule set's 390 * rule list 391 * @param ownersOwner The RuleBasedNumberFormat that owns this rule 392 * @return The newly-created substitution. This is never null; if 393 * the rule text doesn't contain any substitution tokens, this will 394 * be a NullSubstitution. 395 */ 396 NFSubstitution * 397 NFRule::extractSubstitution(const NFRuleSet* ruleSet, 398 const NFRule* predecessor, 399 const RuleBasedNumberFormat* rbnf, 400 UErrorCode& status) 401 { 402 NFSubstitution* result = NULL; 403 404 // search the rule's rule text for the first two characters of 405 // a substitution token 406 int32_t subStart = indexOfAny(tokenStrings); 407 int32_t subEnd = subStart; 408 409 // if we didn't find one, create a null substitution positioned 410 // at the end of the rule text 411 if (subStart == -1) { 412 return NFSubstitution::makeSubstitution(ruleText.length(), this, predecessor, 413 ruleSet, rbnf, gEmptyString, status); 414 } 415 416 // special-case the ">>>" token, since searching for the > at the 417 // end will actually find the > in the middle 418 if (ruleText.indexOf(gGreaterGreaterGreater) == subStart) { 419 subEnd = subStart + 2; 420 421 // otherwise the substitution token ends with the same character 422 // it began with 423 } else { 424 UChar c = ruleText.charAt(subStart); 425 subEnd = ruleText.indexOf(c, subStart + 1); 426 // special case for '<%foo<<' 427 if (c == gLessThan && subEnd != -1 && subEnd < ruleText.length() - 1 && ruleText.charAt(subEnd+1) == c) { 428 // ordinals use "=#,##0==%abbrev=" as their rule. Notice that the '==' in the middle 429 // occurs because of the juxtaposition of two different rules. The check for '<' is a hack 430 // to get around this. Having the duplicate at the front would cause problems with 431 // rules like "<<%" to format, say, percents... 432 ++subEnd; 433 } 434 } 435 436 // if we don't find the end of the token (i.e., if we're on a single, 437 // unmatched token character), create a null substitution positioned 438 // at the end of the rule 439 if (subEnd == -1) { 440 return NFSubstitution::makeSubstitution(ruleText.length(), this, predecessor, 441 ruleSet, rbnf, gEmptyString, status); 442 } 443 444 // if we get here, we have a real substitution token (or at least 445 // some text bounded by substitution token characters). Use 446 // makeSubstitution() to create the right kind of substitution 447 UnicodeString subToken; 448 subToken.setTo(ruleText, subStart, subEnd + 1 - subStart); 449 result = NFSubstitution::makeSubstitution(subStart, this, predecessor, ruleSet, 450 rbnf, subToken, status); 451 452 // remove the substitution from the rule text 453 ruleText.removeBetween(subStart, subEnd+1); 454 455 return result; 456 } 457 458 /** 459 * Sets the rule's base value, and causes the radix and exponent 460 * to be recalculated. This is used during construction when we 461 * don't know the rule's base value until after it's been 462 * constructed. It should be used at any other time. 463 * @param The new base value for the rule. 464 */ 465 void 466 NFRule::setBaseValue(int64_t newBaseValue, UErrorCode& status) 467 { 468 // set the base value 469 baseValue = newBaseValue; 470 471 // if this isn't a special rule, recalculate the radix and exponent 472 // (the radix always defaults to 10; if it's supposed to be something 473 // else, it's cleaned up by the caller and the exponent is 474 // recalculated again-- the only function that does this is 475 // NFRule.parseRuleDescriptor() ) 476 if (baseValue >= 1) { 477 radix = 10; 478 exponent = expectedExponent(); 479 480 // this function gets called on a fully-constructed rule whose 481 // description didn't specify a base value. This means it 482 // has substitutions, and some substitutions hold on to copies 483 // of the rule's divisor. Fix their copies of the divisor. 484 if (sub1 != NULL) { 485 sub1->setDivisor(radix, exponent, status); 486 } 487 if (sub2 != NULL) { 488 sub2->setDivisor(radix, exponent, status); 489 } 490 491 // if this is a special rule, its radix and exponent are basically 492 // ignored. Set them to "safe" default values 493 } else { 494 radix = 10; 495 exponent = 0; 496 } 497 } 498 499 /** 500 * This calculates the rule's exponent based on its radix and base 501 * value. This will be the highest power the radix can be raised to 502 * and still produce a result less than or equal to the base value. 503 */ 504 int16_t 505 NFRule::expectedExponent() const 506 { 507 // since the log of 0, or the log base 0 of something, causes an 508 // error, declare the exponent in these cases to be 0 (we also 509 // deal with the special-rule identifiers here) 510 if (radix == 0 || baseValue < 1) { 511 return 0; 512 } 513 514 // we get rounding error in some cases-- for example, log 1000 / log 10 515 // gives us 1.9999999996 instead of 2. The extra logic here is to take 516 // that into account 517 int16_t tempResult = (int16_t)(uprv_log((double)baseValue) / uprv_log((double)radix)); 518 int64_t temp = util64_pow(radix, tempResult + 1); 519 if (temp <= baseValue) { 520 tempResult += 1; 521 } 522 return tempResult; 523 } 524 525 /** 526 * Searches the rule's rule text for any of the specified strings. 527 * @param strings An array of strings to search the rule's rule 528 * text for 529 * @return The index of the first match in the rule's rule text 530 * (i.e., the first substring in the rule's rule text that matches 531 * _any_ of the strings in "strings"). If none of the strings in 532 * "strings" is found in the rule's rule text, returns -1. 533 */ 534 int32_t 535 NFRule::indexOfAny(const UChar* const strings[]) const 536 { 537 int result = -1; 538 for (int i = 0; strings[i]; i++) { 539 int32_t pos = ruleText.indexOf(*strings[i]); 540 if (pos != -1 && (result == -1 || pos < result)) { 541 result = pos; 542 } 543 } 544 return result; 545 } 546 547 //----------------------------------------------------------------------- 548 // boilerplate 549 //----------------------------------------------------------------------- 550 551 /** 552 * Tests two rules for equality. 553 * @param that The rule to compare this one against 554 * @return True is the two rules are functionally equivalent 555 */ 556 UBool 557 NFRule::operator==(const NFRule& rhs) const 558 { 559 return baseValue == rhs.baseValue 560 && radix == rhs.radix 561 && exponent == rhs.exponent 562 && ruleText == rhs.ruleText 563 && *sub1 == *rhs.sub1 564 && *sub2 == *rhs.sub2; 565 } 566 567 /** 568 * Returns a textual representation of the rule. This won't 569 * necessarily be the same as the description that this rule 570 * was created with, but it will produce the same result. 571 * @return A textual description of the rule 572 */ 573 static void util_append64(UnicodeString& result, int64_t n) 574 { 575 UChar buffer[256]; 576 int32_t len = util64_tou(n, buffer, sizeof(buffer)); 577 UnicodeString temp(buffer, len); 578 result.append(temp); 579 } 580 581 void 582 NFRule::_appendRuleText(UnicodeString& result) const 583 { 584 switch (getType()) { 585 case kNegativeNumberRule: result.append(gMinusX); break; 586 case kImproperFractionRule: result.append(gXDotX); break; 587 case kProperFractionRule: result.append(gZeroDotX); break; 588 case kMasterRule: result.append(gXDotZero); break; 589 default: 590 // for a normal rule, write out its base value, and if the radix is 591 // something other than 10, write out the radix (with the preceding 592 // slash, of course). Then calculate the expected exponent and if 593 // if isn't the same as the actual exponent, write an appropriate 594 // number of > signs. Finally, terminate the whole thing with 595 // a colon. 596 util_append64(result, baseValue); 597 if (radix != 10) { 598 result.append(gSlash); 599 util_append64(result, radix); 600 } 601 int numCarets = expectedExponent() - exponent; 602 for (int i = 0; i < numCarets; i++) { 603 result.append(gGreaterThan); 604 } 605 break; 606 } 607 result.append(gColon); 608 result.append(gSpace); 609 610 // if the rule text begins with a space, write an apostrophe 611 // (whitespace after the rule descriptor is ignored; the 612 // apostrophe is used to make the whitespace significant) 613 if (ruleText.startsWith(gSpace) && sub1->getPos() != 0) { 614 result.append(gTick); 615 } 616 617 // now, write the rule's rule text, inserting appropriate 618 // substitution tokens in the appropriate places 619 UnicodeString ruleTextCopy; 620 ruleTextCopy.setTo(ruleText); 621 622 UnicodeString temp; 623 sub2->toString(temp); 624 ruleTextCopy.insert(sub2->getPos(), temp); 625 sub1->toString(temp); 626 ruleTextCopy.insert(sub1->getPos(), temp); 627 628 result.append(ruleTextCopy); 629 630 // and finally, top the whole thing off with a semicolon and 631 // return the result 632 result.append(gSemicolon); 633 } 634 635 //----------------------------------------------------------------------- 636 // formatting 637 //----------------------------------------------------------------------- 638 639 /** 640 * Formats the number, and inserts the resulting text into 641 * toInsertInto. 642 * @param number The number being formatted 643 * @param toInsertInto The string where the resultant text should 644 * be inserted 645 * @param pos The position in toInsertInto where the resultant text 646 * should be inserted 647 */ 648 void 649 NFRule::doFormat(int64_t number, UnicodeString& toInsertInto, int32_t pos) const 650 { 651 // first, insert the rule's rule text into toInsertInto at the 652 // specified position, then insert the results of the substitutions 653 // into the right places in toInsertInto (notice we do the 654 // substitutions in reverse order so that the offsets don't get 655 // messed up) 656 toInsertInto.insert(pos, ruleText); 657 sub2->doSubstitution(number, toInsertInto, pos); 658 sub1->doSubstitution(number, toInsertInto, pos); 659 } 660 661 /** 662 * Formats the number, and inserts the resulting text into 663 * toInsertInto. 664 * @param number The number being formatted 665 * @param toInsertInto The string where the resultant text should 666 * be inserted 667 * @param pos The position in toInsertInto where the resultant text 668 * should be inserted 669 */ 670 void 671 NFRule::doFormat(double number, UnicodeString& toInsertInto, int32_t pos) const 672 { 673 // first, insert the rule's rule text into toInsertInto at the 674 // specified position, then insert the results of the substitutions 675 // into the right places in toInsertInto 676 // [again, we have two copies of this routine that do the same thing 677 // so that we don't sacrifice precision in a long by casting it 678 // to a double] 679 toInsertInto.insert(pos, ruleText); 680 sub2->doSubstitution(number, toInsertInto, pos); 681 sub1->doSubstitution(number, toInsertInto, pos); 682 } 683 684 /** 685 * Used by the owning rule set to determine whether to invoke the 686 * rollback rule (i.e., whether this rule or the one that precedes 687 * it in the rule set's list should be used to format the number) 688 * @param The number being formatted 689 * @return True if the rule set should use the rule that precedes 690 * this one in its list; false if it should use this rule 691 */ 692 UBool 693 NFRule::shouldRollBack(double number) const 694 { 695 // we roll back if the rule contains a modulus substitution, 696 // the number being formatted is an even multiple of the rule's 697 // divisor, and the rule's base value is NOT an even multiple 698 // of its divisor 699 // In other words, if the original description had 700 // 100: << hundred[ >>]; 701 // that expands into 702 // 100: << hundred; 703 // 101: << hundred >>; 704 // internally. But when we're formatting 200, if we use the rule 705 // at 101, which would normally apply, we get "two hundred zero". 706 // To prevent this, we roll back and use the rule at 100 instead. 707 // This is the logic that makes this happen: the rule at 101 has 708 // a modulus substitution, its base value isn't an even multiple 709 // of 100, and the value we're trying to format _is_ an even 710 // multiple of 100. This is called the "rollback rule." 711 if ((sub1->isModulusSubstitution()) || (sub2->isModulusSubstitution())) { 712 int64_t re = util64_pow(radix, exponent); 713 return uprv_fmod(number, (double)re) == 0 && (baseValue % re) != 0; 714 } 715 return FALSE; 716 } 717 718 //----------------------------------------------------------------------- 719 // parsing 720 //----------------------------------------------------------------------- 721 722 /** 723 * Attempts to parse the string with this rule. 724 * @param text The string being parsed 725 * @param parsePosition On entry, the value is ignored and assumed to 726 * be 0. On exit, this has been updated with the position of the first 727 * character not consumed by matching the text against this rule 728 * (if this rule doesn't match the text at all, the parse position 729 * if left unchanged (presumably at 0) and the function returns 730 * new Long(0)). 731 * @param isFractionRule True if this rule is contained within a 732 * fraction rule set. This is only used if the rule has no 733 * substitutions. 734 * @return If this rule matched the text, this is the rule's base value 735 * combined appropriately with the results of parsing the substitutions. 736 * If nothing matched, this is new Long(0) and the parse position is 737 * left unchanged. The result will be an instance of Long if the 738 * result is an integer and Double otherwise. The result is never null. 739 */ 740 #ifdef RBNF_DEBUG 741 #include <stdio.h> 742 743 static void dumpUS(FILE* f, const UnicodeString& us) { 744 int len = us.length(); 745 char* buf = (char *)uprv_malloc((len+1)*sizeof(char)); //new char[len+1]; 746 if (buf != NULL) { 747 us.extract(0, len, buf); 748 buf[len] = 0; 749 fprintf(f, "%s", buf); 750 uprv_free(buf); //delete[] buf; 751 } 752 } 753 #endif 754 755 UBool 756 NFRule::doParse(const UnicodeString& text, 757 ParsePosition& parsePosition, 758 UBool isFractionRule, 759 double upperBound, 760 Formattable& resVal) const 761 { 762 // internally we operate on a copy of the string being parsed 763 // (because we're going to change it) and use our own ParsePosition 764 ParsePosition pp; 765 UnicodeString workText(text); 766 767 // check to see whether the text before the first substitution 768 // matches the text at the beginning of the string being 769 // parsed. If it does, strip that off the front of workText; 770 // otherwise, dump out with a mismatch 771 UnicodeString prefix; 772 prefix.setTo(ruleText, 0, sub1->getPos()); 773 774 #ifdef RBNF_DEBUG 775 fprintf(stderr, "doParse %x ", this); 776 { 777 UnicodeString rt; 778 _appendRuleText(rt); 779 dumpUS(stderr, rt); 780 } 781 782 fprintf(stderr, " text: '", this); 783 dumpUS(stderr, text); 784 fprintf(stderr, "' prefix: '"); 785 dumpUS(stderr, prefix); 786 #endif 787 stripPrefix(workText, prefix, pp); 788 int32_t prefixLength = text.length() - workText.length(); 789 790 #ifdef RBNF_DEBUG 791 fprintf(stderr, "' pl: %d ppi: %d s1p: %d\n", prefixLength, pp.getIndex(), sub1->getPos()); 792 #endif 793 794 if (pp.getIndex() == 0 && sub1->getPos() != 0) { 795 // commented out because ParsePosition doesn't have error index in 1.1.x 796 // restored for ICU4C port 797 parsePosition.setErrorIndex(pp.getErrorIndex()); 798 resVal.setLong(0); 799 return TRUE; 800 } 801 802 // this is the fun part. The basic guts of the rule-matching 803 // logic is matchToDelimiter(), which is called twice. The first 804 // time it searches the input string for the rule text BETWEEN 805 // the substitutions and tries to match the intervening text 806 // in the input string with the first substitution. If that 807 // succeeds, it then calls it again, this time to look for the 808 // rule text after the second substitution and to match the 809 // intervening input text against the second substitution. 810 // 811 // For example, say we have a rule that looks like this: 812 // first << middle >> last; 813 // and input text that looks like this: 814 // first one middle two last 815 // First we use stripPrefix() to match "first " in both places and 816 // strip it off the front, leaving 817 // one middle two last 818 // Then we use matchToDelimiter() to match " middle " and try to 819 // match "one" against a substitution. If it's successful, we now 820 // have 821 // two last 822 // We use matchToDelimiter() a second time to match " last" and 823 // try to match "two" against a substitution. If "two" matches 824 // the substitution, we have a successful parse. 825 // 826 // Since it's possible in many cases to find multiple instances 827 // of each of these pieces of rule text in the input string, 828 // we need to try all the possible combinations of these 829 // locations. This prevents us from prematurely declaring a mismatch, 830 // and makes sure we match as much input text as we can. 831 int highWaterMark = 0; 832 double result = 0; 833 int start = 0; 834 double tempBaseValue = (double)(baseValue <= 0 ? 0 : baseValue); 835 836 UnicodeString temp; 837 do { 838 // our partial parse result starts out as this rule's base 839 // value. If it finds a successful match, matchToDelimiter() 840 // will compose this in some way with what it gets back from 841 // the substitution, giving us a new partial parse result 842 pp.setIndex(0); 843 844 temp.setTo(ruleText, sub1->getPos(), sub2->getPos() - sub1->getPos()); 845 double partialResult = matchToDelimiter(workText, start, tempBaseValue, 846 temp, pp, sub1, 847 upperBound); 848 849 // if we got a successful match (or were trying to match a 850 // null substitution), pp is now pointing at the first unmatched 851 // character. Take note of that, and try matchToDelimiter() 852 // on the input text again 853 if (pp.getIndex() != 0 || sub1->isNullSubstitution()) { 854 start = pp.getIndex(); 855 856 UnicodeString workText2; 857 workText2.setTo(workText, pp.getIndex(), workText.length() - pp.getIndex()); 858 ParsePosition pp2; 859 860 // the second matchToDelimiter() will compose our previous 861 // partial result with whatever it gets back from its 862 // substitution if there's a successful match, giving us 863 // a real result 864 temp.setTo(ruleText, sub2->getPos(), ruleText.length() - sub2->getPos()); 865 partialResult = matchToDelimiter(workText2, 0, partialResult, 866 temp, pp2, sub2, 867 upperBound); 868 869 // if we got a successful match on this second 870 // matchToDelimiter() call, update the high-water mark 871 // and result (if necessary) 872 if (pp2.getIndex() != 0 || sub2->isNullSubstitution()) { 873 if (prefixLength + pp.getIndex() + pp2.getIndex() > highWaterMark) { 874 highWaterMark = prefixLength + pp.getIndex() + pp2.getIndex(); 875 result = partialResult; 876 } 877 } 878 // commented out because ParsePosition doesn't have error index in 1.1.x 879 // restored for ICU4C port 880 else { 881 int32_t temp = pp2.getErrorIndex() + sub1->getPos() + pp.getIndex(); 882 if (temp> parsePosition.getErrorIndex()) { 883 parsePosition.setErrorIndex(temp); 884 } 885 } 886 } 887 // commented out because ParsePosition doesn't have error index in 1.1.x 888 // restored for ICU4C port 889 else { 890 int32_t temp = sub1->getPos() + pp.getErrorIndex(); 891 if (temp > parsePosition.getErrorIndex()) { 892 parsePosition.setErrorIndex(temp); 893 } 894 } 895 // keep trying to match things until the outer matchToDelimiter() 896 // call fails to make a match (each time, it picks up where it 897 // left off the previous time) 898 } while (sub1->getPos() != sub2->getPos() 899 && pp.getIndex() > 0 900 && pp.getIndex() < workText.length() 901 && pp.getIndex() != start); 902 903 // update the caller's ParsePosition with our high-water mark 904 // (i.e., it now points at the first character this function 905 // didn't match-- the ParsePosition is therefore unchanged if 906 // we didn't match anything) 907 parsePosition.setIndex(highWaterMark); 908 // commented out because ParsePosition doesn't have error index in 1.1.x 909 // restored for ICU4C port 910 if (highWaterMark > 0) { 911 parsePosition.setErrorIndex(0); 912 } 913 914 // this is a hack for one unusual condition: Normally, whether this 915 // rule belong to a fraction rule set or not is handled by its 916 // substitutions. But if that rule HAS NO substitutions, then 917 // we have to account for it here. By definition, if the matching 918 // rule in a fraction rule set has no substitutions, its numerator 919 // is 1, and so the result is the reciprocal of its base value. 920 if (isFractionRule && 921 highWaterMark > 0 && 922 sub1->isNullSubstitution()) { 923 result = 1 / result; 924 } 925 926 resVal.setDouble(result); 927 return TRUE; // ??? do we need to worry if it is a long or a double? 928 } 929 930 /** 931 * This function is used by parse() to match the text being parsed 932 * against a possible prefix string. This function 933 * matches characters from the beginning of the string being parsed 934 * to characters from the prospective prefix. If they match, pp is 935 * updated to the first character not matched, and the result is 936 * the unparsed part of the string. If they don't match, the whole 937 * string is returned, and pp is left unchanged. 938 * @param text The string being parsed 939 * @param prefix The text to match against 940 * @param pp On entry, ignored and assumed to be 0. On exit, points 941 * to the first unmatched character (assuming the whole prefix matched), 942 * or is unchanged (if the whole prefix didn't match). 943 * @return If things match, this is the unparsed part of "text"; 944 * if they didn't match, this is "text". 945 */ 946 void 947 NFRule::stripPrefix(UnicodeString& text, const UnicodeString& prefix, ParsePosition& pp) const 948 { 949 // if the prefix text is empty, dump out without doing anything 950 if (prefix.length() != 0) { 951 UErrorCode status = U_ZERO_ERROR; 952 // use prefixLength() to match the beginning of 953 // "text" against "prefix". This function returns the 954 // number of characters from "text" that matched (or 0 if 955 // we didn't match the whole prefix) 956 int32_t pfl = prefixLength(text, prefix, status); 957 if (U_FAILURE(status)) { // Memory allocation error. 958 return; 959 } 960 if (pfl != 0) { 961 // if we got a successful match, update the parse position 962 // and strip the prefix off of "text" 963 pp.setIndex(pp.getIndex() + pfl); 964 text.remove(0, pfl); 965 } 966 } 967 } 968 969 /** 970 * Used by parse() to match a substitution and any following text. 971 * "text" is searched for instances of "delimiter". For each instance 972 * of delimiter, the intervening text is tested to see whether it 973 * matches the substitution. The longest match wins. 974 * @param text The string being parsed 975 * @param startPos The position in "text" where we should start looking 976 * for "delimiter". 977 * @param baseValue A partial parse result (often the rule's base value), 978 * which is combined with the result from matching the substitution 979 * @param delimiter The string to search "text" for. 980 * @param pp Ignored and presumed to be 0 on entry. If there's a match, 981 * on exit this will point to the first unmatched character. 982 * @param sub If we find "delimiter" in "text", this substitution is used 983 * to match the text between the beginning of the string and the 984 * position of "delimiter." (If "delimiter" is the empty string, then 985 * this function just matches against this substitution and updates 986 * everything accordingly.) 987 * @param upperBound When matching the substitution, it will only 988 * consider rules with base values lower than this value. 989 * @return If there's a match, this is the result of composing 990 * baseValue with the result of matching the substitution. Otherwise, 991 * this is new Long(0). It's never null. If the result is an integer, 992 * this will be an instance of Long; otherwise, it's an instance of 993 * Double. 994 * 995 * !!! note {dlf} in point of fact, in the java code the caller always converts 996 * the result to a double, so we might as well return one. 997 */ 998 double 999 NFRule::matchToDelimiter(const UnicodeString& text, 1000 int32_t startPos, 1001 double _baseValue, 1002 const UnicodeString& delimiter, 1003 ParsePosition& pp, 1004 const NFSubstitution* sub, 1005 double upperBound) const 1006 { 1007 UErrorCode status = U_ZERO_ERROR; 1008 // if "delimiter" contains real (i.e., non-ignorable) text, search 1009 // it for "delimiter" beginning at "start". If that succeeds, then 1010 // use "sub"'s doParse() method to match the text before the 1011 // instance of "delimiter" we just found. 1012 if (!allIgnorable(delimiter, status)) { 1013 if (U_FAILURE(status)) { //Memory allocation error. 1014 return 0; 1015 } 1016 ParsePosition tempPP; 1017 Formattable result; 1018 1019 // use findText() to search for "delimiter". It returns a two- 1020 // element array: element 0 is the position of the match, and 1021 // element 1 is the number of characters that matched 1022 // "delimiter". 1023 int32_t dLen; 1024 int32_t dPos = findText(text, delimiter, startPos, &dLen); 1025 1026 // if findText() succeeded, isolate the text preceding the 1027 // match, and use "sub" to match that text 1028 while (dPos >= 0) { 1029 UnicodeString subText; 1030 subText.setTo(text, 0, dPos); 1031 if (subText.length() > 0) { 1032 UBool success = sub->doParse(subText, tempPP, _baseValue, upperBound, 1033 #if UCONFIG_NO_COLLATION 1034 FALSE, 1035 #else 1036 formatter->isLenient(), 1037 #endif 1038 result); 1039 1040 // if the substitution could match all the text up to 1041 // where we found "delimiter", then this function has 1042 // a successful match. Bump the caller's parse position 1043 // to point to the first character after the text 1044 // that matches "delimiter", and return the result 1045 // we got from parsing the substitution. 1046 if (success && tempPP.getIndex() == dPos) { 1047 pp.setIndex(dPos + dLen); 1048 return result.getDouble(); 1049 } 1050 // commented out because ParsePosition doesn't have error index in 1.1.x 1051 // restored for ICU4C port 1052 else { 1053 if (tempPP.getErrorIndex() > 0) { 1054 pp.setErrorIndex(tempPP.getErrorIndex()); 1055 } else { 1056 pp.setErrorIndex(tempPP.getIndex()); 1057 } 1058 } 1059 } 1060 1061 // if we didn't match the substitution, search for another 1062 // copy of "delimiter" in "text" and repeat the loop if 1063 // we find it 1064 tempPP.setIndex(0); 1065 dPos = findText(text, delimiter, dPos + dLen, &dLen); 1066 } 1067 // if we make it here, this was an unsuccessful match, and we 1068 // leave pp unchanged and return 0 1069 pp.setIndex(0); 1070 return 0; 1071 1072 // if "delimiter" is empty, or consists only of ignorable characters 1073 // (i.e., is semantically empty), thwe we obviously can't search 1074 // for "delimiter". Instead, just use "sub" to parse as much of 1075 // "text" as possible. 1076 } else { 1077 ParsePosition tempPP; 1078 Formattable result; 1079 1080 // try to match the whole string against the substitution 1081 UBool success = sub->doParse(text, tempPP, _baseValue, upperBound, 1082 #if UCONFIG_NO_COLLATION 1083 FALSE, 1084 #else 1085 formatter->isLenient(), 1086 #endif 1087 result); 1088 if (success && (tempPP.getIndex() != 0 || sub->isNullSubstitution())) { 1089 // if there's a successful match (or it's a null 1090 // substitution), update pp to point to the first 1091 // character we didn't match, and pass the result from 1092 // sub.doParse() on through to the caller 1093 pp.setIndex(tempPP.getIndex()); 1094 return result.getDouble(); 1095 } 1096 // commented out because ParsePosition doesn't have error index in 1.1.x 1097 // restored for ICU4C port 1098 else { 1099 pp.setErrorIndex(tempPP.getErrorIndex()); 1100 } 1101 1102 // and if we get to here, then nothing matched, so we return 1103 // 0 and leave pp alone 1104 return 0; 1105 } 1106 } 1107 1108 /** 1109 * Used by stripPrefix() to match characters. If lenient parse mode 1110 * is off, this just calls startsWith(). If lenient parse mode is on, 1111 * this function uses CollationElementIterators to match characters in 1112 * the strings (only primary-order differences are significant in 1113 * determining whether there's a match). 1114 * @param str The string being tested 1115 * @param prefix The text we're hoping to see at the beginning 1116 * of "str" 1117 * @return If "prefix" is found at the beginning of "str", this 1118 * is the number of characters in "str" that were matched (this 1119 * isn't necessarily the same as the length of "prefix" when matching 1120 * text with a collator). If there's no match, this is 0. 1121 */ 1122 int32_t 1123 NFRule::prefixLength(const UnicodeString& str, const UnicodeString& prefix, UErrorCode& status) const 1124 { 1125 // if we're looking for an empty prefix, it obviously matches 1126 // zero characters. Just go ahead and return 0. 1127 if (prefix.length() == 0) { 1128 return 0; 1129 } 1130 1131 #if !UCONFIG_NO_COLLATION 1132 // go through all this grief if we're in lenient-parse mode 1133 if (formatter->isLenient()) { 1134 // get the formatter's collator and use it to create two 1135 // collation element iterators, one over the target string 1136 // and another over the prefix (right now, we'll throw an 1137 // exception if the collator we get back from the formatter 1138 // isn't a RuleBasedCollator, because RuleBasedCollator defines 1139 // the CollationElementIterator protocol. Hopefully, this 1140 // will change someday.) 1141 RuleBasedCollator* collator = (RuleBasedCollator*)formatter->getCollator(); 1142 CollationElementIterator* strIter = collator->createCollationElementIterator(str); 1143 CollationElementIterator* prefixIter = collator->createCollationElementIterator(prefix); 1144 // Check for memory allocation error. 1145 if (collator == NULL || strIter == NULL || prefixIter == NULL) { 1146 delete collator; 1147 delete strIter; 1148 delete prefixIter; 1149 status = U_MEMORY_ALLOCATION_ERROR; 1150 return 0; 1151 } 1152 1153 UErrorCode err = U_ZERO_ERROR; 1154 1155 // The original code was problematic. Consider this match: 1156 // prefix = "fifty-" 1157 // string = " fifty-7" 1158 // The intent is to match string up to the '7', by matching 'fifty-' at position 1 1159 // in the string. Unfortunately, we were getting a match, and then computing where 1160 // the match terminated by rematching the string. The rematch code was using as an 1161 // initial guess the substring of string between 0 and prefix.length. Because of 1162 // the leading space and trailing hyphen (both ignorable) this was succeeding, leaving 1163 // the position before the hyphen in the string. Recursing down, we then parsed the 1164 // remaining string '-7' as numeric. The resulting number turned out as 43 (50 - 7). 1165 // This was not pretty, especially since the string "fifty-7" parsed just fine. 1166 // 1167 // We have newer APIs now, so we can use calls on the iterator to determine what we 1168 // matched up to. If we terminate because we hit the last element in the string, 1169 // our match terminates at this length. If we terminate because we hit the last element 1170 // in the target, our match terminates at one before the element iterator position. 1171 1172 // match collation elements between the strings 1173 int32_t oStr = strIter->next(err); 1174 int32_t oPrefix = prefixIter->next(err); 1175 1176 while (oPrefix != CollationElementIterator::NULLORDER) { 1177 // skip over ignorable characters in the target string 1178 while (CollationElementIterator::primaryOrder(oStr) == 0 1179 && oStr != CollationElementIterator::NULLORDER) { 1180 oStr = strIter->next(err); 1181 } 1182 1183 // skip over ignorable characters in the prefix 1184 while (CollationElementIterator::primaryOrder(oPrefix) == 0 1185 && oPrefix != CollationElementIterator::NULLORDER) { 1186 oPrefix = prefixIter->next(err); 1187 } 1188 1189 // dlf: move this above following test, if we consume the 1190 // entire target, aren't we ok even if the source was also 1191 // entirely consumed? 1192 1193 // if skipping over ignorables brought to the end of 1194 // the prefix, we DID match: drop out of the loop 1195 if (oPrefix == CollationElementIterator::NULLORDER) { 1196 break; 1197 } 1198 1199 // if skipping over ignorables brought us to the end 1200 // of the target string, we didn't match and return 0 1201 if (oStr == CollationElementIterator::NULLORDER) { 1202 delete prefixIter; 1203 delete strIter; 1204 return 0; 1205 } 1206 1207 // match collation elements from the two strings 1208 // (considering only primary differences). If we 1209 // get a mismatch, dump out and return 0 1210 if (CollationElementIterator::primaryOrder(oStr) 1211 != CollationElementIterator::primaryOrder(oPrefix)) { 1212 delete prefixIter; 1213 delete strIter; 1214 return 0; 1215 1216 // otherwise, advance to the next character in each string 1217 // and loop (we drop out of the loop when we exhaust 1218 // collation elements in the prefix) 1219 } else { 1220 oStr = strIter->next(err); 1221 oPrefix = prefixIter->next(err); 1222 } 1223 } 1224 1225 int32_t result = strIter->getOffset(); 1226 if (oStr != CollationElementIterator::NULLORDER) { 1227 --result; // back over character that we don't want to consume; 1228 } 1229 1230 #ifdef RBNF_DEBUG 1231 fprintf(stderr, "prefix length: %d\n", result); 1232 #endif 1233 delete prefixIter; 1234 delete strIter; 1235 1236 return result; 1237 #if 0 1238 //---------------------------------------------------------------- 1239 // JDK 1.2-specific API call 1240 // return strIter.getOffset(); 1241 //---------------------------------------------------------------- 1242 // JDK 1.1 HACK (take out for 1.2-specific code) 1243 1244 // if we make it to here, we have a successful match. Now we 1245 // have to find out HOW MANY characters from the target string 1246 // matched the prefix (there isn't necessarily a one-to-one 1247 // mapping between collation elements and characters). 1248 // In JDK 1.2, there's a simple getOffset() call we can use. 1249 // In JDK 1.1, on the other hand, we have to go through some 1250 // ugly contortions. First, use the collator to compare the 1251 // same number of characters from the prefix and target string. 1252 // If they're equal, we're done. 1253 collator->setStrength(Collator::PRIMARY); 1254 if (str.length() >= prefix.length()) { 1255 UnicodeString temp; 1256 temp.setTo(str, 0, prefix.length()); 1257 if (collator->equals(temp, prefix)) { 1258 #ifdef RBNF_DEBUG 1259 fprintf(stderr, "returning: %d\n", prefix.length()); 1260 #endif 1261 return prefix.length(); 1262 } 1263 } 1264 1265 // if they're not equal, then we have to compare successively 1266 // larger and larger substrings of the target string until we 1267 // get to one that matches the prefix. At that point, we know 1268 // how many characters matched the prefix, and we can return. 1269 int32_t p = 1; 1270 while (p <= str.length()) { 1271 UnicodeString temp; 1272 temp.setTo(str, 0, p); 1273 if (collator->equals(temp, prefix)) { 1274 return p; 1275 } else { 1276 ++p; 1277 } 1278 } 1279 1280 // SHOULD NEVER GET HERE!!! 1281 return 0; 1282 //---------------------------------------------------------------- 1283 #endif 1284 1285 // If lenient parsing is turned off, forget all that crap above. 1286 // Just use String.startsWith() and be done with it. 1287 } else 1288 #endif 1289 { 1290 if (str.startsWith(prefix)) { 1291 return prefix.length(); 1292 } else { 1293 return 0; 1294 } 1295 } 1296 } 1297 1298 /** 1299 * Searches a string for another string. If lenient parsing is off, 1300 * this just calls indexOf(). If lenient parsing is on, this function 1301 * uses CollationElementIterator to match characters, and only 1302 * primary-order differences are significant in determining whether 1303 * there's a match. 1304 * @param str The string to search 1305 * @param key The string to search "str" for 1306 * @param startingAt The index into "str" where the search is to 1307 * begin 1308 * @return A two-element array of ints. Element 0 is the position 1309 * of the match, or -1 if there was no match. Element 1 is the 1310 * number of characters in "str" that matched (which isn't necessarily 1311 * the same as the length of "key") 1312 */ 1313 int32_t 1314 NFRule::findText(const UnicodeString& str, 1315 const UnicodeString& key, 1316 int32_t startingAt, 1317 int32_t* length) const 1318 { 1319 #if !UCONFIG_NO_COLLATION 1320 // if lenient parsing is turned off, this is easy: just call 1321 // String.indexOf() and we're done 1322 if (!formatter->isLenient()) { 1323 *length = key.length(); 1324 return str.indexOf(key, startingAt); 1325 1326 // but if lenient parsing is turned ON, we've got some work 1327 // ahead of us 1328 } else 1329 #endif 1330 { 1331 //---------------------------------------------------------------- 1332 // JDK 1.1 HACK (take out of 1.2-specific code) 1333 1334 // in JDK 1.2, CollationElementIterator provides us with an 1335 // API to map between character offsets and collation elements 1336 // and we can do this by marching through the string comparing 1337 // collation elements. We can't do that in JDK 1.1. Insted, 1338 // we have to go through this horrible slow mess: 1339 int32_t p = startingAt; 1340 int32_t keyLen = 0; 1341 1342 // basically just isolate smaller and smaller substrings of 1343 // the target string (each running to the end of the string, 1344 // and with the first one running from startingAt to the end) 1345 // and then use prefixLength() to see if the search key is at 1346 // the beginning of each substring. This is excruciatingly 1347 // slow, but it will locate the key and tell use how long the 1348 // matching text was. 1349 UnicodeString temp; 1350 UErrorCode status = U_ZERO_ERROR; 1351 while (p < str.length() && keyLen == 0) { 1352 temp.setTo(str, p, str.length() - p); 1353 keyLen = prefixLength(temp, key, status); 1354 if (U_FAILURE(status)) { 1355 break; 1356 } 1357 if (keyLen != 0) { 1358 *length = keyLen; 1359 return p; 1360 } 1361 ++p; 1362 } 1363 // if we make it to here, we didn't find it. Return -1 for the 1364 // location. The length should be ignored, but set it to 0, 1365 // which should be "safe" 1366 *length = 0; 1367 return -1; 1368 1369 //---------------------------------------------------------------- 1370 // JDK 1.2 version of this routine 1371 //RuleBasedCollator collator = (RuleBasedCollator)formatter.getCollator(); 1372 // 1373 //CollationElementIterator strIter = collator.getCollationElementIterator(str); 1374 //CollationElementIterator keyIter = collator.getCollationElementIterator(key); 1375 // 1376 //int keyStart = -1; 1377 // 1378 //str.setOffset(startingAt); 1379 // 1380 //int oStr = strIter.next(); 1381 //int oKey = keyIter.next(); 1382 //while (oKey != CollationElementIterator.NULLORDER) { 1383 // while (oStr != CollationElementIterator.NULLORDER && 1384 // CollationElementIterator.primaryOrder(oStr) == 0) 1385 // oStr = strIter.next(); 1386 // 1387 // while (oKey != CollationElementIterator.NULLORDER && 1388 // CollationElementIterator.primaryOrder(oKey) == 0) 1389 // oKey = keyIter.next(); 1390 // 1391 // if (oStr == CollationElementIterator.NULLORDER) { 1392 // return new int[] { -1, 0 }; 1393 // } 1394 // 1395 // if (oKey == CollationElementIterator.NULLORDER) { 1396 // break; 1397 // } 1398 // 1399 // if (CollationElementIterator.primaryOrder(oStr) == 1400 // CollationElementIterator.primaryOrder(oKey)) { 1401 // keyStart = strIter.getOffset(); 1402 // oStr = strIter.next(); 1403 // oKey = keyIter.next(); 1404 // } else { 1405 // if (keyStart != -1) { 1406 // keyStart = -1; 1407 // keyIter.reset(); 1408 // } else { 1409 // oStr = strIter.next(); 1410 // } 1411 // } 1412 //} 1413 // 1414 //if (oKey == CollationElementIterator.NULLORDER) { 1415 // return new int[] { keyStart, strIter.getOffset() - keyStart }; 1416 //} else { 1417 // return new int[] { -1, 0 }; 1418 //} 1419 } 1420 } 1421 1422 /** 1423 * Checks to see whether a string consists entirely of ignorable 1424 * characters. 1425 * @param str The string to test. 1426 * @return true if the string is empty of consists entirely of 1427 * characters that the number formatter's collator says are 1428 * ignorable at the primary-order level. false otherwise. 1429 */ 1430 UBool 1431 NFRule::allIgnorable(const UnicodeString& str, UErrorCode& status) const 1432 { 1433 // if the string is empty, we can just return true 1434 if (str.length() == 0) { 1435 return TRUE; 1436 } 1437 1438 #if !UCONFIG_NO_COLLATION 1439 // if lenient parsing is turned on, walk through the string with 1440 // a collation element iterator and make sure each collation 1441 // element is 0 (ignorable) at the primary level 1442 if (formatter->isLenient()) { 1443 RuleBasedCollator* collator = (RuleBasedCollator*)(formatter->getCollator()); 1444 CollationElementIterator* iter = collator->createCollationElementIterator(str); 1445 1446 // Memory allocation error check. 1447 if (collator == NULL || iter == NULL) { 1448 delete collator; 1449 delete iter; 1450 status = U_MEMORY_ALLOCATION_ERROR; 1451 return FALSE; 1452 } 1453 1454 UErrorCode err = U_ZERO_ERROR; 1455 int32_t o = iter->next(err); 1456 while (o != CollationElementIterator::NULLORDER 1457 && CollationElementIterator::primaryOrder(o) == 0) { 1458 o = iter->next(err); 1459 } 1460 1461 delete iter; 1462 return o == CollationElementIterator::NULLORDER; 1463 } 1464 #endif 1465 1466 // if lenient parsing is turned off, there is no such thing as 1467 // an ignorable character: return true only if the string is empty 1468 return FALSE; 1469 } 1470 1471 U_NAMESPACE_END 1472 1473 /* U_HAVE_RBNF */ 1474 #endif 1475 1476 1477