Home | History | Annotate | Download | only in i18n
      1 /*
      2 ******************************************************************************
      3 *   Copyright (C) 1997-2015, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 ******************************************************************************
      6 *   file name:  nfrule.cpp
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 * Modification history
     12 * Date        Name      Comments
     13 * 10/11/2001  Doug      Ported from ICU4J
     14 */
     15 
     16 #include "nfrule.h"
     17 
     18 #if U_HAVE_RBNF
     19 
     20 #include "unicode/localpointer.h"
     21 #include "unicode/rbnf.h"
     22 #include "unicode/tblcoll.h"
     23 #include "unicode/plurfmt.h"
     24 #include "unicode/upluralrules.h"
     25 #include "unicode/coleitr.h"
     26 #include "unicode/uchar.h"
     27 #include "nfrs.h"
     28 #include "nfrlist.h"
     29 #include "nfsubs.h"
     30 #include "patternprops.h"
     31 
     32 U_NAMESPACE_BEGIN
     33 
     34 NFRule::NFRule(const RuleBasedNumberFormat* _rbnf, const UnicodeString &_ruleText, UErrorCode &status)
     35   : baseValue((int32_t)0)
     36   , radix(10)
     37   , exponent(0)
     38   , decimalPoint(0)
     39   , ruleText(_ruleText)
     40   , sub1(NULL)
     41   , sub2(NULL)
     42   , formatter(_rbnf)
     43   , rulePatternFormat(NULL)
     44 {
     45     if (!ruleText.isEmpty()) {
     46         parseRuleDescriptor(ruleText, status);
     47     }
     48 }
     49 
     50 NFRule::~NFRule()
     51 {
     52     if (sub1 != sub2) {
     53         delete sub2;
     54         sub2 = NULL;
     55     }
     56     delete sub1;
     57     sub1 = NULL;
     58     delete rulePatternFormat;
     59     rulePatternFormat = NULL;
     60 }
     61 
     62 static const UChar gLeftBracket = 0x005b;
     63 static const UChar gRightBracket = 0x005d;
     64 static const UChar gColon = 0x003a;
     65 static const UChar gZero = 0x0030;
     66 static const UChar gNine = 0x0039;
     67 static const UChar gSpace = 0x0020;
     68 static const UChar gSlash = 0x002f;
     69 static const UChar gGreaterThan = 0x003e;
     70 static const UChar gLessThan = 0x003c;
     71 static const UChar gComma = 0x002c;
     72 static const UChar gDot = 0x002e;
     73 static const UChar gTick = 0x0027;
     74 //static const UChar gMinus = 0x002d;
     75 static const UChar gSemicolon = 0x003b;
     76 static const UChar gX = 0x0078;
     77 
     78 static const UChar gMinusX[] =                  {0x2D, 0x78, 0};    /* "-x" */
     79 static const UChar gInf[] =                     {0x49, 0x6E, 0x66, 0}; /* "Inf" */
     80 static const UChar gNaN[] =                     {0x4E, 0x61, 0x4E, 0}; /* "NaN" */
     81 
     82 static const UChar gDollarOpenParenthesis[] =   {0x24, 0x28, 0}; /* "$(" */
     83 static const UChar gClosedParenthesisDollar[] = {0x29, 0x24, 0}; /* ")$" */
     84 
     85 static const UChar gLessLess[] =                {0x3C, 0x3C, 0};    /* "<<" */
     86 static const UChar gLessPercent[] =             {0x3C, 0x25, 0};    /* "<%" */
     87 static const UChar gLessHash[] =                {0x3C, 0x23, 0};    /* "<#" */
     88 static const UChar gLessZero[] =                {0x3C, 0x30, 0};    /* "<0" */
     89 static const UChar gGreaterGreater[] =          {0x3E, 0x3E, 0};    /* ">>" */
     90 static const UChar gGreaterPercent[] =          {0x3E, 0x25, 0};    /* ">%" */
     91 static const UChar gGreaterHash[] =             {0x3E, 0x23, 0};    /* ">#" */
     92 static const UChar gGreaterZero[] =             {0x3E, 0x30, 0};    /* ">0" */
     93 static const UChar gEqualPercent[] =            {0x3D, 0x25, 0};    /* "=%" */
     94 static const UChar gEqualHash[] =               {0x3D, 0x23, 0};    /* "=#" */
     95 static const UChar gEqualZero[] =               {0x3D, 0x30, 0};    /* "=0" */
     96 static const UChar gGreaterGreaterGreater[] =   {0x3E, 0x3E, 0x3E, 0}; /* ">>>" */
     97 
     98 static const UChar * const RULE_PREFIXES[] = {
     99     gLessLess, gLessPercent, gLessHash, gLessZero,
    100     gGreaterGreater, gGreaterPercent,gGreaterHash, gGreaterZero,
    101     gEqualPercent, gEqualHash, gEqualZero, NULL
    102 };
    103 
    104 void
    105 NFRule::makeRules(UnicodeString& description,
    106                   NFRuleSet *owner,
    107                   const NFRule *predecessor,
    108                   const RuleBasedNumberFormat *rbnf,
    109                   NFRuleList& rules,
    110                   UErrorCode& status)
    111 {
    112     // we know we're making at least one rule, so go ahead and
    113     // new it up and initialize its basevalue and divisor
    114     // (this also strips the rule descriptor, if any, off the
    115     // descripton string)
    116     NFRule* rule1 = new NFRule(rbnf, description, status);
    117     /* test for NULL */
    118     if (rule1 == 0) {
    119         status = U_MEMORY_ALLOCATION_ERROR;
    120         return;
    121     }
    122     description = rule1->ruleText;
    123 
    124     // check the description to see whether there's text enclosed
    125     // in brackets
    126     int32_t brack1 = description.indexOf(gLeftBracket);
    127     int32_t brack2 = brack1 < 0 ? -1 : description.indexOf(gRightBracket);
    128 
    129     // if the description doesn't contain a matched pair of brackets,
    130     // or if it's of a type that doesn't recognize bracketed text,
    131     // then leave the description alone, initialize the rule's
    132     // rule text and substitutions, and return that rule
    133     if (brack2 < 0 || brack1 > brack2
    134         || rule1->getType() == kProperFractionRule
    135         || rule1->getType() == kNegativeNumberRule
    136         || rule1->getType() == kInfinityRule
    137         || rule1->getType() == kNaNRule)
    138     {
    139         rule1->extractSubstitutions(owner, description, predecessor, status);
    140     }
    141     else {
    142         // if the description does contain a matched pair of brackets,
    143         // then it's really shorthand for two rules (with one exception)
    144         NFRule* rule2 = NULL;
    145         UnicodeString sbuf;
    146 
    147         // we'll actually only split the rule into two rules if its
    148         // base value is an even multiple of its divisor (or it's one
    149         // of the special rules)
    150         if ((rule1->baseValue > 0
    151             && (rule1->baseValue % util64_pow(rule1->radix, rule1->exponent)) == 0)
    152             || rule1->getType() == kImproperFractionRule
    153             || rule1->getType() == kMasterRule) {
    154 
    155             // if it passes that test, new up the second rule.  If the
    156             // rule set both rules will belong to is a fraction rule
    157             // set, they both have the same base value; otherwise,
    158             // increment the original rule's base value ("rule1" actually
    159             // goes SECOND in the rule set's rule list)
    160             rule2 = new NFRule(rbnf, UnicodeString(), status);
    161             /* test for NULL */
    162             if (rule2 == 0) {
    163                 status = U_MEMORY_ALLOCATION_ERROR;
    164                 return;
    165             }
    166             if (rule1->baseValue >= 0) {
    167                 rule2->baseValue = rule1->baseValue;
    168                 if (!owner->isFractionRuleSet()) {
    169                     ++rule1->baseValue;
    170                 }
    171             }
    172 
    173             // if the description began with "x.x" and contains bracketed
    174             // text, it describes both the improper fraction rule and
    175             // the proper fraction rule
    176             else if (rule1->getType() == kImproperFractionRule) {
    177                 rule2->setType(kProperFractionRule);
    178             }
    179 
    180             // if the description began with "x.0" and contains bracketed
    181             // text, it describes both the master rule and the
    182             // improper fraction rule
    183             else if (rule1->getType() == kMasterRule) {
    184                 rule2->baseValue = rule1->baseValue;
    185                 rule1->setType(kImproperFractionRule);
    186             }
    187 
    188             // both rules have the same radix and exponent (i.e., the
    189             // same divisor)
    190             rule2->radix = rule1->radix;
    191             rule2->exponent = rule1->exponent;
    192 
    193             // rule2's rule text omits the stuff in brackets: initalize
    194             // its rule text and substitutions accordingly
    195             sbuf.append(description, 0, brack1);
    196             if (brack2 + 1 < description.length()) {
    197                 sbuf.append(description, brack2 + 1, description.length() - brack2 - 1);
    198             }
    199             rule2->extractSubstitutions(owner, sbuf, predecessor, status);
    200         }
    201 
    202         // rule1's text includes the text in the brackets but omits
    203         // the brackets themselves: initialize _its_ rule text and
    204         // substitutions accordingly
    205         sbuf.setTo(description, 0, brack1);
    206         sbuf.append(description, brack1 + 1, brack2 - brack1 - 1);
    207         if (brack2 + 1 < description.length()) {
    208             sbuf.append(description, brack2 + 1, description.length() - brack2 - 1);
    209         }
    210         rule1->extractSubstitutions(owner, sbuf, predecessor, status);
    211 
    212         // if we only have one rule, return it; if we have two, return
    213         // a two-element array containing them (notice that rule2 goes
    214         // BEFORE rule1 in the list: in all cases, rule2 OMITS the
    215         // material in the brackets and rule1 INCLUDES the material
    216         // in the brackets)
    217         if (rule2 != NULL) {
    218             if (rule2->baseValue >= kNoBase) {
    219                 rules.add(rule2);
    220             }
    221             else {
    222                 owner->setNonNumericalRule(rule2);
    223             }
    224         }
    225     }
    226     if (rule1->baseValue >= kNoBase) {
    227         rules.add(rule1);
    228     }
    229     else {
    230         owner->setNonNumericalRule(rule1);
    231     }
    232 }
    233 
    234 /**
    235  * This function parses the rule's rule descriptor (i.e., the base
    236  * value and/or other tokens that precede the rule's rule text
    237  * in the description) and sets the rule's base value, radix, and
    238  * exponent according to the descriptor.  (If the description doesn't
    239  * include a rule descriptor, then this function sets everything to
    240  * default values and the rule set sets the rule's real base value).
    241  * @param description The rule's description
    242  * @return If "description" included a rule descriptor, this is
    243  * "description" with the descriptor and any trailing whitespace
    244  * stripped off.  Otherwise; it's "descriptor" unchangd.
    245  */
    246 void
    247 NFRule::parseRuleDescriptor(UnicodeString& description, UErrorCode& status)
    248 {
    249     // the description consists of a rule descriptor and a rule body,
    250     // separated by a colon.  The rule descriptor is optional.  If
    251     // it's omitted, just set the base value to 0.
    252     int32_t p = description.indexOf(gColon);
    253     if (p != -1) {
    254         // copy the descriptor out into its own string and strip it,
    255         // along with any trailing whitespace, out of the original
    256         // description
    257         UnicodeString descriptor;
    258         descriptor.setTo(description, 0, p);
    259 
    260         ++p;
    261         while (p < description.length() && PatternProps::isWhiteSpace(description.charAt(p))) {
    262             ++p;
    263         }
    264         description.removeBetween(0, p);
    265 
    266         // check first to see if the rule descriptor matches the token
    267         // for one of the special rules.  If it does, set the base
    268         // value to the correct identifier value
    269         int descriptorLength = descriptor.length();
    270         UChar firstChar = descriptor.charAt(0);
    271         UChar lastChar = descriptor.charAt(descriptorLength - 1);
    272         if (firstChar >= gZero && firstChar <= gNine && lastChar != gX) {
    273             // if the rule descriptor begins with a digit, it's a descriptor
    274             // for a normal rule
    275             // since we don't have Long.parseLong, and this isn't much work anyway,
    276             // just build up the value as we encounter the digits.
    277             int64_t val = 0;
    278             p = 0;
    279             UChar c = gSpace;
    280 
    281             // begin parsing the descriptor: copy digits
    282             // into "tempValue", skip periods, commas, and spaces,
    283             // stop on a slash or > sign (or at the end of the string),
    284             // and throw an exception on any other character
    285             int64_t ll_10 = 10;
    286             while (p < descriptorLength) {
    287                 c = descriptor.charAt(p);
    288                 if (c >= gZero && c <= gNine) {
    289                     val = val * ll_10 + (int32_t)(c - gZero);
    290                 }
    291                 else if (c == gSlash || c == gGreaterThan) {
    292                     break;
    293                 }
    294                 else if (PatternProps::isWhiteSpace(c) || c == gComma || c == gDot) {
    295                 }
    296                 else {
    297                     // throw new IllegalArgumentException("Illegal character in rule descriptor");
    298                     status = U_PARSE_ERROR;
    299                     return;
    300                 }
    301                 ++p;
    302             }
    303 
    304             // we have the base value, so set it
    305             setBaseValue(val, status);
    306 
    307             // if we stopped the previous loop on a slash, we're
    308             // now parsing the rule's radix.  Again, accumulate digits
    309             // in tempValue, skip punctuation, stop on a > mark, and
    310             // throw an exception on anything else
    311             if (c == gSlash) {
    312                 val = 0;
    313                 ++p;
    314                 int64_t ll_10 = 10;
    315                 while (p < descriptorLength) {
    316                     c = descriptor.charAt(p);
    317                     if (c >= gZero && c <= gNine) {
    318                         val = val * ll_10 + (int32_t)(c - gZero);
    319                     }
    320                     else if (c == gGreaterThan) {
    321                         break;
    322                     }
    323                     else if (PatternProps::isWhiteSpace(c) || c == gComma || c == gDot) {
    324                     }
    325                     else {
    326                         // throw new IllegalArgumentException("Illegal character is rule descriptor");
    327                         status = U_PARSE_ERROR;
    328                         return;
    329                     }
    330                     ++p;
    331                 }
    332 
    333                 // tempValue now contain's the rule's radix.  Set it
    334                 // accordingly, and recalculate the rule's exponent
    335                 radix = (int32_t)val;
    336                 if (radix == 0) {
    337                     // throw new IllegalArgumentException("Rule can't have radix of 0");
    338                     status = U_PARSE_ERROR;
    339                 }
    340 
    341                 exponent = expectedExponent();
    342             }
    343 
    344             // if we stopped the previous loop on a > sign, then continue
    345             // for as long as we still see > signs.  For each one,
    346             // decrement the exponent (unless the exponent is already 0).
    347             // If we see another character before reaching the end of
    348             // the descriptor, that's also a syntax error.
    349             if (c == gGreaterThan) {
    350                 while (p < descriptor.length()) {
    351                     c = descriptor.charAt(p);
    352                     if (c == gGreaterThan && exponent > 0) {
    353                         --exponent;
    354                     } else {
    355                         // throw new IllegalArgumentException("Illegal character in rule descriptor");
    356                         status = U_PARSE_ERROR;
    357                         return;
    358                     }
    359                     ++p;
    360                 }
    361             }
    362         }
    363         else if (0 == descriptor.compare(gMinusX, 2)) {
    364             setType(kNegativeNumberRule);
    365         }
    366         else if (descriptorLength == 3) {
    367             if (firstChar == gZero && lastChar == gX) {
    368                 setBaseValue(kProperFractionRule, status);
    369                 decimalPoint = descriptor.charAt(1);
    370             }
    371             else if (firstChar == gX && lastChar == gX) {
    372                 setBaseValue(kImproperFractionRule, status);
    373                 decimalPoint = descriptor.charAt(1);
    374             }
    375             else if (firstChar == gX && lastChar == gZero) {
    376                 setBaseValue(kMasterRule, status);
    377                 decimalPoint = descriptor.charAt(1);
    378             }
    379             else if (descriptor.compare(gNaN, 3) == 0) {
    380                 setBaseValue(kNaNRule, status);
    381             }
    382             else if (descriptor.compare(gInf, 3) == 0) {
    383                 setBaseValue(kInfinityRule, status);
    384             }
    385         }
    386     }
    387     // else use the default base value for now.
    388 
    389     // finally, if the rule body begins with an apostrophe, strip it off
    390     // (this is generally used to put whitespace at the beginning of
    391     // a rule's rule text)
    392     if (description.length() > 0 && description.charAt(0) == gTick) {
    393         description.removeBetween(0, 1);
    394     }
    395 
    396     // return the description with all the stuff we've just waded through
    397     // stripped off the front.  It now contains just the rule body.
    398     // return description;
    399 }
    400 
    401 /**
    402 * Searches the rule's rule text for the substitution tokens,
    403 * creates the substitutions, and removes the substitution tokens
    404 * from the rule's rule text.
    405 * @param owner The rule set containing this rule
    406 * @param predecessor The rule preseding this one in "owners" rule list
    407 * @param ownersOwner The RuleBasedFormat that owns this rule
    408 */
    409 void
    410 NFRule::extractSubstitutions(const NFRuleSet* ruleSet,
    411                              const UnicodeString &ruleText,
    412                              const NFRule* predecessor,
    413                              UErrorCode& status)
    414 {
    415     if (U_FAILURE(status)) {
    416         return;
    417     }
    418     this->ruleText = ruleText;
    419     sub1 = extractSubstitution(ruleSet, predecessor, status);
    420     if (sub1 == NULL) {
    421         // Small optimization. There is no need to create a redundant NullSubstitution.
    422         sub2 = NULL;
    423     }
    424     else {
    425         sub2 = extractSubstitution(ruleSet, predecessor, status);
    426     }
    427     int32_t pluralRuleStart = this->ruleText.indexOf(gDollarOpenParenthesis, -1, 0);
    428     int32_t pluralRuleEnd = (pluralRuleStart >= 0 ? this->ruleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart) : -1);
    429     if (pluralRuleEnd >= 0) {
    430         int32_t endType = this->ruleText.indexOf(gComma, pluralRuleStart);
    431         if (endType < 0) {
    432             status = U_PARSE_ERROR;
    433             return;
    434         }
    435         UnicodeString type(this->ruleText.tempSubString(pluralRuleStart + 2, endType - pluralRuleStart - 2));
    436         UPluralType pluralType;
    437         if (type.startsWith(UNICODE_STRING_SIMPLE("cardinal"))) {
    438             pluralType = UPLURAL_TYPE_CARDINAL;
    439         }
    440         else if (type.startsWith(UNICODE_STRING_SIMPLE("ordinal"))) {
    441             pluralType = UPLURAL_TYPE_ORDINAL;
    442         }
    443         else {
    444             status = U_ILLEGAL_ARGUMENT_ERROR;
    445             return;
    446         }
    447         rulePatternFormat = formatter->createPluralFormat(pluralType,
    448                 this->ruleText.tempSubString(endType + 1, pluralRuleEnd - endType - 1), status);
    449     }
    450 }
    451 
    452 /**
    453 * Searches the rule's rule text for the first substitution token,
    454 * creates a substitution based on it, and removes the token from
    455 * the rule's rule text.
    456 * @param owner The rule set containing this rule
    457 * @param predecessor The rule preceding this one in the rule set's
    458 * rule list
    459 * @param ownersOwner The RuleBasedNumberFormat that owns this rule
    460 * @return The newly-created substitution.  This is never null; if
    461 * the rule text doesn't contain any substitution tokens, this will
    462 * be a NullSubstitution.
    463 */
    464 NFSubstitution *
    465 NFRule::extractSubstitution(const NFRuleSet* ruleSet,
    466                             const NFRule* predecessor,
    467                             UErrorCode& status)
    468 {
    469     NFSubstitution* result = NULL;
    470 
    471     // search the rule's rule text for the first two characters of
    472     // a substitution token
    473     int32_t subStart = indexOfAnyRulePrefix();
    474     int32_t subEnd = subStart;
    475 
    476     // if we didn't find one, create a null substitution positioned
    477     // at the end of the rule text
    478     if (subStart == -1) {
    479         return NULL;
    480     }
    481 
    482     // special-case the ">>>" token, since searching for the > at the
    483     // end will actually find the > in the middle
    484     if (ruleText.indexOf(gGreaterGreaterGreater, 3, 0) == subStart) {
    485         subEnd = subStart + 2;
    486 
    487         // otherwise the substitution token ends with the same character
    488         // it began with
    489     } else {
    490         UChar c = ruleText.charAt(subStart);
    491         subEnd = ruleText.indexOf(c, subStart + 1);
    492         // special case for '<%foo<<'
    493         if (c == gLessThan && subEnd != -1 && subEnd < ruleText.length() - 1 && ruleText.charAt(subEnd+1) == c) {
    494             // ordinals use "=#,##0==%abbrev=" as their rule.  Notice that the '==' in the middle
    495             // occurs because of the juxtaposition of two different rules.  The check for '<' is a hack
    496             // to get around this.  Having the duplicate at the front would cause problems with
    497             // rules like "<<%" to format, say, percents...
    498             ++subEnd;
    499         }
    500    }
    501 
    502     // if we don't find the end of the token (i.e., if we're on a single,
    503     // unmatched token character), create a null substitution positioned
    504     // at the end of the rule
    505     if (subEnd == -1) {
    506         return NULL;
    507     }
    508 
    509     // if we get here, we have a real substitution token (or at least
    510     // some text bounded by substitution token characters).  Use
    511     // makeSubstitution() to create the right kind of substitution
    512     UnicodeString subToken;
    513     subToken.setTo(ruleText, subStart, subEnd + 1 - subStart);
    514     result = NFSubstitution::makeSubstitution(subStart, this, predecessor, ruleSet,
    515         this->formatter, subToken, status);
    516 
    517     // remove the substitution from the rule text
    518     ruleText.removeBetween(subStart, subEnd+1);
    519 
    520     return result;
    521 }
    522 
    523 /**
    524  * Sets the rule's base value, and causes the radix and exponent
    525  * to be recalculated.  This is used during construction when we
    526  * don't know the rule's base value until after it's been
    527  * constructed.  It should be used at any other time.
    528  * @param The new base value for the rule.
    529  */
    530 void
    531 NFRule::setBaseValue(int64_t newBaseValue, UErrorCode& status)
    532 {
    533     // set the base value
    534     baseValue = newBaseValue;
    535     radix = 10;
    536 
    537     // if this isn't a special rule, recalculate the radix and exponent
    538     // (the radix always defaults to 10; if it's supposed to be something
    539     // else, it's cleaned up by the caller and the exponent is
    540     // recalculated again-- the only function that does this is
    541     // NFRule.parseRuleDescriptor() )
    542     if (baseValue >= 1) {
    543         exponent = expectedExponent();
    544 
    545         // this function gets called on a fully-constructed rule whose
    546         // description didn't specify a base value.  This means it
    547         // has substitutions, and some substitutions hold on to copies
    548         // of the rule's divisor.  Fix their copies of the divisor.
    549         if (sub1 != NULL) {
    550             sub1->setDivisor(radix, exponent, status);
    551         }
    552         if (sub2 != NULL) {
    553             sub2->setDivisor(radix, exponent, status);
    554         }
    555 
    556         // if this is a special rule, its radix and exponent are basically
    557         // ignored.  Set them to "safe" default values
    558     } else {
    559         exponent = 0;
    560     }
    561 }
    562 
    563 /**
    564 * This calculates the rule's exponent based on its radix and base
    565 * value.  This will be the highest power the radix can be raised to
    566 * and still produce a result less than or equal to the base value.
    567 */
    568 int16_t
    569 NFRule::expectedExponent() const
    570 {
    571     // since the log of 0, or the log base 0 of something, causes an
    572     // error, declare the exponent in these cases to be 0 (we also
    573     // deal with the special-rule identifiers here)
    574     if (radix == 0 || baseValue < 1) {
    575         return 0;
    576     }
    577 
    578     // we get rounding error in some cases-- for example, log 1000 / log 10
    579     // gives us 1.9999999996 instead of 2.  The extra logic here is to take
    580     // that into account
    581     int16_t tempResult = (int16_t)(uprv_log((double)baseValue) / uprv_log((double)radix));
    582     int64_t temp = util64_pow(radix, tempResult + 1);
    583     if (temp <= baseValue) {
    584         tempResult += 1;
    585     }
    586     return tempResult;
    587 }
    588 
    589 /**
    590  * Searches the rule's rule text for any of the specified strings.
    591  * @return The index of the first match in the rule's rule text
    592  * (i.e., the first substring in the rule's rule text that matches
    593  * _any_ of the strings in "strings").  If none of the strings in
    594  * "strings" is found in the rule's rule text, returns -1.
    595  */
    596 int32_t
    597 NFRule::indexOfAnyRulePrefix() const
    598 {
    599     int result = -1;
    600     for (int i = 0; RULE_PREFIXES[i]; i++) {
    601         int32_t pos = ruleText.indexOf(*RULE_PREFIXES[i]);
    602         if (pos != -1 && (result == -1 || pos < result)) {
    603             result = pos;
    604         }
    605     }
    606     return result;
    607 }
    608 
    609 //-----------------------------------------------------------------------
    610 // boilerplate
    611 //-----------------------------------------------------------------------
    612 
    613 static UBool
    614 util_equalSubstitutions(const NFSubstitution* sub1, const NFSubstitution* sub2)
    615 {
    616     if (sub1) {
    617         if (sub2) {
    618             return *sub1 == *sub2;
    619         }
    620     } else if (!sub2) {
    621         return TRUE;
    622     }
    623     return FALSE;
    624 }
    625 
    626 /**
    627 * Tests two rules for equality.
    628 * @param that The rule to compare this one against
    629 * @return True is the two rules are functionally equivalent
    630 */
    631 UBool
    632 NFRule::operator==(const NFRule& rhs) const
    633 {
    634     return baseValue == rhs.baseValue
    635         && radix == rhs.radix
    636         && exponent == rhs.exponent
    637         && ruleText == rhs.ruleText
    638         && util_equalSubstitutions(sub1, rhs.sub1)
    639         && util_equalSubstitutions(sub2, rhs.sub2);
    640 }
    641 
    642 /**
    643 * Returns a textual representation of the rule.  This won't
    644 * necessarily be the same as the description that this rule
    645 * was created with, but it will produce the same result.
    646 * @return A textual description of the rule
    647 */
    648 static void util_append64(UnicodeString& result, int64_t n)
    649 {
    650     UChar buffer[256];
    651     int32_t len = util64_tou(n, buffer, sizeof(buffer));
    652     UnicodeString temp(buffer, len);
    653     result.append(temp);
    654 }
    655 
    656 void
    657 NFRule::_appendRuleText(UnicodeString& result) const
    658 {
    659     switch (getType()) {
    660     case kNegativeNumberRule: result.append(gMinusX, 2); break;
    661     case kImproperFractionRule: result.append(gX).append(decimalPoint == 0 ? gDot : decimalPoint).append(gX); break;
    662     case kProperFractionRule: result.append(gZero).append(decimalPoint == 0 ? gDot : decimalPoint).append(gX); break;
    663     case kMasterRule: result.append(gX).append(decimalPoint == 0 ? gDot : decimalPoint).append(gZero); break;
    664     case kInfinityRule: result.append(gInf, 3); break;
    665     case kNaNRule: result.append(gNaN, 3); break;
    666     default:
    667         // for a normal rule, write out its base value, and if the radix is
    668         // something other than 10, write out the radix (with the preceding
    669         // slash, of course).  Then calculate the expected exponent and if
    670         // if isn't the same as the actual exponent, write an appropriate
    671         // number of > signs.  Finally, terminate the whole thing with
    672         // a colon.
    673         util_append64(result, baseValue);
    674         if (radix != 10) {
    675             result.append(gSlash);
    676             util_append64(result, radix);
    677         }
    678         int numCarets = expectedExponent() - exponent;
    679         for (int i = 0; i < numCarets; i++) {
    680             result.append(gGreaterThan);
    681         }
    682         break;
    683     }
    684     result.append(gColon);
    685     result.append(gSpace);
    686 
    687     // if the rule text begins with a space, write an apostrophe
    688     // (whitespace after the rule descriptor is ignored; the
    689     // apostrophe is used to make the whitespace significant)
    690     if (ruleText.charAt(0) == gSpace && (sub1 == NULL || sub1->getPos() != 0)) {
    691         result.append(gTick);
    692     }
    693 
    694     // now, write the rule's rule text, inserting appropriate
    695     // substitution tokens in the appropriate places
    696     UnicodeString ruleTextCopy;
    697     ruleTextCopy.setTo(ruleText);
    698 
    699     UnicodeString temp;
    700     if (sub2 != NULL) {
    701         sub2->toString(temp);
    702         ruleTextCopy.insert(sub2->getPos(), temp);
    703     }
    704     if (sub1 != NULL) {
    705         sub1->toString(temp);
    706         ruleTextCopy.insert(sub1->getPos(), temp);
    707     }
    708 
    709     result.append(ruleTextCopy);
    710 
    711     // and finally, top the whole thing off with a semicolon and
    712     // return the result
    713     result.append(gSemicolon);
    714 }
    715 
    716 //-----------------------------------------------------------------------
    717 // formatting
    718 //-----------------------------------------------------------------------
    719 
    720 /**
    721 * Formats the number, and inserts the resulting text into
    722 * toInsertInto.
    723 * @param number The number being formatted
    724 * @param toInsertInto The string where the resultant text should
    725 * be inserted
    726 * @param pos The position in toInsertInto where the resultant text
    727 * should be inserted
    728 */
    729 void
    730 NFRule::doFormat(int64_t number, UnicodeString& toInsertInto, int32_t pos, int32_t recursionCount, UErrorCode& status) const
    731 {
    732     // first, insert the rule's rule text into toInsertInto at the
    733     // specified position, then insert the results of the substitutions
    734     // into the right places in toInsertInto (notice we do the
    735     // substitutions in reverse order so that the offsets don't get
    736     // messed up)
    737     int32_t pluralRuleStart = ruleText.length();
    738     int32_t lengthOffset = 0;
    739     if (!rulePatternFormat) {
    740         toInsertInto.insert(pos, ruleText);
    741     }
    742     else {
    743         pluralRuleStart = ruleText.indexOf(gDollarOpenParenthesis, -1, 0);
    744         int pluralRuleEnd = ruleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart);
    745         int initialLength = toInsertInto.length();
    746         if (pluralRuleEnd < ruleText.length() - 1) {
    747             toInsertInto.insert(pos, ruleText.tempSubString(pluralRuleEnd + 2));
    748         }
    749         toInsertInto.insert(pos,
    750             rulePatternFormat->format((int32_t)(number/uprv_pow(radix, exponent)), status));
    751         if (pluralRuleStart > 0) {
    752             toInsertInto.insert(pos, ruleText.tempSubString(0, pluralRuleStart));
    753         }
    754         lengthOffset = ruleText.length() - (toInsertInto.length() - initialLength);
    755     }
    756 
    757     if (sub2 != NULL) {
    758         sub2->doSubstitution(number, toInsertInto, pos - (sub2->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status);
    759     }
    760     if (sub1 != NULL) {
    761         sub1->doSubstitution(number, toInsertInto, pos - (sub1->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status);
    762     }
    763 }
    764 
    765 /**
    766 * Formats the number, and inserts the resulting text into
    767 * toInsertInto.
    768 * @param number The number being formatted
    769 * @param toInsertInto The string where the resultant text should
    770 * be inserted
    771 * @param pos The position in toInsertInto where the resultant text
    772 * should be inserted
    773 */
    774 void
    775 NFRule::doFormat(double number, UnicodeString& toInsertInto, int32_t pos, int32_t recursionCount, UErrorCode& status) const
    776 {
    777     // first, insert the rule's rule text into toInsertInto at the
    778     // specified position, then insert the results of the substitutions
    779     // into the right places in toInsertInto
    780     // [again, we have two copies of this routine that do the same thing
    781     // so that we don't sacrifice precision in a long by casting it
    782     // to a double]
    783     int32_t pluralRuleStart = ruleText.length();
    784     int32_t lengthOffset = 0;
    785     if (!rulePatternFormat) {
    786         toInsertInto.insert(pos, ruleText);
    787     }
    788     else {
    789         pluralRuleStart = ruleText.indexOf(gDollarOpenParenthesis, -1, 0);
    790         int pluralRuleEnd = ruleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart);
    791         int initialLength = toInsertInto.length();
    792         if (pluralRuleEnd < ruleText.length() - 1) {
    793             toInsertInto.insert(pos, ruleText.tempSubString(pluralRuleEnd + 2));
    794         }
    795         double pluralVal = number;
    796         if (0 <= pluralVal && pluralVal < 1) {
    797             // We're in a fractional rule, and we have to match the NumeratorSubstitution behavior.
    798             // 2.3 can become 0.2999999999999998 for the fraction due to rounding errors.
    799             pluralVal = uprv_round(pluralVal * uprv_pow(radix, exponent));
    800         }
    801         else {
    802             pluralVal = pluralVal / uprv_pow(radix, exponent);
    803         }
    804         toInsertInto.insert(pos, rulePatternFormat->format((int32_t)(pluralVal), status));
    805         if (pluralRuleStart > 0) {
    806             toInsertInto.insert(pos, ruleText.tempSubString(0, pluralRuleStart));
    807         }
    808         lengthOffset = ruleText.length() - (toInsertInto.length() - initialLength);
    809     }
    810 
    811     if (sub2 != NULL) {
    812         sub2->doSubstitution(number, toInsertInto, pos - (sub2->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status);
    813     }
    814     if (sub1 != NULL) {
    815         sub1->doSubstitution(number, toInsertInto, pos - (sub1->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status);
    816     }
    817 }
    818 
    819 /**
    820 * Used by the owning rule set to determine whether to invoke the
    821 * rollback rule (i.e., whether this rule or the one that precedes
    822 * it in the rule set's list should be used to format the number)
    823 * @param The number being formatted
    824 * @return True if the rule set should use the rule that precedes
    825 * this one in its list; false if it should use this rule
    826 */
    827 UBool
    828 NFRule::shouldRollBack(double number) const
    829 {
    830     // we roll back if the rule contains a modulus substitution,
    831     // the number being formatted is an even multiple of the rule's
    832     // divisor, and the rule's base value is NOT an even multiple
    833     // of its divisor
    834     // In other words, if the original description had
    835     //    100: << hundred[ >>];
    836     // that expands into
    837     //    100: << hundred;
    838     //    101: << hundred >>;
    839     // internally.  But when we're formatting 200, if we use the rule
    840     // at 101, which would normally apply, we get "two hundred zero".
    841     // To prevent this, we roll back and use the rule at 100 instead.
    842     // This is the logic that makes this happen: the rule at 101 has
    843     // a modulus substitution, its base value isn't an even multiple
    844     // of 100, and the value we're trying to format _is_ an even
    845     // multiple of 100.  This is called the "rollback rule."
    846     if ((sub1 != NULL && sub1->isModulusSubstitution()) || (sub2 != NULL && sub2->isModulusSubstitution())) {
    847         int64_t re = util64_pow(radix, exponent);
    848         return uprv_fmod(number, (double)re) == 0 && (baseValue % re) != 0;
    849     }
    850     return FALSE;
    851 }
    852 
    853 //-----------------------------------------------------------------------
    854 // parsing
    855 //-----------------------------------------------------------------------
    856 
    857 /**
    858 * Attempts to parse the string with this rule.
    859 * @param text The string being parsed
    860 * @param parsePosition On entry, the value is ignored and assumed to
    861 * be 0. On exit, this has been updated with the position of the first
    862 * character not consumed by matching the text against this rule
    863 * (if this rule doesn't match the text at all, the parse position
    864 * if left unchanged (presumably at 0) and the function returns
    865 * new Long(0)).
    866 * @param isFractionRule True if this rule is contained within a
    867 * fraction rule set.  This is only used if the rule has no
    868 * substitutions.
    869 * @return If this rule matched the text, this is the rule's base value
    870 * combined appropriately with the results of parsing the substitutions.
    871 * If nothing matched, this is new Long(0) and the parse position is
    872 * left unchanged.  The result will be an instance of Long if the
    873 * result is an integer and Double otherwise.  The result is never null.
    874 */
    875 #ifdef RBNF_DEBUG
    876 #include <stdio.h>
    877 
    878 static void dumpUS(FILE* f, const UnicodeString& us) {
    879   int len = us.length();
    880   char* buf = (char *)uprv_malloc((len+1)*sizeof(char)); //new char[len+1];
    881   if (buf != NULL) {
    882 	  us.extract(0, len, buf);
    883 	  buf[len] = 0;
    884 	  fprintf(f, "%s", buf);
    885 	  uprv_free(buf); //delete[] buf;
    886   }
    887 }
    888 #endif
    889 UBool
    890 NFRule::doParse(const UnicodeString& text,
    891                 ParsePosition& parsePosition,
    892                 UBool isFractionRule,
    893                 double upperBound,
    894                 Formattable& resVal) const
    895 {
    896     // internally we operate on a copy of the string being parsed
    897     // (because we're going to change it) and use our own ParsePosition
    898     ParsePosition pp;
    899     UnicodeString workText(text);
    900 
    901     int32_t sub1Pos = sub1 != NULL ? sub1->getPos() : ruleText.length();
    902     int32_t sub2Pos = sub2 != NULL ? sub2->getPos() : ruleText.length();
    903 
    904     // check to see whether the text before the first substitution
    905     // matches the text at the beginning of the string being
    906     // parsed.  If it does, strip that off the front of workText;
    907     // otherwise, dump out with a mismatch
    908     UnicodeString prefix;
    909     prefix.setTo(ruleText, 0, sub1Pos);
    910 
    911 #ifdef RBNF_DEBUG
    912     fprintf(stderr, "doParse %p ", this);
    913     {
    914         UnicodeString rt;
    915         _appendRuleText(rt);
    916         dumpUS(stderr, rt);
    917     }
    918 
    919     fprintf(stderr, " text: '");
    920     dumpUS(stderr, text);
    921     fprintf(stderr, "' prefix: '");
    922     dumpUS(stderr, prefix);
    923 #endif
    924     stripPrefix(workText, prefix, pp);
    925     int32_t prefixLength = text.length() - workText.length();
    926 
    927 #ifdef RBNF_DEBUG
    928     fprintf(stderr, "' pl: %d ppi: %d s1p: %d\n", prefixLength, pp.getIndex(), sub1Pos);
    929 #endif
    930 
    931     if (pp.getIndex() == 0 && sub1Pos != 0) {
    932         // commented out because ParsePosition doesn't have error index in 1.1.x
    933         // restored for ICU4C port
    934         parsePosition.setErrorIndex(pp.getErrorIndex());
    935         resVal.setLong(0);
    936         return TRUE;
    937     }
    938     if (baseValue == kInfinityRule) {
    939         // If you match this, don't try to perform any calculations on it.
    940         parsePosition.setIndex(pp.getIndex());
    941         resVal.setDouble(uprv_getInfinity());
    942         return TRUE;
    943     }
    944     if (baseValue == kNaNRule) {
    945         // If you match this, don't try to perform any calculations on it.
    946         parsePosition.setIndex(pp.getIndex());
    947         resVal.setDouble(uprv_getNaN());
    948         return TRUE;
    949     }
    950 
    951     // this is the fun part.  The basic guts of the rule-matching
    952     // logic is matchToDelimiter(), which is called twice.  The first
    953     // time it searches the input string for the rule text BETWEEN
    954     // the substitutions and tries to match the intervening text
    955     // in the input string with the first substitution.  If that
    956     // succeeds, it then calls it again, this time to look for the
    957     // rule text after the second substitution and to match the
    958     // intervening input text against the second substitution.
    959     //
    960     // For example, say we have a rule that looks like this:
    961     //    first << middle >> last;
    962     // and input text that looks like this:
    963     //    first one middle two last
    964     // First we use stripPrefix() to match "first " in both places and
    965     // strip it off the front, leaving
    966     //    one middle two last
    967     // Then we use matchToDelimiter() to match " middle " and try to
    968     // match "one" against a substitution.  If it's successful, we now
    969     // have
    970     //    two last
    971     // We use matchToDelimiter() a second time to match " last" and
    972     // try to match "two" against a substitution.  If "two" matches
    973     // the substitution, we have a successful parse.
    974     //
    975     // Since it's possible in many cases to find multiple instances
    976     // of each of these pieces of rule text in the input string,
    977     // we need to try all the possible combinations of these
    978     // locations.  This prevents us from prematurely declaring a mismatch,
    979     // and makes sure we match as much input text as we can.
    980     int highWaterMark = 0;
    981     double result = 0;
    982     int start = 0;
    983     double tempBaseValue = (double)(baseValue <= 0 ? 0 : baseValue);
    984 
    985     UnicodeString temp;
    986     do {
    987         // our partial parse result starts out as this rule's base
    988         // value.  If it finds a successful match, matchToDelimiter()
    989         // will compose this in some way with what it gets back from
    990         // the substitution, giving us a new partial parse result
    991         pp.setIndex(0);
    992 
    993         temp.setTo(ruleText, sub1Pos, sub2Pos - sub1Pos);
    994         double partialResult = matchToDelimiter(workText, start, tempBaseValue,
    995             temp, pp, sub1,
    996             upperBound);
    997 
    998         // if we got a successful match (or were trying to match a
    999         // null substitution), pp is now pointing at the first unmatched
   1000         // character.  Take note of that, and try matchToDelimiter()
   1001         // on the input text again
   1002         if (pp.getIndex() != 0 || sub1 == NULL) {
   1003             start = pp.getIndex();
   1004 
   1005             UnicodeString workText2;
   1006             workText2.setTo(workText, pp.getIndex(), workText.length() - pp.getIndex());
   1007             ParsePosition pp2;
   1008 
   1009             // the second matchToDelimiter() will compose our previous
   1010             // partial result with whatever it gets back from its
   1011             // substitution if there's a successful match, giving us
   1012             // a real result
   1013             temp.setTo(ruleText, sub2Pos, ruleText.length() - sub2Pos);
   1014             partialResult = matchToDelimiter(workText2, 0, partialResult,
   1015                 temp, pp2, sub2,
   1016                 upperBound);
   1017 
   1018             // if we got a successful match on this second
   1019             // matchToDelimiter() call, update the high-water mark
   1020             // and result (if necessary)
   1021             if (pp2.getIndex() != 0 || sub2 == NULL) {
   1022                 if (prefixLength + pp.getIndex() + pp2.getIndex() > highWaterMark) {
   1023                     highWaterMark = prefixLength + pp.getIndex() + pp2.getIndex();
   1024                     result = partialResult;
   1025                 }
   1026             }
   1027             else {
   1028                 // commented out because ParsePosition doesn't have error index in 1.1.x
   1029                 // restored for ICU4C port
   1030                 int32_t temp = pp2.getErrorIndex() + sub1Pos + pp.getIndex();
   1031                 if (temp> parsePosition.getErrorIndex()) {
   1032                     parsePosition.setErrorIndex(temp);
   1033                 }
   1034             }
   1035         }
   1036         else {
   1037             // commented out because ParsePosition doesn't have error index in 1.1.x
   1038             // restored for ICU4C port
   1039             int32_t temp = sub1Pos + pp.getErrorIndex();
   1040             if (temp > parsePosition.getErrorIndex()) {
   1041                 parsePosition.setErrorIndex(temp);
   1042             }
   1043         }
   1044         // keep trying to match things until the outer matchToDelimiter()
   1045         // call fails to make a match (each time, it picks up where it
   1046         // left off the previous time)
   1047     } while (sub1Pos != sub2Pos
   1048         && pp.getIndex() > 0
   1049         && pp.getIndex() < workText.length()
   1050         && pp.getIndex() != start);
   1051 
   1052     // update the caller's ParsePosition with our high-water mark
   1053     // (i.e., it now points at the first character this function
   1054     // didn't match-- the ParsePosition is therefore unchanged if
   1055     // we didn't match anything)
   1056     parsePosition.setIndex(highWaterMark);
   1057     // commented out because ParsePosition doesn't have error index in 1.1.x
   1058     // restored for ICU4C port
   1059     if (highWaterMark > 0) {
   1060         parsePosition.setErrorIndex(0);
   1061     }
   1062 
   1063     // this is a hack for one unusual condition: Normally, whether this
   1064     // rule belong to a fraction rule set or not is handled by its
   1065     // substitutions.  But if that rule HAS NO substitutions, then
   1066     // we have to account for it here.  By definition, if the matching
   1067     // rule in a fraction rule set has no substitutions, its numerator
   1068     // is 1, and so the result is the reciprocal of its base value.
   1069     if (isFractionRule && highWaterMark > 0 && sub1 == NULL) {
   1070         result = 1 / result;
   1071     }
   1072 
   1073     resVal.setDouble(result);
   1074     return TRUE; // ??? do we need to worry if it is a long or a double?
   1075 }
   1076 
   1077 /**
   1078 * This function is used by parse() to match the text being parsed
   1079 * against a possible prefix string.  This function
   1080 * matches characters from the beginning of the string being parsed
   1081 * to characters from the prospective prefix.  If they match, pp is
   1082 * updated to the first character not matched, and the result is
   1083 * the unparsed part of the string.  If they don't match, the whole
   1084 * string is returned, and pp is left unchanged.
   1085 * @param text The string being parsed
   1086 * @param prefix The text to match against
   1087 * @param pp On entry, ignored and assumed to be 0.  On exit, points
   1088 * to the first unmatched character (assuming the whole prefix matched),
   1089 * or is unchanged (if the whole prefix didn't match).
   1090 * @return If things match, this is the unparsed part of "text";
   1091 * if they didn't match, this is "text".
   1092 */
   1093 void
   1094 NFRule::stripPrefix(UnicodeString& text, const UnicodeString& prefix, ParsePosition& pp) const
   1095 {
   1096     // if the prefix text is empty, dump out without doing anything
   1097     if (prefix.length() != 0) {
   1098     	UErrorCode status = U_ZERO_ERROR;
   1099         // use prefixLength() to match the beginning of
   1100         // "text" against "prefix".  This function returns the
   1101         // number of characters from "text" that matched (or 0 if
   1102         // we didn't match the whole prefix)
   1103         int32_t pfl = prefixLength(text, prefix, status);
   1104         if (U_FAILURE(status)) { // Memory allocation error.
   1105         	return;
   1106         }
   1107         if (pfl != 0) {
   1108             // if we got a successful match, update the parse position
   1109             // and strip the prefix off of "text"
   1110             pp.setIndex(pp.getIndex() + pfl);
   1111             text.remove(0, pfl);
   1112         }
   1113     }
   1114 }
   1115 
   1116 /**
   1117 * Used by parse() to match a substitution and any following text.
   1118 * "text" is searched for instances of "delimiter".  For each instance
   1119 * of delimiter, the intervening text is tested to see whether it
   1120 * matches the substitution.  The longest match wins.
   1121 * @param text The string being parsed
   1122 * @param startPos The position in "text" where we should start looking
   1123 * for "delimiter".
   1124 * @param baseValue A partial parse result (often the rule's base value),
   1125 * which is combined with the result from matching the substitution
   1126 * @param delimiter The string to search "text" for.
   1127 * @param pp Ignored and presumed to be 0 on entry.  If there's a match,
   1128 * on exit this will point to the first unmatched character.
   1129 * @param sub If we find "delimiter" in "text", this substitution is used
   1130 * to match the text between the beginning of the string and the
   1131 * position of "delimiter."  (If "delimiter" is the empty string, then
   1132 * this function just matches against this substitution and updates
   1133 * everything accordingly.)
   1134 * @param upperBound When matching the substitution, it will only
   1135 * consider rules with base values lower than this value.
   1136 * @return If there's a match, this is the result of composing
   1137 * baseValue with the result of matching the substitution.  Otherwise,
   1138 * this is new Long(0).  It's never null.  If the result is an integer,
   1139 * this will be an instance of Long; otherwise, it's an instance of
   1140 * Double.
   1141 *
   1142 * !!! note {dlf} in point of fact, in the java code the caller always converts
   1143 * the result to a double, so we might as well return one.
   1144 */
   1145 double
   1146 NFRule::matchToDelimiter(const UnicodeString& text,
   1147                          int32_t startPos,
   1148                          double _baseValue,
   1149                          const UnicodeString& delimiter,
   1150                          ParsePosition& pp,
   1151                          const NFSubstitution* sub,
   1152                          double upperBound) const
   1153 {
   1154 	UErrorCode status = U_ZERO_ERROR;
   1155     // if "delimiter" contains real (i.e., non-ignorable) text, search
   1156     // it for "delimiter" beginning at "start".  If that succeeds, then
   1157     // use "sub"'s doParse() method to match the text before the
   1158     // instance of "delimiter" we just found.
   1159     if (!allIgnorable(delimiter, status)) {
   1160     	if (U_FAILURE(status)) { //Memory allocation error.
   1161     		return 0;
   1162     	}
   1163         ParsePosition tempPP;
   1164         Formattable result;
   1165 
   1166         // use findText() to search for "delimiter".  It returns a two-
   1167         // element array: element 0 is the position of the match, and
   1168         // element 1 is the number of characters that matched
   1169         // "delimiter".
   1170         int32_t dLen;
   1171         int32_t dPos = findText(text, delimiter, startPos, &dLen);
   1172 
   1173         // if findText() succeeded, isolate the text preceding the
   1174         // match, and use "sub" to match that text
   1175         while (dPos >= 0) {
   1176             UnicodeString subText;
   1177             subText.setTo(text, 0, dPos);
   1178             if (subText.length() > 0) {
   1179                 UBool success = sub->doParse(subText, tempPP, _baseValue, upperBound,
   1180 #if UCONFIG_NO_COLLATION
   1181                     FALSE,
   1182 #else
   1183                     formatter->isLenient(),
   1184 #endif
   1185                     result);
   1186 
   1187                 // if the substitution could match all the text up to
   1188                 // where we found "delimiter", then this function has
   1189                 // a successful match.  Bump the caller's parse position
   1190                 // to point to the first character after the text
   1191                 // that matches "delimiter", and return the result
   1192                 // we got from parsing the substitution.
   1193                 if (success && tempPP.getIndex() == dPos) {
   1194                     pp.setIndex(dPos + dLen);
   1195                     return result.getDouble();
   1196                 }
   1197                 else {
   1198                     // commented out because ParsePosition doesn't have error index in 1.1.x
   1199                     // restored for ICU4C port
   1200                     if (tempPP.getErrorIndex() > 0) {
   1201                         pp.setErrorIndex(tempPP.getErrorIndex());
   1202                     } else {
   1203                         pp.setErrorIndex(tempPP.getIndex());
   1204                     }
   1205                 }
   1206             }
   1207 
   1208             // if we didn't match the substitution, search for another
   1209             // copy of "delimiter" in "text" and repeat the loop if
   1210             // we find it
   1211             tempPP.setIndex(0);
   1212             dPos = findText(text, delimiter, dPos + dLen, &dLen);
   1213         }
   1214         // if we make it here, this was an unsuccessful match, and we
   1215         // leave pp unchanged and return 0
   1216         pp.setIndex(0);
   1217         return 0;
   1218 
   1219         // if "delimiter" is empty, or consists only of ignorable characters
   1220         // (i.e., is semantically empty), thwe we obviously can't search
   1221         // for "delimiter".  Instead, just use "sub" to parse as much of
   1222         // "text" as possible.
   1223     }
   1224     else if (sub == NULL) {
   1225         return _baseValue;
   1226     }
   1227     else {
   1228         ParsePosition tempPP;
   1229         Formattable result;
   1230 
   1231         // try to match the whole string against the substitution
   1232         UBool success = sub->doParse(text, tempPP, _baseValue, upperBound,
   1233 #if UCONFIG_NO_COLLATION
   1234             FALSE,
   1235 #else
   1236             formatter->isLenient(),
   1237 #endif
   1238             result);
   1239         if (success && (tempPP.getIndex() != 0)) {
   1240             // if there's a successful match (or it's a null
   1241             // substitution), update pp to point to the first
   1242             // character we didn't match, and pass the result from
   1243             // sub.doParse() on through to the caller
   1244             pp.setIndex(tempPP.getIndex());
   1245             return result.getDouble();
   1246         }
   1247         else {
   1248             // commented out because ParsePosition doesn't have error index in 1.1.x
   1249             // restored for ICU4C port
   1250             pp.setErrorIndex(tempPP.getErrorIndex());
   1251         }
   1252 
   1253         // and if we get to here, then nothing matched, so we return
   1254         // 0 and leave pp alone
   1255         return 0;
   1256     }
   1257 }
   1258 
   1259 /**
   1260 * Used by stripPrefix() to match characters.  If lenient parse mode
   1261 * is off, this just calls startsWith().  If lenient parse mode is on,
   1262 * this function uses CollationElementIterators to match characters in
   1263 * the strings (only primary-order differences are significant in
   1264 * determining whether there's a match).
   1265 * @param str The string being tested
   1266 * @param prefix The text we're hoping to see at the beginning
   1267 * of "str"
   1268 * @return If "prefix" is found at the beginning of "str", this
   1269 * is the number of characters in "str" that were matched (this
   1270 * isn't necessarily the same as the length of "prefix" when matching
   1271 * text with a collator).  If there's no match, this is 0.
   1272 */
   1273 int32_t
   1274 NFRule::prefixLength(const UnicodeString& str, const UnicodeString& prefix, UErrorCode& status) const
   1275 {
   1276     // if we're looking for an empty prefix, it obviously matches
   1277     // zero characters.  Just go ahead and return 0.
   1278     if (prefix.length() == 0) {
   1279         return 0;
   1280     }
   1281 
   1282 #if !UCONFIG_NO_COLLATION
   1283     // go through all this grief if we're in lenient-parse mode
   1284     if (formatter->isLenient()) {
   1285         // get the formatter's collator and use it to create two
   1286         // collation element iterators, one over the target string
   1287         // and another over the prefix (right now, we'll throw an
   1288         // exception if the collator we get back from the formatter
   1289         // isn't a RuleBasedCollator, because RuleBasedCollator defines
   1290         // the CollationElementIterator protocol.  Hopefully, this
   1291         // will change someday.)
   1292         const RuleBasedCollator* collator = formatter->getCollator();
   1293         if (collator == NULL) {
   1294             status = U_MEMORY_ALLOCATION_ERROR;
   1295             return 0;
   1296         }
   1297         LocalPointer<CollationElementIterator> strIter(collator->createCollationElementIterator(str));
   1298         LocalPointer<CollationElementIterator> prefixIter(collator->createCollationElementIterator(prefix));
   1299         // Check for memory allocation error.
   1300         if (strIter.isNull() || prefixIter.isNull()) {
   1301             status = U_MEMORY_ALLOCATION_ERROR;
   1302             return 0;
   1303         }
   1304 
   1305         UErrorCode err = U_ZERO_ERROR;
   1306 
   1307         // The original code was problematic.  Consider this match:
   1308         // prefix = "fifty-"
   1309         // string = " fifty-7"
   1310         // The intent is to match string up to the '7', by matching 'fifty-' at position 1
   1311         // in the string.  Unfortunately, we were getting a match, and then computing where
   1312         // the match terminated by rematching the string.  The rematch code was using as an
   1313         // initial guess the substring of string between 0 and prefix.length.  Because of
   1314         // the leading space and trailing hyphen (both ignorable) this was succeeding, leaving
   1315         // the position before the hyphen in the string.  Recursing down, we then parsed the
   1316         // remaining string '-7' as numeric.  The resulting number turned out as 43 (50 - 7).
   1317         // This was not pretty, especially since the string "fifty-7" parsed just fine.
   1318         //
   1319         // We have newer APIs now, so we can use calls on the iterator to determine what we
   1320         // matched up to.  If we terminate because we hit the last element in the string,
   1321         // our match terminates at this length.  If we terminate because we hit the last element
   1322         // in the target, our match terminates at one before the element iterator position.
   1323 
   1324         // match collation elements between the strings
   1325         int32_t oStr = strIter->next(err);
   1326         int32_t oPrefix = prefixIter->next(err);
   1327 
   1328         while (oPrefix != CollationElementIterator::NULLORDER) {
   1329             // skip over ignorable characters in the target string
   1330             while (CollationElementIterator::primaryOrder(oStr) == 0
   1331                 && oStr != CollationElementIterator::NULLORDER) {
   1332                 oStr = strIter->next(err);
   1333             }
   1334 
   1335             // skip over ignorable characters in the prefix
   1336             while (CollationElementIterator::primaryOrder(oPrefix) == 0
   1337                 && oPrefix != CollationElementIterator::NULLORDER) {
   1338                 oPrefix = prefixIter->next(err);
   1339             }
   1340 
   1341             // dlf: move this above following test, if we consume the
   1342             // entire target, aren't we ok even if the source was also
   1343             // entirely consumed?
   1344 
   1345             // if skipping over ignorables brought to the end of
   1346             // the prefix, we DID match: drop out of the loop
   1347             if (oPrefix == CollationElementIterator::NULLORDER) {
   1348                 break;
   1349             }
   1350 
   1351             // if skipping over ignorables brought us to the end
   1352             // of the target string, we didn't match and return 0
   1353             if (oStr == CollationElementIterator::NULLORDER) {
   1354                 return 0;
   1355             }
   1356 
   1357             // match collation elements from the two strings
   1358             // (considering only primary differences).  If we
   1359             // get a mismatch, dump out and return 0
   1360             if (CollationElementIterator::primaryOrder(oStr)
   1361                 != CollationElementIterator::primaryOrder(oPrefix)) {
   1362                 return 0;
   1363 
   1364                 // otherwise, advance to the next character in each string
   1365                 // and loop (we drop out of the loop when we exhaust
   1366                 // collation elements in the prefix)
   1367             } else {
   1368                 oStr = strIter->next(err);
   1369                 oPrefix = prefixIter->next(err);
   1370             }
   1371         }
   1372 
   1373         int32_t result = strIter->getOffset();
   1374         if (oStr != CollationElementIterator::NULLORDER) {
   1375             --result; // back over character that we don't want to consume;
   1376         }
   1377 
   1378 #ifdef RBNF_DEBUG
   1379         fprintf(stderr, "prefix length: %d\n", result);
   1380 #endif
   1381         return result;
   1382 #if 0
   1383         //----------------------------------------------------------------
   1384         // JDK 1.2-specific API call
   1385         // return strIter.getOffset();
   1386         //----------------------------------------------------------------
   1387         // JDK 1.1 HACK (take out for 1.2-specific code)
   1388 
   1389         // if we make it to here, we have a successful match.  Now we
   1390         // have to find out HOW MANY characters from the target string
   1391         // matched the prefix (there isn't necessarily a one-to-one
   1392         // mapping between collation elements and characters).
   1393         // In JDK 1.2, there's a simple getOffset() call we can use.
   1394         // In JDK 1.1, on the other hand, we have to go through some
   1395         // ugly contortions.  First, use the collator to compare the
   1396         // same number of characters from the prefix and target string.
   1397         // If they're equal, we're done.
   1398         collator->setStrength(Collator::PRIMARY);
   1399         if (str.length() >= prefix.length()) {
   1400             UnicodeString temp;
   1401             temp.setTo(str, 0, prefix.length());
   1402             if (collator->equals(temp, prefix)) {
   1403 #ifdef RBNF_DEBUG
   1404                 fprintf(stderr, "returning: %d\n", prefix.length());
   1405 #endif
   1406                 return prefix.length();
   1407             }
   1408         }
   1409 
   1410         // if they're not equal, then we have to compare successively
   1411         // larger and larger substrings of the target string until we
   1412         // get to one that matches the prefix.  At that point, we know
   1413         // how many characters matched the prefix, and we can return.
   1414         int32_t p = 1;
   1415         while (p <= str.length()) {
   1416             UnicodeString temp;
   1417             temp.setTo(str, 0, p);
   1418             if (collator->equals(temp, prefix)) {
   1419                 return p;
   1420             } else {
   1421                 ++p;
   1422             }
   1423         }
   1424 
   1425         // SHOULD NEVER GET HERE!!!
   1426         return 0;
   1427         //----------------------------------------------------------------
   1428 #endif
   1429 
   1430         // If lenient parsing is turned off, forget all that crap above.
   1431         // Just use String.startsWith() and be done with it.
   1432   } else
   1433 #endif
   1434   {
   1435       if (str.startsWith(prefix)) {
   1436           return prefix.length();
   1437       } else {
   1438           return 0;
   1439       }
   1440   }
   1441 }
   1442 
   1443 /**
   1444 * Searches a string for another string.  If lenient parsing is off,
   1445 * this just calls indexOf().  If lenient parsing is on, this function
   1446 * uses CollationElementIterator to match characters, and only
   1447 * primary-order differences are significant in determining whether
   1448 * there's a match.
   1449 * @param str The string to search
   1450 * @param key The string to search "str" for
   1451 * @param startingAt The index into "str" where the search is to
   1452 * begin
   1453 * @return A two-element array of ints.  Element 0 is the position
   1454 * of the match, or -1 if there was no match.  Element 1 is the
   1455 * number of characters in "str" that matched (which isn't necessarily
   1456 * the same as the length of "key")
   1457 */
   1458 int32_t
   1459 NFRule::findText(const UnicodeString& str,
   1460                  const UnicodeString& key,
   1461                  int32_t startingAt,
   1462                  int32_t* length) const
   1463 {
   1464     if (rulePatternFormat) {
   1465         Formattable result;
   1466         FieldPosition position(UNUM_INTEGER_FIELD);
   1467         position.setBeginIndex(startingAt);
   1468         rulePatternFormat->parseType(str, this, result, position);
   1469         int start = position.getBeginIndex();
   1470         if (start >= 0) {
   1471             int32_t pluralRuleStart = ruleText.indexOf(gDollarOpenParenthesis, -1, 0);
   1472             int32_t pluralRuleSuffix = ruleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart) + 2;
   1473             int32_t matchLen = position.getEndIndex() - start;
   1474             UnicodeString prefix(ruleText.tempSubString(0, pluralRuleStart));
   1475             UnicodeString suffix(ruleText.tempSubString(pluralRuleSuffix));
   1476             if (str.compare(start - prefix.length(), prefix.length(), prefix, 0, prefix.length()) == 0
   1477                     && str.compare(start + matchLen, suffix.length(), suffix, 0, suffix.length()) == 0)
   1478             {
   1479                 *length = matchLen + prefix.length() + suffix.length();
   1480                 return start - prefix.length();
   1481             }
   1482         }
   1483         *length = 0;
   1484         return -1;
   1485     }
   1486     if (!formatter->isLenient()) {
   1487         // if lenient parsing is turned off, this is easy: just call
   1488         // String.indexOf() and we're done
   1489         *length = key.length();
   1490         return str.indexOf(key, startingAt);
   1491     }
   1492     else {
   1493         // but if lenient parsing is turned ON, we've got some work
   1494         // ahead of us
   1495         return findTextLenient(str, key, startingAt, length);
   1496     }
   1497 }
   1498 
   1499 int32_t
   1500 NFRule::findTextLenient(const UnicodeString& str,
   1501                  const UnicodeString& key,
   1502                  int32_t startingAt,
   1503                  int32_t* length) const
   1504 {
   1505     //----------------------------------------------------------------
   1506     // JDK 1.1 HACK (take out of 1.2-specific code)
   1507 
   1508     // in JDK 1.2, CollationElementIterator provides us with an
   1509     // API to map between character offsets and collation elements
   1510     // and we can do this by marching through the string comparing
   1511     // collation elements.  We can't do that in JDK 1.1.  Insted,
   1512     // we have to go through this horrible slow mess:
   1513     int32_t p = startingAt;
   1514     int32_t keyLen = 0;
   1515 
   1516     // basically just isolate smaller and smaller substrings of
   1517     // the target string (each running to the end of the string,
   1518     // and with the first one running from startingAt to the end)
   1519     // and then use prefixLength() to see if the search key is at
   1520     // the beginning of each substring.  This is excruciatingly
   1521     // slow, but it will locate the key and tell use how long the
   1522     // matching text was.
   1523     UnicodeString temp;
   1524     UErrorCode status = U_ZERO_ERROR;
   1525     while (p < str.length() && keyLen == 0) {
   1526         temp.setTo(str, p, str.length() - p);
   1527         keyLen = prefixLength(temp, key, status);
   1528         if (U_FAILURE(status)) {
   1529             break;
   1530         }
   1531         if (keyLen != 0) {
   1532             *length = keyLen;
   1533             return p;
   1534         }
   1535         ++p;
   1536     }
   1537     // if we make it to here, we didn't find it.  Return -1 for the
   1538     // location.  The length should be ignored, but set it to 0,
   1539     // which should be "safe"
   1540     *length = 0;
   1541     return -1;
   1542 }
   1543 
   1544 /**
   1545 * Checks to see whether a string consists entirely of ignorable
   1546 * characters.
   1547 * @param str The string to test.
   1548 * @return true if the string is empty of consists entirely of
   1549 * characters that the number formatter's collator says are
   1550 * ignorable at the primary-order level.  false otherwise.
   1551 */
   1552 UBool
   1553 NFRule::allIgnorable(const UnicodeString& str, UErrorCode& status) const
   1554 {
   1555     // if the string is empty, we can just return true
   1556     if (str.length() == 0) {
   1557         return TRUE;
   1558     }
   1559 
   1560 #if !UCONFIG_NO_COLLATION
   1561     // if lenient parsing is turned on, walk through the string with
   1562     // a collation element iterator and make sure each collation
   1563     // element is 0 (ignorable) at the primary level
   1564     if (formatter->isLenient()) {
   1565         const RuleBasedCollator* collator = formatter->getCollator();
   1566         if (collator == NULL) {
   1567             status = U_MEMORY_ALLOCATION_ERROR;
   1568             return FALSE;
   1569         }
   1570         LocalPointer<CollationElementIterator> iter(collator->createCollationElementIterator(str));
   1571 
   1572         // Memory allocation error check.
   1573         if (iter.isNull()) {
   1574             status = U_MEMORY_ALLOCATION_ERROR;
   1575             return FALSE;
   1576         }
   1577 
   1578         UErrorCode err = U_ZERO_ERROR;
   1579         int32_t o = iter->next(err);
   1580         while (o != CollationElementIterator::NULLORDER
   1581             && CollationElementIterator::primaryOrder(o) == 0) {
   1582             o = iter->next(err);
   1583         }
   1584 
   1585         return o == CollationElementIterator::NULLORDER;
   1586     }
   1587 #endif
   1588 
   1589     // if lenient parsing is turned off, there is no such thing as
   1590     // an ignorable character: return true only if the string is empty
   1591     return FALSE;
   1592 }
   1593 
   1594 void
   1595 NFRule::setDecimalFormatSymbols(const DecimalFormatSymbols& newSymbols, UErrorCode& status) {
   1596     if (sub1 != NULL) {
   1597         sub1->setDecimalFormatSymbols(newSymbols, status);
   1598     }
   1599     if (sub2 != NULL) {
   1600         sub2->setDecimalFormatSymbols(newSymbols, status);
   1601     }
   1602 }
   1603 
   1604 U_NAMESPACE_END
   1605 
   1606 /* U_HAVE_RBNF */
   1607 #endif
   1608