Home | History | Annotate | Download | only in common
      1 /*
      2 **********************************************************************
      3 *   Copyright (c) 2001-2008, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 *   Date        Name        Description
      7 *   11/19/2001  aliu        Creation.
      8 **********************************************************************
      9 */
     10 
     11 #include "util.h"
     12 #include "unicode/unimatch.h"
     13 #include "unicode/uniset.h"
     14 
     15 // Define UChar constants using hex for EBCDIC compatibility
     16 
     17 static const UChar BACKSLASH  = 0x005C; /*\*/
     18 static const UChar UPPER_U    = 0x0055; /*U*/
     19 static const UChar LOWER_U    = 0x0075; /*u*/
     20 static const UChar APOSTROPHE = 0x0027; // '\''
     21 static const UChar SPACE      = 0x0020; // ' '
     22 
     23 // "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
     24 static const UChar DIGITS[] = {
     25     48,49,50,51,52,53,54,55,56,57,
     26     65,66,67,68,69,70,71,72,73,74,
     27     75,76,77,78,79,80,81,82,83,84,
     28     85,86,87,88,89,90
     29 };
     30 
     31 U_NAMESPACE_BEGIN
     32 
     33 UnicodeString& ICU_Utility::appendNumber(UnicodeString& result, int32_t n,
     34                                      int32_t radix, int32_t minDigits) {
     35     if (radix < 2 || radix > 36) {
     36         // Bogus radix
     37         return result.append((UChar)63/*?*/);
     38     }
     39     // Handle negatives
     40     if (n < 0) {
     41         n = -n;
     42         result.append((UChar)45/*-*/);
     43     }
     44     // First determine the number of digits
     45     int32_t nn = n;
     46     int32_t r = 1;
     47     while (nn >= radix) {
     48         nn /= radix;
     49         r *= radix;
     50         --minDigits;
     51     }
     52     // Now generate the digits
     53     while (--minDigits > 0) {
     54         result.append(DIGITS[0]);
     55     }
     56     while (r > 0) {
     57         int32_t digit = n / r;
     58         result.append(DIGITS[digit]);
     59         n -= digit * r;
     60         r /= radix;
     61     }
     62     return result;
     63 }
     64 
     65 /**
     66  * Return true if the character is NOT printable ASCII.
     67  */
     68 UBool ICU_Utility::isUnprintable(UChar32 c) {
     69     return !(c >= 0x20 && c <= 0x7E);
     70 }
     71 
     72 /**
     73  * Escape unprintable characters using \uxxxx notation for U+0000 to
     74  * U+FFFF and \Uxxxxxxxx for U+10000 and above.  If the character is
     75  * printable ASCII, then do nothing and return FALSE.  Otherwise,
     76  * append the escaped notation and return TRUE.
     77  */
     78 UBool ICU_Utility::escapeUnprintable(UnicodeString& result, UChar32 c) {
     79     if (isUnprintable(c)) {
     80         result.append(BACKSLASH);
     81         if (c & ~0xFFFF) {
     82             result.append(UPPER_U);
     83             result.append(DIGITS[0xF&(c>>28)]);
     84             result.append(DIGITS[0xF&(c>>24)]);
     85             result.append(DIGITS[0xF&(c>>20)]);
     86             result.append(DIGITS[0xF&(c>>16)]);
     87         } else {
     88             result.append(LOWER_U);
     89         }
     90         result.append(DIGITS[0xF&(c>>12)]);
     91         result.append(DIGITS[0xF&(c>>8)]);
     92         result.append(DIGITS[0xF&(c>>4)]);
     93         result.append(DIGITS[0xF&c]);
     94         return TRUE;
     95     }
     96     return FALSE;
     97 }
     98 
     99 /**
    100  * Returns the index of a character, ignoring quoted text.
    101  * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
    102  * found by a search for 'h'.
    103  */
    104 // FOR FUTURE USE.  DISABLE FOR NOW for coverage reasons.
    105 /*
    106 int32_t ICU_Utility::quotedIndexOf(const UnicodeString& text,
    107                                int32_t start, int32_t limit,
    108                                UChar charToFind) {
    109     for (int32_t i=start; i<limit; ++i) {
    110         UChar c = text.charAt(i);
    111         if (c == BACKSLASH) {
    112             ++i;
    113         } else if (c == APOSTROPHE) {
    114             while (++i < limit
    115                    && text.charAt(i) != APOSTROPHE) {}
    116         } else if (c == charToFind) {
    117             return i;
    118         }
    119     }
    120     return -1;
    121 }
    122 */
    123 
    124 /**
    125  * Skip over a sequence of zero or more white space characters at pos.
    126  * @param advance if true, advance pos to the first non-white-space
    127  * character at or after pos, or str.length(), if there is none.
    128  * Otherwise leave pos unchanged.
    129  * @return the index of the first non-white-space character at or
    130  * after pos, or str.length(), if there is none.
    131  */
    132 int32_t ICU_Utility::skipWhitespace(const UnicodeString& str, int32_t& pos,
    133                                     UBool advance) {
    134     int32_t p = pos;
    135     while (p < str.length()) {
    136         UChar32 c = str.char32At(p);
    137         if (!uprv_isRuleWhiteSpace(c)) {
    138             break;
    139         }
    140         p += UTF_CHAR_LENGTH(c);
    141     }
    142     if (advance) {
    143         pos = p;
    144     }
    145     return p;
    146 }
    147 
    148 /**
    149  * Skip over whitespace in a Replaceable.  Whitespace is defined by
    150  * uprv_isRuleWhiteSpace().  Skipping may be done in the forward or
    151  * reverse direction.  In either case, the leftmost index will be
    152  * inclusive, and the rightmost index will be exclusive.  That is,
    153  * given a range defined as [start, limit), the call
    154  * skipWhitespace(text, start, limit) will advance start past leading
    155  * whitespace, whereas the call skipWhitespace(text, limit, start),
    156  * will back up limit past trailing whitespace.
    157  * @param text the text to be analyzed
    158  * @param pos either the start or limit of a range of 'text', to skip
    159  * leading or trailing whitespace, respectively
    160  * @param stop either the limit or start of a range of 'text', to skip
    161  * leading or trailing whitespace, respectively
    162  * @return the new start or limit, depending on what was passed in to
    163  * 'pos'
    164  */
    165 //?FOR FUTURE USE.  DISABLE FOR NOW for coverage reasons.
    166 //?int32_t ICU_Utility::skipWhitespace(const Replaceable& text,
    167 //?                                    int32_t pos, int32_t stop) {
    168 //?    UChar32 c;
    169 //?    UBool isForward = (stop >= pos);
    170 //?
    171 //?    if (!isForward) {
    172 //?        --pos; // pos is a limit, so back up by one
    173 //?    }
    174 //?
    175 //?    while (pos != stop &&
    176 //?           uprv_isRuleWhiteSpace(c = text.char32At(pos))) {
    177 //?        if (isForward) {
    178 //?            pos += UTF_CHAR_LENGTH(c);
    179 //?        } else {
    180 //?            pos -= UTF_CHAR_LENGTH(c);
    181 //?        }
    182 //?    }
    183 //?
    184 //?    if (!isForward) {
    185 //?        ++pos; // make pos back into a limit
    186 //?    }
    187 //?
    188 //?    return pos;
    189 //?}
    190 
    191 /**
    192  * Parse a single non-whitespace character 'ch', optionally
    193  * preceded by whitespace.
    194  * @param id the string to be parsed
    195  * @param pos INPUT-OUTPUT parameter.  On input, pos[0] is the
    196  * offset of the first character to be parsed.  On output, pos[0]
    197  * is the index after the last parsed character.  If the parse
    198  * fails, pos[0] will be unchanged.
    199  * @param ch the non-whitespace character to be parsed.
    200  * @return true if 'ch' is seen preceded by zero or more
    201  * whitespace characters.
    202  */
    203 UBool ICU_Utility::parseChar(const UnicodeString& id, int32_t& pos, UChar ch) {
    204     int32_t start = pos;
    205     skipWhitespace(id, pos, TRUE);
    206     if (pos == id.length() ||
    207         id.charAt(pos) != ch) {
    208         pos = start;
    209         return FALSE;
    210     }
    211     ++pos;
    212     return TRUE;
    213 }
    214 
    215 /**
    216  * Parse a pattern string within the given Replaceable and a parsing
    217  * pattern.  Characters are matched literally and case-sensitively
    218  * except for the following special characters:
    219  *
    220  * ~  zero or more uprv_isRuleWhiteSpace chars
    221  *
    222  * If end of pattern is reached with all matches along the way,
    223  * pos is advanced to the first unparsed index and returned.
    224  * Otherwise -1 is returned.
    225  * @param pat pattern that controls parsing
    226  * @param text text to be parsed, starting at index
    227  * @param index offset to first character to parse
    228  * @param limit offset after last character to parse
    229  * @return index after last parsed character, or -1 on parse failure.
    230  */
    231 int32_t ICU_Utility::parsePattern(const UnicodeString& pat,
    232                                   const Replaceable& text,
    233                                   int32_t index,
    234                                   int32_t limit) {
    235     int32_t ipat = 0;
    236 
    237     // empty pattern matches immediately
    238     if (ipat == pat.length()) {
    239         return index;
    240     }
    241 
    242     UChar32 cpat = pat.char32At(ipat);
    243 
    244     while (index < limit) {
    245         UChar32 c = text.char32At(index);
    246 
    247         // parse \s*
    248         if (cpat == 126 /*~*/) {
    249             if (uprv_isRuleWhiteSpace(c)) {
    250                 index += UTF_CHAR_LENGTH(c);
    251                 continue;
    252             } else {
    253                 if (++ipat == pat.length()) {
    254                     return index; // success; c unparsed
    255                 }
    256                 // fall thru; process c again with next cpat
    257             }
    258         }
    259 
    260         // parse literal
    261         else if (c == cpat) {
    262             index += UTF_CHAR_LENGTH(c);
    263             ipat += UTF_CHAR_LENGTH(cpat);
    264             if (ipat == pat.length()) {
    265                 return index; // success; c parsed
    266             }
    267             // fall thru; get next cpat
    268         }
    269 
    270         // match failure of literal
    271         else {
    272             return -1;
    273         }
    274 
    275         cpat = pat.char32At(ipat);
    276     }
    277 
    278     return -1; // text ended before end of pat
    279 }
    280 
    281 /**
    282  * Append a character to a rule that is being built up.  To flush
    283  * the quoteBuf to rule, make one final call with isLiteral == TRUE.
    284  * If there is no final character, pass in (UChar32)-1 as c.
    285  * @param rule the string to append the character to
    286  * @param c the character to append, or (UChar32)-1 if none.
    287  * @param isLiteral if true, then the given character should not be
    288  * quoted or escaped.  Usually this means it is a syntactic element
    289  * such as > or $
    290  * @param escapeUnprintable if true, then unprintable characters
    291  * should be escaped using \uxxxx or \Uxxxxxxxx.  These escapes will
    292  * appear outside of quotes.
    293  * @param quoteBuf a buffer which is used to build up quoted
    294  * substrings.  The caller should initially supply an empty buffer,
    295  * and thereafter should not modify the buffer.  The buffer should be
    296  * cleared out by, at the end, calling this method with a literal
    297  * character.
    298  */
    299 void ICU_Utility::appendToRule(UnicodeString& rule,
    300                                UChar32 c,
    301                                UBool isLiteral,
    302                                UBool escapeUnprintable,
    303                                UnicodeString& quoteBuf) {
    304     // If we are escaping unprintables, then escape them outside
    305     // quotes.  \u and \U are not recognized within quotes.  The same
    306     // logic applies to literals, but literals are never escaped.
    307     if (isLiteral ||
    308         (escapeUnprintable && ICU_Utility::isUnprintable(c))) {
    309         if (quoteBuf.length() > 0) {
    310             // We prefer backslash APOSTROPHE to double APOSTROPHE
    311             // (more readable, less similar to ") so if there are
    312             // double APOSTROPHEs at the ends, we pull them outside
    313             // of the quote.
    314 
    315             // If the first thing in the quoteBuf is APOSTROPHE
    316             // (doubled) then pull it out.
    317             while (quoteBuf.length() >= 2 &&
    318                    quoteBuf.charAt(0) == APOSTROPHE &&
    319                    quoteBuf.charAt(1) == APOSTROPHE) {
    320                 rule.append(BACKSLASH).append(APOSTROPHE);
    321                 quoteBuf.remove(0, 2);
    322             }
    323             // If the last thing in the quoteBuf is APOSTROPHE
    324             // (doubled) then remove and count it and add it after.
    325             int32_t trailingCount = 0;
    326             while (quoteBuf.length() >= 2 &&
    327                    quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE &&
    328                    quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) {
    329                 quoteBuf.truncate(quoteBuf.length()-2);
    330                 ++trailingCount;
    331             }
    332             if (quoteBuf.length() > 0) {
    333                 rule.append(APOSTROPHE);
    334                 rule.append(quoteBuf);
    335                 rule.append(APOSTROPHE);
    336                 quoteBuf.truncate(0);
    337             }
    338             while (trailingCount-- > 0) {
    339                 rule.append(BACKSLASH).append(APOSTROPHE);
    340             }
    341         }
    342         if (c != (UChar32)-1) {
    343             /* Since spaces are ignored during parsing, they are
    344              * emitted only for readability.  We emit one here
    345              * only if there isn't already one at the end of the
    346              * rule.
    347              */
    348             if (c == SPACE) {
    349                 int32_t len = rule.length();
    350                 if (len > 0 && rule.charAt(len-1) != c) {
    351                     rule.append(c);
    352                 }
    353             } else if (!escapeUnprintable || !ICU_Utility::escapeUnprintable(rule, c)) {
    354                 rule.append(c);
    355             }
    356         }
    357     }
    358 
    359     // Escape ' and '\' and don't begin a quote just for them
    360     else if (quoteBuf.length() == 0 &&
    361              (c == APOSTROPHE || c == BACKSLASH)) {
    362         rule.append(BACKSLASH);
    363         rule.append(c);
    364     }
    365 
    366     // Specials (printable ascii that isn't [0-9a-zA-Z]) and
    367     // whitespace need quoting.  Also append stuff to quotes if we are
    368     // building up a quoted substring already.
    369     else if (quoteBuf.length() > 0 ||
    370              (c >= 0x0021 && c <= 0x007E &&
    371               !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
    372                 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
    373                 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) ||
    374              uprv_isRuleWhiteSpace(c)) {
    375         quoteBuf.append(c);
    376         // Double ' within a quote
    377         if (c == APOSTROPHE) {
    378             quoteBuf.append(c);
    379         }
    380     }
    381 
    382     // Otherwise just append
    383     else {
    384         rule.append(c);
    385     }
    386 }
    387 
    388 void ICU_Utility::appendToRule(UnicodeString& rule,
    389                                const UnicodeString& text,
    390                                UBool isLiteral,
    391                                UBool escapeUnprintable,
    392                                UnicodeString& quoteBuf) {
    393     for (int32_t i=0; i<text.length(); ++i) {
    394         appendToRule(rule, text[i], isLiteral, escapeUnprintable, quoteBuf);
    395     }
    396 }
    397 
    398 /**
    399  * Given a matcher reference, which may be null, append its
    400  * pattern as a literal to the given rule.
    401  */
    402 void ICU_Utility::appendToRule(UnicodeString& rule,
    403                                const UnicodeMatcher* matcher,
    404                                UBool escapeUnprintable,
    405                                UnicodeString& quoteBuf) {
    406     if (matcher != NULL) {
    407         UnicodeString pat;
    408         appendToRule(rule, matcher->toPattern(pat, escapeUnprintable),
    409                      TRUE, escapeUnprintable, quoteBuf);
    410     }
    411 }
    412 
    413 U_NAMESPACE_END
    414 
    415 U_CAPI UBool U_EXPORT2
    416 uprv_isRuleWhiteSpace(UChar32 c) {
    417     /* "white space" in the sense of ICU rule parsers
    418        This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
    419        See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
    420        U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
    421        Equivalent to test for Pattern_White_Space Unicode property.
    422     */
    423     return (c >= 0x0009 && c <= 0x2029 &&
    424             (c <= 0x000D || c == 0x0020 || c == 0x0085 ||
    425              c == 0x200E || c == 0x200F || c >= 0x2028));
    426 }
    427 
    428 U_CAPI U_NAMESPACE_QUALIFIER UnicodeSet* U_EXPORT2
    429 uprv_openRuleWhiteSpaceSet(UErrorCode* ec) {
    430     if(U_FAILURE(*ec)) {
    431         return NULL;
    432     }
    433     // create a set with the Pattern_White_Space characters,
    434     // without a pattern for fewer code dependencies
    435     U_NAMESPACE_QUALIFIER UnicodeSet *set=new U_NAMESPACE_QUALIFIER UnicodeSet(9, 0xd);
    436     // Check for new failure.
    437     if (set == NULL) {
    438         *ec = U_MEMORY_ALLOCATION_ERROR;
    439         return NULL;
    440     }
    441     set->UnicodeSet::add(0x20).add(0x85).add(0x200e, 0x200f).add(0x2028, 0x2029);
    442     return set;
    443 }
    444 
    445 //eof
    446