Home | History | Annotate | Download | only in i18n
      1 /*
      2 **********************************************************************
      3 *   Copyright (c) 2001-2012, International Business Machines Corporation
      4 *   and others.  All Rights Reserved.
      5 **********************************************************************
      6 *   Date        Name        Description
      7 *   07/23/01    aliu        Creation.
      8 **********************************************************************
      9 */
     10 
     11 #include "unicode/utypes.h"
     12 
     13 #if !UCONFIG_NO_TRANSLITERATION
     14 
     15 #include "strmatch.h"
     16 #include "rbt_data.h"
     17 #include "util.h"
     18 #include "unicode/uniset.h"
     19 #include "unicode/utf16.h"
     20 
     21 U_NAMESPACE_BEGIN
     22 
     23 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
     24 
     25 StringMatcher::StringMatcher(const UnicodeString& theString,
     26                              int32_t start,
     27                              int32_t limit,
     28                              int32_t segmentNum,
     29                              const TransliterationRuleData& theData) :
     30     data(&theData),
     31     segmentNumber(segmentNum),
     32     matchStart(-1),
     33     matchLimit(-1)
     34 {
     35     theString.extractBetween(start, limit, pattern);
     36 }
     37 
     38 StringMatcher::StringMatcher(const StringMatcher& o) :
     39     UnicodeFunctor(o),
     40     UnicodeMatcher(o),
     41     UnicodeReplacer(o),
     42     pattern(o.pattern),
     43     data(o.data),
     44     segmentNumber(o.segmentNumber),
     45     matchStart(o.matchStart),
     46     matchLimit(o.matchLimit)
     47 {
     48 }
     49 
     50 /**
     51  * Destructor
     52  */
     53 StringMatcher::~StringMatcher() {
     54 }
     55 
     56 /**
     57  * Implement UnicodeFunctor
     58  */
     59 UnicodeFunctor* StringMatcher::clone() const {
     60     return new StringMatcher(*this);
     61 }
     62 
     63 /**
     64  * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
     65  * and return the pointer.
     66  */
     67 UnicodeMatcher* StringMatcher::toMatcher() const {
     68   StringMatcher  *nonconst_this = const_cast<StringMatcher *>(this);
     69   UnicodeMatcher *nonconst_base = static_cast<UnicodeMatcher *>(nonconst_this);
     70 
     71   return nonconst_base;
     72 }
     73 
     74 /**
     75  * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
     76  * and return the pointer.
     77  */
     78 UnicodeReplacer* StringMatcher::toReplacer() const {
     79   StringMatcher  *nonconst_this = const_cast<StringMatcher *>(this);
     80   UnicodeReplacer *nonconst_base = static_cast<UnicodeReplacer *>(nonconst_this);
     81 
     82   return nonconst_base;
     83 }
     84 
     85 /**
     86  * Implement UnicodeMatcher
     87  */
     88 UMatchDegree StringMatcher::matches(const Replaceable& text,
     89                                     int32_t& offset,
     90                                     int32_t limit,
     91                                     UBool incremental) {
     92     int32_t i;
     93     int32_t cursor = offset;
     94     if (limit < cursor) {
     95         // Match in the reverse direction
     96         for (i=pattern.length()-1; i>=0; --i) {
     97             UChar keyChar = pattern.charAt(i);
     98             UnicodeMatcher* subm = data->lookupMatcher(keyChar);
     99             if (subm == 0) {
    100                 if (cursor > limit &&
    101                     keyChar == text.charAt(cursor)) {
    102                     --cursor;
    103                 } else {
    104                     return U_MISMATCH;
    105                 }
    106             } else {
    107                 UMatchDegree m =
    108                     subm->matches(text, cursor, limit, incremental);
    109                 if (m != U_MATCH) {
    110                     return m;
    111                 }
    112             }
    113         }
    114         // Record the match position, but adjust for a normal
    115         // forward start, limit, and only if a prior match does not
    116         // exist -- we want the rightmost match.
    117         if (matchStart < 0) {
    118             matchStart = cursor+1;
    119             matchLimit = offset+1;
    120         }
    121     } else {
    122         for (i=0; i<pattern.length(); ++i) {
    123             if (incremental && cursor == limit) {
    124                 // We've reached the context limit without a mismatch and
    125                 // without completing our match.
    126                 return U_PARTIAL_MATCH;
    127             }
    128             UChar keyChar = pattern.charAt(i);
    129             UnicodeMatcher* subm = data->lookupMatcher(keyChar);
    130             if (subm == 0) {
    131                 // Don't need the cursor < limit check if
    132                 // incremental is TRUE (because it's done above); do need
    133                 // it otherwise.
    134                 if (cursor < limit &&
    135                     keyChar == text.charAt(cursor)) {
    136                     ++cursor;
    137                 } else {
    138                     return U_MISMATCH;
    139                 }
    140             } else {
    141                 UMatchDegree m =
    142                     subm->matches(text, cursor, limit, incremental);
    143                 if (m != U_MATCH) {
    144                     return m;
    145                 }
    146             }
    147         }
    148         // Record the match position
    149         matchStart = offset;
    150         matchLimit = cursor;
    151     }
    152 
    153     offset = cursor;
    154     return U_MATCH;
    155 }
    156 
    157 /**
    158  * Implement UnicodeMatcher
    159  */
    160 UnicodeString& StringMatcher::toPattern(UnicodeString& result,
    161                                         UBool escapeUnprintable) const
    162 {
    163     result.truncate(0);
    164     UnicodeString str, quoteBuf;
    165     if (segmentNumber > 0) {
    166         result.append((UChar)40); /*(*/
    167     }
    168     for (int32_t i=0; i<pattern.length(); ++i) {
    169         UChar keyChar = pattern.charAt(i);
    170         const UnicodeMatcher* m = data->lookupMatcher(keyChar);
    171         if (m == 0) {
    172             ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
    173         } else {
    174             ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
    175                          TRUE, escapeUnprintable, quoteBuf);
    176         }
    177     }
    178     if (segmentNumber > 0) {
    179         result.append((UChar)41); /*)*/
    180     }
    181     // Flush quoteBuf out to result
    182     ICU_Utility::appendToRule(result, -1,
    183                               TRUE, escapeUnprintable, quoteBuf);
    184     return result;
    185 }
    186 
    187 /**
    188  * Implement UnicodeMatcher
    189  */
    190 UBool StringMatcher::matchesIndexValue(uint8_t v) const {
    191     if (pattern.length() == 0) {
    192         return TRUE;
    193     }
    194     UChar32 c = pattern.char32At(0);
    195     const UnicodeMatcher *m = data->lookupMatcher(c);
    196     return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
    197 }
    198 
    199 /**
    200  * Implement UnicodeMatcher
    201  */
    202 void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
    203     UChar32 ch;
    204     for (int32_t i=0; i<pattern.length(); i+=U16_LENGTH(ch)) {
    205         ch = pattern.char32At(i);
    206         const UnicodeMatcher* matcher = data->lookupMatcher(ch);
    207         if (matcher == NULL) {
    208             toUnionTo.add(ch);
    209         } else {
    210             matcher->addMatchSetTo(toUnionTo);
    211         }
    212     }
    213 }
    214 
    215 /**
    216  * UnicodeReplacer API
    217  */
    218 int32_t StringMatcher::replace(Replaceable& text,
    219                                int32_t start,
    220                                int32_t limit,
    221                                int32_t& /*cursor*/) {
    222 
    223     int32_t outLen = 0;
    224 
    225     // Copy segment with out-of-band data
    226     int32_t dest = limit;
    227     // If there was no match, that means that a quantifier
    228     // matched zero-length.  E.g., x (a)* y matched "xy".
    229     if (matchStart >= 0) {
    230         if (matchStart != matchLimit) {
    231             text.copy(matchStart, matchLimit, dest);
    232             outLen = matchLimit - matchStart;
    233         }
    234     }
    235 
    236     text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text
    237 
    238     return outLen;
    239 }
    240 
    241 /**
    242  * UnicodeReplacer API
    243  */
    244 UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
    245                                                 UBool /*escapeUnprintable*/) const {
    246     // assert(segmentNumber > 0);
    247     rule.truncate(0);
    248     rule.append((UChar)0x0024 /*$*/);
    249     ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
    250     return rule;
    251 }
    252 
    253 /**
    254  * Remove any match info.  This must be called before performing a
    255  * set of matches with this segment.
    256  */
    257  void StringMatcher::resetMatch() {
    258     matchStart = matchLimit = -1;
    259 }
    260 
    261 /**
    262  * Union the set of all characters that may output by this object
    263  * into the given set.
    264  * @param toUnionTo the set into which to union the output characters
    265  */
    266 void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
    267     // The output of this replacer varies; it is the source text between
    268     // matchStart and matchLimit.  Since this varies depending on the
    269     // input text, we can't compute it here.  We can either do nothing
    270     // or we can add ALL characters to the set.  It's probably more useful
    271     // to do nothing.
    272 }
    273 
    274 /**
    275  * Implement UnicodeFunctor
    276  */
    277 void StringMatcher::setData(const TransliterationRuleData* d) {
    278     data = d;
    279     int32_t i = 0;
    280     while (i<pattern.length()) {
    281         UChar32 c = pattern.char32At(i);
    282         UnicodeFunctor* f = data->lookup(c);
    283         if (f != NULL) {
    284             f->setData(data);
    285         }
    286         i += U16_LENGTH(c);
    287     }
    288 }
    289 
    290 U_NAMESPACE_END
    291 
    292 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    293 
    294 //eof
    295