Home | History | Annotate | Download | only in i18n
      1 /*
      2 **********************************************************************
      3 *   Copyright (c) 2001-2004, International Business Machines Corporation
      4 *   and others.  All Rights Reserved.
      5 **********************************************************************
      6 *   Date        Name        Description
      7 *   07/23/01    aliu        Creation.
      8 **********************************************************************
      9 */
     10 
     11 #include "unicode/utypes.h"
     12 
     13 #if !UCONFIG_NO_TRANSLITERATION
     14 
     15 #include "strmatch.h"
     16 #include "rbt_data.h"
     17 #include "util.h"
     18 #include "unicode/uniset.h"
     19 
     20 U_NAMESPACE_BEGIN
     21 
     22 static const UChar EMPTY[] = { 0 }; // empty string: ""
     23 
     24 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
     25 
     26 StringMatcher::StringMatcher(const UnicodeString& theString,
     27                              int32_t start,
     28                              int32_t limit,
     29                              int32_t segmentNum,
     30                              const TransliterationRuleData& theData) :
     31     data(&theData),
     32     segmentNumber(segmentNum),
     33     matchStart(-1),
     34     matchLimit(-1)
     35 {
     36     theString.extractBetween(start, limit, pattern);
     37 }
     38 
     39 StringMatcher::StringMatcher(const StringMatcher& o) :
     40     UnicodeFunctor(o),
     41     UnicodeMatcher(o),
     42     UnicodeReplacer(o),
     43     pattern(o.pattern),
     44     data(o.data),
     45     segmentNumber(o.segmentNumber),
     46     matchStart(o.matchStart),
     47     matchLimit(o.matchLimit)
     48 {
     49 }
     50 
     51 /**
     52  * Destructor
     53  */
     54 StringMatcher::~StringMatcher() {
     55 }
     56 
     57 /**
     58  * Implement UnicodeFunctor
     59  */
     60 UnicodeFunctor* StringMatcher::clone() const {
     61     return new StringMatcher(*this);
     62 }
     63 
     64 /**
     65  * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
     66  * and return the pointer.
     67  */
     68 UnicodeMatcher* StringMatcher::toMatcher() const {
     69     return (UnicodeMatcher*) this;
     70 }
     71 
     72 /**
     73  * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
     74  * and return the pointer.
     75  */
     76 UnicodeReplacer* StringMatcher::toReplacer() const {
     77     return (UnicodeReplacer*) this;
     78 }
     79 
     80 /**
     81  * Implement UnicodeMatcher
     82  */
     83 UMatchDegree StringMatcher::matches(const Replaceable& text,
     84                                     int32_t& offset,
     85                                     int32_t limit,
     86                                     UBool incremental) {
     87     int32_t i;
     88     int32_t cursor = offset;
     89     if (limit < cursor) {
     90         // Match in the reverse direction
     91         for (i=pattern.length()-1; i>=0; --i) {
     92             UChar keyChar = pattern.charAt(i);
     93             UnicodeMatcher* subm = data->lookupMatcher(keyChar);
     94             if (subm == 0) {
     95                 if (cursor > limit &&
     96                     keyChar == text.charAt(cursor)) {
     97                     --cursor;
     98                 } else {
     99                     return U_MISMATCH;
    100                 }
    101             } else {
    102                 UMatchDegree m =
    103                     subm->matches(text, cursor, limit, incremental);
    104                 if (m != U_MATCH) {
    105                     return m;
    106                 }
    107             }
    108         }
    109         // Record the match position, but adjust for a normal
    110         // forward start, limit, and only if a prior match does not
    111         // exist -- we want the rightmost match.
    112         if (matchStart < 0) {
    113             matchStart = cursor+1;
    114             matchLimit = offset+1;
    115         }
    116     } else {
    117         for (i=0; i<pattern.length(); ++i) {
    118             if (incremental && cursor == limit) {
    119                 // We've reached the context limit without a mismatch and
    120                 // without completing our match.
    121                 return U_PARTIAL_MATCH;
    122             }
    123             UChar keyChar = pattern.charAt(i);
    124             UnicodeMatcher* subm = data->lookupMatcher(keyChar);
    125             if (subm == 0) {
    126                 // Don't need the cursor < limit check if
    127                 // incremental is TRUE (because it's done above); do need
    128                 // it otherwise.
    129                 if (cursor < limit &&
    130                     keyChar == text.charAt(cursor)) {
    131                     ++cursor;
    132                 } else {
    133                     return U_MISMATCH;
    134                 }
    135             } else {
    136                 UMatchDegree m =
    137                     subm->matches(text, cursor, limit, incremental);
    138                 if (m != U_MATCH) {
    139                     return m;
    140                 }
    141             }
    142         }
    143         // Record the match position
    144         matchStart = offset;
    145         matchLimit = cursor;
    146     }
    147 
    148     offset = cursor;
    149     return U_MATCH;
    150 }
    151 
    152 /**
    153  * Implement UnicodeMatcher
    154  */
    155 UnicodeString& StringMatcher::toPattern(UnicodeString& result,
    156                                         UBool escapeUnprintable) const
    157 {
    158     result.truncate(0);
    159     UnicodeString str, quoteBuf;
    160     if (segmentNumber > 0) {
    161         result.append((UChar)40); /*(*/
    162     }
    163     for (int32_t i=0; i<pattern.length(); ++i) {
    164         UChar keyChar = pattern.charAt(i);
    165         const UnicodeMatcher* m = data->lookupMatcher(keyChar);
    166         if (m == 0) {
    167             ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
    168         } else {
    169             ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
    170                          TRUE, escapeUnprintable, quoteBuf);
    171         }
    172     }
    173     if (segmentNumber > 0) {
    174         result.append((UChar)41); /*)*/
    175     }
    176     // Flush quoteBuf out to result
    177     ICU_Utility::appendToRule(result, -1,
    178                               TRUE, escapeUnprintable, quoteBuf);
    179     return result;
    180 }
    181 
    182 /**
    183  * Implement UnicodeMatcher
    184  */
    185 UBool StringMatcher::matchesIndexValue(uint8_t v) const {
    186     if (pattern.length() == 0) {
    187         return TRUE;
    188     }
    189     UChar32 c = pattern.char32At(0);
    190     const UnicodeMatcher *m = data->lookupMatcher(c);
    191     return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
    192 }
    193 
    194 /**
    195  * Implement UnicodeMatcher
    196  */
    197 void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
    198     UChar32 ch;
    199     for (int32_t i=0; i<pattern.length(); i+=UTF_CHAR_LENGTH(ch)) {
    200         ch = pattern.char32At(i);
    201         const UnicodeMatcher* matcher = data->lookupMatcher(ch);
    202         if (matcher == NULL) {
    203             toUnionTo.add(ch);
    204         } else {
    205             matcher->addMatchSetTo(toUnionTo);
    206         }
    207     }
    208 }
    209 
    210 /**
    211  * UnicodeReplacer API
    212  */
    213 int32_t StringMatcher::replace(Replaceable& text,
    214                                int32_t start,
    215                                int32_t limit,
    216                                int32_t& /*cursor*/) {
    217 
    218     int32_t outLen = 0;
    219 
    220     // Copy segment with out-of-band data
    221     int32_t dest = limit;
    222     // If there was no match, that means that a quantifier
    223     // matched zero-length.  E.g., x (a)* y matched "xy".
    224     if (matchStart >= 0) {
    225         if (matchStart != matchLimit) {
    226             text.copy(matchStart, matchLimit, dest);
    227             outLen = matchLimit - matchStart;
    228         }
    229     }
    230 
    231     text.handleReplaceBetween(start, limit, EMPTY); // delete original text
    232 
    233     return outLen;
    234 }
    235 
    236 /**
    237  * UnicodeReplacer API
    238  */
    239 UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
    240                                                 UBool /*escapeUnprintable*/) const {
    241     // assert(segmentNumber > 0);
    242     rule.truncate(0);
    243     rule.append((UChar)0x0024 /*$*/);
    244     ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
    245     return rule;
    246 }
    247 
    248 /**
    249  * Remove any match info.  This must be called before performing a
    250  * set of matches with this segment.
    251  */
    252  void StringMatcher::resetMatch() {
    253     matchStart = matchLimit = -1;
    254 }
    255 
    256 /**
    257  * Union the set of all characters that may output by this object
    258  * into the given set.
    259  * @param toUnionTo the set into which to union the output characters
    260  */
    261 void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
    262     // The output of this replacer varies; it is the source text between
    263     // matchStart and matchLimit.  Since this varies depending on the
    264     // input text, we can't compute it here.  We can either do nothing
    265     // or we can add ALL characters to the set.  It's probably more useful
    266     // to do nothing.
    267 }
    268 
    269 /**
    270  * Implement UnicodeFunctor
    271  */
    272 void StringMatcher::setData(const TransliterationRuleData* d) {
    273     data = d;
    274     int32_t i = 0;
    275     while (i<pattern.length()) {
    276         UChar32 c = pattern.char32At(i);
    277         UnicodeFunctor* f = data->lookup(c);
    278         if (f != NULL) {
    279             f->setData(data);
    280         }
    281         i += UTF_CHAR_LENGTH(c);
    282     }
    283 }
    284 
    285 U_NAMESPACE_END
    286 
    287 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    288 
    289 //eof
    290