Home | History | Annotate | Download | only in i18n
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (c) 2001-2012, International Business Machines Corporation
      6 *   and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   Date        Name        Description
      9 *   07/23/01    aliu        Creation.
     10 **********************************************************************
     11 */
     12 
     13 #include "unicode/utypes.h"
     14 
     15 #if !UCONFIG_NO_TRANSLITERATION
     16 
     17 #include "strmatch.h"
     18 #include "rbt_data.h"
     19 #include "util.h"
     20 #include "unicode/uniset.h"
     21 #include "unicode/utf16.h"
     22 
     23 U_NAMESPACE_BEGIN
     24 
     25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
     26 
     27 StringMatcher::StringMatcher(const UnicodeString& theString,
     28                              int32_t start,
     29                              int32_t limit,
     30                              int32_t segmentNum,
     31                              const TransliterationRuleData& theData) :
     32     data(&theData),
     33     segmentNumber(segmentNum),
     34     matchStart(-1),
     35     matchLimit(-1)
     36 {
     37     theString.extractBetween(start, limit, pattern);
     38 }
     39 
     40 StringMatcher::StringMatcher(const StringMatcher& o) :
     41     UnicodeFunctor(o),
     42     UnicodeMatcher(o),
     43     UnicodeReplacer(o),
     44     pattern(o.pattern),
     45     data(o.data),
     46     segmentNumber(o.segmentNumber),
     47     matchStart(o.matchStart),
     48     matchLimit(o.matchLimit)
     49 {
     50 }
     51 
     52 /**
     53  * Destructor
     54  */
     55 StringMatcher::~StringMatcher() {
     56 }
     57 
     58 /**
     59  * Implement UnicodeFunctor
     60  */
     61 UnicodeFunctor* StringMatcher::clone() const {
     62     return new StringMatcher(*this);
     63 }
     64 
     65 /**
     66  * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
     67  * and return the pointer.
     68  */
     69 UnicodeMatcher* StringMatcher::toMatcher() const {
     70   StringMatcher  *nonconst_this = const_cast<StringMatcher *>(this);
     71   UnicodeMatcher *nonconst_base = static_cast<UnicodeMatcher *>(nonconst_this);
     72 
     73   return nonconst_base;
     74 }
     75 
     76 /**
     77  * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
     78  * and return the pointer.
     79  */
     80 UnicodeReplacer* StringMatcher::toReplacer() const {
     81   StringMatcher  *nonconst_this = const_cast<StringMatcher *>(this);
     82   UnicodeReplacer *nonconst_base = static_cast<UnicodeReplacer *>(nonconst_this);
     83 
     84   return nonconst_base;
     85 }
     86 
     87 /**
     88  * Implement UnicodeMatcher
     89  */
     90 UMatchDegree StringMatcher::matches(const Replaceable& text,
     91                                     int32_t& offset,
     92                                     int32_t limit,
     93                                     UBool incremental) {
     94     int32_t i;
     95     int32_t cursor = offset;
     96     if (limit < cursor) {
     97         // Match in the reverse direction
     98         for (i=pattern.length()-1; i>=0; --i) {
     99             UChar keyChar = pattern.charAt(i);
    100             UnicodeMatcher* subm = data->lookupMatcher(keyChar);
    101             if (subm == 0) {
    102                 if (cursor > limit &&
    103                     keyChar == text.charAt(cursor)) {
    104                     --cursor;
    105                 } else {
    106                     return U_MISMATCH;
    107                 }
    108             } else {
    109                 UMatchDegree m =
    110                     subm->matches(text, cursor, limit, incremental);
    111                 if (m != U_MATCH) {
    112                     return m;
    113                 }
    114             }
    115         }
    116         // Record the match position, but adjust for a normal
    117         // forward start, limit, and only if a prior match does not
    118         // exist -- we want the rightmost match.
    119         if (matchStart < 0) {
    120             matchStart = cursor+1;
    121             matchLimit = offset+1;
    122         }
    123     } else {
    124         for (i=0; i<pattern.length(); ++i) {
    125             if (incremental && cursor == limit) {
    126                 // We've reached the context limit without a mismatch and
    127                 // without completing our match.
    128                 return U_PARTIAL_MATCH;
    129             }
    130             UChar keyChar = pattern.charAt(i);
    131             UnicodeMatcher* subm = data->lookupMatcher(keyChar);
    132             if (subm == 0) {
    133                 // Don't need the cursor < limit check if
    134                 // incremental is TRUE (because it's done above); do need
    135                 // it otherwise.
    136                 if (cursor < limit &&
    137                     keyChar == text.charAt(cursor)) {
    138                     ++cursor;
    139                 } else {
    140                     return U_MISMATCH;
    141                 }
    142             } else {
    143                 UMatchDegree m =
    144                     subm->matches(text, cursor, limit, incremental);
    145                 if (m != U_MATCH) {
    146                     return m;
    147                 }
    148             }
    149         }
    150         // Record the match position
    151         matchStart = offset;
    152         matchLimit = cursor;
    153     }
    154 
    155     offset = cursor;
    156     return U_MATCH;
    157 }
    158 
    159 /**
    160  * Implement UnicodeMatcher
    161  */
    162 UnicodeString& StringMatcher::toPattern(UnicodeString& result,
    163                                         UBool escapeUnprintable) const
    164 {
    165     result.truncate(0);
    166     UnicodeString str, quoteBuf;
    167     if (segmentNumber > 0) {
    168         result.append((UChar)40); /*(*/
    169     }
    170     for (int32_t i=0; i<pattern.length(); ++i) {
    171         UChar keyChar = pattern.charAt(i);
    172         const UnicodeMatcher* m = data->lookupMatcher(keyChar);
    173         if (m == 0) {
    174             ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
    175         } else {
    176             ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
    177                          TRUE, escapeUnprintable, quoteBuf);
    178         }
    179     }
    180     if (segmentNumber > 0) {
    181         result.append((UChar)41); /*)*/
    182     }
    183     // Flush quoteBuf out to result
    184     ICU_Utility::appendToRule(result, -1,
    185                               TRUE, escapeUnprintable, quoteBuf);
    186     return result;
    187 }
    188 
    189 /**
    190  * Implement UnicodeMatcher
    191  */
    192 UBool StringMatcher::matchesIndexValue(uint8_t v) const {
    193     if (pattern.length() == 0) {
    194         return TRUE;
    195     }
    196     UChar32 c = pattern.char32At(0);
    197     const UnicodeMatcher *m = data->lookupMatcher(c);
    198     return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
    199 }
    200 
    201 /**
    202  * Implement UnicodeMatcher
    203  */
    204 void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
    205     UChar32 ch;
    206     for (int32_t i=0; i<pattern.length(); i+=U16_LENGTH(ch)) {
    207         ch = pattern.char32At(i);
    208         const UnicodeMatcher* matcher = data->lookupMatcher(ch);
    209         if (matcher == NULL) {
    210             toUnionTo.add(ch);
    211         } else {
    212             matcher->addMatchSetTo(toUnionTo);
    213         }
    214     }
    215 }
    216 
    217 /**
    218  * UnicodeReplacer API
    219  */
    220 int32_t StringMatcher::replace(Replaceable& text,
    221                                int32_t start,
    222                                int32_t limit,
    223                                int32_t& /*cursor*/) {
    224 
    225     int32_t outLen = 0;
    226 
    227     // Copy segment with out-of-band data
    228     int32_t dest = limit;
    229     // If there was no match, that means that a quantifier
    230     // matched zero-length.  E.g., x (a)* y matched "xy".
    231     if (matchStart >= 0) {
    232         if (matchStart != matchLimit) {
    233             text.copy(matchStart, matchLimit, dest);
    234             outLen = matchLimit - matchStart;
    235         }
    236     }
    237 
    238     text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text
    239 
    240     return outLen;
    241 }
    242 
    243 /**
    244  * UnicodeReplacer API
    245  */
    246 UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
    247                                                 UBool /*escapeUnprintable*/) const {
    248     // assert(segmentNumber > 0);
    249     rule.truncate(0);
    250     rule.append((UChar)0x0024 /*$*/);
    251     ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
    252     return rule;
    253 }
    254 
    255 /**
    256  * Remove any match info.  This must be called before performing a
    257  * set of matches with this segment.
    258  */
    259  void StringMatcher::resetMatch() {
    260     matchStart = matchLimit = -1;
    261 }
    262 
    263 /**
    264  * Union the set of all characters that may output by this object
    265  * into the given set.
    266  * @param toUnionTo the set into which to union the output characters
    267  */
    268 void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
    269     // The output of this replacer varies; it is the source text between
    270     // matchStart and matchLimit.  Since this varies depending on the
    271     // input text, we can't compute it here.  We can either do nothing
    272     // or we can add ALL characters to the set.  It's probably more useful
    273     // to do nothing.
    274 }
    275 
    276 /**
    277  * Implement UnicodeFunctor
    278  */
    279 void StringMatcher::setData(const TransliterationRuleData* d) {
    280     data = d;
    281     int32_t i = 0;
    282     while (i<pattern.length()) {
    283         UChar32 c = pattern.char32At(i);
    284         UnicodeFunctor* f = data->lookup(c);
    285         if (f != NULL) {
    286             f->setData(data);
    287         }
    288         i += U16_LENGTH(c);
    289     }
    290 }
    291 
    292 U_NAMESPACE_END
    293 
    294 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    295 
    296 //eof
    297