Home | History | Annotate | Download | only in text
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5  *******************************************************************************
      6  * Copyright (C) 2001-2004, International Business Machines Corporation and    *
      7  * others. All Rights Reserved.                                                *
      8  *******************************************************************************
      9  */
     10 package android.icu.text;
     11 import android.icu.impl.Utility;
     12 
     13 /**
     14  * An object that matches a fixed input string, implementing the
     15  * UnicodeMatcher API.  This object also implements the
     16  * UnicodeReplacer API, allowing it to emit the matched text as
     17  * output.  Since the match text may contain flexible match elements,
     18  * such as UnicodeSets, the emitted text is not the match pattern, but
     19  * instead a substring of the actual matched text.  Following
     20  * convention, the output text is the leftmost match seen up to this
     21  * point.
     22  *
     23  * A StringMatcher may represent a segment, in which case it has a
     24  * positive segment number.  This affects how the matcher converts
     25  * itself to a pattern but does not otherwise affect its function.
     26  *
     27  * A StringMatcher that is not a segment should not be used as a
     28  * UnicodeReplacer.
     29  */
     30 class StringMatcher implements UnicodeMatcher, UnicodeReplacer {
     31 
     32     /**
     33      * The text to be matched.
     34      */
     35     private String pattern;
     36 
     37     /**
     38      * Start offset, in the match text, of the <em>rightmost</em>
     39      * match.
     40      */
     41     private int matchStart;
     42 
     43     /**
     44      * Limit offset, in the match text, of the <em>rightmost</em>
     45      * match.
     46      */
     47     private int matchLimit;
     48 
     49     /**
     50      * The segment number, 1-based, or 0 if not a segment.
     51      */
     52     private int segmentNumber;
     53 
     54     /**
     55      * Context object that maps stand-ins to matcher and replacer
     56      * objects.
     57      */
     58     private final RuleBasedTransliterator.Data data;
     59 
     60     /**
     61      * Construct a matcher that matches the given pattern string.
     62      * @param theString the pattern to be matched, possibly containing
     63      * stand-ins that represent nested UnicodeMatcher objects.
     64      * @param segmentNum the segment number from 1..n, or 0 if this is
     65      * not a segment.
     66      * @param theData context object mapping stand-ins to
     67      * UnicodeMatcher objects.
     68      */
     69     public StringMatcher(String theString,
     70                          int segmentNum,
     71                          RuleBasedTransliterator.Data theData) {
     72         data = theData;
     73         pattern = theString;
     74         matchStart = matchLimit = -1;
     75         segmentNumber = segmentNum;
     76     }
     77 
     78     /**
     79      * Construct a matcher that matches a substring of the given
     80      * pattern string.
     81      * @param theString the pattern to be matched, possibly containing
     82      * stand-ins that represent nested UnicodeMatcher objects.
     83      * @param start first character of theString to be matched
     84      * @param limit index after the last character of theString to be
     85      * matched.
     86      * @param segmentNum the segment number from 1..n, or 0 if this is
     87      * not a segment.
     88      * @param theData context object mapping stand-ins to
     89      * UnicodeMatcher objects.
     90      */
     91     public StringMatcher(String theString,
     92                          int start,
     93                          int limit,
     94                          int segmentNum,
     95                          RuleBasedTransliterator.Data theData) {
     96         this(theString.substring(start, limit), segmentNum, theData);
     97     }
     98 
     99     /**
    100      * Implement UnicodeMatcher
    101      */
    102     @Override
    103     public int matches(Replaceable text,
    104                        int[] offset,
    105                        int limit,
    106                        boolean incremental) {
    107         // Note (1): We process text in 16-bit code units, rather than
    108         // 32-bit code points.  This works because stand-ins are
    109         // always in the BMP and because we are doing a literal match
    110         // operation, which can be done 16-bits at a time.
    111         int i;
    112         int[] cursor = new int[] { offset[0] };
    113         if (limit < cursor[0]) {
    114             // Match in the reverse direction
    115             for (i=pattern.length()-1; i>=0; --i) {
    116                 char keyChar = pattern.charAt(i); // OK; see note (1) above
    117                 UnicodeMatcher subm = data.lookupMatcher(keyChar);
    118                 if (subm == null) {
    119                     if (cursor[0] > limit &&
    120                         keyChar == text.charAt(cursor[0])) { // OK; see note (1) above
    121                         --cursor[0];
    122                     } else {
    123                         return U_MISMATCH;
    124                     }
    125                 } else {
    126                     int m =
    127                         subm.matches(text, cursor, limit, incremental);
    128                     if (m != U_MATCH) {
    129                         return m;
    130                     }
    131                 }
    132             }
    133             // Record the match position, but adjust for a normal
    134             // forward start, limit, and only if a prior match does not
    135             // exist -- we want the rightmost match.
    136             if (matchStart < 0) {
    137                 matchStart = cursor[0]+1;
    138                 matchLimit = offset[0]+1;
    139             }
    140         } else {
    141             for (i=0; i<pattern.length(); ++i) {
    142                 if (incremental && cursor[0] == limit) {
    143                     // We've reached the context limit without a mismatch and
    144                     // without completing our match.
    145                     return U_PARTIAL_MATCH;
    146                 }
    147                 char keyChar = pattern.charAt(i); // OK; see note (1) above
    148                 UnicodeMatcher subm = data.lookupMatcher(keyChar);
    149                 if (subm == null) {
    150                     // Don't need the cursor < limit check if
    151                     // incremental is true (because it's done above); do need
    152                     // it otherwise.
    153                     if (cursor[0] < limit &&
    154                         keyChar == text.charAt(cursor[0])) { // OK; see note (1) above
    155                         ++cursor[0];
    156                     } else {
    157                         return U_MISMATCH;
    158                     }
    159                 } else {
    160                     int m =
    161                         subm.matches(text, cursor, limit, incremental);
    162                     if (m != U_MATCH) {
    163                         return m;
    164                     }
    165                 }
    166             }
    167             // Record the match position
    168             matchStart = offset[0];
    169             matchLimit = cursor[0];
    170         }
    171 
    172         offset[0] = cursor[0];
    173         return U_MATCH;
    174     }
    175 
    176     /**
    177      * Implement UnicodeMatcher
    178      */
    179     @Override
    180     public String toPattern(boolean escapeUnprintable) {
    181         StringBuffer result = new StringBuffer();
    182         StringBuffer quoteBuf = new StringBuffer();
    183         if (segmentNumber > 0) { // i.e., if this is a segment
    184             result.append('(');
    185         }
    186         for (int i=0; i<pattern.length(); ++i) {
    187             char keyChar = pattern.charAt(i); // OK; see note (1) above
    188             UnicodeMatcher m = data.lookupMatcher(keyChar);
    189             if (m == null) {
    190                 Utility.appendToRule(result, keyChar, false, escapeUnprintable, quoteBuf);
    191             } else {
    192                 Utility.appendToRule(result, m.toPattern(escapeUnprintable),
    193                                      true, escapeUnprintable, quoteBuf);
    194             }
    195         }
    196         if (segmentNumber > 0) { // i.e., if this is a segment
    197             result.append(')');
    198         }
    199         // Flush quoteBuf out to result
    200         Utility.appendToRule(result, -1,
    201                              true, escapeUnprintable, quoteBuf);
    202         return result.toString();
    203     }
    204 
    205     /**
    206      * Implement UnicodeMatcher
    207      */
    208     @Override
    209     public boolean matchesIndexValue(int v) {
    210         if (pattern.length() == 0) {
    211             return true;
    212         }
    213         int c = UTF16.charAt(pattern, 0);
    214         UnicodeMatcher m = data.lookupMatcher(c);
    215         return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v);
    216     }
    217 
    218     /**
    219      * Implementation of UnicodeMatcher API.  Union the set of all
    220      * characters that may be matched by this object into the given
    221      * set.
    222      * @param toUnionTo the set into which to union the source characters
    223      */
    224     @Override
    225     public void addMatchSetTo(UnicodeSet toUnionTo) {
    226         int ch;
    227         for (int i=0; i<pattern.length(); i+=UTF16.getCharCount(ch)) {
    228             ch = UTF16.charAt(pattern, i);
    229             UnicodeMatcher matcher = data.lookupMatcher(ch);
    230             if (matcher == null) {
    231                 toUnionTo.add(ch);
    232             } else {
    233                 matcher.addMatchSetTo(toUnionTo);
    234             }
    235         }
    236     }
    237 
    238     /**
    239      * UnicodeReplacer API
    240      */
    241     @Override
    242     public int replace(Replaceable text,
    243                        int start,
    244                        int limit,
    245                        int[] cursor) {
    246 
    247         int outLen = 0;
    248 
    249         // Copy segment with out-of-band data
    250         int dest = limit;
    251         // If there was no match, that means that a quantifier
    252         // matched zero-length.  E.g., x (a)* y matched "xy".
    253         if (matchStart >= 0) {
    254             if (matchStart != matchLimit) {
    255                 text.copy(matchStart, matchLimit, dest);
    256                 outLen = matchLimit - matchStart;
    257             }
    258         }
    259 
    260         text.replace(start, limit, ""); // delete original text
    261 
    262         return outLen;
    263     }
    264 
    265     /**
    266      * UnicodeReplacer API
    267      */
    268     @Override
    269     public String toReplacerPattern(boolean escapeUnprintable) {
    270         // assert(segmentNumber > 0);
    271         StringBuffer rule = new StringBuffer("$");
    272         Utility.appendNumber(rule, segmentNumber, 10, 1);
    273         return rule.toString();
    274     }
    275 
    276     /**
    277      * Remove any match data.  This must be called before performing a
    278      * set of matches with this segment.
    279      */
    280     public void resetMatch() {
    281         matchStart = matchLimit = -1;
    282     }
    283 
    284     /**
    285      * Union the set of all characters that may output by this object
    286      * into the given set.
    287      * @param toUnionTo the set into which to union the output characters
    288      */
    289     @Override
    290     public void addReplacementSetTo(UnicodeSet toUnionTo) {
    291         // The output of this replacer varies; it is the source text between
    292         // matchStart and matchLimit.  Since this varies depending on the
    293         // input text, we can't compute it here.  We can either do nothing
    294         // or we can add ALL characters to the set.  It's probably more useful
    295         // to do nothing.
    296     }
    297 }
    298 
    299 //eof
    300