Home | History | Annotate | Download | only in regex
      1 /*
      2  * Copyright (C) 2007 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package java.util.regex;
     18 
     19 /**
     20  * The result of applying a {@code Pattern} to a given input. See {@link Pattern} for
     21  * example uses.
     22  */
     23 public final class Matcher implements MatchResult {
     24 
     25     /**
     26      * Holds the pattern, that is, the compiled regular expression.
     27      */
     28     private Pattern pattern;
     29 
     30     /**
     31      * Holds the handle for the native version of the pattern.
     32      */
     33     private int address;
     34 
     35     /**
     36      * Holds the input text.
     37      */
     38     private String input;
     39 
     40     /**
     41      * Holds the start of the region, or 0 if the matching should start at the
     42      * beginning of the text.
     43      */
     44     private int regionStart;
     45 
     46     /**
     47      * Holds the end of the region, or input.length() if the matching should
     48      * go until the end of the input.
     49      */
     50     private int regionEnd;
     51 
     52     /**
     53      * Holds the position where the next find operation will take place.
     54      */
     55     private int findPos;
     56 
     57     /**
     58      * Holds the position where the next append operation will take place.
     59      */
     60     private int appendPos;
     61 
     62     /**
     63      * Reflects whether a match has been found during the most recent find
     64      * operation.
     65      */
     66     private boolean matchFound;
     67 
     68     /**
     69      * Holds the offsets for the most recent match.
     70      */
     71     private int[] matchOffsets;
     72 
     73     /**
     74      * Reflects whether the bounds of the region are anchoring.
     75      */
     76     private boolean anchoringBounds = true;
     77 
     78     /**
     79      * Reflects whether the bounds of the region are transparent.
     80      */
     81     private boolean transparentBounds;
     82 
     83     /**
     84      * Creates a matcher for a given combination of pattern and input. Both
     85      * elements can be changed later on.
     86      *
     87      * @param pattern
     88      *            the pattern to use.
     89      * @param input
     90      *            the input to use.
     91      */
     92     Matcher(Pattern pattern, CharSequence input) {
     93         usePattern(pattern);
     94         reset(input);
     95     }
     96 
     97     /**
     98      * Appends a literal part of the input plus a replacement for the current
     99      * match to a given {@link StringBuffer}. The literal part is exactly the
    100      * part of the input between the previous match and the current match. The
    101      * method can be used in conjunction with {@link #find()} and
    102      * {@link #appendTail(StringBuffer)} to walk through the input and replace
    103      * all occurrences of the {@code Pattern} with something else.
    104      *
    105      * @param buffer
    106      *            the {@code StringBuffer} to append to.
    107      * @param replacement
    108      *            the replacement text.
    109      * @return the {@code Matcher} itself.
    110      * @throws IllegalStateException
    111      *             if no successful match has been made.
    112      */
    113     public Matcher appendReplacement(StringBuffer buffer, String replacement) {
    114         buffer.append(input.substring(appendPos, start()));
    115         appendEvaluated(buffer, replacement);
    116         appendPos = end();
    117 
    118         return this;
    119     }
    120 
    121     /**
    122      * Internal helper method to append a given string to a given string buffer.
    123      * If the string contains any references to groups, these are replaced by
    124      * the corresponding group's contents.
    125      *
    126      * @param buffer
    127      *            the string buffer.
    128      * @param s
    129      *            the string to append.
    130      */
    131     private void appendEvaluated(StringBuffer buffer, String s) {
    132         boolean escape = false;
    133         boolean dollar = false;
    134 
    135         for (int i = 0; i < s.length(); i++) {
    136             char c = s.charAt(i);
    137             if (c == '\\' && !escape) {
    138                 escape = true;
    139             } else if (c == '$' && !escape) {
    140                 dollar = true;
    141             } else if (c >= '0' && c <= '9' && dollar) {
    142                 buffer.append(group(c - '0'));
    143                 dollar = false;
    144             } else {
    145                 buffer.append(c);
    146                 dollar = false;
    147                 escape = false;
    148             }
    149         }
    150 
    151         // This seemingly stupid piece of code reproduces a JDK bug.
    152         if (escape) {
    153             throw new ArrayIndexOutOfBoundsException(s.length());
    154         }
    155     }
    156 
    157     /**
    158      * Resets the {@code Matcher}. This results in the region being set to the
    159      * whole input. Results of a previous find get lost. The next attempt to
    160      * find an occurrence of the {@link Pattern} in the string will start at the
    161      * beginning of the input.
    162      *
    163      * @return the {@code Matcher} itself.
    164      */
    165     public Matcher reset() {
    166         return reset(input, 0, input.length());
    167     }
    168 
    169     /**
    170      * Provides a new input and resets the {@code Matcher}. This results in the
    171      * region being set to the whole input. Results of a previous find get lost.
    172      * The next attempt to find an occurrence of the {@link Pattern} in the
    173      * string will start at the beginning of the input.
    174      *
    175      * @param input
    176      *            the new input sequence.
    177      *
    178      * @return the {@code Matcher} itself.
    179      */
    180     public Matcher reset(CharSequence input) {
    181         return reset(input, 0, input.length());
    182     }
    183 
    184     /**
    185      * Resets the Matcher. A new input sequence and a new region can be
    186      * specified. Results of a previous find get lost. The next attempt to find
    187      * an occurrence of the Pattern in the string will start at the beginning of
    188      * the region. This is the internal version of reset() to which the several
    189      * public versions delegate.
    190      *
    191      * @param input
    192      *            the input sequence.
    193      * @param start
    194      *            the start of the region.
    195      * @param end
    196      *            the end of the region.
    197      *
    198      * @return the matcher itself.
    199      */
    200     private Matcher reset(CharSequence input, int start, int end) {
    201         if (input == null) {
    202             throw new IllegalArgumentException();
    203         }
    204 
    205         if (start < 0 || end < 0 || start > input.length() || end > input.length() || start > end) {
    206             throw new IndexOutOfBoundsException();
    207         }
    208 
    209         this.input = input.toString();
    210         this.regionStart = start;
    211         this.regionEnd = end;
    212         resetForInput();
    213 
    214         matchFound = false;
    215         findPos = regionStart;
    216         appendPos = 0;
    217 
    218         return this;
    219     }
    220 
    221     /**
    222      * Sets a new pattern for the {@code Matcher}. Results of a previous find
    223      * get lost. The next attempt to find an occurrence of the {@link Pattern}
    224      * in the string will start at the beginning of the input.
    225      *
    226      * @param pattern
    227      *            the new {@code Pattern}.
    228      *
    229      * @return the {@code Matcher} itself.
    230      */
    231     public Matcher usePattern(Pattern pattern) {
    232         if (pattern == null) {
    233             throw new IllegalArgumentException();
    234         }
    235 
    236         this.pattern = pattern;
    237 
    238         if (address != 0) {
    239             closeImpl(address);
    240             address = 0;
    241         }
    242         address = openImpl(pattern.address);
    243 
    244         if (input != null) {
    245             resetForInput();
    246         }
    247 
    248         matchOffsets = new int[(groupCount() + 1) * 2];
    249         matchFound = false;
    250         return this;
    251     }
    252 
    253     private void resetForInput() {
    254         setInputImpl(address, input, regionStart, regionEnd);
    255         useAnchoringBoundsImpl(address, anchoringBounds);
    256         useTransparentBoundsImpl(address, transparentBounds);
    257     }
    258 
    259     /**
    260      * Resets this matcher and sets a region. Only characters inside the region
    261      * are considered for a match.
    262      *
    263      * @param start
    264      *            the first character of the region.
    265      * @param end
    266      *            the first character after the end of the region.
    267      * @return the {@code Matcher} itself.
    268      */
    269     public Matcher region(int start, int end) {
    270         return reset(input, start, end);
    271     }
    272 
    273     /**
    274      * Appends the (unmatched) remainder of the input to the given
    275      * {@link StringBuffer}. The method can be used in conjunction with
    276      * {@link #find()} and {@link #appendReplacement(StringBuffer, String)} to
    277      * walk through the input and replace all matches of the {@code Pattern}
    278      * with something else.
    279      *
    280      * @param buffer
    281      *            the {@code StringBuffer} to append to.
    282      * @return the {@code StringBuffer}.
    283      * @throws IllegalStateException
    284      *             if no successful match has been made.
    285      */
    286     public StringBuffer appendTail(StringBuffer buffer) {
    287         if (appendPos < regionEnd) {
    288             buffer.append(input.substring(appendPos, regionEnd));
    289         }
    290         return buffer;
    291     }
    292 
    293     /**
    294      * Replaces the first occurrence of this matcher's pattern in the input with
    295      * a given string.
    296      *
    297      * @param replacement
    298      *            the replacement text.
    299      * @return the modified input string.
    300      */
    301     public String replaceFirst(String replacement) {
    302         reset();
    303         StringBuffer buffer = new StringBuffer(input.length());
    304         if (find()) {
    305             appendReplacement(buffer, replacement);
    306         }
    307         return appendTail(buffer).toString();
    308     }
    309 
    310     /**
    311      * Replaces all occurrences of this matcher's pattern in the input with a
    312      * given string.
    313      *
    314      * @param replacement
    315      *            the replacement text.
    316      * @return the modified input string.
    317      */
    318     public String replaceAll(String replacement) {
    319         reset();
    320         StringBuffer buffer = new StringBuffer(input.length());
    321         while (find()) {
    322             appendReplacement(buffer, replacement);
    323         }
    324         return appendTail(buffer).toString();
    325     }
    326 
    327     /**
    328      * Returns the {@link Pattern} instance used inside this matcher.
    329      *
    330      * @return the {@code Pattern} instance.
    331      */
    332     public Pattern pattern() {
    333         return pattern;
    334     }
    335 
    336     /**
    337      * Returns the text that matched a given group of the regular expression.
    338      * Explicit capturing groups in the pattern are numbered left to right in order
    339      * of their <i>opening</i> parenthesis, starting at 1.
    340      * The special group 0 represents the entire match (as if the entire pattern is surrounded
    341      * by an implicit capturing group).
    342      * For example, "a((b)c)" matching "abc" would give the following groups:
    343      * <pre>
    344      * 0 "abc"
    345      * 1 "bc"
    346      * 2 "b"
    347      * </pre>
    348      *
    349      * <p>An optional capturing group that failed to match as part of an overall
    350      * successful match (for example, "a(b)?c" matching "ac") returns null.
    351      * A capturing group that matched the empty string (for example, "a(b?)c" matching "ac")
    352      * returns the empty string.
    353      *
    354      * @throws IllegalStateException
    355      *             if no successful match has been made.
    356      */
    357     public String group(int group) {
    358         ensureMatch();
    359         int from = matchOffsets[group * 2];
    360         int to = matchOffsets[(group * 2) + 1];
    361         if (from == -1 || to == -1) {
    362             return null;
    363         } else {
    364             return input.substring(from, to);
    365         }
    366     }
    367 
    368     /**
    369      * Returns the text that matched the whole regular expression.
    370      *
    371      * @return the text.
    372      * @throws IllegalStateException
    373      *             if no successful match has been made.
    374      */
    375     public String group() {
    376         return group(0);
    377     }
    378 
    379     /**
    380      * Returns the next occurrence of the {@link Pattern} in the input. The
    381      * method starts the search from the given character in the input.
    382      *
    383      * @param start
    384      *            The index in the input at which the find operation is to
    385      *            begin. If this is less than the start of the region, it is
    386      *            automatically adjusted to that value. If it is beyond the end
    387      *            of the region, the method will fail.
    388      * @return true if (and only if) a match has been found.
    389      */
    390     public boolean find(int start) {
    391         findPos = start;
    392 
    393         if (findPos < regionStart) {
    394             findPos = regionStart;
    395         } else if (findPos >= regionEnd) {
    396             matchFound = false;
    397             return false;
    398         }
    399 
    400         matchFound = findImpl(address, input, findPos, matchOffsets);
    401         if (matchFound) {
    402             findPos = matchOffsets[1];
    403         }
    404         return matchFound;
    405     }
    406 
    407     /**
    408      * Returns the next occurrence of the {@link Pattern} in the input. If a
    409      * previous match was successful, the method continues the search from the
    410      * first character following that match in the input. Otherwise it searches
    411      * either from the region start (if one has been set), or from position 0.
    412      *
    413      * @return true if (and only if) a match has been found.
    414      */
    415     public boolean find() {
    416         matchFound = findNextImpl(address, input, matchOffsets);
    417         if (matchFound) {
    418             findPos = matchOffsets[1];
    419         }
    420         return matchFound;
    421     }
    422 
    423     /**
    424      * Tries to match the {@link Pattern}, starting from the beginning of the
    425      * region (or the beginning of the input, if no region has been set).
    426      * Doesn't require the {@code Pattern} to match against the whole region.
    427      *
    428      * @return true if (and only if) the {@code Pattern} matches.
    429      */
    430     public boolean lookingAt() {
    431         matchFound = lookingAtImpl(address, input, matchOffsets);
    432         if (matchFound) {
    433             findPos = matchOffsets[1];
    434         }
    435         return matchFound;
    436     }
    437 
    438     /**
    439      * Tries to match the {@link Pattern} against the entire region (or the
    440      * entire input, if no region has been set).
    441      *
    442      * @return true if (and only if) the {@code Pattern} matches the entire
    443      *         region.
    444      */
    445     public boolean matches() {
    446         matchFound = matchesImpl(address, input, matchOffsets);
    447         if (matchFound) {
    448             findPos = matchOffsets[1];
    449         }
    450         return matchFound;
    451     }
    452 
    453     /**
    454      * Returns the index of the first character of the text that matched a given
    455      * group.
    456      *
    457      * @param group
    458      *            the group, ranging from 0 to groupCount() - 1, with 0
    459      *            representing the whole pattern.
    460      * @return the character index.
    461      * @throws IllegalStateException
    462      *             if no successful match has been made.
    463      */
    464     public int start(int group) throws IllegalStateException {
    465         ensureMatch();
    466         return matchOffsets[group * 2];
    467     }
    468 
    469     /**
    470      * Returns the index of the first character following the text that matched
    471      * a given group.
    472      *
    473      * @param group
    474      *            the group, ranging from 0 to groupCount() - 1, with 0
    475      *            representing the whole pattern.
    476      * @return the character index.
    477      * @throws IllegalStateException
    478      *             if no successful match has been made.
    479      */
    480     public int end(int group) {
    481         ensureMatch();
    482         return matchOffsets[(group * 2) + 1];
    483     }
    484 
    485     /**
    486      * Returns a replacement string for the given one that has all backslashes
    487      * and dollar signs escaped.
    488      *
    489      * @param s
    490      *            the input string.
    491      * @return the input string, with all backslashes and dollar signs having
    492      *         been escaped.
    493      */
    494     public static String quoteReplacement(String s) {
    495         StringBuilder result = new StringBuilder(s.length());
    496         for (int i = 0; i < s.length(); i++) {
    497             char c = s.charAt(i);
    498             if (c == '\\' || c == '$') {
    499                 result.append('\\');
    500             }
    501             result.append(c);
    502         }
    503         return result.toString();
    504     }
    505 
    506     /**
    507      * Returns the index of the first character of the text that matched the
    508      * whole regular expression.
    509      *
    510      * @return the character index.
    511      * @throws IllegalStateException
    512      *             if no successful match has been made.
    513      */
    514     public int start() {
    515         return start(0);
    516     }
    517 
    518     /**
    519      * Returns the number of groups in the results, which is always equal to
    520      * the number of groups in the original regular expression.
    521      *
    522      * @return the number of groups.
    523      */
    524     public int groupCount() {
    525         return groupCountImpl(address);
    526     }
    527 
    528     /**
    529      * Returns the index of the first character following the text that matched
    530      * the whole regular expression.
    531      *
    532      * @return the character index.
    533      * @throws IllegalStateException
    534      *             if no successful match has been made.
    535      */
    536     public int end() {
    537         return end(0);
    538     }
    539 
    540     /**
    541      * Converts the current match into a separate {@link MatchResult} instance
    542      * that is independent from this matcher. The new object is unaffected when
    543      * the state of this matcher changes.
    544      *
    545      * @return the new {@code MatchResult}.
    546      * @throws IllegalStateException
    547      *             if no successful match has been made.
    548      */
    549     public MatchResult toMatchResult() {
    550         ensureMatch();
    551         return new MatchResultImpl(input, matchOffsets);
    552     }
    553 
    554     /**
    555      * Determines whether this matcher has anchoring bounds enabled or not. When
    556      * anchoring bounds are enabled, the start and end of the input match the
    557      * '^' and '$' meta-characters, otherwise not. Anchoring bounds are enabled
    558      * by default.
    559      *
    560      * @param value
    561      *            the new value for anchoring bounds.
    562      * @return the {@code Matcher} itself.
    563      */
    564     public Matcher useAnchoringBounds(boolean value) {
    565         anchoringBounds = value;
    566         useAnchoringBoundsImpl(address, value);
    567         return this;
    568     }
    569 
    570     /**
    571      * Indicates whether this matcher has anchoring bounds enabled. When
    572      * anchoring bounds are enabled, the start and end of the input match the
    573      * '^' and '$' meta-characters, otherwise not. Anchoring bounds are enabled
    574      * by default.
    575      *
    576      * @return true if (and only if) the {@code Matcher} uses anchoring bounds.
    577      */
    578     public boolean hasAnchoringBounds() {
    579         return anchoringBounds;
    580     }
    581 
    582     /**
    583      * Determines whether this matcher has transparent bounds enabled or not.
    584      * When transparent bounds are enabled, the parts of the input outside the
    585      * region are subject to lookahead and lookbehind, otherwise they are not.
    586      * Transparent bounds are disabled by default.
    587      *
    588      * @param value
    589      *            the new value for transparent bounds.
    590      * @return the {@code Matcher} itself.
    591      */
    592     public Matcher useTransparentBounds(boolean value) {
    593         transparentBounds = value;
    594         useTransparentBoundsImpl(address, value);
    595         return this;
    596     }
    597 
    598     /**
    599      * Makes sure that a successful match has been made. Is invoked internally
    600      * from various places in the class.
    601      *
    602      * @throws IllegalStateException
    603      *             if no successful match has been made.
    604      */
    605     private void ensureMatch() {
    606         if (!matchFound) {
    607             throw new IllegalStateException("No successful match so far");
    608         }
    609     }
    610 
    611     /**
    612      * Indicates whether this matcher has transparent bounds enabled. When
    613      * transparent bounds are enabled, the parts of the input outside the region
    614      * are subject to lookahead and lookbehind, otherwise they are not.
    615      * Transparent bounds are disabled by default.
    616      *
    617      * @return true if (and only if) the {@code Matcher} uses anchoring bounds.
    618      */
    619     public boolean hasTransparentBounds() {
    620         return transparentBounds;
    621     }
    622 
    623     /**
    624      * Returns this matcher's region start, that is, the first character that is
    625      * considered for a match.
    626      *
    627      * @return the start of the region.
    628      */
    629     public int regionStart() {
    630         return regionStart;
    631     }
    632 
    633     /**
    634      * Returns this matcher's region end, that is, the first character that is
    635      * not considered for a match.
    636      *
    637      * @return the end of the region.
    638      */
    639     public int regionEnd() {
    640         return regionEnd;
    641     }
    642 
    643     /**
    644      * Indicates whether more input might change a successful match into an
    645      * unsuccessful one.
    646      *
    647      * @return true if (and only if) more input might change a successful match
    648      *         into an unsuccessful one.
    649      */
    650     public boolean requireEnd() {
    651         return requireEndImpl(address);
    652     }
    653 
    654     /**
    655      * Indicates whether the last match hit the end of the input.
    656      *
    657      * @return true if (and only if) the last match hit the end of the input.
    658      */
    659     public boolean hitEnd() {
    660         return hitEndImpl(address);
    661     }
    662 
    663     @Override protected void finalize() throws Throwable {
    664         try {
    665             closeImpl(address);
    666         } finally {
    667             super.finalize();
    668         }
    669     }
    670 
    671     private static native void closeImpl(int addr);
    672     private static native boolean findImpl(int addr, String s, int startIndex, int[] offsets);
    673     private static native boolean findNextImpl(int addr, String s, int[] offsets);
    674     private static native int groupCountImpl(int addr);
    675     private static native boolean hitEndImpl(int addr);
    676     private static native boolean lookingAtImpl(int addr, String s, int[] offsets);
    677     private static native boolean matchesImpl(int addr, String s, int[] offsets);
    678     private static native int openImpl(int patternAddr);
    679     private static native boolean requireEndImpl(int addr);
    680     private static native void setInputImpl(int addr, String s, int start, int end);
    681     private static native void useAnchoringBoundsImpl(int addr, boolean value);
    682     private static native void useTransparentBoundsImpl(int addr, boolean value);
    683 }
    684