Home | History | Annotate | Download | only in regex
      1 /*
      2  * Copyright (C) 2007 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package java.util.regex;
     18 
     19 /**
     20  * The result of applying a {@code Pattern} to a given input. See {@link Pattern} for
     21  * example uses.
     22  */
     23 public final class Matcher implements MatchResult {
     24 
     25     /**
     26      * Holds the pattern, that is, the compiled regular expression.
     27      */
     28     private Pattern pattern;
     29 
     30     /**
     31      * The address of the native peer.
     32      * Uses of this must be manually synchronized to avoid native crashes.
     33      */
     34     private long address;
     35 
     36     /**
     37      * Holds the input text.
     38      */
     39     private String input;
     40 
     41     /**
     42      * Holds the start of the region, or 0 if the matching should start at the
     43      * beginning of the text.
     44      */
     45     private int regionStart;
     46 
     47     /**
     48      * Holds the end of the region, or input.length() if the matching should
     49      * go until the end of the input.
     50      */
     51     private int regionEnd;
     52 
     53     /**
     54      * Holds the position where the next append operation will take place.
     55      */
     56     private int appendPos;
     57 
     58     /**
     59      * Reflects whether a match has been found during the most recent find
     60      * operation.
     61      */
     62     private boolean matchFound;
     63 
     64     /**
     65      * Holds the offsets for the most recent match.
     66      */
     67     private int[] matchOffsets;
     68 
     69     /**
     70      * Reflects whether the bounds of the region are anchoring.
     71      */
     72     private boolean anchoringBounds = true;
     73 
     74     /**
     75      * Reflects whether the bounds of the region are transparent.
     76      */
     77     private boolean transparentBounds;
     78 
     79     /**
     80      * Creates a matcher for a given combination of pattern and input. Both
     81      * elements can be changed later on.
     82      *
     83      * @param pattern
     84      *            the pattern to use.
     85      * @param input
     86      *            the input to use.
     87      */
     88     Matcher(Pattern pattern, CharSequence input) {
     89         usePattern(pattern);
     90         reset(input);
     91     }
     92 
     93     /**
     94      * Appends a literal part of the input plus a replacement for the current
     95      * match to a given {@link StringBuffer}. The literal part is exactly the
     96      * part of the input between the previous match and the current match. The
     97      * method can be used in conjunction with {@link #find()} and
     98      * {@link #appendTail(StringBuffer)} to walk through the input and replace
     99      * all occurrences of the {@code Pattern} with something else.
    100      *
    101      * @param buffer
    102      *            the {@code StringBuffer} to append to.
    103      * @param replacement
    104      *            the replacement text.
    105      * @return the {@code Matcher} itself.
    106      * @throws IllegalStateException
    107      *             if no successful match has been made.
    108      */
    109     public Matcher appendReplacement(StringBuffer buffer, String replacement) {
    110         buffer.append(input.substring(appendPos, start()));
    111         appendEvaluated(buffer, replacement);
    112         appendPos = end();
    113 
    114         return this;
    115     }
    116 
    117     /**
    118      * Internal helper method to append a given string to a given string buffer.
    119      * If the string contains any references to groups, these are replaced by
    120      * the corresponding group's contents.
    121      *
    122      * @param buffer
    123      *            the string buffer.
    124      * @param s
    125      *            the string to append.
    126      */
    127     private void appendEvaluated(StringBuffer buffer, String s) {
    128         boolean escape = false;
    129         boolean dollar = false;
    130 
    131         for (int i = 0; i < s.length(); i++) {
    132             char c = s.charAt(i);
    133             if (c == '\\' && !escape) {
    134                 escape = true;
    135             } else if (c == '$' && !escape) {
    136                 dollar = true;
    137             } else if (c >= '0' && c <= '9' && dollar) {
    138                 buffer.append(group(c - '0'));
    139                 dollar = false;
    140             } else {
    141                 buffer.append(c);
    142                 dollar = false;
    143                 escape = false;
    144             }
    145         }
    146 
    147         // This seemingly stupid piece of code reproduces a JDK bug.
    148         if (escape) {
    149             throw new ArrayIndexOutOfBoundsException(s.length());
    150         }
    151     }
    152 
    153     /**
    154      * Resets the {@code Matcher}. This results in the region being set to the
    155      * whole input. Results of a previous find get lost. The next attempt to
    156      * find an occurrence of the {@link Pattern} in the string will start at the
    157      * beginning of the input.
    158      *
    159      * @return the {@code Matcher} itself.
    160      */
    161     public Matcher reset() {
    162         return reset(input, 0, input.length());
    163     }
    164 
    165     /**
    166      * Provides a new input and resets the {@code Matcher}. This results in the
    167      * region being set to the whole input. Results of a previous find get lost.
    168      * The next attempt to find an occurrence of the {@link Pattern} in the
    169      * string will start at the beginning of the input.
    170      *
    171      * @param input
    172      *            the new input sequence.
    173      *
    174      * @return the {@code Matcher} itself.
    175      */
    176     public Matcher reset(CharSequence input) {
    177         return reset(input, 0, input.length());
    178     }
    179 
    180     /**
    181      * Resets the Matcher. A new input sequence and a new region can be
    182      * specified. Results of a previous find get lost. The next attempt to find
    183      * an occurrence of the Pattern in the string will start at the beginning of
    184      * the region. This is the internal version of reset() to which the several
    185      * public versions delegate.
    186      *
    187      * @param input
    188      *            the input sequence.
    189      * @param start
    190      *            the start of the region.
    191      * @param end
    192      *            the end of the region.
    193      *
    194      * @return the matcher itself.
    195      */
    196     private Matcher reset(CharSequence input, int start, int end) {
    197         if (input == null) {
    198             throw new IllegalArgumentException("input == null");
    199         }
    200 
    201         if (start < 0 || end < 0 || start > input.length() || end > input.length() || start > end) {
    202             throw new IndexOutOfBoundsException();
    203         }
    204 
    205         this.input = input.toString();
    206         this.regionStart = start;
    207         this.regionEnd = end;
    208         resetForInput();
    209 
    210         matchFound = false;
    211         appendPos = 0;
    212 
    213         return this;
    214     }
    215 
    216     /**
    217      * Sets a new pattern for the {@code Matcher}. Results of a previous find
    218      * get lost. The next attempt to find an occurrence of the {@link Pattern}
    219      * in the string will start at the beginning of the input.
    220      *
    221      * @param pattern
    222      *            the new {@code Pattern}.
    223      *
    224      * @return the {@code Matcher} itself.
    225      */
    226     public Matcher usePattern(Pattern pattern) {
    227         if (pattern == null) {
    228             throw new IllegalArgumentException("pattern == null");
    229         }
    230 
    231         this.pattern = pattern;
    232 
    233         synchronized (this) {
    234             if (address != 0) {
    235                 closeImpl(address);
    236                 address = 0; // In case openImpl throws.
    237             }
    238             address = openImpl(pattern.address);
    239         }
    240 
    241         if (input != null) {
    242             resetForInput();
    243         }
    244 
    245         matchOffsets = new int[(groupCount() + 1) * 2];
    246         matchFound = false;
    247         return this;
    248     }
    249 
    250     private void resetForInput() {
    251         synchronized (this) {
    252             setInputImpl(address, input, regionStart, regionEnd);
    253             useAnchoringBoundsImpl(address, anchoringBounds);
    254             useTransparentBoundsImpl(address, transparentBounds);
    255         }
    256     }
    257 
    258     /**
    259      * Resets this matcher and sets a region. Only characters inside the region
    260      * are considered for a match.
    261      *
    262      * @param start
    263      *            the first character of the region.
    264      * @param end
    265      *            the first character after the end of the region.
    266      * @return the {@code Matcher} itself.
    267      */
    268     public Matcher region(int start, int end) {
    269         return reset(input, start, end);
    270     }
    271 
    272     /**
    273      * Appends the (unmatched) remainder of the input to the given
    274      * {@link StringBuffer}. The method can be used in conjunction with
    275      * {@link #find()} and {@link #appendReplacement(StringBuffer, String)} to
    276      * walk through the input and replace all matches of the {@code Pattern}
    277      * with something else.
    278      *
    279      * @param buffer
    280      *            the {@code StringBuffer} to append to.
    281      * @return the {@code StringBuffer}.
    282      * @throws IllegalStateException
    283      *             if no successful match has been made.
    284      */
    285     public StringBuffer appendTail(StringBuffer buffer) {
    286         if (appendPos < regionEnd) {
    287             buffer.append(input.substring(appendPos, regionEnd));
    288         }
    289         return buffer;
    290     }
    291 
    292     /**
    293      * Replaces the first occurrence of this matcher's pattern in the input with
    294      * a given string.
    295      *
    296      * @param replacement
    297      *            the replacement text.
    298      * @return the modified input string.
    299      */
    300     public String replaceFirst(String replacement) {
    301         reset();
    302         StringBuffer buffer = new StringBuffer(input.length());
    303         if (find()) {
    304             appendReplacement(buffer, replacement);
    305         }
    306         return appendTail(buffer).toString();
    307     }
    308 
    309     /**
    310      * Replaces all occurrences of this matcher's pattern in the input with a
    311      * given string.
    312      *
    313      * @param replacement
    314      *            the replacement text.
    315      * @return the modified input string.
    316      */
    317     public String replaceAll(String replacement) {
    318         reset();
    319         StringBuffer buffer = new StringBuffer(input.length());
    320         while (find()) {
    321             appendReplacement(buffer, replacement);
    322         }
    323         return appendTail(buffer).toString();
    324     }
    325 
    326     /**
    327      * Returns the {@link Pattern} instance used inside this matcher.
    328      *
    329      * @return the {@code Pattern} instance.
    330      */
    331     public Pattern pattern() {
    332         return pattern;
    333     }
    334 
    335     /**
    336      * Returns the text that matched a given group of the regular expression.
    337      * Explicit capturing groups in the pattern are numbered left to right in order
    338      * of their <i>opening</i> parenthesis, starting at 1.
    339      * The special group 0 represents the entire match (as if the entire pattern is surrounded
    340      * by an implicit capturing group).
    341      * For example, "a((b)c)" matching "abc" would give the following groups:
    342      * <pre>
    343      * 0 "abc"
    344      * 1 "bc"
    345      * 2 "b"
    346      * </pre>
    347      *
    348      * <p>An optional capturing group that failed to match as part of an overall
    349      * successful match (for example, "a(b)?c" matching "ac") returns null.
    350      * A capturing group that matched the empty string (for example, "a(b?)c" matching "ac")
    351      * returns the empty string.
    352      *
    353      * @throws IllegalStateException
    354      *             if no successful match has been made.
    355      */
    356     public String group(int group) {
    357         ensureMatch();
    358         int from = matchOffsets[group * 2];
    359         int to = matchOffsets[(group * 2) + 1];
    360         if (from == -1 || to == -1) {
    361             return null;
    362         } else {
    363             return input.substring(from, to);
    364         }
    365     }
    366 
    367     /**
    368      * Returns the text that matched the whole regular expression.
    369      *
    370      * @return the text.
    371      * @throws IllegalStateException
    372      *             if no successful match has been made.
    373      */
    374     public String group() {
    375         return group(0);
    376     }
    377 
    378     /**
    379      * Returns true if there is another match in the input, starting
    380      * from the given position. The region is ignored.
    381      *
    382      * @throws IndexOutOfBoundsException if {@code start < 0 || start > input.length()}
    383      */
    384     public boolean find(int start) {
    385         if (start < 0 || start > input.length()) {
    386             throw new IndexOutOfBoundsException("start=" + start + "; length=" + input.length());
    387         }
    388 
    389         synchronized (this) {
    390             matchFound = findImpl(address, input, start, matchOffsets);
    391         }
    392         return matchFound;
    393     }
    394 
    395     /**
    396      * Returns the next occurrence of the {@link Pattern} in the input. If a
    397      * previous match was successful, the method continues the search from the
    398      * first character following that match in the input. Otherwise it searches
    399      * either from the region start (if one has been set), or from position 0.
    400      *
    401      * @return true if (and only if) a match has been found.
    402      */
    403     public boolean find() {
    404         synchronized (this) {
    405             matchFound = findNextImpl(address, input, matchOffsets);
    406         }
    407         return matchFound;
    408     }
    409 
    410     /**
    411      * Tries to match the {@link Pattern}, starting from the beginning of the
    412      * region (or the beginning of the input, if no region has been set).
    413      * Doesn't require the {@code Pattern} to match against the whole region.
    414      *
    415      * @return true if (and only if) the {@code Pattern} matches.
    416      */
    417     public boolean lookingAt() {
    418         synchronized (this) {
    419             matchFound = lookingAtImpl(address, input, matchOffsets);
    420         }
    421         return matchFound;
    422     }
    423 
    424     /**
    425      * Tries to match the {@link Pattern} against the entire region (or the
    426      * entire input, if no region has been set).
    427      *
    428      * @return true if (and only if) the {@code Pattern} matches the entire
    429      *         region.
    430      */
    431     public boolean matches() {
    432         synchronized (this) {
    433             matchFound = matchesImpl(address, input, matchOffsets);
    434         }
    435         return matchFound;
    436     }
    437 
    438     /**
    439      * Returns the index of the first character of the text that matched a given
    440      * group.
    441      *
    442      * @param group
    443      *            the group, ranging from 0 to groupCount() - 1, with 0
    444      *            representing the whole pattern.
    445      * @return the character index.
    446      * @throws IllegalStateException
    447      *             if no successful match has been made.
    448      */
    449     public int start(int group) throws IllegalStateException {
    450         ensureMatch();
    451         return matchOffsets[group * 2];
    452     }
    453 
    454     /**
    455      * Returns the index of the first character following the text that matched
    456      * a given group.
    457      *
    458      * @param group
    459      *            the group, ranging from 0 to groupCount() - 1, with 0
    460      *            representing the whole pattern.
    461      * @return the character index.
    462      * @throws IllegalStateException
    463      *             if no successful match has been made.
    464      */
    465     public int end(int group) {
    466         ensureMatch();
    467         return matchOffsets[(group * 2) + 1];
    468     }
    469 
    470     /**
    471      * Returns a replacement string for the given one that has all backslashes
    472      * and dollar signs escaped.
    473      *
    474      * @param s
    475      *            the input string.
    476      * @return the input string, with all backslashes and dollar signs having
    477      *         been escaped.
    478      */
    479     public static String quoteReplacement(String s) {
    480         StringBuilder result = new StringBuilder(s.length());
    481         for (int i = 0; i < s.length(); i++) {
    482             char c = s.charAt(i);
    483             if (c == '\\' || c == '$') {
    484                 result.append('\\');
    485             }
    486             result.append(c);
    487         }
    488         return result.toString();
    489     }
    490 
    491     /**
    492      * Returns the index of the first character of the text that matched the
    493      * whole regular expression.
    494      *
    495      * @return the character index.
    496      * @throws IllegalStateException
    497      *             if no successful match has been made.
    498      */
    499     public int start() {
    500         return start(0);
    501     }
    502 
    503     /**
    504      * Returns the number of groups in the results, which is always equal to
    505      * the number of groups in the original regular expression.
    506      *
    507      * @return the number of groups.
    508      */
    509     public int groupCount() {
    510         synchronized (this) {
    511             return groupCountImpl(address);
    512         }
    513     }
    514 
    515     /**
    516      * Returns the index of the first character following the text that matched
    517      * the whole regular expression.
    518      *
    519      * @return the character index.
    520      * @throws IllegalStateException
    521      *             if no successful match has been made.
    522      */
    523     public int end() {
    524         return end(0);
    525     }
    526 
    527     /**
    528      * Converts the current match into a separate {@link MatchResult} instance
    529      * that is independent from this matcher. The new object is unaffected when
    530      * the state of this matcher changes.
    531      *
    532      * @return the new {@code MatchResult}.
    533      * @throws IllegalStateException
    534      *             if no successful match has been made.
    535      */
    536     public MatchResult toMatchResult() {
    537         ensureMatch();
    538         return new MatchResultImpl(input, matchOffsets);
    539     }
    540 
    541     /**
    542      * Determines whether this matcher has anchoring bounds enabled or not. When
    543      * anchoring bounds are enabled, the start and end of the input match the
    544      * '^' and '$' meta-characters, otherwise not. Anchoring bounds are enabled
    545      * by default.
    546      *
    547      * @param value
    548      *            the new value for anchoring bounds.
    549      * @return the {@code Matcher} itself.
    550      */
    551     public Matcher useAnchoringBounds(boolean value) {
    552         synchronized (this) {
    553             anchoringBounds = value;
    554             useAnchoringBoundsImpl(address, value);
    555         }
    556         return this;
    557     }
    558 
    559     /**
    560      * Indicates whether this matcher has anchoring bounds enabled. When
    561      * anchoring bounds are enabled, the start and end of the input match the
    562      * '^' and '$' meta-characters, otherwise not. Anchoring bounds are enabled
    563      * by default.
    564      *
    565      * @return true if (and only if) the {@code Matcher} uses anchoring bounds.
    566      */
    567     public boolean hasAnchoringBounds() {
    568         return anchoringBounds;
    569     }
    570 
    571     /**
    572      * Determines whether this matcher has transparent bounds enabled or not.
    573      * When transparent bounds are enabled, the parts of the input outside the
    574      * region are subject to lookahead and lookbehind, otherwise they are not.
    575      * Transparent bounds are disabled by default.
    576      *
    577      * @param value
    578      *            the new value for transparent bounds.
    579      * @return the {@code Matcher} itself.
    580      */
    581     public Matcher useTransparentBounds(boolean value) {
    582         synchronized (this) {
    583             transparentBounds = value;
    584             useTransparentBoundsImpl(address, value);
    585         }
    586         return this;
    587     }
    588 
    589     /**
    590      * Makes sure that a successful match has been made. Is invoked internally
    591      * from various places in the class.
    592      *
    593      * @throws IllegalStateException
    594      *             if no successful match has been made.
    595      */
    596     private void ensureMatch() {
    597         if (!matchFound) {
    598             throw new IllegalStateException("No successful match so far");
    599         }
    600     }
    601 
    602     /**
    603      * Indicates whether this matcher has transparent bounds enabled. When
    604      * transparent bounds are enabled, the parts of the input outside the region
    605      * are subject to lookahead and lookbehind, otherwise they are not.
    606      * Transparent bounds are disabled by default.
    607      *
    608      * @return true if (and only if) the {@code Matcher} uses anchoring bounds.
    609      */
    610     public boolean hasTransparentBounds() {
    611         return transparentBounds;
    612     }
    613 
    614     /**
    615      * Returns this matcher's region start, that is, the index of the first character that is
    616      * considered for a match.
    617      */
    618     public int regionStart() {
    619         return regionStart;
    620     }
    621 
    622     /**
    623      * Returns this matcher's region end, that is, the index of the first character that is
    624      * not considered for a match.
    625      */
    626     public int regionEnd() {
    627         return regionEnd;
    628     }
    629 
    630     /**
    631      * Returns true if and only if more input might change a successful match into an
    632      * unsuccessful one.
    633      */
    634     public boolean requireEnd() {
    635         synchronized (this) {
    636             return requireEndImpl(address);
    637         }
    638     }
    639 
    640     /**
    641      * Returns true if and only if the last match hit the end of the input.
    642      */
    643     public boolean hitEnd() {
    644         synchronized (this) {
    645             return hitEndImpl(address);
    646         }
    647     }
    648 
    649     @Override protected void finalize() throws Throwable {
    650         try {
    651             synchronized (this) {
    652                 closeImpl(address);
    653             }
    654         } finally {
    655             super.finalize();
    656         }
    657     }
    658 
    659     private static native void closeImpl(long addr);
    660     private static native boolean findImpl(long addr, String s, int startIndex, int[] offsets);
    661     private static native boolean findNextImpl(long addr, String s, int[] offsets);
    662     private static native int groupCountImpl(long addr);
    663     private static native boolean hitEndImpl(long addr);
    664     private static native boolean lookingAtImpl(long addr, String s, int[] offsets);
    665     private static native boolean matchesImpl(long addr, String s, int[] offsets);
    666     private static native long openImpl(long patternAddr);
    667     private static native boolean requireEndImpl(long addr);
    668     private static native void setInputImpl(long addr, String s, int start, int end);
    669     private static native void useAnchoringBoundsImpl(long addr, boolean value);
    670     private static native void useTransparentBoundsImpl(long addr, boolean value);
    671 }
    672