Home | History | Annotate | Download | only in phonenumbers
      1 /*
      2  * Copyright (C) 2011 The Libphonenumber Authors
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.i18n.phonenumbers;
     18 
     19 import com.android.i18n.phonenumbers.PhoneNumberUtil.Leniency;
     20 import com.android.i18n.phonenumbers.Phonenumber.PhoneNumber;
     21 
     22 import java.lang.Character.UnicodeBlock;
     23 import java.util.Iterator;
     24 import java.util.NoSuchElementException;
     25 import java.util.regex.Matcher;
     26 import java.util.regex.Pattern;
     27 
     28 /**
     29  * A stateful class that finds and extracts telephone numbers from {@linkplain CharSequence text}.
     30  * Instances can be created using the {@linkplain PhoneNumberUtil#findNumbers factory methods} in
     31  * {@link PhoneNumberUtil}.
     32  *
     33  * <p>Vanity numbers (phone numbers using alphabetic digits such as <tt>1-800-SIX-FLAGS</tt> are
     34  * not found.
     35  *
     36  * <p>This class is not thread-safe.
     37  *
     38  * @author Tom Hofmann
     39  */
     40 final class PhoneNumberMatcher implements Iterator<PhoneNumberMatch> {
     41   /**
     42    * The phone number pattern used by {@link #find}, similar to
     43    * {@code PhoneNumberUtil.VALID_PHONE_NUMBER}, but with the following differences:
     44    * <ul>
     45    *   <li>All captures are limited in order to place an upper bound to the text matched by the
     46    *       pattern.
     47    * <ul>
     48    *   <li>Leading punctuation / plus signs are limited.
     49    *   <li>Consecutive occurrences of punctuation are limited.
     50    *   <li>Number of digits is limited.
     51    * </ul>
     52    *   <li>No whitespace is allowed at the start or end.
     53    *   <li>No alpha digits (vanity numbers such as 1-800-SIX-FLAGS) are currently supported.
     54    * </ul>
     55    */
     56   private static final Pattern PATTERN;
     57   /**
     58    * Matches strings that look like publication pages. Example:
     59    * <pre>Computing Complete Answers to Queries in the Presence of Limited Access Patterns.
     60    * Chen Li. VLDB J. 12(3): 211-227 (2003).</pre>
     61    *
     62    * The string "211-227 (2003)" is not a telephone number.
     63    */
     64   private static final Pattern PUB_PAGES = Pattern.compile("\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}");
     65 
     66   /**
     67    * Matches strings that look like dates using "/" as a separator. Examples: 3/10/2011, 31/10/96 or
     68    * 08/31/95.
     69    */
     70   private static final Pattern SLASH_SEPARATED_DATES =
     71       Pattern.compile("(?:(?:[0-3]?\\d/[01]?\\d)|(?:[01]?\\d/[0-3]?\\d))/(?:[12]\\d)?\\d{2}");
     72 
     73   /**
     74    * Matches timestamps. Examples: "2012-01-02 08:00". Note that the reg-ex does not include the
     75    * trailing ":\d\d" -- that is covered by TIME_STAMPS_SUFFIX.
     76    */
     77   private static final Pattern TIME_STAMPS =
     78       Pattern.compile("[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d [0-2]\\d$");
     79   private static final Pattern TIME_STAMPS_SUFFIX = Pattern.compile(":[0-5]\\d");
     80 
     81   /**
     82    * Pattern to check that brackets match. Opening brackets should be closed within a phone number.
     83    * This also checks that there is something inside the brackets. Having no brackets at all is also
     84    * fine.
     85    */
     86   private static final Pattern MATCHING_BRACKETS;
     87 
     88   /**
     89    * Matches white-space, which may indicate the end of a phone number and the start of something
     90    * else (such as a neighbouring zip-code). If white-space is found, continues to match all
     91    * characters that are not typically used to start a phone number.
     92    */
     93   private static final Pattern GROUP_SEPARATOR;
     94 
     95   /**
     96    * Punctuation that may be at the start of a phone number - brackets and plus signs.
     97    */
     98   private static final Pattern LEAD_CLASS;
     99 
    100   static {
    101     /* Builds the MATCHING_BRACKETS and PATTERN regular expressions. The building blocks below exist
    102      * to make the pattern more easily understood. */
    103 
    104     String openingParens = "(\\[\uFF08\uFF3B";
    105     String closingParens = ")\\]\uFF09\uFF3D";
    106     String nonParens = "[^" + openingParens + closingParens + "]";
    107 
    108     /* Limit on the number of pairs of brackets in a phone number. */
    109     String bracketPairLimit = limit(0, 3);
    110     /*
    111      * An opening bracket at the beginning may not be closed, but subsequent ones should be.  It's
    112      * also possible that the leading bracket was dropped, so we shouldn't be surprised if we see a
    113      * closing bracket first. We limit the sets of brackets in a phone number to four.
    114      */
    115     MATCHING_BRACKETS = Pattern.compile(
    116         "(?:[" + openingParens + "])?" + "(?:" + nonParens + "+" + "[" + closingParens + "])?" +
    117         nonParens + "+" +
    118         "(?:[" + openingParens + "]" + nonParens + "+[" + closingParens + "])" + bracketPairLimit +
    119         nonParens + "*");
    120 
    121     /* Limit on the number of leading (plus) characters. */
    122     String leadLimit = limit(0, 2);
    123     /* Limit on the number of consecutive punctuation characters. */
    124     String punctuationLimit = limit(0, 4);
    125     /* The maximum number of digits allowed in a digit-separated block. As we allow all digits in a
    126      * single block, set high enough to accommodate the entire national number and the international
    127      * country code. */
    128     int digitBlockLimit =
    129         PhoneNumberUtil.MAX_LENGTH_FOR_NSN + PhoneNumberUtil.MAX_LENGTH_COUNTRY_CODE;
    130     /* Limit on the number of blocks separated by punctuation. Uses digitBlockLimit since some
    131      * formats use spaces to separate each digit. */
    132     String blockLimit = limit(0, digitBlockLimit);
    133 
    134     /* A punctuation sequence allowing white space. */
    135     String punctuation = "[" + PhoneNumberUtil.VALID_PUNCTUATION + "]" + punctuationLimit;
    136     /* A digits block without punctuation. */
    137     String digitSequence = "\\p{Nd}" + limit(1, digitBlockLimit);
    138 
    139     String leadClassChars = openingParens + PhoneNumberUtil.PLUS_CHARS;
    140     String leadClass = "[" + leadClassChars + "]";
    141     LEAD_CLASS = Pattern.compile(leadClass);
    142     GROUP_SEPARATOR = Pattern.compile("\\p{Z}" + "[^" + leadClassChars  + "\\p{Nd}]*");
    143 
    144     /* Phone number pattern allowing optional punctuation. */
    145     PATTERN = Pattern.compile(
    146         "(?:" + leadClass + punctuation + ")" + leadLimit +
    147         digitSequence + "(?:" + punctuation + digitSequence + ")" + blockLimit +
    148         "(?:" + PhoneNumberUtil.EXTN_PATTERNS_FOR_MATCHING + ")?",
    149         PhoneNumberUtil.REGEX_FLAGS);
    150   }
    151 
    152   /** Returns a regular expression quantifier with an upper and lower limit. */
    153   private static String limit(int lower, int upper) {
    154     if ((lower < 0) || (upper <= 0) || (upper < lower)) {
    155       throw new IllegalArgumentException();
    156     }
    157     return "{" + lower + "," + upper + "}";
    158   }
    159 
    160   /** The potential states of a PhoneNumberMatcher. */
    161   private enum State {
    162     NOT_READY, READY, DONE
    163   }
    164 
    165   /** The phone number utility. */
    166   private final PhoneNumberUtil phoneUtil;
    167   /** The text searched for phone numbers. */
    168   private final CharSequence text;
    169   /**
    170    * The region (country) to assume for phone numbers without an international prefix, possibly
    171    * null.
    172    */
    173   private final String preferredRegion;
    174   /** The degree of validation requested. */
    175   private final Leniency leniency;
    176   /** The maximum number of retries after matching an invalid number. */
    177   private long maxTries;
    178 
    179   /** The iteration tristate. */
    180   private State state = State.NOT_READY;
    181   /** The last successful match, null unless in {@link State#READY}. */
    182   private PhoneNumberMatch lastMatch = null;
    183   /** The next index to start searching at. Undefined in {@link State#DONE}. */
    184   private int searchIndex = 0;
    185 
    186   /**
    187    * Creates a new instance. See the factory methods in {@link PhoneNumberUtil} on how to obtain a
    188    * new instance.
    189    *
    190    * @param util      the phone number util to use
    191    * @param text      the character sequence that we will search, null for no text
    192    * @param country   the country to assume for phone numbers not written in international format
    193    *                  (with a leading plus, or with the international dialing prefix of the
    194    *                  specified region). May be null or "ZZ" if only numbers with a
    195    *                  leading plus should be considered.
    196    * @param leniency  the leniency to use when evaluating candidate phone numbers
    197    * @param maxTries  the maximum number of invalid numbers to try before giving up on the text.
    198    *                  This is to cover degenerate cases where the text has a lot of false positives
    199    *                  in it. Must be {@code >= 0}.
    200    */
    201   PhoneNumberMatcher(PhoneNumberUtil util, CharSequence text, String country, Leniency leniency,
    202       long maxTries) {
    203 
    204     if ((util == null) || (leniency == null)) {
    205       throw new NullPointerException();
    206     }
    207     if (maxTries < 0) {
    208       throw new IllegalArgumentException();
    209     }
    210     this.phoneUtil = util;
    211     this.text = (text != null) ? text : "";
    212     this.preferredRegion = country;
    213     this.leniency = leniency;
    214     this.maxTries = maxTries;
    215   }
    216 
    217   public boolean hasNext() {
    218     if (state == State.NOT_READY) {
    219       lastMatch = find(searchIndex);
    220       if (lastMatch == null) {
    221         state = State.DONE;
    222       } else {
    223         searchIndex = lastMatch.end();
    224         state = State.READY;
    225       }
    226     }
    227     return state == State.READY;
    228   }
    229 
    230   public PhoneNumberMatch next() {
    231     // Check the state and find the next match as a side-effect if necessary.
    232     if (!hasNext()) {
    233       throw new NoSuchElementException();
    234     }
    235 
    236     // Don't retain that memory any longer than necessary.
    237     PhoneNumberMatch result = lastMatch;
    238     lastMatch = null;
    239     state = State.NOT_READY;
    240     return result;
    241   }
    242 
    243   /**
    244    * Attempts to find the next subsequence in the searched sequence on or after {@code searchIndex}
    245    * that represents a phone number. Returns the next match, null if none was found.
    246    *
    247    * @param index  the search index to start searching at
    248    * @return  the phone number match found, null if none can be found
    249    */
    250   private PhoneNumberMatch find(int index) {
    251     Matcher matcher = PATTERN.matcher(text);
    252     while ((maxTries > 0) && matcher.find(index)) {
    253       int start = matcher.start();
    254       CharSequence candidate = text.subSequence(start, matcher.end());
    255 
    256       // Check for extra numbers at the end.
    257       // TODO: This is the place to start when trying to support extraction of multiple phone number
    258       // from split notations (+41 79 123 45 67 / 68).
    259       candidate = trimAfterFirstMatch(PhoneNumberUtil.SECOND_NUMBER_START_PATTERN, candidate);
    260 
    261       PhoneNumberMatch match = extractMatch(candidate, start);
    262       if (match != null) {
    263         return match;
    264       }
    265 
    266       index = start + candidate.length();
    267       maxTries--;
    268     }
    269 
    270     return null;
    271   }
    272 
    273   /**
    274    * Trims away any characters after the first match of {@code pattern} in {@code candidate},
    275    * returning the trimmed version.
    276    */
    277   private static CharSequence trimAfterFirstMatch(Pattern pattern, CharSequence candidate) {
    278     Matcher trailingCharsMatcher = pattern.matcher(candidate);
    279     if (trailingCharsMatcher.find()) {
    280       candidate = candidate.subSequence(0, trailingCharsMatcher.start());
    281     }
    282     return candidate;
    283   }
    284 
    285   /**
    286    * Helper method to determine if a character is a Latin-script letter or not. For our purposes,
    287    * combining marks should also return true since we assume they have been added to a preceding
    288    * Latin character.
    289    */
    290   // @VisibleForTesting
    291   static boolean isLatinLetter(char letter) {
    292     // Combining marks are a subset of non-spacing-mark.
    293     if (!Character.isLetter(letter) && Character.getType(letter) != Character.NON_SPACING_MARK) {
    294       return false;
    295     }
    296     UnicodeBlock block = UnicodeBlock.of(letter);
    297     return block.equals(UnicodeBlock.BASIC_LATIN) ||
    298         block.equals(UnicodeBlock.LATIN_1_SUPPLEMENT) ||
    299         block.equals(UnicodeBlock.LATIN_EXTENDED_A) ||
    300         block.equals(UnicodeBlock.LATIN_EXTENDED_ADDITIONAL) ||
    301         block.equals(UnicodeBlock.LATIN_EXTENDED_B) ||
    302         block.equals(UnicodeBlock.COMBINING_DIACRITICAL_MARKS);
    303   }
    304 
    305   private static boolean isInvalidPunctuationSymbol(char character) {
    306     return character == '%' || Character.getType(character) == Character.CURRENCY_SYMBOL;
    307   }
    308 
    309   /**
    310    * Attempts to extract a match from a {@code candidate} character sequence.
    311    *
    312    * @param candidate  the candidate text that might contain a phone number
    313    * @param offset  the offset of {@code candidate} within {@link #text}
    314    * @return  the match found, null if none can be found
    315    */
    316   private PhoneNumberMatch extractMatch(CharSequence candidate, int offset) {
    317     // Skip a match that is more likely a publication page reference or a date.
    318     if (PUB_PAGES.matcher(candidate).find() || SLASH_SEPARATED_DATES.matcher(candidate).find()) {
    319       return null;
    320     }
    321     // Skip potential time-stamps.
    322     if (TIME_STAMPS.matcher(candidate).find()) {
    323       String followingText = text.toString().substring(offset + candidate.length());
    324       if (TIME_STAMPS_SUFFIX.matcher(followingText).lookingAt()) {
    325         return null;
    326       }
    327     }
    328 
    329     // Try to come up with a valid match given the entire candidate.
    330     String rawString = candidate.toString();
    331     PhoneNumberMatch match = parseAndVerify(rawString, offset);
    332     if (match != null) {
    333       return match;
    334     }
    335 
    336     // If that failed, try to find an "inner match" - there might be a phone number within this
    337     // candidate.
    338     return extractInnerMatch(rawString, offset);
    339   }
    340 
    341   /**
    342    * Attempts to extract a match from {@code candidate} if the whole candidate does not qualify as a
    343    * match.
    344    *
    345    * @param candidate  the candidate text that might contain a phone number
    346    * @param offset  the current offset of {@code candidate} within {@link #text}
    347    * @return  the match found, null if none can be found
    348    */
    349   private PhoneNumberMatch extractInnerMatch(String candidate, int offset) {
    350     // Try removing either the first or last "group" in the number and see if this gives a result.
    351     // We consider white space to be a possible indication of the start or end of the phone number.
    352     Matcher groupMatcher = GROUP_SEPARATOR.matcher(candidate);
    353 
    354     if (groupMatcher.find()) {
    355       // Try the first group by itself.
    356       CharSequence firstGroupOnly = candidate.substring(0, groupMatcher.start());
    357       firstGroupOnly = trimAfterFirstMatch(PhoneNumberUtil.UNWANTED_END_CHAR_PATTERN,
    358                                            firstGroupOnly);
    359       PhoneNumberMatch match = parseAndVerify(firstGroupOnly.toString(), offset);
    360       if (match != null) {
    361         return match;
    362       }
    363       maxTries--;
    364 
    365       int withoutFirstGroupStart = groupMatcher.end();
    366       // Try the rest of the candidate without the first group.
    367       CharSequence withoutFirstGroup = candidate.substring(withoutFirstGroupStart);
    368       withoutFirstGroup = trimAfterFirstMatch(PhoneNumberUtil.UNWANTED_END_CHAR_PATTERN,
    369                                               withoutFirstGroup);
    370       match = parseAndVerify(withoutFirstGroup.toString(), offset + withoutFirstGroupStart);
    371       if (match != null) {
    372         return match;
    373       }
    374       maxTries--;
    375 
    376       if (maxTries > 0) {
    377         int lastGroupStart = withoutFirstGroupStart;
    378         while (groupMatcher.find()) {
    379           // Find the last group.
    380           lastGroupStart = groupMatcher.start();
    381         }
    382         CharSequence withoutLastGroup = candidate.substring(0, lastGroupStart);
    383         withoutLastGroup = trimAfterFirstMatch(PhoneNumberUtil.UNWANTED_END_CHAR_PATTERN,
    384                                                withoutLastGroup);
    385         if (withoutLastGroup.equals(firstGroupOnly)) {
    386           // If there are only two groups, then the group "without the last group" is the same as
    387           // the first group. In these cases, we don't want to re-check the number group, so we exit
    388           // already.
    389           return null;
    390         }
    391         match = parseAndVerify(withoutLastGroup.toString(), offset);
    392         if (match != null) {
    393           return match;
    394         }
    395         maxTries--;
    396       }
    397     }
    398     return null;
    399   }
    400 
    401   /**
    402    * Parses a phone number from the {@code candidate} using {@link PhoneNumberUtil#parse} and
    403    * verifies it matches the requested {@link #leniency}. If parsing and verification succeed, a
    404    * corresponding {@link PhoneNumberMatch} is returned, otherwise this method returns null.
    405    *
    406    * @param candidate  the candidate match
    407    * @param offset  the offset of {@code candidate} within {@link #text}
    408    * @return  the parsed and validated phone number match, or null
    409    */
    410   private PhoneNumberMatch parseAndVerify(String candidate, int offset) {
    411     try {
    412       // Check the candidate doesn't contain any formatting which would indicate that it really
    413       // isn't a phone number.
    414       if (!MATCHING_BRACKETS.matcher(candidate).matches()) {
    415         return null;
    416       }
    417 
    418       // If leniency is set to VALID or stricter, we also want to skip numbers that are surrounded
    419       // by Latin alphabetic characters, to skip cases like abc8005001234 or 8005001234def.
    420       if (leniency.compareTo(Leniency.VALID) >= 0) {
    421         // If the candidate is not at the start of the text, and does not start with phone-number
    422         // punctuation, check the previous character.
    423         if (offset > 0 && !LEAD_CLASS.matcher(candidate).lookingAt()) {
    424           char previousChar = text.charAt(offset - 1);
    425           // We return null if it is a latin letter or an invalid punctuation symbol.
    426           if (isInvalidPunctuationSymbol(previousChar) || isLatinLetter(previousChar)) {
    427             return null;
    428           }
    429         }
    430         int lastCharIndex = offset + candidate.length();
    431         if (lastCharIndex < text.length()) {
    432           char nextChar = text.charAt(lastCharIndex);
    433           if (isInvalidPunctuationSymbol(nextChar) || isLatinLetter(nextChar)) {
    434             return null;
    435           }
    436         }
    437       }
    438 
    439       PhoneNumber number = phoneUtil.parseAndKeepRawInput(candidate, preferredRegion);
    440       if (leniency.verify(number, candidate, phoneUtil)) {
    441         // We used parseAndKeepRawInput to create this number, but for now we don't return the extra
    442         // values parsed. TODO: stop clearing all values here and switch all users over
    443         // to using rawInput() rather than the rawString() of PhoneNumberMatch.
    444         number.clearCountryCodeSource();
    445         number.clearRawInput();
    446         number.clearPreferredDomesticCarrierCode();
    447         return new PhoneNumberMatch(offset, candidate, number);
    448       }
    449     } catch (NumberParseException e) {
    450       // ignore and continue
    451     }
    452     return null;
    453   }
    454 
    455   /**
    456    * Always throws {@link UnsupportedOperationException} as removal is not supported.
    457    */
    458   public void remove() {
    459     throw new UnsupportedOperationException();
    460   }
    461 }
    462